From 699f8400eeb654103890c4d99f5d5e9d5302aff5 Mon Sep 17 00:00:00 2001 From: Nat Makarevitch Date: Tue, 26 Aug 2025 17:58:43 +0200 Subject: [PATCH 1/7] sampling method, WIP --- pg_sample | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/pg_sample b/pg_sample index 1836943..4f4a87b 100755 --- a/pg_sample +++ b/pg_sample @@ -124,6 +124,14 @@ increase the running time of the script. Schema name to use for the sample database (defaults to _pg_sample). +=item B<--sampling_method=>I + +Sampling method used to select samples, either 'BERNOULLI' or 'SYSTEM'. +Defaults to BERNOULLI, the most random method. +SYSTEM may speed up the sampling phase on a huge database, at the +potential price of a less-random sample. +Implicitly enables --random. + =item B<--schema=>I Limit sampling to the specified schema. By default, all schemas are sampled. @@ -509,6 +517,7 @@ sub notice (@) { random => 0, schema => undef, sample_schema => '_pg_sample', + sampling_method => undef, verbose => 0, ); @@ -530,7 +539,8 @@ GetOptions(\%opt, "ordered_desc|ordered-desc", "ordered_asc|ordered-asc", "random", - "sample_schema=s", + "sample_schema=s", + "sampling_method=s", "schema=s", "trace", "verbose|v", @@ -548,6 +558,11 @@ if ($opt{help}) { exit 0; } +if (defined $opt{sampling_method}) { + die "Option sampling-method accepts either 'BERNOULLI' or 'SYSTEM'" if ('BERNOULLI' ne $opt{sampling_method} and 'SYSTEM' ne $opt{sampling_method}); + $opt{random}=1; +} + $opt{ordered} = $opt{ordered_desc} ? 'DESC' : $opt{ordered_asc} ? 'ASC' : $opt{ordered} ? 'DESC' @@ -693,7 +708,7 @@ foreach my $row (@{$table_info}) { SELECT greatest(count(*), ?) FROM $table }, undef, $_->[1]); my $percent = 100 * $_->[1] / $table_num_rows; - $tablesample = "TABLESAMPLE BERNOULLI ($percent)"; + $tablesample = "TABLESAMPLE $opt{sampling_method} ($percent)"; } } elsif ($_->[1] =~ /^\d+(\.\d+)?%$/) { # percent value turned into LIMIT if (not $opt{random} or $pg_version < version->declare('9.5')) { @@ -704,7 +719,7 @@ foreach my $row (@{$table_info}) { $limit = "LIMIT $total_rows"; } else { my $percent = (substr $_->[1], 0, (length $_->[1]) - 1); - $tablesample = "TABLESAMPLE BERNOULLI ($percent)"; + $tablesample = "TABLESAMPLE $opt{sampling_method} ($percent)"; } } else { # otherwise treated as subselect $where = "($_->[1])"; From 998c41697c0f5f16b238ba8d4a2c15d8d64f394a Mon Sep 17 00:00:00 2001 From: Nat Makarevitch Date: Tue, 26 Aug 2025 20:24:17 +0200 Subject: [PATCH 2/7] sampling method: parameter uppercased , default value enforced --- pg_sample | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pg_sample b/pg_sample index 4f4a87b..d31c50e 100755 --- a/pg_sample +++ b/pg_sample @@ -559,8 +559,11 @@ if ($opt{help}) { } if (defined $opt{sampling_method}) { + $opt{sampling_method}=uc($opt{sampling_method}); die "Option sampling-method accepts either 'BERNOULLI' or 'SYSTEM'" if ('BERNOULLI' ne $opt{sampling_method} and 'SYSTEM' ne $opt{sampling_method}); - $opt{random}=1; + $opt{random}=1; # implied +} else { + $opt{sampling_method}='BERNOULLI' if $opt{random}; # default method } $opt{ordered} = $opt{ordered_desc} ? 'DESC' From 0e1b965797646aeb29450f963de8979b912fad8e Mon Sep 17 00:00:00 2001 From: Nat Makarevitch Date: Tue, 26 Aug 2025 20:45:35 +0200 Subject: [PATCH 3/7] minor --- pg_sample | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/pg_sample b/pg_sample index d31c50e..cdfb631 100755 --- a/pg_sample +++ b/pg_sample @@ -1,5 +1,12 @@ #!/usr/bin/env perl +use strict; +use warnings; +use Carp; +use Getopt::Long qw/ GetOptions :config no_ignore_case /; +use DBI; +use DBD::Pg 2.0.0; + our $VERSION = "1.17"; =head1 NAME @@ -205,13 +212,6 @@ Maurice Aubrey # Finally, the database schema and sample data are output, renaming the sample # table names to reflect the original database. -use strict; -use warnings; -use Carp; -use Getopt::Long qw/ GetOptions :config no_ignore_case /; -use DBI; -use DBD::Pg 2.0.0; - our $MAX_IDENTIFIER_LENGTH = 63; $SIG{TERM} = $SIG{INT} = $SIG{QUIT} = $SIG{HUP} = sub { @@ -348,7 +348,7 @@ my %opt; # closure; all functions have access to options PrintError => 0, HandleError => sub { confess( shift ) }, }, - ) or croak "db connection failed!"; + ) or croak "DB connection failed!"; $dbh->trace(1) if defined $opt{trace}; @@ -561,7 +561,7 @@ if ($opt{help}) { if (defined $opt{sampling_method}) { $opt{sampling_method}=uc($opt{sampling_method}); die "Option sampling-method accepts either 'BERNOULLI' or 'SYSTEM'" if ('BERNOULLI' ne $opt{sampling_method} and 'SYSTEM' ne $opt{sampling_method}); - $opt{random}=1; # implied + $opt{random}=2; # implied } else { $opt{sampling_method}='BERNOULLI' if $opt{random}; # default method } @@ -571,11 +571,12 @@ $opt{ordered} = $opt{ordered_desc} ? 'DESC' : $opt{ordered} ? 'DESC' : undef; if ($opt{random} && $opt{ordered}) { - print("Error: --random and --ordered are mutually exclusive"); + warn "you used --sampling_method, therefore --random was automatically enabled " if ( 2 == $opt{random} ); + warn "Error: --random and --ordered are mutually exclusive"; exit 1; } -@ARGV or die "\nUsage: $0 [ option... ] [ dbname ]\n\n\t" . +@ARGV or die "Usage: $0 [ option... ] [ dbname ]\n\n\t" . "$0 --help for detailed options\n"; push @{ $opt{limit} }, ".* = 100 "; # append default limit rule From 365d29f0dfda97e1fdbbad4176347581298d5461 Mon Sep 17 00:00:00 2001 From: Nat Makarevitch Date: Tue, 26 Aug 2025 23:56:44 +0200 Subject: [PATCH 4/7] APPROX_table_tuples_count --- pg_sample | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pg_sample b/pg_sample index cdfb631..162f2b3 100755 --- a/pg_sample +++ b/pg_sample @@ -522,6 +522,7 @@ sub notice (@) { ); GetOptions(\%opt, + "approxcount", "data-only|data_only|a", "db_name|db-name=s", "db_user|db-user|db_username|db-username|username|U=s", @@ -827,6 +828,17 @@ sub should_skip_insert_for_fk ($) { return 0; # Do not skip by default } +sub APPROX_table_tuples_count($$) { + my $dbh = ${ shift() }; # dereference the reference to DBI session handle + my $tablename = $dbh->quote_identifier(shift); + + my ($count) = $dbh->do(qq{SELECT c.reltuples::bigint AS approx_row_count FROM pg_class c WHERE c.relname = $tablename AND c.relkind = 'r'}); + if ($count < 1) { + $dbh->do(qq{ANALYZE $tablename}); + ($count) = $dbh->do(qq{SELECT c.reltuples::bigint AS approx_row_count FROM pg_class c WHERE c.relname = $tablename AND c.relkind = 'r'}); + } + return $count; +} # Keep inserting rows to satisfy any fk constraints until no more # are inserted. This should handle circular references. From 09a3b924c400a824d463a40739ae02e007d67852 Mon Sep 17 00:00:00 2001 From: Nat Makarevitch Date: Wed, 27 Aug 2025 00:00:31 +0200 Subject: [PATCH 5/7] APPROX_table_tuples_count comments --- pg_sample | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pg_sample b/pg_sample index 162f2b3..d6b02a0 100755 --- a/pg_sample +++ b/pg_sample @@ -828,12 +828,14 @@ sub should_skip_insert_for_fk ($) { return 0; # Do not skip by default } +# Obtains the amount of tuples in a table. +# May over/under-estimate if the table needs an ANALYZE pass. sub APPROX_table_tuples_count($$) { my $dbh = ${ shift() }; # dereference the reference to DBI session handle my $tablename = $dbh->quote_identifier(shift); - my ($count) = $dbh->do(qq{SELECT c.reltuples::bigint AS approx_row_count FROM pg_class c WHERE c.relname = $tablename AND c.relkind = 'r'}); - if ($count < 1) { + my ($count) = $dbh->do(qq{SELECT c.reltuples::bigint FROM pg_class c WHERE c.relname = $tablename AND c.relkind = 'r'}); + if ($count < 1) { # may indicate that the evaluated amount is dubious $dbh->do(qq{ANALYZE $tablename}); ($count) = $dbh->do(qq{SELECT c.reltuples::bigint AS approx_row_count FROM pg_class c WHERE c.relname = $tablename AND c.relkind = 'r'}); } From 25da7eac288dfc7cf1753244fdf0c7b59b3ba7c0 Mon Sep 17 00:00:00 2001 From: Nat Makarevitch Date: Wed, 27 Aug 2025 00:25:12 +0200 Subject: [PATCH 6/7] APPROX_table_tuples_count usage, WIP --- pg_sample | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/pg_sample b/pg_sample index d6b02a0..a23a4ba 100755 --- a/pg_sample +++ b/pg_sample @@ -510,6 +510,7 @@ sub notice (@) { %opt = ( +approxcount => 0, db_host => '', db_port => '', keep => 0, @@ -642,6 +643,15 @@ unless ($opt{'data-only'}) { # If running PostgreSQL 9.1 or later, use UNLOGGED tables my $unlogged = $pg_version >= version->declare('9.1') ? 'UNLOGGED' : ''; +# Ensuring that the user can read (GRANT...) the 'pg_class' table +eval { + $dbh->selectrow_array(qq{SELECT 1 FROM pg_class LIMIT 1")}); +}; +if ($@) { + warn "I cannot read from the 'pg_class' table, therefore I cannot honor --approxcount: $@\n"; + $opt{approxcount}=0; +} + notice "Creating sample schema $opt{sample_schema}\n"; $dbh->do(qq{ CREATE SCHEMA $opt{sample_schema} }); my $created_schema = 1; # keep track that we actually did it; see END block @@ -834,7 +844,8 @@ sub APPROX_table_tuples_count($$) { my $dbh = ${ shift() }; # dereference the reference to DBI session handle my $tablename = $dbh->quote_identifier(shift); - my ($count) = $dbh->do(qq{SELECT c.reltuples::bigint FROM pg_class c WHERE c.relname = $tablename AND c.relkind = 'r'}); + # todo: take into account the last ANALYZE age, sizeof(tuples)/sizeof(shema def)... + my ($count) = $dbh->do(qq{SELECT c.reltuples::bigint FROM pg_class WHERE relname = $tablename AND relkind = 'r'}); if ($count < 1) { # may indicate that the evaluated amount is dubious $dbh->do(qq{ANALYZE $tablename}); ($count) = $dbh->do(qq{SELECT c.reltuples::bigint AS approx_row_count FROM pg_class c WHERE c.relname = $tablename AND c.relkind = 'r'}); From df2d8822af2b2419edd2a90b7ad9482ec6f0e68b Mon Sep 17 00:00:00 2001 From: Nat Makarevitch Date: Sat, 22 Nov 2025 17:01:29 +0100 Subject: [PATCH 7/7] vanished tables --- pg_sample | 48 ++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 38 insertions(+), 10 deletions(-) diff --git a/pg_sample b/pg_sample index a23a4ba..eb954ec 100755 --- a/pg_sample +++ b/pg_sample @@ -50,6 +50,12 @@ import errors. =over +=item B<--approxcount> + +In order to count the tuples stored in a table do not read them but read the +count stored in PostgreSQL's 'pg_class' table. It may be way off is the table +needs an ANALYZE pass. + =item I Specifies the database to sample. If not specified, uses the @@ -643,14 +649,15 @@ unless ($opt{'data-only'}) { # If running PostgreSQL 9.1 or later, use UNLOGGED tables my $unlogged = $pg_version >= version->declare('9.1') ? 'UNLOGGED' : ''; -# Ensuring that the user can read (GRANT...) the 'pg_class' table -eval { - $dbh->selectrow_array(qq{SELECT 1 FROM pg_class LIMIT 1")}); -}; -if ($@) { - warn "I cannot read from the 'pg_class' table, therefore I cannot honor --approxcount: $@\n"; - $opt{approxcount}=0; -} +# # Useless(?) +# # Ensuring that the user can read (GRANT...) the 'pg_class' table +# eval { +# $dbh->selectrow_array(qq{SELECT 1 FROM pg_class LIMIT 1")}); +# }; +# if ($@) { +# warn "I cannot read from the 'pg_class' table, therefore I cannot honor --approxcount: $@\n"; +# $opt{approxcount}=0; +# } notice "Creating sample schema $opt{sample_schema}\n"; $dbh->do(qq{ CREATE SCHEMA $opt{sample_schema} }); @@ -697,6 +704,14 @@ foreach my $row (@{$table_info}) { ORDER BY ordinal_position }, { Slice => {} }, ($tname, $sname) ) ]; + my $sthTabelExistsP = $dbh->table_info('', undef, $tname, 'TABLE'); # FOREIGN TABLE, VIEW, MATERIALIZED VIEW + my $table_exists = keys (@{$sthTabelExistsP->fetchall_arrayref({})}); + $sthTabelExistsP->finish(); + if ( 0>= $table_exists ) { + notice "\nThe '$tname' table vanished\n" if ($opt{verbose}); + next; + } + my $table = Table->new($sname, $tname, $columns); push @tables, $table; @@ -782,6 +797,7 @@ foreach my $row (@{$table_info}) { } } + # Find foreign keys my @fks; foreach my $table (@tables) { @@ -821,22 +837,34 @@ foreach my $fk (@fks) { } +{ + my $warn_last_table=''; # To avoid repeating a notice about the same table + # Skip FK inserts if entire table is being included anyway # (useful for cases where we don't support equality operator) sub should_skip_insert_for_fk ($) { my $table_name = shift; $table_name =~ s/"\."/\./g; - notice "$table_name\n"; + my $already_noticed; + if ($warn_last_table ne $table_name) { + # The last notice wasn't about this table + $warn_last_table = $table_name; + $already_noticed=0; + } else { + $already_noticed=1; + } + notice "$table_name\n" if (! $already_noticed); foreach my $limit (@limits) { my ($regex, $action) = @$limit; if ($table_name =~ $regex && $action eq '*') { - notice "Skipping $table_name as all data is imported\n"; + notice "Skipping $table_name (regex $regex) as all data is imported\n" if (! $already_noticed); return 1; # Skip if it matches the regex and action is '*' } } return 0; # Do not skip by default } +} # Obtains the amount of tuples in a table. # May over/under-estimate if the table needs an ANALYZE pass.