From 89bfb42d3b6852bb4d273d6bda1cced303ade1d0 Mon Sep 17 00:00:00 2001 From: "Andrei V. Lepikhov" Date: Tue, 11 Nov 2025 15:33:02 +0100 Subject: [PATCH 1/2] zodan.sql: add subscriptions pre-check. With this commit, we require that each node of the Spock cluster has only enabled subscriptions. There is no algorithmic reason to do that, because when adding the node, we are concerned only about the donor's state and the consistency of its progress table. However, such a strict procedure ensures that users can be sure they add a node to a healthy cluster, take action beforehand if it is not healthy, and that replication doesn't delay WAL cutting. Introduce a TAP test that could be used to check Z0DAN checks. XXX: add_node finishes its job with subscriptions in the 'initialising' state. Do we need to take any action here? Also, fix the annoying WARNING on the 'exception_replay_queue_size'. --- samples/Z0DAN/zodan.sql | 34 ++++++++++++++ src/spock.c | 6 +-- tests/tap/schedule | 1 + tests/tap/t/012_zodan_basics.pl | 82 +++++++++++++++++++++++++++++++++ tests/tap/t/SpockTest.pm | 2 +- 5 files changed, 121 insertions(+), 4 deletions(-) create mode 100755 tests/tap/t/012_zodan_basics.pl diff --git a/samples/Z0DAN/zodan.sql b/samples/Z0DAN/zodan.sql index 38baba40..a4c82384 100644 --- a/samples/Z0DAN/zodan.sql +++ b/samples/Z0DAN/zodan.sql @@ -1531,6 +1531,40 @@ BEGIN END IF; END; + -- Check: all nodes, included in the cluster, have only enabled subscriptions. + -- + -- Connect to each node in the cluster and pass through the spock.subscription + -- table to check subscriptions statuses. Using it we try to avoid cases + -- when somewhere in the middle a crash or disconnection happens that may + -- be aggravated by add_node. + DECLARE + status_rec record; + dsn_rec record; + dsns_sql text; + sub_status_sql text; + BEGIN + dsns_sql := 'SELECT if_dsn,node_name + FROM spock.node JOIN spock.node_interface + ON (if_nodeid = node_id) + WHERE node_id NOT IN (SELECT node_id FROM spock.local_node)'; + sub_status_sql := 'SELECT sub_name, sub_enabled FROM spock.subscription'; + + FOR dsn_rec IN SELECT * FROM dblink(src_dsn, dsns_sql) + AS t(dsn text, node name) + LOOP + FOR status_rec IN SELECT * FROM dblink(dsn_rec.dsn, sub_status_sql) + AS t(name text, status text) + LOOP + IF status_rec.status != 't' THEN + RAISE EXCEPTION ' [FAILED] %', rpad('Node ' || dsn_rec.node || ' has disabled subscription ' || status_rec.name, 60, ' '); + ELSIF verb THEN + RAISE NOTICE ' OK: %', rpad('Node with DSN ' || dsn_rec.dsn || ' has enabled subscription ' || status_rec.name, 120, ' '); + END IF; + END LOOP; + END LOOP; + RAISE NOTICE ' OK: %', rpad('Checking each Spock node has only active subscriptions', 120, ' '); + END; + -- Validating new node prerequisites SELECT count(*) INTO new_exists FROM spock.node WHERE node_name = new_node_name; IF new_exists > 0 THEN diff --git a/src/spock.c b/src/spock.c index 27c6ee38..6320ce6d 100644 --- a/src/spock.c +++ b/src/spock.c @@ -1106,11 +1106,11 @@ _PG_init(void) "This setting is deprecated and has no effect. " "The replay queue now dynamically allocates memory as needed.", &spock_replay_queue_size, - 4194304, + 4, 0, - INT_MAX, + MAX_KILOBYTES / 1024, PGC_SIGHUP, - 0, + GUC_UNIT_MB, NULL, NULL, NULL); diff --git a/tests/tap/schedule b/tests/tap/schedule index 6bfe6451..a1bf5672 100644 --- a/tests/tap/schedule +++ b/tests/tap/schedule @@ -15,6 +15,7 @@ test: 004_non_default_repset test: 008_rmgr test: 009_zodan_add_remove_nodes test: 010_zodan_add_remove_python +test: 012_zodan_basics # Tests, consuming too much time to be launched on each check: #test: 011_zodan_sync_third diff --git a/tests/tap/t/012_zodan_basics.pl b/tests/tap/t/012_zodan_basics.pl new file mode 100755 index 00000000..732cad5a --- /dev/null +++ b/tests/tap/t/012_zodan_basics.pl @@ -0,0 +1,82 @@ +use strict; +use warnings; +use Test::More; +use lib '.'; +use lib 't'; +use SpockTest qw(create_cluster destroy_cluster get_test_config psql_or_bail scalar_query); + +my ($result); + +create_cluster(3, 'Create basic Spock test cluster'); + +# Get cluster configuration +my $config = get_test_config(); +my $node_count = $config->{node_count}; +my $node_ports = $config->{node_ports}; +my $host = $config->{host}; +my $dbname = $config->{db_name}; +my $db_user = $config->{db_user}; +my $db_password = $config->{db_password}; +my $pg_bin = $config->{pg_bin}; + +psql_or_bail(2, "SELECT spock.node_drop('n2')"); +psql_or_bail(3, "SELECT spock.node_drop('n3')"); +psql_or_bail(1, "CREATE EXTENSION snowflake"); +psql_or_bail(1, "CREATE EXTENSION lolor"); +psql_or_bail(1, "CREATE EXTENSION amcheck"); +psql_or_bail(2, "CREATE EXTENSION dblink"); +psql_or_bail(3, "CREATE EXTENSION dblink"); +psql_or_bail(2, "\\i ../../samples/Z0DAN/zodan.sql"); +psql_or_bail(3, "\\i ../../samples/Z0DAN/zodan.sql"); +psql_or_bail(1, "CREATE TABLE test(x serial PRIMARY KEY)"); +psql_or_bail(1, "INSERT INTO test DEFAULT VALUES"); + +print STDERR "All supporting stuff has been installed\n"; + +print STDERR "Call Z0DAN: n2 => n1"; +psql_or_bail(2, " + CALL spock.add_node( + src_node_name := 'n1', + src_dsn := 'host=$host dbname=$dbname port=$node_ports->[0] user=$db_user password=$db_password', + new_node_name := 'n2', + new_node_dsn := 'host=$host dbname=$dbname port=$node_ports->[1] user=$db_user password=$db_password', + verb := false + )"); +print STDERR "Z0DAN (n2 => n1) has finished the attach process\n"; +$result = scalar_query(2, "SELECT x FROM test"); +print STDERR "Check result: $result\n"; +ok($result eq '1', "Check state of the test table after the attachment"); + +psql_or_bail(1, "SELECT spock.sub_disable('sub_n1_n2')"); + +print STDERR "Call Z0DAN: n3 => n2\n"; + +scalar_query(3, " + CALL spock.add_node( + src_node_name := 'n2', + src_dsn := 'host=$host dbname=$dbname port=$node_ports->[1] user=$db_user password=$db_password', + new_node_name := 'n3', new_node_dsn := 'host=$host dbname=$dbname port=$node_ports->[2] user=$db_user password=$db_password', + verb := false)"); + +$result = scalar_query(3, "SELECT count(*) FROM spock.local_node"); +ok($result eq '0', "N3 is not in the cluster yet"); +print STDERR "Z0DAN should fail because of a disabled subscription\n"; + +psql_or_bail(1, "SELECT spock.sub_enable('sub_n1_n2')"); +scalar_query(3, " + CALL spock.add_node( + src_node_name := 'n2', + src_dsn := 'host=$host dbname=$dbname port=$node_ports->[1] user=$db_user password=$db_password', + new_node_name := 'n3', new_node_dsn := 'host=$host dbname=$dbname port=$node_ports->[2] user=$db_user password=$db_password', + verb := true)"); + +$result = scalar_query(3, "SELECT count(*) FROM spock.local_node"); +ok($result eq '1', "N3 is in the cluster"); +$result = scalar_query(3, "SELECT x FROM test"); +print STDERR "Check result: $result\n"; +ok($result eq '1', "Check state of the test table on N3 after the attachment"); +print STDERR "Z0DAN should add N3 to the cluster\n"; + +# Clean up +destroy_cluster('Destroy test cluster'); +done_testing(); diff --git a/tests/tap/t/SpockTest.pm b/tests/tap/t/SpockTest.pm index 0333b43e..409f37c5 100644 --- a/tests/tap/t/SpockTest.pm +++ b/tests/tap/t/SpockTest.pm @@ -150,7 +150,7 @@ sub create_postgresql_conf { print $conf "spock.exception_behaviour=sub_disable\n"; print $conf "spock.conflict_resolution=last_update_wins\n"; print $conf "track_commit_timestamp=on\n"; - print $conf "spock.exception_replay_queue_size=1MB\n"; + print $conf "spock.exception_replay_queue_size='1MB'\n"; print $conf "spock.enable_spill=on\n"; print $conf "port=$port\n"; print $conf "listen_addresses='*'\n"; From 2dcef2eefe66967d290eebd054c8d70818e2aef9 Mon Sep 17 00:00:00 2001 From: "Andrei V. Lepikhov" Date: Wed, 12 Nov 2025 12:39:04 +0100 Subject: [PATCH 2/2] Check subscription state in the wait_for_sync_event. If no group's subscription is enabled, wait_for_sync_event may get stuck in an infinite loop. Add a check of the subscription state in the waiting loop to reflect the fact that an apply worker may quietly die, deactivating its subscription. Also, add the correct processing of this behaviour to the Z0DAN and include a TAP test. --- samples/Z0DAN/zodan.sql | 3 +- sql/spock--6.0.0-devel.sql | 22 ++++++++++++++ tests/tap/t/012_zodan_basics.pl | 54 +++++++++++++++++++++++++++++---- 3 files changed, 71 insertions(+), 8 deletions(-) diff --git a/samples/Z0DAN/zodan.sql b/samples/Z0DAN/zodan.sql index a4c82384..bd7345e3 100644 --- a/samples/Z0DAN/zodan.sql +++ b/samples/Z0DAN/zodan.sql @@ -2351,8 +2351,7 @@ BEGIN RAISE NOTICE ' OK: %', rpad('Waiting for sync event from ' || src_node_name || ' on new node ' || new_node_name || '...', 120, ' '); EXCEPTION WHEN OTHERS THEN - RAISE NOTICE ' ✗ %', rpad('Unable to wait for sync event from ' || src_node_name || ' on new node ' || new_node_name || ' (error: ' || SQLERRM || ')', 120, ' '); - RAISE; + RAISE EXCEPTION ' ✗ %', rpad('Unable to wait for sync event from ' || src_node_name || ' on new node ' || new_node_name || ' (error: ' || SQLERRM || ')', 120, ' '); END; END; $$; diff --git a/sql/spock--6.0.0-devel.sql b/sql/spock--6.0.0-devel.sql index 1ad9a9a8..0732398a 100644 --- a/sql/spock--6.0.0-devel.sql +++ b/sql/spock--6.0.0-devel.sql @@ -414,6 +414,17 @@ BEGIN target_id := node_id FROM spock.node_info(); WHILE true LOOP + -- If an unresolvable issue occurs with the apply worker, the LR + -- progress gets stuck, and we need to check the subscription's state + -- carefully. + IF NOT EXISTS (SELECT * FROM spock.subscription + WHERE sub_origin = origin_id AND + sub_target = target_id AND + sub_enabled = true) THEN + RAISE EXCEPTION 'Replication % => % does not have any enabled subscription yet', + origin_id, target_id; + END IF; + SELECT INTO progress_lsn remote_commit_lsn FROM spock.progress WHERE node_id = target_id AND remote_node_id = origin_id; @@ -452,6 +463,17 @@ BEGIN target_id := node_id FROM spock.node_info(); WHILE true LOOP + -- If an unresolvable issue occurs with the apply worker, the LR + -- progress gets stuck, and we need to check the subscription's state + -- carefully. + IF NOT EXISTS (SELECT * FROM spock.subscription + WHERE sub_origin = origin_id AND + sub_target = target_id AND + sub_enabled = true) THEN + RAISE EXCEPTION 'Replication % => % does not have any enabled subscription yet', + origin_id, target_id; + END IF; + SELECT INTO progress_lsn remote_commit_lsn FROM spock.progress WHERE node_id = target_id AND remote_node_id = origin_id; diff --git a/tests/tap/t/012_zodan_basics.pl b/tests/tap/t/012_zodan_basics.pl index 732cad5a..69430255 100755 --- a/tests/tap/t/012_zodan_basics.pl +++ b/tests/tap/t/012_zodan_basics.pl @@ -21,8 +21,6 @@ psql_or_bail(2, "SELECT spock.node_drop('n2')"); psql_or_bail(3, "SELECT spock.node_drop('n3')"); -psql_or_bail(1, "CREATE EXTENSION snowflake"); -psql_or_bail(1, "CREATE EXTENSION lolor"); psql_or_bail(1, "CREATE EXTENSION amcheck"); psql_or_bail(2, "CREATE EXTENSION dblink"); psql_or_bail(3, "CREATE EXTENSION dblink"); @@ -31,9 +29,15 @@ psql_or_bail(1, "CREATE TABLE test(x serial PRIMARY KEY)"); psql_or_bail(1, "INSERT INTO test DEFAULT VALUES"); -print STDERR "All supporting stuff has been installed\n"; +print STDERR "All supporting stuff has been installed successfully\n"; -print STDERR "Call Z0DAN: n2 => n1"; +# ############################################################################## +# +# Basic check that Z0DAN correctly add node to the single-node cluster +# +# ############################################################################## + +print STDERR "Call Z0DAN: n2 => n1\n"; psql_or_bail(2, " CALL spock.add_node( src_node_name := 'n1', @@ -49,8 +53,13 @@ psql_or_bail(1, "SELECT spock.sub_disable('sub_n1_n2')"); -print STDERR "Call Z0DAN: n3 => n2\n"; +# ############################################################################## +# +# Z0DAN reject node addition if some subscriptions are disabled +# +# ############################################################################## +print STDERR "Call Z0DAN: n3 => n2\n"; scalar_query(3, " CALL spock.add_node( src_node_name := 'n2', @@ -63,7 +72,7 @@ print STDERR "Z0DAN should fail because of a disabled subscription\n"; psql_or_bail(1, "SELECT spock.sub_enable('sub_n1_n2')"); -scalar_query(3, " +psql_or_bail(3, " CALL spock.add_node( src_node_name := 'n2', src_dsn := 'host=$host dbname=$dbname port=$node_ports->[1] user=$db_user password=$db_password', @@ -77,6 +86,39 @@ ok($result eq '1', "Check state of the test table on N3 after the attachment"); print STDERR "Z0DAN should add N3 to the cluster\n"; +# ############################################################################## +# +# Test that Z0DAN correctly doesn't add node to the cluster if something happens +# during the SYNC process. +# +# ############################################################################## + +# Remove node from the cluster and data leftovers. +psql_or_bail(3, "\\i ../../samples/Z0DAN/zodremove.sql"); +psql_or_bail(3, "CALL spock.remove_node(target_node_name := 'n3', + target_node_dsn := 'host=$host dbname=$dbname port=$node_ports->[2] user=$db_user password=$db_password', + verbose_mode := true)"); +psql_or_bail(3, "DROP TABLE test"); + +psql_or_bail(1, "CREATE FUNCTION fake_fn() RETURNS integer LANGUAGE sql AS \$\$ SELECT 1\$\$"); +psql_or_bail(3, "CREATE FUNCTION fake_fn() RETURNS integer LANGUAGE sql AS \$\$ SELECT 1\$\$"); +scalar_query(3, " + CALL spock.add_node( + src_node_name := 'n2', + src_dsn := 'host=$host dbname=$dbname port=$node_ports->[1] user=$db_user password=$db_password', + new_node_name := 'n3', new_node_dsn := 'host=$host dbname=$dbname port=$node_ports->[2] user=$db_user password=$db_password', + verb := true)"); + +# TODO: +# It seems that add_node keeps remnants after unsuccessful execution. It is +# happened because we have commited some intermediate results before. +# It would be better to keep remote transaction opened until the end of the +# operation or just remove these remnants at the end pretending to be a +# distributed transaction. +# +# $result = scalar_query(3, "SELECT count(*) FROM spock.local_node"); +# ok($result eq '0', "N3 is not in the cluster"); + # Clean up destroy_cluster('Destroy test cluster'); done_testing();