From: Alexander Korotkov Date: Mon, 5 Jan 2026 17:41:31 +0000 (+0200) Subject: Use WAIT FOR LSN in PostgreSQL::Test::Cluster::wait_for_catchup() X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=f30848cb05d4d63e1a5a2d6a9d72604f3b63370d;p=thirdparty%2Fpostgresql.git Use WAIT FOR LSN in PostgreSQL::Test::Cluster::wait_for_catchup() When the standby is passed as a PostgreSQL::Test::Cluster instance, use the WAIT FOR LSN command on the standby server to implement wait_for_catchup() for replay, write, and flush modes. This is more efficient than polling pg_stat_replication on the upstream, as the WAIT FOR LSN command uses a latch-based wakeup mechanism. The optimization applies when: - The standby is passed as a Cluster object (not just a name string) - The mode is 'replay', 'write', or 'flush' (not 'sent') - The standby is in recovery For 'sent' mode, when the standby is passed as a string (e.g., a subscription name for logical replication), or when the standby has been promoted, the function falls back to the original polling-based approach using pg_stat_replication on the upstream. Discussion: https://postgr.es/m/CABPTF7UiArgW-sXj9CNwRzUhYOQrevLzkYcgBydmX5oDes1sjg%40mail.gmail.com Author: Xuneng Zhou Reviewed-by: Alexander Korotkov Reviewed-by: Chao Li Reviewed-by: Alvaro Herrera --- diff --git a/src/test/perl/PostgreSQL/Test/Cluster.pm b/src/test/perl/PostgreSQL/Test/Cluster.pm index 955dfc0e7f8..a28ea89aa10 100644 --- a/src/test/perl/PostgreSQL/Test/Cluster.pm +++ b/src/test/perl/PostgreSQL/Test/Cluster.pm @@ -3320,6 +3320,13 @@ If you pass an explicit value of target_lsn, it should almost always be the primary's write LSN; so this parameter is seldom needed except when querying some intermediate replication node rather than the primary. +When the standby is passed as a PostgreSQL::Test::Cluster instance and is +in recovery, this function uses the WAIT FOR LSN command on the standby +for modes replay, write, and flush. This is more efficient than polling +pg_stat_replication on the upstream, as WAIT FOR LSN uses a latch-based +wakeup mechanism. For 'sent' mode, or when the standby is passed as a +string (e.g., a subscription name), it falls back to polling. + If there is no active replication connection from this peer, waits until poll_query_until timeout. @@ -3339,10 +3346,13 @@ sub wait_for_catchup . join(', ', keys(%valid_modes)) unless exists($valid_modes{$mode}); - # Allow passing of a PostgreSQL::Test::Cluster instance as shorthand + # Keep a reference to the standby node if passed as an object, so we can + # use WAIT FOR LSN on it later. + my $standby_node; if (blessed($standby_name) && $standby_name->isa("PostgreSQL::Test::Cluster")) { + $standby_node = $standby_name; $standby_name = $standby_name->name; } if (!defined($target_lsn)) @@ -3367,6 +3377,53 @@ sub wait_for_catchup . $self->name . "\n"; # Before release 12 walreceiver just set the application name to # "walreceiver" + + # Use WAIT FOR LSN on the standby when: + # - The standby was passed as a Cluster object (so we can connect to it) + # - The mode is replay, write, or flush (not 'sent') + # - The standby is in recovery + # This is more efficient than polling pg_stat_replication on the upstream, + # as WAIT FOR LSN uses a latch-based wakeup mechanism. + if (defined($standby_node) && ($mode ne 'sent')) + { + my $standby_in_recovery = + $standby_node->safe_psql('postgres', "SELECT pg_is_in_recovery()"); + chomp($standby_in_recovery); + + if ($standby_in_recovery eq 't') + { + # Map mode names to WAIT FOR LSN mode names + my %mode_map = ( + 'replay' => 'standby_replay', + 'write' => 'standby_write', + 'flush' => 'standby_flush',); + my $wait_mode = $mode_map{$mode}; + my $timeout = $PostgreSQL::Test::Utils::timeout_default; + my $wait_query = + qq[WAIT FOR LSN '${target_lsn}' WITH (MODE '${wait_mode}', timeout '${timeout}s', no_throw);]; + my $output = $standby_node->safe_psql('postgres', $wait_query); + chomp($output); + + if ($output ne 'success') + { + # Fetch additional detail for debugging purposes + my $details = $self->safe_psql('postgres', + "SELECT * FROM pg_catalog.pg_stat_replication"); + diag qq(WAIT FOR LSN failed with status: + ${output}); + diag qq(Last pg_stat_replication contents: + ${details}); + croak "failed waiting for catchup"; + } + print "done\n"; + return; + } + } + + # Fall back to polling pg_stat_replication on the upstream for: + # - 'sent' mode (no corresponding WAIT FOR LSN mode) + # - When standby_name is a string (e.g., subscription name) + # - When the standby is no longer in recovery (was promoted) my $query = qq[SELECT '$target_lsn' <= ${mode}_lsn AND state = 'streaming' FROM pg_catalog.pg_stat_replication WHERE application_name IN ('$standby_name', 'walreceiver')];