]> git.ipfire.org Git - thirdparty/postgresql.git/commitdiff
Fix unconditional WAL receiver shutdown during stream-archive transition
authorMichael Paquier <michael@paquier.xyz>
Tue, 4 Nov 2025 01:52:41 +0000 (10:52 +0900)
committerMichael Paquier <michael@paquier.xyz>
Tue, 4 Nov 2025 01:52:41 +0000 (10:52 +0900)
Commit b4f584f9d2a1 (affecting v15~, later backpatched down to 13 as of
3635a0a35aaf) introduced an unconditional WAL receiver shutdown when
switching from streaming to archive WAL sources.  This causes problems
during a timeline switch, when a WAL receiver enters WALRCV_WAITING
state but remains alive, waiting for instructions.

The unconditional shutdown can break some monitoring scenarios as the
WAL receiver gets repeatedly terminated and re-spawned, causing
pg_stat_wal_receiver.status to show a "streaming" instead of "waiting"
status, masking the fact that the WAL receiver is waiting for a new TLI
and a new LSN to be able to continue streaming.

This commit changes the WAL receiver behavior so as the shutdown becomes
conditional, with InstallXLogFileSegmentActive being always reset to
prevent the regression fixed by b4f584f9d2a1: only terminate the WAL
receiver when it is actively streaming (WALRCV_STREAMING,
WALRCV_STARTING, or WALRCV_RESTARTING).  When in WALRCV_WAITING state,
just reset InstallXLogFileSegmentActive flag to allow archive
restoration without killing the process.  WALRCV_STOPPED and
WALRCV_STOPPING are not reachable states in this code path.  For the
latter, the startup process is the one in charge of setting
WALRCV_STOPPING via ShutdownWalRcv(), waiting for the WAL receiver to
reach a WALRCV_STOPPED state after switching walRcvState, so
WaitForWALToBecomeAvailable() cannot be reached while a WAL receiver is
in a WALRCV_STOPPING state.

A regression test is added to check that a WAL receiver is not stopped
on timeline jump, that fails when the fix of this commit is reverted.

Reported-by: Ryan Bird <ryanzxg@gmail.com>
Author: Xuneng Zhou <xunengzhou@gmail.com>
Reviewed-by: Noah Misch <noah@leadboat.com>
Reviewed-by: Michael Paquier <michael@paquier.xyz>
Discussion: https://postgr.es/m/19093-c4fff49a608f82a0@postgresql.org
Backpatch-through: 13

src/backend/access/transam/xlog.c
src/backend/access/transam/xlogrecovery.c
src/include/access/xlog.h
src/test/recovery/t/004_timeline_switch.pl

index dc11af7cbcd5395aeed6b66a1f143309c67d9630..0528ac38d59097d6f0ed41ede68f81f7043fb2b4 100644 (file)
@@ -8939,10 +8939,7 @@ void
 XLogShutdownWalRcv(void)
 {
        ShutdownWalRcv();
-
-       LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
-       XLogCtl->InstallXLogFileSegmentActive = false;
-       LWLockRelease(ControlFileLock);
+       ResetInstallXLogFileSegmentActive();
 }
 
 /* Enable WAL file recycling and preallocation. */
@@ -8954,6 +8951,15 @@ SetInstallXLogFileSegmentActive(void)
        LWLockRelease(ControlFileLock);
 }
 
+/* Disable WAL file recycling and preallocation. */
+void
+ResetInstallXLogFileSegmentActive(void)
+{
+       LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+       XLogCtl->InstallXLogFileSegmentActive = false;
+       LWLockRelease(ControlFileLock);
+}
+
 bool
 IsInstallXLogFileSegmentActive(void)
 {
index d4c6b7c0ba710dfd1240730fb49e3ebc649994b3..7c6692dee6ebcee84f65ae38355fc27b0d7df34f 100644 (file)
@@ -3588,8 +3588,19 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
                                         * Before we leave XLOG_FROM_STREAM state, make sure that
                                         * walreceiver is not active, so that it won't overwrite
                                         * WAL that we restore from archive.
+                                        *
+                                        * If walreceiver is actively streaming (or attempting to
+                                        * connect), we must shut it down. However, if it's
+                                        * already in WAITING state (e.g., due to timeline
+                                        * divergence), we only need to reset the install flag to
+                                        * allow archive restoration.
                                         */
-                                       XLogShutdownWalRcv();
+                                       if (WalRcvStreaming())
+                                               XLogShutdownWalRcv();
+                                       else
+                                       {
+                                               ResetInstallXLogFileSegmentActive();
+                                       }
 
                                        /*
                                         * Before we sleep, re-scan for possible new timelines if
index cd674c3c23f1adc8cf155845861c2f9a6ecea080..e81a9d6aec954390ba2b89542ea76ba361dfd973 100644 (file)
@@ -257,6 +257,7 @@ extern void SwitchIntoArchiveRecovery(XLogRecPtr EndRecPtr, TimeLineID replayTLI
 extern void ReachedEndOfBackup(XLogRecPtr EndRecPtr, TimeLineID tli);
 extern void SetInstallXLogFileSegmentActive(void);
 extern bool IsInstallXLogFileSegmentActive(void);
+extern void ResetInstallXLogFileSegmentActive(void);
 extern void XLogShutdownWalRcv(void);
 
 /*
index 3203d93701643ce12a699dd6b2a9b31b13e5b6e2..ba21f1a5e5e54c3e7b24093c5b5d621b5b0206df 100644 (file)
@@ -69,6 +69,14 @@ my $result =
   $node_standby_2->safe_psql('postgres', "SELECT count(*) FROM tab_int");
 is($result, qq(2000), 'check content of standby 2');
 
+# Check the logs, WAL receiver should not have been stopped while
+# transitioning to its new timeline.  There is no need to rely on an
+# offset in this check of the server logs: a new log file is used on
+# node restart when primary_conninfo is updated above.
+ok( !$node_standby_2->log_contains(
+               "FATAL: .* terminating walreceiver process due to administrator command"
+       ),
+       'WAL receiver should not be stopped across timeline jumps');
 
 # Ensure that a standby is able to follow a primary on a newer timeline
 # when WAL archiving is enabled.