]> git.ipfire.org Git - thirdparty/postgresql.git/commitdiff
Fix slotsync worker blocking promotion when stuck in wait
authorFujii Masao <fujii@postgresql.org>
Wed, 8 Apr 2026 02:24:00 +0000 (11:24 +0900)
committerFujii Masao <fujii@postgresql.org>
Wed, 8 Apr 2026 02:24:00 +0000 (11:24 +0900)
Previously, on standby promotion, the startup process sent SIGUSR1 to
the slotsync worker (or a backend performing slot synchronization) and
waited for it to exit. This worked in most cases, but if the process was
blocked waiting for a response from the primary (e.g., due to a network
failure), SIGUSR1 would not interrupt the wait. As a result, the process
could remain stuck, causing the startup process to wait for a long time
and delaying promotion.

This commit fixes the issue by introducing a new procsignal reason,
PROCSIG_SLOTSYNC_MESSAGE. On promotion, the startup process
sends this signal, and the handler sets interrupt flags so the process
exits (or errors out) promptly at CHECK_FOR_INTERRUPTS(), allowing
promotion to complete without delay.

Backpatch to v17, where slotsync was introduced.

Author: Nisha Moond <nisha.moond412@gmail.com>
Reviewed-by: shveta malik <shveta.malik@gmail.com>
Reviewed-by: Amit Kapila <amit.kapila16@gmail.com>
Reviewed-by: Zhijie Hou <houzj.fnst@fujitsu.com>
Reviewed-by: Fujii Masao <masao.fujii@gmail.com>
Discussion: https://postgr.es/m/CAHGQGwFzNYroAxSoyJhqTU-pH=t4Ej6RyvhVmBZ91Exj_TPMMQ@mail.gmail.com
Backpatch-through: 17

src/backend/replication/logical/slotsync.c
src/backend/storage/ipc/procsignal.c
src/backend/tcop/postgres.c
src/include/replication/slotsync.h
src/include/storage/procsignal.h

index 9ce6160dcfac7f51739b9132d58e4a9ff6722cbe..0b48f55d05755f8532ccb64438721e4df7c67d3b 100644 (file)
  * Struct for sharing information to control slot synchronization.
  *
  * The 'pid' is either the slot sync worker's pid or the backend's pid running
- * the SQL function pg_sync_replication_slots(). When the startup process sets
- * 'stopSignaled' during promotion, it uses this 'pid' to wake up the currently
- * synchronizing process so that the process can immediately stop its
- * synchronizing work on seeing 'stopSignaled' set.
- * Setting 'stopSignaled' is also used to handle the race condition when the
- * postmaster has not noticed the promotion yet and thus may end up restarting
- * the slot sync worker. If 'stopSignaled' is set, the worker will exit in such a
- * case. The SQL function pg_sync_replication_slots() will also error out if
- * this flag is set. Note that we don't need to reset this variable as after
- * promotion the slot sync worker won't be restarted because the pmState
- * changes to PM_RUN from PM_HOT_STANDBY and we don't support demoting
- * primary without restarting the server. See MaybeStartSlotSyncWorker.
+ * the SQL function pg_sync_replication_slots(). On promotion, the startup
+ * process sets 'stopSignaled' and uses this 'pid' to signal the synchronizing
+ * process with PROCSIG_SLOTSYNC_MESSAGE and also to wake it up so that the
+ * process can immediately stop its synchronizing work.
+ * Setting 'stopSignaled' on the other hand is used to handle the race
+ * condition when the postmaster has not noticed the promotion yet and thus may
+ * end up restarting the slot sync worker. If 'stopSignaled' is set, the worker
+ * will exit in such a case. The SQL function pg_sync_replication_slots() will
+ * also error out if this flag is set. Note that we don't need to reset this
+ * variable as after promotion the slot sync worker won't be restarted because
+ * the pmState changes to PM_RUN from PM_HOT_STANDBY and we don't support
+ * demoting primary without restarting the server.
+ * See MaybeStartSlotSyncWorker.
  *
  * The 'syncing' flag is needed to prevent concurrent slot syncs to avoid slot
  * overwrites.
@@ -131,6 +132,13 @@ static long sleep_ms = MIN_SLOTSYNC_WORKER_NAPTIME_MS;
  */
 static bool syncing_slots = false;
 
+/*
+ * Interrupt flag set when PROCSIG_SLOTSYNC_MESSAGE is received, asking the
+ * slotsync worker or pg_sync_replication_slots() to stop because
+ * standby promotion has been triggered.
+ */
+volatile sig_atomic_t SlotSyncShutdownPending = false;
+
 /*
  * Structure to hold information fetched from the primary server about a logical
  * replication slot.
@@ -1186,36 +1194,52 @@ slotsync_reread_config(void)
 }
 
 /*
- * Interrupt handler for process performing slot synchronization.
+ * Handle receipt of an interrupt indicating a slotsync shutdown message.
+ *
+ * This is called within the SIGUSR1 handler.  All we do here is set a flag
+ * that will cause the next CHECK_FOR_INTERRUPTS() to invoke
+ * ProcessSlotSyncMessage().
  */
-static void
-ProcessSlotSyncInterrupts(WalReceiverConn *wrconn)
+void
+HandleSlotSyncMessageInterrupt(void)
 {
-       CHECK_FOR_INTERRUPTS();
+       InterruptPending = true;
+       SlotSyncShutdownPending = true;
+       /* latch will be set by procsignal_sigusr1_handler */
+}
 
-       if (SlotSyncCtx->stopSignaled)
-       {
-               if (AmLogicalSlotSyncWorkerProcess())
-               {
-                       ereport(LOG,
-                                       errmsg("replication slot synchronization worker will stop because promotion is triggered"));
+/*
+ * Handle a PROCSIG_SLOTSYNC_MESSAGE signal, called from ProcessInterrupts().
+ *
+ * If the current process is the slotsync background worker, log a message
+ * and exit cleanly.  If it is a backend executing pg_sync_replication_slots(),
+ * raise an error, unless the sync has already finished, in which case there
+ * is no need to interrupt the caller.
+ */
+void
+ProcessSlotSyncMessage(void)
+{
+       SlotSyncShutdownPending = false;
 
-                       proc_exit(0);
-               }
-               else
-               {
-                       /*
-                        * For the backend executing SQL function
-                        * pg_sync_replication_slots().
-                        */
-                       ereport(ERROR,
-                                       errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
-                                       errmsg("replication slot synchronization will stop because promotion is triggered"));
-               }
+       if (AmLogicalSlotSyncWorkerProcess())
+       {
+               ereport(LOG,
+                               errmsg("replication slot synchronization worker will stop because promotion is triggered"));
+               proc_exit(0);
        }
+       else
+       {
+               /*
+                * If sync has already completed, there is no need to interrupt the
+                * caller with an error.
+                */
+               if (!IsSyncingReplicationSlots())
+                       return;
 
-       if (ConfigReloadPending)
-               slotsync_reread_config();
+               ereport(ERROR,
+                               errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+                               errmsg("replication slot synchronization will stop because promotion is triggered"));
+       }
 }
 
 /*
@@ -1322,6 +1346,34 @@ check_and_set_sync_info(pid_t sync_process_pid)
 {
        SpinLockAcquire(&SlotSyncCtx->mutex);
 
+       /*
+        * Exit immediately if promotion has been triggered.  This guards against
+        * a new worker (or a call to pg_sync_replication_slots()) that starts
+        * after the old worker was stopped by ShutDownSlotSync().
+        */
+       if (SlotSyncCtx->stopSignaled)
+       {
+               SpinLockRelease(&SlotSyncCtx->mutex);
+
+               if (AmLogicalSlotSyncWorkerProcess())
+               {
+                       ereport(DEBUG1,
+                                       errmsg("replication slot synchronization worker will not start because promotion was triggered"));
+
+                       proc_exit(0);
+               }
+               else
+               {
+                       /*
+                        * For the backend executing SQL function
+                        * pg_sync_replication_slots().
+                        */
+                       ereport(ERROR,
+                                       errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+                                       errmsg("replication slot synchronization will not start because promotion was triggered"));
+               }
+       }
+
        if (SlotSyncCtx->syncing)
        {
                SpinLockRelease(&SlotSyncCtx->mutex);
@@ -1525,7 +1577,10 @@ ReplSlotSyncWorkerMain(char *startup_data, size_t startup_data_len)
        {
                bool            some_slot_updated = false;
 
-               ProcessSlotSyncInterrupts(wrconn);
+               CHECK_FOR_INTERRUPTS();
+
+               if (ConfigReloadPending)
+                       slotsync_reread_config();
 
                some_slot_updated = synchronize_slots(wrconn);
 
@@ -1625,11 +1680,11 @@ ShutDownSlotSync(void)
        SpinLockRelease(&SlotSyncCtx->mutex);
 
        /*
-        * Signal process doing slotsync, if any. The process will stop upon
-        * detecting that the stopSignaled flag is set to true.
+        * Signal process doing slotsync, if any, asking it to stop.
         */
        if (sync_process_pid != InvalidPid)
-               kill(sync_process_pid, SIGUSR1);
+               SendProcSignal(sync_process_pid, PROCSIG_SLOTSYNC_MESSAGE,
+                                          INVALID_PROC_NUMBER);
 
        /* Wait for slot sync to end */
        for (;;)
@@ -1774,9 +1829,6 @@ SyncReplicationSlots(WalReceiverConn *wrconn)
        {
                check_and_set_sync_info(MyProcPid);
 
-               /* Check for interrupts and config changes */
-               ProcessSlotSyncInterrupts();
-
                validate_remote_info(wrconn);
 
                synchronize_slots(wrconn);
index 4ed9cedcdd4e5aabb086be9694a967fa688ed422..d6857f5a8bb60da34ab2a83f0023f39c684bdb46 100644 (file)
@@ -23,6 +23,7 @@
 #include "pgstat.h"
 #include "port/pg_bitutils.h"
 #include "replication/logicalworker.h"
+#include "replication/slotsync.h"
 #include "replication/walsender.h"
 #include "storage/condition_variable.h"
 #include "storage/ipc.h"
@@ -655,6 +656,9 @@ procsignal_sigusr1_handler(SIGNAL_ARGS)
        if (CheckProcSignal(PROCSIG_PARALLEL_APPLY_MESSAGE))
                HandleParallelApplyMessageInterrupt();
 
+       if (CheckProcSignal(PROCSIG_SLOTSYNC_MESSAGE))
+               HandleSlotSyncMessageInterrupt();
+
        if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_DATABASE))
                HandleRecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_DATABASE);
 
index 9cd1d0abe35fb6b0a4044b89e49cfe710bbaaed1..00d4073c089575373303aba72e853613d10349f4 100644 (file)
@@ -58,6 +58,7 @@
 #include "postmaster/postmaster.h"
 #include "replication/logicallauncher.h"
 #include "replication/logicalworker.h"
+#include "replication/slotsync.h"
 #include "replication/slot.h"
 #include "replication/walsender.h"
 #include "rewrite/rewriteHandler.h"
@@ -3497,6 +3498,9 @@ ProcessInterrupts(void)
 
        if (ParallelApplyMessagePending)
                HandleParallelApplyMessages();
+
+       if (SlotSyncShutdownPending)
+               ProcessSlotSyncMessage();
 }
 
 /*
index e03c2a005a429b4b5070c5d6786d51cb2ffbc776..c7a937911045c91893f4832b1e1c29a1cbef4b6c 100644 (file)
 #ifndef SLOTSYNC_H
 #define SLOTSYNC_H
 
+#include <signal.h>
+
 #include "replication/walreceiver.h"
 
 extern PGDLLIMPORT bool sync_replication_slots;
 
+/* Interrupt flag set by HandleSlotSyncMessageInterrupt() */
+extern PGDLLIMPORT volatile sig_atomic_t SlotSyncShutdownPending;
+
 /*
  * GUCs needed by slot sync worker to connect to the primary
  * server and carry on with slots synchronization.
@@ -34,5 +39,7 @@ extern bool IsSyncingReplicationSlots(void);
 extern Size SlotSyncShmemSize(void);
 extern void SlotSyncShmemInit(void);
 extern void SyncReplicationSlots(WalReceiverConn *wrconn);
+extern void HandleSlotSyncMessageInterrupt(void);
+extern void ProcessSlotSyncMessage(void);
 
 #endif                                                 /* SLOTSYNC_H */
index 7d290ea7d05ccf1725b686a7cf46b1dfcbe435bb..126c44bcf1d742bddb1fa2c782f7558b46cca618 100644 (file)
@@ -36,6 +36,7 @@ typedef enum
        PROCSIG_BARRIER,                        /* global barrier interrupt  */
        PROCSIG_LOG_MEMORY_CONTEXT, /* ask backend to log the memory contexts */
        PROCSIG_PARALLEL_APPLY_MESSAGE, /* Message from parallel apply workers */
+       PROCSIG_SLOTSYNC_MESSAGE,       /* ask slot synchronization to stop */
 
        /* Recovery conflict reasons */
        PROCSIG_RECOVERY_CONFLICT_FIRST,