]> git.ipfire.org Git - thirdparty/postgresql.git/commitdiff
Fix slotsync worker blocking promotion when stuck in wait
authorFujii Masao <fujii@postgresql.org>
Wed, 8 Apr 2026 02:22:21 +0000 (11:22 +0900)
committerFujii Masao <fujii@postgresql.org>
Wed, 8 Apr 2026 02:22:21 +0000 (11:22 +0900)
Previously, on standby promotion, the startup process sent SIGUSR1 to
the slotsync worker (or a backend performing slot synchronization) and
waited for it to exit. This worked in most cases, but if the process was
blocked waiting for a response from the primary (e.g., due to a network
failure), SIGUSR1 would not interrupt the wait. As a result, the process
could remain stuck, causing the startup process to wait for a long time
and delaying promotion.

This commit fixes the issue by introducing a new procsignal reason,
PROCSIG_SLOTSYNC_MESSAGE. On promotion, the startup process
sends this signal, and the handler sets interrupt flags so the process
exits (or errors out) promptly at CHECK_FOR_INTERRUPTS(), allowing
promotion to complete without delay.

Backpatch to v17, where slotsync was introduced.

Author: Nisha Moond <nisha.moond412@gmail.com>
Reviewed-by: shveta malik <shveta.malik@gmail.com>
Reviewed-by: Amit Kapila <amit.kapila16@gmail.com>
Reviewed-by: Zhijie Hou <houzj.fnst@fujitsu.com>
Reviewed-by: Fujii Masao <masao.fujii@gmail.com>
Discussion: https://postgr.es/m/CAHGQGwFzNYroAxSoyJhqTU-pH=t4Ej6RyvhVmBZ91Exj_TPMMQ@mail.gmail.com
Backpatch-through: 17

src/backend/replication/logical/slotsync.c
src/backend/storage/ipc/procsignal.c
src/backend/tcop/postgres.c
src/include/replication/slotsync.h
src/include/storage/procsignal.h

index ae900f13467605f9f3c0a540c965dcdcc17b0bc7..f90653e52320bd33f283916c4f814dad22062c43 100644 (file)
  * Struct for sharing information to control slot synchronization.
  *
  * The 'pid' is either the slot sync worker's pid or the backend's pid running
- * the SQL function pg_sync_replication_slots(). When the startup process sets
- * 'stopSignaled' during promotion, it uses this 'pid' to wake up the currently
- * synchronizing process so that the process can immediately stop its
- * synchronizing work on seeing 'stopSignaled' set.
- * Setting 'stopSignaled' is also used to handle the race condition when the
- * postmaster has not noticed the promotion yet and thus may end up restarting
- * the slot sync worker. If 'stopSignaled' is set, the worker will exit in such a
- * case. The SQL function pg_sync_replication_slots() will also error out if
- * this flag is set. Note that we don't need to reset this variable as after
- * promotion the slot sync worker won't be restarted because the pmState
- * changes to PM_RUN from PM_HOT_STANDBY and we don't support demoting
- * primary without restarting the server. See LaunchMissingBackgroundProcesses.
+ * the SQL function pg_sync_replication_slots(). On promotion, the startup
+ * process sets 'stopSignaled' and uses this 'pid' to signal the synchronizing
+ * process with PROCSIG_SLOTSYNC_MESSAGE and also to wake it up so that the
+ * process can immediately stop its synchronizing work.
+ * Setting 'stopSignaled' on the other hand is used to handle the race
+ * condition when the postmaster has not noticed the promotion yet and thus may
+ * end up restarting the slot sync worker. If 'stopSignaled' is set, the worker
+ * will exit in such a case. The SQL function pg_sync_replication_slots() will
+ * also error out if this flag is set. Note that we don't need to reset this
+ * variable as after promotion the slot sync worker won't be restarted because
+ * the pmState changes to PM_RUN from PM_HOT_STANDBY and we don't support
+ * demoting primary without restarting the server.
+ * See LaunchMissingBackgroundProcesses.
  *
  * The 'syncing' flag is needed to prevent concurrent slot syncs to avoid slot
  * overwrites.
@@ -150,6 +151,13 @@ static long sleep_ms = MIN_SLOTSYNC_WORKER_NAPTIME_MS;
  */
 static bool syncing_slots = false;
 
+/*
+ * Interrupt flag set when PROCSIG_SLOTSYNC_MESSAGE is received, asking the
+ * slotsync worker or pg_sync_replication_slots() to stop because
+ * standby promotion has been triggered.
+ */
+volatile sig_atomic_t SlotSyncShutdownPending = false;
+
 /*
  * Structure to hold information fetched from the primary server about a logical
  * replication slot.
@@ -1301,36 +1309,52 @@ slotsync_reread_config(void)
 }
 
 /*
- * Interrupt handler for process performing slot synchronization.
+ * Handle receipt of an interrupt indicating a slotsync shutdown message.
+ *
+ * This is called within the SIGUSR1 handler.  All we do here is set a flag
+ * that will cause the next CHECK_FOR_INTERRUPTS() to invoke
+ * ProcessSlotSyncMessage().
  */
-static void
-ProcessSlotSyncInterrupts(void)
+void
+HandleSlotSyncMessageInterrupt(void)
 {
-       CHECK_FOR_INTERRUPTS();
+       InterruptPending = true;
+       SlotSyncShutdownPending = true;
+       /* latch will be set by procsignal_sigusr1_handler */
+}
 
-       if (SlotSyncCtx->stopSignaled)
-       {
-               if (AmLogicalSlotSyncWorkerProcess())
-               {
-                       ereport(LOG,
-                                       errmsg("replication slot synchronization worker will stop because promotion is triggered"));
+/*
+ * Handle a PROCSIG_SLOTSYNC_MESSAGE signal, called from ProcessInterrupts().
+ *
+ * If the current process is the slotsync background worker, log a message
+ * and exit cleanly.  If it is a backend executing pg_sync_replication_slots(),
+ * raise an error, unless the sync has already finished, in which case there
+ * is no need to interrupt the caller.
+ */
+void
+ProcessSlotSyncMessage(void)
+{
+       SlotSyncShutdownPending = false;
 
-                       proc_exit(0);
-               }
-               else
-               {
-                       /*
-                        * For the backend executing SQL function
-                        * pg_sync_replication_slots().
-                        */
-                       ereport(ERROR,
-                                       errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
-                                       errmsg("replication slot synchronization will stop because promotion is triggered"));
-               }
+       if (AmLogicalSlotSyncWorkerProcess())
+       {
+               ereport(LOG,
+                               errmsg("replication slot synchronization worker will stop because promotion is triggered"));
+               proc_exit(0);
        }
+       else
+       {
+               /*
+                * If sync has already completed, there is no need to interrupt the
+                * caller with an error.
+                */
+               if (!IsSyncingReplicationSlots())
+                       return;
 
-       if (ConfigReloadPending)
-               slotsync_reread_config();
+               ereport(ERROR,
+                               errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+                               errmsg("replication slot synchronization will stop because promotion is triggered"));
+       }
 }
 
 /*
@@ -1437,6 +1461,34 @@ check_and_set_sync_info(pid_t sync_process_pid)
 {
        SpinLockAcquire(&SlotSyncCtx->mutex);
 
+       /*
+        * Exit immediately if promotion has been triggered.  This guards against
+        * a new worker (or a call to pg_sync_replication_slots()) that starts
+        * after the old worker was stopped by ShutDownSlotSync().
+        */
+       if (SlotSyncCtx->stopSignaled)
+       {
+               SpinLockRelease(&SlotSyncCtx->mutex);
+
+               if (AmLogicalSlotSyncWorkerProcess())
+               {
+                       ereport(DEBUG1,
+                                       errmsg("replication slot synchronization worker will not start because promotion was triggered"));
+
+                       proc_exit(0);
+               }
+               else
+               {
+                       /*
+                        * For the backend executing SQL function
+                        * pg_sync_replication_slots().
+                        */
+                       ereport(ERROR,
+                                       errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+                                       errmsg("replication slot synchronization will not start because promotion was triggered"));
+               }
+       }
+
        if (SlotSyncCtx->syncing)
        {
                SpinLockRelease(&SlotSyncCtx->mutex);
@@ -1652,7 +1704,10 @@ ReplSlotSyncWorkerMain(const void *startup_data, size_t startup_data_len)
                bool            started_tx = false;
                List       *remote_slots;
 
-               ProcessSlotSyncInterrupts();
+               CHECK_FOR_INTERRUPTS();
+
+               if (ConfigReloadPending)
+                       slotsync_reread_config();
 
                /*
                 * The syscache access in fetch_remote_slots() needs a transaction
@@ -1765,11 +1820,11 @@ ShutDownSlotSync(void)
        SpinLockRelease(&SlotSyncCtx->mutex);
 
        /*
-        * Signal process doing slotsync, if any. The process will stop upon
-        * detecting that the stopSignaled flag is set to true.
+        * Signal process doing slotsync, if any, asking it to stop.
         */
        if (sync_process_pid != InvalidPid)
-               kill(sync_process_pid, SIGUSR1);
+               SendProcSignal(sync_process_pid, PROCSIG_SLOTSYNC_MESSAGE,
+                                          INVALID_PROC_NUMBER);
 
        /* Wait for slot sync to end */
        for (;;)
@@ -1942,9 +1997,6 @@ SyncReplicationSlots(WalReceiverConn *wrconn)
 
                check_and_set_sync_info(MyProcPid);
 
-               /* Check for interrupts and config changes */
-               ProcessSlotSyncInterrupts();
-
                validate_remote_info(wrconn);
 
                /* Retry until all the slots are sync-ready */
@@ -1954,7 +2006,10 @@ SyncReplicationSlots(WalReceiverConn *wrconn)
                        bool            some_slot_updated = false;
 
                        /* Check for interrupts and config changes */
-                       ProcessSlotSyncInterrupts();
+                       CHECK_FOR_INTERRUPTS();
+
+                       if (ConfigReloadPending)
+                               slotsync_reread_config();
 
                        /* We must be in a valid transaction state */
                        Assert(IsTransactionState());
index 4e3ee27a058e98ed6250faed6868bcc70f3f2f61..264e4c22ca6a04603dadc4d8448151fe1f051d9a 100644 (file)
@@ -26,6 +26,7 @@
 #include "postmaster/datachecksum_state.h"
 #include "replication/logicalctl.h"
 #include "replication/logicalworker.h"
+#include "replication/slotsync.h"
 #include "replication/walsender.h"
 #include "storage/condition_variable.h"
 #include "storage/ipc.h"
@@ -710,6 +711,9 @@ procsignal_sigusr1_handler(SIGNAL_ARGS)
        if (CheckProcSignal(PROCSIG_REPACK_MESSAGE))
                HandleRepackMessageInterrupt();
 
+       if (CheckProcSignal(PROCSIG_SLOTSYNC_MESSAGE))
+               HandleSlotSyncMessageInterrupt();
+
        if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT))
                HandleRecoveryConflictInterrupt();
 
index 14a061599bce8e840afabac380ccf29262c184d6..aeaf1c6db8f9ecd05c77b552004910d7c71ef9c6 100644 (file)
@@ -59,6 +59,7 @@
 #include "postmaster/postmaster.h"
 #include "replication/logicallauncher.h"
 #include "replication/logicalworker.h"
+#include "replication/slotsync.h"
 #include "replication/slot.h"
 #include "replication/walsender.h"
 #include "rewrite/rewriteHandler.h"
@@ -3599,6 +3600,9 @@ ProcessInterrupts(void)
        if (ParallelApplyMessagePending)
                ProcessParallelApplyMessages();
 
+       if (SlotSyncShutdownPending)
+               ProcessSlotSyncMessage();
+
        if (RepackMessagePending)
                ProcessRepackMessages();
 }
index d2121cd3ed77e918fa047ed898c54c7da1d89046..a55d1f0dccc89ad8ff37c27739b29c00afa76c62 100644 (file)
 #ifndef SLOTSYNC_H
 #define SLOTSYNC_H
 
+#include <signal.h>
+
 #include "replication/walreceiver.h"
 
 extern PGDLLIMPORT bool sync_replication_slots;
 
+/* Interrupt flag set by HandleSlotSyncMessageInterrupt() */
+extern PGDLLIMPORT volatile sig_atomic_t SlotSyncShutdownPending;
+
 /*
  * GUCs needed by slot sync worker to connect to the primary
  * server and carry on with slots synchronization.
@@ -32,5 +37,7 @@ extern void ShutDownSlotSync(void);
 extern bool SlotSyncWorkerCanRestart(void);
 extern bool IsSyncingReplicationSlots(void);
 extern void SyncReplicationSlots(WalReceiverConn *wrconn);
+extern void HandleSlotSyncMessageInterrupt(void);
+extern void ProcessSlotSyncMessage(void);
 
 #endif                                                 /* SLOTSYNC_H */
index 480c02203b0cba6c0380490c95348d946acde474..aaa158bfd6618f1ad0b02570e91a29d12e69b96f 100644 (file)
@@ -36,6 +36,7 @@ typedef enum
        PROCSIG_BARRIER,                        /* global barrier interrupt  */
        PROCSIG_LOG_MEMORY_CONTEXT, /* ask backend to log the memory contexts */
        PROCSIG_PARALLEL_APPLY_MESSAGE, /* Message from parallel apply workers */
+       PROCSIG_SLOTSYNC_MESSAGE,       /* ask slot synchronization to stop */
        PROCSIG_REPACK_MESSAGE,         /* Message from repack worker */
        PROCSIG_RECOVERY_CONFLICT,      /* backend is blocking recovery, check
                                                                 * PGPROC->pendingRecoveryConflicts for the