From: Fujii Masao Date: Wed, 8 Apr 2026 02:22:21 +0000 (+0900) Subject: Fix slotsync worker blocking promotion when stuck in wait X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=db93032a7cbd30199429167e649e22fab89327ed;p=thirdparty%2Fpostgresql.git Fix slotsync worker blocking promotion when stuck in wait Previously, on standby promotion, the startup process sent SIGUSR1 to the slotsync worker (or a backend performing slot synchronization) and waited for it to exit. This worked in most cases, but if the process was blocked waiting for a response from the primary (e.g., due to a network failure), SIGUSR1 would not interrupt the wait. As a result, the process could remain stuck, causing the startup process to wait for a long time and delaying promotion. This commit fixes the issue by introducing a new procsignal reason, PROCSIG_SLOTSYNC_MESSAGE. On promotion, the startup process sends this signal, and the handler sets interrupt flags so the process exits (or errors out) promptly at CHECK_FOR_INTERRUPTS(), allowing promotion to complete without delay. Backpatch to v17, where slotsync was introduced. Author: Nisha Moond Reviewed-by: shveta malik Reviewed-by: Amit Kapila Reviewed-by: Zhijie Hou Reviewed-by: Fujii Masao Discussion: https://postgr.es/m/CAHGQGwFzNYroAxSoyJhqTU-pH=t4Ej6RyvhVmBZ91Exj_TPMMQ@mail.gmail.com Backpatch-through: 17 --- diff --git a/src/backend/replication/logical/slotsync.c b/src/backend/replication/logical/slotsync.c index ae900f13467..f90653e5232 100644 --- a/src/backend/replication/logical/slotsync.c +++ b/src/backend/replication/logical/slotsync.c @@ -86,18 +86,19 @@ * Struct for sharing information to control slot synchronization. * * The 'pid' is either the slot sync worker's pid or the backend's pid running - * the SQL function pg_sync_replication_slots(). When the startup process sets - * 'stopSignaled' during promotion, it uses this 'pid' to wake up the currently - * synchronizing process so that the process can immediately stop its - * synchronizing work on seeing 'stopSignaled' set. - * Setting 'stopSignaled' is also used to handle the race condition when the - * postmaster has not noticed the promotion yet and thus may end up restarting - * the slot sync worker. If 'stopSignaled' is set, the worker will exit in such a - * case. The SQL function pg_sync_replication_slots() will also error out if - * this flag is set. Note that we don't need to reset this variable as after - * promotion the slot sync worker won't be restarted because the pmState - * changes to PM_RUN from PM_HOT_STANDBY and we don't support demoting - * primary without restarting the server. See LaunchMissingBackgroundProcesses. + * the SQL function pg_sync_replication_slots(). On promotion, the startup + * process sets 'stopSignaled' and uses this 'pid' to signal the synchronizing + * process with PROCSIG_SLOTSYNC_MESSAGE and also to wake it up so that the + * process can immediately stop its synchronizing work. + * Setting 'stopSignaled' on the other hand is used to handle the race + * condition when the postmaster has not noticed the promotion yet and thus may + * end up restarting the slot sync worker. If 'stopSignaled' is set, the worker + * will exit in such a case. The SQL function pg_sync_replication_slots() will + * also error out if this flag is set. Note that we don't need to reset this + * variable as after promotion the slot sync worker won't be restarted because + * the pmState changes to PM_RUN from PM_HOT_STANDBY and we don't support + * demoting primary without restarting the server. + * See LaunchMissingBackgroundProcesses. * * The 'syncing' flag is needed to prevent concurrent slot syncs to avoid slot * overwrites. @@ -150,6 +151,13 @@ static long sleep_ms = MIN_SLOTSYNC_WORKER_NAPTIME_MS; */ static bool syncing_slots = false; +/* + * Interrupt flag set when PROCSIG_SLOTSYNC_MESSAGE is received, asking the + * slotsync worker or pg_sync_replication_slots() to stop because + * standby promotion has been triggered. + */ +volatile sig_atomic_t SlotSyncShutdownPending = false; + /* * Structure to hold information fetched from the primary server about a logical * replication slot. @@ -1301,36 +1309,52 @@ slotsync_reread_config(void) } /* - * Interrupt handler for process performing slot synchronization. + * Handle receipt of an interrupt indicating a slotsync shutdown message. + * + * This is called within the SIGUSR1 handler. All we do here is set a flag + * that will cause the next CHECK_FOR_INTERRUPTS() to invoke + * ProcessSlotSyncMessage(). */ -static void -ProcessSlotSyncInterrupts(void) +void +HandleSlotSyncMessageInterrupt(void) { - CHECK_FOR_INTERRUPTS(); + InterruptPending = true; + SlotSyncShutdownPending = true; + /* latch will be set by procsignal_sigusr1_handler */ +} - if (SlotSyncCtx->stopSignaled) - { - if (AmLogicalSlotSyncWorkerProcess()) - { - ereport(LOG, - errmsg("replication slot synchronization worker will stop because promotion is triggered")); +/* + * Handle a PROCSIG_SLOTSYNC_MESSAGE signal, called from ProcessInterrupts(). + * + * If the current process is the slotsync background worker, log a message + * and exit cleanly. If it is a backend executing pg_sync_replication_slots(), + * raise an error, unless the sync has already finished, in which case there + * is no need to interrupt the caller. + */ +void +ProcessSlotSyncMessage(void) +{ + SlotSyncShutdownPending = false; - proc_exit(0); - } - else - { - /* - * For the backend executing SQL function - * pg_sync_replication_slots(). - */ - ereport(ERROR, - errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("replication slot synchronization will stop because promotion is triggered")); - } + if (AmLogicalSlotSyncWorkerProcess()) + { + ereport(LOG, + errmsg("replication slot synchronization worker will stop because promotion is triggered")); + proc_exit(0); } + else + { + /* + * If sync has already completed, there is no need to interrupt the + * caller with an error. + */ + if (!IsSyncingReplicationSlots()) + return; - if (ConfigReloadPending) - slotsync_reread_config(); + ereport(ERROR, + errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("replication slot synchronization will stop because promotion is triggered")); + } } /* @@ -1437,6 +1461,34 @@ check_and_set_sync_info(pid_t sync_process_pid) { SpinLockAcquire(&SlotSyncCtx->mutex); + /* + * Exit immediately if promotion has been triggered. This guards against + * a new worker (or a call to pg_sync_replication_slots()) that starts + * after the old worker was stopped by ShutDownSlotSync(). + */ + if (SlotSyncCtx->stopSignaled) + { + SpinLockRelease(&SlotSyncCtx->mutex); + + if (AmLogicalSlotSyncWorkerProcess()) + { + ereport(DEBUG1, + errmsg("replication slot synchronization worker will not start because promotion was triggered")); + + proc_exit(0); + } + else + { + /* + * For the backend executing SQL function + * pg_sync_replication_slots(). + */ + ereport(ERROR, + errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("replication slot synchronization will not start because promotion was triggered")); + } + } + if (SlotSyncCtx->syncing) { SpinLockRelease(&SlotSyncCtx->mutex); @@ -1652,7 +1704,10 @@ ReplSlotSyncWorkerMain(const void *startup_data, size_t startup_data_len) bool started_tx = false; List *remote_slots; - ProcessSlotSyncInterrupts(); + CHECK_FOR_INTERRUPTS(); + + if (ConfigReloadPending) + slotsync_reread_config(); /* * The syscache access in fetch_remote_slots() needs a transaction @@ -1765,11 +1820,11 @@ ShutDownSlotSync(void) SpinLockRelease(&SlotSyncCtx->mutex); /* - * Signal process doing slotsync, if any. The process will stop upon - * detecting that the stopSignaled flag is set to true. + * Signal process doing slotsync, if any, asking it to stop. */ if (sync_process_pid != InvalidPid) - kill(sync_process_pid, SIGUSR1); + SendProcSignal(sync_process_pid, PROCSIG_SLOTSYNC_MESSAGE, + INVALID_PROC_NUMBER); /* Wait for slot sync to end */ for (;;) @@ -1942,9 +1997,6 @@ SyncReplicationSlots(WalReceiverConn *wrconn) check_and_set_sync_info(MyProcPid); - /* Check for interrupts and config changes */ - ProcessSlotSyncInterrupts(); - validate_remote_info(wrconn); /* Retry until all the slots are sync-ready */ @@ -1954,7 +2006,10 @@ SyncReplicationSlots(WalReceiverConn *wrconn) bool some_slot_updated = false; /* Check for interrupts and config changes */ - ProcessSlotSyncInterrupts(); + CHECK_FOR_INTERRUPTS(); + + if (ConfigReloadPending) + slotsync_reread_config(); /* We must be in a valid transaction state */ Assert(IsTransactionState()); diff --git a/src/backend/storage/ipc/procsignal.c b/src/backend/storage/ipc/procsignal.c index 4e3ee27a058..264e4c22ca6 100644 --- a/src/backend/storage/ipc/procsignal.c +++ b/src/backend/storage/ipc/procsignal.c @@ -26,6 +26,7 @@ #include "postmaster/datachecksum_state.h" #include "replication/logicalctl.h" #include "replication/logicalworker.h" +#include "replication/slotsync.h" #include "replication/walsender.h" #include "storage/condition_variable.h" #include "storage/ipc.h" @@ -710,6 +711,9 @@ procsignal_sigusr1_handler(SIGNAL_ARGS) if (CheckProcSignal(PROCSIG_REPACK_MESSAGE)) HandleRepackMessageInterrupt(); + if (CheckProcSignal(PROCSIG_SLOTSYNC_MESSAGE)) + HandleSlotSyncMessageInterrupt(); + if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT)) HandleRecoveryConflictInterrupt(); diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index 14a061599bc..aeaf1c6db8f 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -59,6 +59,7 @@ #include "postmaster/postmaster.h" #include "replication/logicallauncher.h" #include "replication/logicalworker.h" +#include "replication/slotsync.h" #include "replication/slot.h" #include "replication/walsender.h" #include "rewrite/rewriteHandler.h" @@ -3599,6 +3600,9 @@ ProcessInterrupts(void) if (ParallelApplyMessagePending) ProcessParallelApplyMessages(); + if (SlotSyncShutdownPending) + ProcessSlotSyncMessage(); + if (RepackMessagePending) ProcessRepackMessages(); } diff --git a/src/include/replication/slotsync.h b/src/include/replication/slotsync.h index d2121cd3ed7..a55d1f0dccc 100644 --- a/src/include/replication/slotsync.h +++ b/src/include/replication/slotsync.h @@ -12,10 +12,15 @@ #ifndef SLOTSYNC_H #define SLOTSYNC_H +#include + #include "replication/walreceiver.h" extern PGDLLIMPORT bool sync_replication_slots; +/* Interrupt flag set by HandleSlotSyncMessageInterrupt() */ +extern PGDLLIMPORT volatile sig_atomic_t SlotSyncShutdownPending; + /* * GUCs needed by slot sync worker to connect to the primary * server and carry on with slots synchronization. @@ -32,5 +37,7 @@ extern void ShutDownSlotSync(void); extern bool SlotSyncWorkerCanRestart(void); extern bool IsSyncingReplicationSlots(void); extern void SyncReplicationSlots(WalReceiverConn *wrconn); +extern void HandleSlotSyncMessageInterrupt(void); +extern void ProcessSlotSyncMessage(void); #endif /* SLOTSYNC_H */ diff --git a/src/include/storage/procsignal.h b/src/include/storage/procsignal.h index 480c02203b0..aaa158bfd66 100644 --- a/src/include/storage/procsignal.h +++ b/src/include/storage/procsignal.h @@ -36,6 +36,7 @@ typedef enum PROCSIG_BARRIER, /* global barrier interrupt */ PROCSIG_LOG_MEMORY_CONTEXT, /* ask backend to log the memory contexts */ PROCSIG_PARALLEL_APPLY_MESSAGE, /* Message from parallel apply workers */ + PROCSIG_SLOTSYNC_MESSAGE, /* ask slot synchronization to stop */ PROCSIG_REPACK_MESSAGE, /* Message from repack worker */ PROCSIG_RECOVERY_CONFLICT, /* backend is blocking recovery, check * PGPROC->pendingRecoveryConflicts for the