]> git.ipfire.org Git - thirdparty/postgresql.git/commitdiff
Fix LOCK_TIMEOUT handling during parallel apply.
authorAmit Kapila <akapila@postgresql.org>
Wed, 24 Sep 2025 03:38:27 +0000 (03:38 +0000)
committerAmit Kapila <akapila@postgresql.org>
Wed, 24 Sep 2025 03:38:27 +0000 (03:38 +0000)
Previously, the parallel apply worker used SIGINT to receive a graceful
shutdown signal from the leader apply worker. However, SIGINT is also used
by the LOCK_TIMEOUT handler to trigger a query-cancel interrupt. This
overlap caused the parallel apply worker to miss LOCK_TIMEOUT signals,
leading to incorrect behavior during lock wait/contention.

This patch resolves the conflict by switching the graceful shutdown signal
from SIGINT to SIGUSR2.

Reported-by: Zane Duffield <duffieldzane@gmail.com>
Diagnosed-by: Zhijie Hou <houzj.fnst@fujitsu.com>
Author: Hayato Kuroda <kuroda.hayato@fujitsu.com>
Reviewed-by: Amit Kapila <amit.kapila16@gmail.com>
Backpatch-through: 16, where it was introduced
Discussion: https://postgr.es/m/CACMiCkXyC4au74kvE2g6Y=mCEF8X6r-Ne_ty4r7qWkUjRE4+oQ@mail.gmail.com

src/backend/postmaster/interrupt.c
src/backend/replication/logical/applyparallelworker.c
src/backend/replication/logical/launcher.c

index 6d4bd76bf8194edef8c0d6299f0f50b92ad68040..3041c4a0107bb7942a38a012850bc3959eeb2b26 100644 (file)
@@ -98,9 +98,8 @@ SignalHandlerForCrashExit(SIGNAL_ARGS)
  * shut down and exit.
  *
  * Typically, this handler would be used for SIGTERM, but some processes use
- * other signals. In particular, the checkpointer exits on SIGUSR2, and the WAL
- * writer and the logical replication parallel apply worker exits on either
- * SIGINT or SIGTERM.
+ * other signals. In particular, the checkpointer and parallel apply worker
+ * exit on SIGUSR2, and the WAL writer exits on either SIGINT or SIGTERM.
  *
  * ShutdownRequestPending should be checked at a convenient place within the
  * main loop, or else the main loop should call HandleMainLoopInterrupts.
index a24709ec7a5f4d52bf7861fb3f3eb1221c9f9722..4a2c67f9e40938aa9290b0e6cef50364fb8f6562 100644 (file)
@@ -872,10 +872,17 @@ ParallelApplyWorkerMain(Datum main_arg)
 
        InitializingApplyWorker = true;
 
-       /* Setup signal handling. */
+       /*
+        * Setup signal handling.
+        *
+        * Note: We intentionally used SIGUSR2 to trigger a graceful shutdown
+        * initiated by the leader apply worker. This helps to differentiate it
+        * from the case where we abort the current transaction and exit on
+        * receiving SIGTERM.
+        */
        pqsignal(SIGHUP, SignalHandlerForConfigReload);
-       pqsignal(SIGINT, SignalHandlerForShutdownRequest);
        pqsignal(SIGTERM, die);
+       pqsignal(SIGUSR2, SignalHandlerForShutdownRequest);
        BackgroundWorkerUnblockSignals();
 
        /*
@@ -974,9 +981,9 @@ ParallelApplyWorkerMain(Datum main_arg)
 
        /*
         * The parallel apply worker must not get here because the parallel apply
-        * worker will only stop when it receives a SIGTERM or SIGINT from the
-        * leader, or when there is an error. None of these cases will allow the
-        * code to reach here.
+        * worker will only stop when it receives a SIGTERM or SIGUSR2 from the
+        * leader, or SIGINT from itself, or when there is an error. None of these
+        * cases will allow the code to reach here.
         */
        Assert(false);
 }
index 423d85fd40c8ea5f947ae7ba25a5493207a8f8fe..a49c6648cc202e575b37667a42db8347f712cb3d 100644 (file)
@@ -624,7 +624,7 @@ logicalrep_worker_stop(Oid subid, Oid relid)
 /*
  * Stop the given logical replication parallel apply worker.
  *
- * Node that the function sends SIGINT instead of SIGTERM to the parallel apply
+ * Node that the function sends SIGUSR2 instead of SIGTERM to the parallel apply
  * worker so that the worker exits cleanly.
  */
 void
@@ -662,7 +662,7 @@ logicalrep_pa_worker_stop(ParallelApplyWorkerInfo *winfo)
         * Only stop the worker if the generation matches and the worker is alive.
         */
        if (worker->generation == generation && worker->proc)
-               logicalrep_worker_stop_internal(worker, SIGINT);
+               logicalrep_worker_stop_internal(worker, SIGUSR2);
 
        LWLockRelease(LogicalRepWorkerLock);
 }