]> git.ipfire.org Git - thirdparty/postgresql.git/commitdiff
Fix REPACK decoding worker not cleaned up on FATAL exit
authorÁlvaro Herrera <alvherre@kurilemu.de>
Tue, 19 May 2026 18:37:46 +0000 (11:37 -0700)
committerÁlvaro Herrera <alvherre@kurilemu.de>
Tue, 19 May 2026 18:37:46 +0000 (11:37 -0700)
When the launching backend of REPACK (CONCURRENTLY) is terminated via
pg_terminate_backend(), ProcDiePending causes ereport(FATAL) which
bypasses PG_FINALLY blocks.  As a result, stop_repack_decoding_worker()
is never called, leaving the decoding worker running indefinitely and
holding its temporary replication slot.

Fix by using PG_ENSURE_ERROR_CLEANUP, which handles both ERROR and
FATAL exits.

Author: Baji Shaik <baji.pgdev@gmail.com>
Reviewed-by: Sami Imseih <samimseih@gmail.com>
Reviewed-by: Álvaro Herrera <alvherre@kurilemu.de>
Discussion: https://postgr.es/m/CA+fm-RNoPxL2N7db_A0anMXV_aDu6jWj4PNOPtMtBUAPDPvSXQ@mail.gmail.com

src/backend/commands/repack.c

index 351a3cc32e8f1ca0499d44d04cb9ee32ccb5bcf3..22a1307b38d6f4c5972e3842131181158f0976df 100644 (file)
@@ -64,6 +64,7 @@
 #include "pgstat.h"
 #include "replication/logicalrelation.h"
 #include "storage/bufmgr.h"
+#include "storage/ipc.h"
 #include "storage/lmgr.h"
 #include "storage/predicate.h"
 #include "storage/proc.h"
@@ -211,6 +212,7 @@ static Oid  determine_clustered_index(Relation rel, bool usingindex,
 
 static void start_repack_decoding_worker(Oid relid);
 static void stop_repack_decoding_worker(void);
+static void stop_repack_decoding_worker_cb(int code, Datum arg);
 static Snapshot get_initial_snapshot(DecodingWorker *worker);
 
 static void ProcessRepackMessage(StringInfo msg);
@@ -666,27 +668,26 @@ cluster_rel(RepackCommand cmd, Relation OldHeap, Oid indexOid,
        if (!concurrent)
                TransferPredicateLocksToHeapRelation(OldHeap);
 
-       /* rebuild_relation does all the dirty work */
-       PG_TRY();
-       {
-               rebuild_relation(OldHeap, index, verbose, ident_idx);
-       }
-       PG_FINALLY();
+       /*
+        * rebuild_relation does all the dirty work, and closes OldHeap and index,
+        * if valid.
+        *
+        * In concurrent mode, make sure the worker terminates; normally it does
+        * so by itself, but a PG_ENSURE_ERROR_CLEANUP callback ensures that this
+        * happens even in case this backend dies early on a FATAL exit.  Normal
+        * mode doesn't need that overhead.
+        */
+       if (concurrent)
        {
-               if (concurrent)
+               PG_ENSURE_ERROR_CLEANUP(stop_repack_decoding_worker_cb, 0);
                {
-                       /*
-                        * Since during normal operation the worker was already asked to
-                        * exit, stopping it explicitly is especially important on ERROR.
-                        * However it still seems a good practice to make sure that the
-                        * worker never survives the REPACK command.
-                        */
-                       stop_repack_decoding_worker();
+                       rebuild_relation(OldHeap, index, verbose, ident_idx);
                }
+               PG_END_ENSURE_ERROR_CLEANUP(stop_repack_decoding_worker_cb, 0);
+               stop_repack_decoding_worker();
        }
-       PG_END_TRY();
-
-       /* rebuild_relation closes OldHeap, and index if valid */
+       else
+               rebuild_relation(OldHeap, index, verbose, ident_idx);
 
 out:
        /* Roll back any GUC changes executed by index functions */
@@ -3534,6 +3535,13 @@ stop_repack_decoding_worker(void)
        decoding_worker = NULL;
 }
 
+/* stop_repack_decoding_worker, wrapped as a before_shmem_exit callback */
+static void
+stop_repack_decoding_worker_cb(int code, Datum arg)
+{
+       stop_repack_decoding_worker();
+}
+
 /*
  * Get the initial snapshot from the decoding worker.
  */