]> git.ipfire.org Git - thirdparty/postgresql.git/commitdiff
Fix race when logical decoding activation is concurrently interrupted.
authorMasahiko Sawada <msawada@postgresql.org>
Tue, 9 Jun 2026 18:19:27 +0000 (11:19 -0700)
committerMasahiko Sawada <msawada@postgresql.org>
Tue, 9 Jun 2026 18:19:27 +0000 (11:19 -0700)
EnableLogicalDecoding() sets xlog_logical_info to true, emits a
procsignal barrier, sets logical_decoding_enabled to true, and then
writes a WAL record. If the activating backend is interrupted between
these steps, a PG_ENSURE_ERROR_CLEANUP() callback runs to undo the
partial activation.

The previous callback asserted that logical_decoding_enabled was still
false and then cleared xlog_logical_info. Both actions were unsafe
when a second backend was concurrently activating: the peer backend
might have already observed xlog_logical_info as true, set
logical_decoding_enabled to true, and written the activation WAL
record before our callback fired, causing the first backend to hit the
assertion failure.

Fix this by having the abort callback call
RequestDisableLogicalDecoding(), allowing the checkpointer to undo the
partial activation in the same manner as a normal deactivation. This
simplifies the logic by unifying the activation abort and deactivation
paths. While this approach now wakes up the checkpointer when an
activation is interrupted, this should not be a serious issue in
practice since such interruptions are rare.

Add a test case to 051_effective_wal_level.pl.

Reported-by: Chao Li <li.evan.chao@gmail.com>
Reviewed-by: Chao Li <li.evan.chao@gmail.com>
Discussion: https://postgr.es/m/788B5B8A-BC22-48D8-818E-7B00416CF84E@gmail.com

src/backend/replication/logical/logicalctl.c
src/test/recovery/t/051_effective_wal_level.pl

index 72f68ec58ef7e332e76d5baad4dca858d9bfa55a..c11d131645085ed72838c6a896e6ceeb1bd20e4c 100644 (file)
@@ -256,33 +256,19 @@ write_logical_decoding_status_update_record(bool status)
 }
 
 /*
- * A PG_ENSURE_ERROR_CLEANUP callback for activating logical decoding, resetting
- * the shared flags to revert the logical decoding activation process.
+ * A PG_ENSURE_ERROR_CLEANUP callback for activating logical decoding.
+ *
+ * Rather than directly reverting xlog_logical_info here, we request
+ * that the checkpointer handle it via the normal disable path. This
+ * avoids race conditions when multiple backends attempt concurrent
+ * activation: the checkpointer will reset xlog_logical_info when
+ * no valid logical slots exist.
  */
 static void
 abort_logical_decoding_activation(int code, Datum arg)
 {
-       Assert(MyReplicationSlot);
-       Assert(!LogicalDecodingCtl->logical_decoding_enabled);
-
        elog(DEBUG1, "aborting logical decoding activation process");
-
-       /*
-        * Abort the change to xlog_logical_info. We don't need to check
-        * CheckLogicalSlotExists() as we're still holding a logical slot.
-        */
-       LWLockAcquire(LogicalDecodingControlLock, LW_EXCLUSIVE);
-       LogicalDecodingCtl->xlog_logical_info = false;
-       LWLockRelease(LogicalDecodingControlLock);
-
-       /*
-        * Some processes might have already started logical info WAL logging, so
-        * tell all running processes to update their caches. We don't need to
-        * wait for all processes to disable xlog_logical_info locally as it's
-        * always safe to write logical information to WAL records, even when not
-        * strictly required.
-        */
-       EmitProcSignalBarrier(PROCSIGNAL_BARRIER_UPDATE_XLOG_LOGICAL_INFO);
+       RequestDisableLogicalDecoding();
 }
 
 /*
@@ -396,6 +382,8 @@ EnableLogicalDecoding(void)
         * long-running read queries, which is practically unacceptable.
         */
 
+       LWLockAcquire(LogicalDecodingControlLock, LW_EXCLUSIVE);
+
        START_CRIT_SECTION();
 
        /*
@@ -403,8 +391,6 @@ EnableLogicalDecoding(void)
         * This sequence ensures logical decoding becomes available on the primary
         * first.
         */
-       LWLockAcquire(LogicalDecodingControlLock, LW_EXCLUSIVE);
-
        LogicalDecodingCtl->logical_decoding_enabled = true;
 
        if (!in_recovery)
@@ -412,10 +398,16 @@ EnableLogicalDecoding(void)
 
        LogicalDecodingCtl->pending_disable = false;
 
-       LWLockRelease(LogicalDecodingControlLock);
-
        END_CRIT_SECTION();
 
+       LWLockRelease(LogicalDecodingControlLock);
+
+       /*
+        * We log the activation message after releasing the slot lock. This is
+        * safe because the activation is performed while holding a logical slot,
+        * meaning, a concurrent deactivation cannot interleave its log message
+        * ahead of ours.
+        */
        if (!in_recovery)
                ereport(LOG,
                                errmsg("logical decoding is enabled upon creating a new logical replication slot"));
@@ -489,16 +481,19 @@ void
 DisableLogicalDecoding(void)
 {
        bool            in_recovery = RecoveryInProgress();
+       bool            was_enabled;
 
        LWLockAcquire(LogicalDecodingControlLock, LW_EXCLUSIVE);
 
        /*
         * Check if we can disable logical decoding.
         *
-        * Skip CheckLogicalSlotExists() check during recovery because the
-        * existing slots will be invalidated after disabling logical decoding.
+        * Nothing to do if both flags are already off, or if valid slots exist
+        * (skip the slot check during recovery because the existing slots will be
+        * invalidated after disabling logical decoding.)
         */
-       if (!LogicalDecodingCtl->logical_decoding_enabled ||
+       if ((!LogicalDecodingCtl->logical_decoding_enabled &&
+                !LogicalDecodingCtl->xlog_logical_info) ||
                (!in_recovery && CheckLogicalSlotExists()))
        {
                LogicalDecodingCtl->pending_disable = false;
@@ -506,6 +501,13 @@ DisableLogicalDecoding(void)
                return;
        }
 
+       /*
+        * Remember if logical decoding was enabled. An interrupted activation can
+        * leave xlog_logical_info=true while logical_decoding_enabled remains
+        * false.
+        */
+       was_enabled = LogicalDecodingCtl->logical_decoding_enabled;
+
        START_CRIT_SECTION();
 
        /*
@@ -516,7 +518,7 @@ DisableLogicalDecoding(void)
        LogicalDecodingCtl->logical_decoding_enabled = false;
 
        /* Write the WAL to disable logical decoding on standbys too */
-       if (!in_recovery)
+       if (!in_recovery && was_enabled)
                write_logical_decoding_status_update_record(false);
 
        /* Now disable logical information WAL logging */
@@ -525,7 +527,12 @@ DisableLogicalDecoding(void)
 
        END_CRIT_SECTION();
 
-       if (!in_recovery)
+       /*
+        * Logging under the lock guarantees our "is disabled" message appears in
+        * the server log before its eventual "is enabled", making server log
+        * diagnostics easy.
+        */
+       if (!in_recovery && was_enabled)
                ereport(LOG,
                                errmsg("logical decoding is disabled because there are no valid logical replication slots"));
 
index c862073c34ec7c05ae225f65dcd0d48484759935..c45eddc7383a7f2a960dcfa5ddd9939ba58a8f34 100644 (file)
@@ -410,8 +410,49 @@ select pg_cancel_backend(pid) from pg_stat_activity where query ~ 'slot_canceled
 
        # Verify that the backend aborted the activation process.
        $primary->wait_for_log("aborting logical decoding activation process");
-       test_wal_level($primary, "replica|replica",
-               "the activation process aborted");
+       wait_for_logical_decoding_disabled($primary);
+       pass("the activation process aborted");
+
+       # Test concurrent activation processes run and one is interrupted.
+       $psql_create_slot = $primary->background_psql('postgres');
+
+       # Start a psql session and stops in the middle of the activation
+       # process.
+       $psql_create_slot->query_until(
+               qr/create_slot_canceled/,
+               q(\echo create_slot_canceled
+select injection_points_set_local();
+select injection_points_attach('logical-decoding-activation', 'wait');
+select pg_create_logical_replication_slot('slot_canceled2', 'pgoutput');
+\q
+));
+       $primary->wait_for_event('client backend', 'logical-decoding-activation');
+       note("injection_point 'logical-decoding-activation' is reached");
+
+       # Another backend concurrently enables logical decoding.
+       $primary->safe_psql('postgres',
+               qq[select pg_create_logical_replication_slot('test_slot2', 'pgoutput')]
+       );
+
+       # Concurrent slot creation should not be blocked. So wait until
+       # test_slot2 is created and logical decoding is enabled.
+       test_wal_level($primary, "replica|logical",
+               "the concurrent activation has done properly");
+
+       # Cancel the backend initiated by $psql_create_slot, aborting its activation
+       # process.
+       my $log_offset = -s $primary->logfile;
+       $primary->safe_psql(
+               'postgres',
+               qq[
+select pg_cancel_backend(pid) from pg_stat_activity where query ~ 'slot_canceled2' and pid <> pg_backend_pid()
+]);
+       # Canceling the backend should not affect the concurrent slot creation.
+       $primary->wait_for_log("canceling statement due to user request",
+               $log_offset);
+       test_wal_level($primary, "replica|logical",
+               "effective_wal_level remains 'logical' even after the concurrent activation is interrupted"
+       );
 }
 
 $primary->stop;