]> git.ipfire.org Git - thirdparty/postgresql.git/commitdiff
Fix invalid checksum state transition in checkpoints
authorDaniel Gustafsson <dgustafsson@postgresql.org>
Thu, 30 Apr 2026 11:41:48 +0000 (13:41 +0200)
committerDaniel Gustafsson <dgustafsson@postgresql.org>
Thu, 30 Apr 2026 11:41:48 +0000 (13:41 +0200)
Commit 78e950cb8 added checksum state handling to all XLOG_CHECKPOINT
records which caused unnecessary state transitions and emission of
procsignal barriers.  Remove as only the _REDO record need to handle
checksum state.  Barrier emission is also consistently made after
controlfile updates to avoid race conditions.

Additionally, interrupts are held between calling ProcSignalInit and
InitLocalDataChecksumState to remove a window where otherwise invalid
state transitions can happen.

Also remove a pointless assertion on Controlfile which will never hit.

Author: Tomas Vondra <tomas@vondra.me>
Author: Daniel Gustafsson <daniel@yesql.se>
Reviewed-by: Ayush Tiwari <ayushtiwari.slg01@gmail.com>
Reviewed-by: SATYANARAYANA NARLAPURAM <satyanarlapuram@gmail.com>
Discussion: https://postgr.es/m/9197F930-DDEB-4CAC-82A2-16FEC715CCE8@yesql.se

src/backend/access/transam/xlog.c
src/backend/postmaster/auxprocess.c
src/backend/postmaster/datachecksum_state.c
src/backend/utils/init/postinit.c

index f74d7a2ab1a282aa8a5402daece40698c0aeeedc..b20c6a2886b1ffae41fed466bd5cb50919e8a722 100644 (file)
@@ -4723,8 +4723,6 @@ SetDataChecksumsOnInProgress(void)
 {
        uint64          barrier;
 
-       Assert(ControlFile != NULL);
-
        /*
         * The state transition is performed in a critical section with
         * checkpoints held off to provide crash safety.
@@ -4738,25 +4736,16 @@ SetDataChecksumsOnInProgress(void)
        XLogCtl->data_checksum_version = PG_DATA_CHECKSUM_INPROGRESS_ON;
        SpinLockRelease(&XLogCtl->info_lck);
 
-       barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_ON);
-
-       MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
-       END_CRIT_SECTION();
-
-       /*
-        * Update the controlfile before waiting since if we have an immediate
-        * shutdown while waiting we want to come back up with checksums enabled.
-        */
        LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
        ControlFile->data_checksum_version = PG_DATA_CHECKSUM_INPROGRESS_ON;
        UpdateControlFile();
        LWLockRelease(ControlFileLock);
 
-       /*
-        * Await state change in all backends to ensure that all backends are in
-        * "inprogress-on". Once done we know that all backends are writing data
-        * checksums.
-        */
+       barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_ON);
+
+       MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
+       END_CRIT_SECTION();
+
        WaitForProcSignalBarrier(barrier);
 }
 
@@ -4787,8 +4776,6 @@ SetDataChecksumsOn(void)
 {
        uint64          barrier;
 
-       Assert(ControlFile != NULL);
-
        SpinLockAcquire(&XLogCtl->info_lck);
 
        /*
@@ -4818,11 +4805,6 @@ SetDataChecksumsOn(void)
        XLogCtl->data_checksum_version = PG_DATA_CHECKSUM_VERSION;
        SpinLockRelease(&XLogCtl->info_lck);
 
-       barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_ON);
-
-       MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
-       END_CRIT_SECTION();
-
        /*
         * Update the controlfile before waiting since if we have an immediate
         * shutdown while waiting we want to come back up with checksums enabled.
@@ -4832,12 +4814,12 @@ SetDataChecksumsOn(void)
        UpdateControlFile();
        LWLockRelease(ControlFileLock);
 
-       RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT | CHECKPOINT_FAST);
+       barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_ON);
 
-       /*
-        * Await state transition to "on" in all backends. When done we know that
-        * data checksums are both written and verified in all backends.
-        */
+       MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
+       END_CRIT_SECTION();
+
+       RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT | CHECKPOINT_FAST);
        WaitForProcSignalBarrier(barrier);
 }
 
@@ -4859,8 +4841,6 @@ SetDataChecksumsOff(void)
 {
        uint64          barrier;
 
-       Assert(ControlFile != NULL);
-
        SpinLockAcquire(&XLogCtl->info_lck);
 
        /* If data checksums are already disabled there is nothing to do */
@@ -4891,22 +4871,17 @@ SetDataChecksumsOff(void)
                XLogCtl->data_checksum_version = PG_DATA_CHECKSUM_INPROGRESS_OFF;
                SpinLockRelease(&XLogCtl->info_lck);
 
+               LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+               ControlFile->data_checksum_version = PG_DATA_CHECKSUM_INPROGRESS_OFF;
+               UpdateControlFile();
+               LWLockRelease(ControlFileLock);
+
                barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_OFF);
 
                MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
                END_CRIT_SECTION();
 
-               LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
-               ControlFile->data_checksum_version = PG_DATA_CHECKSUM_OFF;
-               UpdateControlFile();
-               LWLockRelease(ControlFileLock);
-
                RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT | CHECKPOINT_FAST);
-
-               /*
-                * Update local state in all backends to ensure that any backend in
-                * "on" state is changed to "inprogress-off".
-                */
                WaitForProcSignalBarrier(barrier);
 
                /*
@@ -4935,18 +4910,17 @@ SetDataChecksumsOff(void)
        XLogCtl->data_checksum_version = PG_DATA_CHECKSUM_OFF;
        SpinLockRelease(&XLogCtl->info_lck);
 
-       barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_OFF);
-
-       MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
-       END_CRIT_SECTION();
-
        LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
        ControlFile->data_checksum_version = PG_DATA_CHECKSUM_OFF;
        UpdateControlFile();
        LWLockRelease(ControlFileLock);
 
-       RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT | CHECKPOINT_FAST);
+       barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_OFF);
 
+       MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
+       END_CRIT_SECTION();
+
+       RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT | CHECKPOINT_FAST);
        WaitForProcSignalBarrier(barrier);
 }
 
@@ -4961,6 +4935,7 @@ SetDataChecksumsOff(void)
 void
 InitLocalDataChecksumState(void)
 {
+       Assert(InterruptHoldoffCount > 0);
        SpinLockAcquire(&XLogCtl->info_lck);
        SetLocalDataChecksumState(XLogCtl->data_checksum_version);
        SpinLockRelease(&XLogCtl->info_lck);
@@ -5427,7 +5402,6 @@ XLOGShmemInit(void *arg)
 
        /* Use the checksum info from control file */
        XLogCtl->data_checksum_version = ControlFile->data_checksum_version;
-
        SetLocalDataChecksumState(XLogCtl->data_checksum_version);
 
        SpinLockInit(&XLogCtl->Insert.insertpos_lck);
@@ -6650,6 +6624,7 @@ StartupXLOG(void)
        ControlFile->state = DB_IN_PRODUCTION;
 
        SpinLockAcquire(&XLogCtl->info_lck);
+       ControlFile->data_checksum_version = XLogCtl->data_checksum_version;
        XLogCtl->SharedRecoveryState = RECOVERY_STATE_DONE;
        SpinLockRelease(&XLogCtl->info_lck);
 
@@ -7518,7 +7493,9 @@ CreateCheckPoint(int flags)
         * Get the current data_checksum_version value from xlogctl, valid at the
         * time of the checkpoint.
         */
+       SpinLockAcquire(&XLogCtl->info_lck);
        checkPoint.dataChecksumState = XLogCtl->data_checksum_version;
+       SpinLockRelease(&XLogCtl->info_lck);
 
        if (shutdown)
        {
@@ -7639,10 +7616,6 @@ CreateCheckPoint(int flags)
                checkPoint.nextOid += TransamVariables->oidCount;
        LWLockRelease(OidGenLock);
 
-       SpinLockAcquire(&XLogCtl->info_lck);
-       checkPoint.dataChecksumState = XLogCtl->data_checksum_version;
-       SpinLockRelease(&XLogCtl->info_lck);
-
        checkPoint.logicalDecodingEnabled = IsLogicalDecodingEnabled();
 
        MultiXactGetCheckptMulti(shutdown,
@@ -7792,9 +7765,6 @@ CreateCheckPoint(int flags)
        ControlFile->minRecoveryPoint = InvalidXLogRecPtr;
        ControlFile->minRecoveryPointTLI = 0;
 
-       /* make sure we start with the checksum version as of the checkpoint */
-       ControlFile->data_checksum_version = checkPoint.dataChecksumState;
-
        /*
         * Persist unloggedLSN value. It's reset on crash recovery, so this goes
         * unused on non-shutdown checkpoints, but seems useful to store it always
@@ -8871,11 +8841,6 @@ xlog_redo(XLogReaderState *record)
                MultiXactAdvanceOldest(checkPoint.oldestMulti,
                                                           checkPoint.oldestMultiDB);
 
-               SpinLockAcquire(&XLogCtl->info_lck);
-               XLogCtl->data_checksum_version = checkPoint.dataChecksumState;
-               SetLocalDataChecksumState(checkPoint.dataChecksumState);
-               SpinLockRelease(&XLogCtl->info_lck);
-
                /*
                 * No need to set oldestClogXid here as well; it'll be set when we
                 * redo an xl_clog_truncate if it changed since initialization.
@@ -8936,6 +8901,8 @@ xlog_redo(XLogReaderState *record)
                LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
                ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
                ControlFile->data_checksum_version = checkPoint.dataChecksumState;
+
+               UpdateControlFile();
                LWLockRelease(ControlFileLock);
 
                /*
@@ -8962,8 +8929,6 @@ xlog_redo(XLogReaderState *record)
        {
                CheckPoint      checkPoint;
                TimeLineID      replayTLI;
-               bool            new_state = false;
-               int                     old_state;
 
                memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
                /* In an ONLINE checkpoint, treat the XID counter as a minimum */
@@ -9002,8 +8967,6 @@ xlog_redo(XLogReaderState *record)
                /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
                LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
                ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
-               old_state = ControlFile->data_checksum_version;
-               ControlFile->data_checksum_version = checkPoint.dataChecksumState;
                LWLockRelease(ControlFileLock);
 
                /* TLI should not change in an on-line checkpoint */
@@ -9015,18 +8978,6 @@ xlog_redo(XLogReaderState *record)
 
                RecoveryRestartPoint(&checkPoint, record);
 
-               /*
-                * If the data checksum state change we need to emit a barrier.
-                */
-               SpinLockAcquire(&XLogCtl->info_lck);
-               XLogCtl->data_checksum_version = checkPoint.dataChecksumState;
-               if (checkPoint.dataChecksumState != old_state)
-                       new_state = true;
-               SpinLockRelease(&XLogCtl->info_lck);
-
-               if (new_state)
-                       EmitAndWaitDataChecksumsBarrier(checkPoint.dataChecksumState);
-
                /*
                 * After replaying a checkpoint record, free all smgr objects.
                 * Otherwise we would never do so for dropped relations, as the
@@ -9195,6 +9146,7 @@ xlog_redo(XLogReaderState *record)
 
                SpinLockAcquire(&XLogCtl->info_lck);
                XLogCtl->data_checksum_version = redo_rec.data_checksum_version;
+               SetLocalDataChecksumState(redo_rec.data_checksum_version);
                if (redo_rec.data_checksum_version != ControlFile->data_checksum_version)
                        new_state = true;
                SpinLockRelease(&XLogCtl->info_lck);
@@ -9268,6 +9220,11 @@ xlog2_redo(XLogReaderState *record)
                XLogCtl->data_checksum_version = state.new_checksum_state;
                SpinLockRelease(&XLogCtl->info_lck);
 
+               LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+               ControlFile->data_checksum_version = state.new_checksum_state;
+               UpdateControlFile();
+               LWLockRelease(ControlFileLock);
+
                /*
                 * Block on a procsignalbarrier to await all processes having seen the
                 * change to checksum status. Once the barrier has been passed we can
index 8fdc518b3a1e66487ff499d6e75b08dbd9888c68..ba8c9add67ac12d81365051ef4c6c35043def9d9 100644 (file)
@@ -68,6 +68,14 @@ AuxiliaryProcessMainCommon(void)
 
        BaseInit();
 
+       /*
+        * Prevent consuming interrupts between setting ProcSignalInit and setting
+        * the initial local data checksum value.  If a barrier is emitted, and
+        * absorbed, before local cached state is initialized the state transition
+        * can be invalid.
+        */
+       HOLD_INTERRUPTS();
+
        ProcSignalInit(NULL, 0);
 
        /*
@@ -88,6 +96,8 @@ AuxiliaryProcessMainCommon(void)
         */
        InitLocalDataChecksumState();
 
+       RESUME_INTERRUPTS();
+
        /*
         * Auxiliary processes don't run transactions, but they may need a
         * resource owner anyway to manage buffer pins acquired outside
index 77d0316b5cbd51d8f98253ff4c63026ca9cf89de..4a8aa5b5ee21abd5e0e9db8b3a2ae08198877331 100644 (file)
@@ -263,8 +263,8 @@ static const ChecksumBarrierCondition checksum_barriers[7] =
        {PG_DATA_CHECKSUM_INPROGRESS_OFF, PG_DATA_CHECKSUM_VERSION},
 
        /*
-        * If checksums are being enabled when launcher_exit is executed, state
-        * is set to off since we cannot reach on at that point.
+        * If checksums are being enabled when launcher_exit is executed, state is
+        * set to off since we cannot reach on at that point.
         */
        {PG_DATA_CHECKSUM_INPROGRESS_ON, PG_DATA_CHECKSUM_INPROGRESS_OFF},
 };
@@ -1291,8 +1291,8 @@ DataChecksumsShmemRequest(void *arg)
 /*
  * DatabaseExists
  *
- * Scans the system catalog to check if a database with the given Oid exist
- * and returns true if it is found, else false.
+ * Scans the system catalog to check if a database with the given Oid exists
+ * and returns true if it is found  else false.
  */
 static bool
 DatabaseExists(Oid dboid)
index 6f074013aa955c621a53e40bc16aba15e9ea0155..ecf78b9a9860dc22e3c1bc12291db0692d8926bf 100644 (file)
@@ -756,6 +756,14 @@ InitPostgres(const char *in_dbname, Oid dboid,
         */
        SharedInvalBackendInit(false);
 
+       /*
+        * Prevent consuming interrupts between setting ProcSignalInit and setting
+        * the initial local data checksum value.  If a barrier is emitted, and
+        * absorbed, before local cached state is initialized the state transition
+        * can be invalid.
+        */
+       HOLD_INTERRUPTS();
+
        ProcSignalInit(MyCancelKey, MyCancelKeyLength);
 
        /*
@@ -776,6 +784,8 @@ InitPostgres(const char *in_dbname, Oid dboid,
         */
        InitLocalDataChecksumState();
 
+       RESUME_INTERRUPTS();
+
        /*
         * Also set up timeout handlers needed for backend operation.  We need
         * these in every case except bootstrap.