]> git.ipfire.org Git - thirdparty/postgresql.git/commitdiff
Fix multixact backwards-compatibility with CHECKPOINT race condition
authorHeikki Linnakangas <heikki.linnakangas@iki.fi>
Mon, 23 Mar 2026 09:53:32 +0000 (11:53 +0200)
committerHeikki Linnakangas <heikki.linnakangas@iki.fi>
Mon, 23 Mar 2026 09:59:07 +0000 (11:59 +0200)
If a CHECKPOINT record with nextMulti N is written to the WAL before
the CREATE_ID record for N, and N happens to be the first multixid on
an offset page, the backwards compatibility logic to tolerate WAL
generated by older minor versions (before commit 789d65364c) failed to
compensate for the missing XLOG_MULTIXACT_ZERO_OFF_PAGE record. In
that case, the latest_page_number was initialized at the start of WAL
replay to the page for nextMulti from the CHECKPOINT record, even if
we had not seen the CREATE_ID record for that multixid yet, which
fooled the backwards compatibility logic to think that the page was
already initialized.

To fix, track the last XLOG_MULTIXACT_ZERO_OFF_PAGE that we've seen
separately from latest_page_number. If we haven't seen any
XLOG_MULTIXACT_ZERO_OFF_PAGE records yet, use
SimpleLruDoesPhysicalPageExist() to check if the page needs to be
initialized.

Reported-by: duankunren.dkr <duankunren.dkr@alibaba-inc.com>
Analyzed-by: duankunren.dkr <duankunren.dkr@alibaba-inc.com>
Reviewed-by: Andrey Borodin <x4mmm@yandex-team.ru>
Reviewed-by: Kirill Reshke <reshkekirill@gmail.com>
Discussion: https://www.postgresql.org/message-id/c4ef1737-8cba-458e-b6fd-4e2d6011e985.duankunren.dkr@alibaba-inc.com
Backpatch-through: 14-18

src/backend/access/transam/multixact.c
src/include/access/slru.h

index 0956272eebe194cd312ce25e1d4b8d30e8a23d27..5067c2a3e394d56fac6bf4e2665e742c8be6a707 100644 (file)
@@ -416,7 +416,17 @@ static MemoryContext MXactContext = NULL;
 #define debug_elog6(a,b,c,d,e,f)
 #endif
 
-/* hack to deal with WAL generated with older minor versions */
+/*
+ * Hack to deal with WAL generated with older minor versions.
+ *
+ * last_initialized_offsets_page is the XLOG_MULTIXACT_ZERO_OFF_PAGE record
+ * that we saw during WAL replay, or -1 if we haven't seen any yet.
+ *
+ * pre_initialized_offsets_page is the last page that was implicitly
+ * initialized by replaying a XLOG_MULTIXACT_CREATE_ID record, when we had not
+ * seen a XLOG_MULTIXACT_ZERO_OFF_PAGE record for the page yet.
+ */
+static int64 last_initialized_offsets_page = -1;
 static int64 pre_initialized_offsets_page = -1;
 
 /* internal MultiXactId management */
@@ -976,29 +986,68 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
         * such a version, the next page might not be initialized yet.  Initialize
         * it now.
         */
-       if (InRecovery &&
-               next_pageno != pageno &&
-               pg_atomic_read_u64(&MultiXactOffsetCtl->shared->latest_page_number) == pageno)
+       if (InRecovery && next_pageno != pageno)
        {
-               elog(DEBUG1, "next offsets page is not initialized, initializing it now");
+               bool            init_needed;
 
-               lock = SimpleLruGetBankLock(MultiXactOffsetCtl, next_pageno);
-               LWLockAcquire(lock, LW_EXCLUSIVE);
+               /*----------
+                * Check if the page exists, and if not, initialize it now.
+                *
+                * The straightforward way to check if the page exists is to call
+                * SimpleLruDoesPhysicalPageExist().  However, there two problems with
+                * that:
+                *
+                * 1. It's somewhat expensive to call on every page switch.
+                *
+                * 2. It does not take into account pages that have been initialized
+                *    in the SLRU buffer cache but not yet flushed to disk.  For such
+                *    pages, it will incorrectly return false.
+                *
+                * To fix both of those problems, if we have replayed any
+                * XLOG_MULTIXACT_ZERO_OFF_PAGE records, we assume that the last page
+                * that was zeroed by XLOG_MULTIXACT_ZERO_OFF_PAGE is the last page
+                * that exists.  This works because the XLOG_MULTIXACT_ZERO_OFF_PAGE
+                * records must appear in the WAL in order, unlike CREATE_ID records.
+                * We only resort to SimpleLruDoesPhysicalPageExist() if we haven't
+                * seen any XLOG_MULTIXACT_ZERO_OFF_PAGE records yet, which should
+                * happen at most once after starting WAL recovery.
+                *
+                * As an extra safety measure, if we do resort to
+                * SimpleLruDoesPhysicalPageExist(), flush the SLRU buffers first so
+                * that it will return an accurate result.
+                *----------
+                */
+               if (last_initialized_offsets_page == -1)
+               {
+                       SimpleLruWriteAll(MultiXactOffsetCtl, false);
+                       init_needed = !SimpleLruDoesPhysicalPageExist(MultiXactOffsetCtl, next_pageno);
+               }
+               else
+                       init_needed = (last_initialized_offsets_page == pageno);
 
-               /* Create and zero the page */
-               slotno = SimpleLruZeroPage(MultiXactOffsetCtl, next_pageno);
+               if (init_needed)
+               {
+                       elog(DEBUG1, "next offsets page is not initialized, initializing it now");
 
-               /* Make sure it's written out */
-               SimpleLruWritePage(MultiXactOffsetCtl, slotno);
-               Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]);
+                       lock = SimpleLruGetBankLock(MultiXactOffsetCtl, next_pageno);
+                       LWLockAcquire(lock, LW_EXCLUSIVE);
 
-               LWLockRelease(lock);
+                       /* Create and zero the page */
+                       slotno = SimpleLruZeroPage(MultiXactOffsetCtl, next_pageno);
 
-               /*
-                * Remember that we initialized the page, so that we don't zero it
-                * again at the XLOG_MULTIXACT_ZERO_OFF_PAGE record.
-                */
-               pre_initialized_offsets_page = next_pageno;
+                       /* Make sure it's written out */
+                       SimpleLruWritePage(MultiXactOffsetCtl, slotno);
+                       Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]);
+
+                       LWLockRelease(lock);
+
+                       /*
+                        * Remember that we initialized the page, so that we don't zero it
+                        * again at the XLOG_MULTIXACT_ZERO_OFF_PAGE record.
+                        */
+                       pre_initialized_offsets_page = next_pageno;
+                       last_initialized_offsets_page = next_pageno;
+               }
        }
 
        /*
@@ -3554,6 +3603,8 @@ multixact_redo(XLogReaderState *record)
                        Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]);
 
                        LWLockRelease(lock);
+
+                       last_initialized_offsets_page = pageno;
                }
                else
                        elog(DEBUG1, "skipping initialization of offsets page " INT64_FORMAT " because it was already initialized on multixid creation", pageno);
index 8edc95b4962d75de1e82e576140f7a31525106ca..9409f68035a052942c6b19cb53eb677cbe7f3432 100644 (file)
@@ -110,9 +110,7 @@ typedef struct SlruSharedData
        /*
         * latest_page_number is the page number of the current end of the log;
         * this is not critical data, since we use it only to avoid swapping out
-        * the latest page.  (An exception: an accurate latest_page_number is
-        * needed on pg_multixact/offsets to replay WAL generated with older minor
-        * versions correctly.  See RecordNewMultiXact().)
+        * the latest page.
         */
        pg_atomic_uint64 latest_page_number;