]> git.ipfire.org Git - thirdparty/postgresql.git/commitdiff
Fix multixact backwards-compatibility with CHECKPOINT race condition
authorHeikki Linnakangas <heikki.linnakangas@iki.fi>
Mon, 23 Mar 2026 09:53:32 +0000 (11:53 +0200)
committerHeikki Linnakangas <heikki.linnakangas@iki.fi>
Mon, 23 Mar 2026 10:02:27 +0000 (12:02 +0200)
If a CHECKPOINT record with nextMulti N is written to the WAL before
the CREATE_ID record for N, and N happens to be the first multixid on
an offset page, the backwards compatibility logic to tolerate WAL
generated by older minor versions (before commit 789d65364c) failed to
compensate for the missing XLOG_MULTIXACT_ZERO_OFF_PAGE record. In
that case, the latest_page_number was initialized at the start of WAL
replay to the page for nextMulti from the CHECKPOINT record, even if
we had not seen the CREATE_ID record for that multixid yet, which
fooled the backwards compatibility logic to think that the page was
already initialized.

To fix, track the last XLOG_MULTIXACT_ZERO_OFF_PAGE that we've seen
separately from latest_page_number. If we haven't seen any
XLOG_MULTIXACT_ZERO_OFF_PAGE records yet, use
SimpleLruDoesPhysicalPageExist() to check if the page needs to be
initialized.

Reported-by: duankunren.dkr <duankunren.dkr@alibaba-inc.com>
Analyzed-by: duankunren.dkr <duankunren.dkr@alibaba-inc.com>
Reviewed-by: Andrey Borodin <x4mmm@yandex-team.ru>
Reviewed-by: Kirill Reshke <reshkekirill@gmail.com>
Discussion: https://www.postgresql.org/message-id/c4ef1737-8cba-458e-b6fd-4e2d6011e985.duankunren.dkr@alibaba-inc.com
Backpatch-through: 14-18

src/backend/access/transam/multixact.c
src/include/access/slru.h

index 26b8d4e1230607e877bcfc44bb01b3361945f3d0..f825579e888fb4d5e205e46ce417640482f1aa75 100644 (file)
@@ -338,8 +338,18 @@ static MemoryContext MXactContext = NULL;
 #define debug_elog6(a,b,c,d,e,f)
 #endif
 
-/* hack to deal with WAL generated with older minor versions */
-static int     pre_initialized_offsets_page = -1;
+/*
+ * Hack to deal with WAL generated with older minor versions.
+ *
+ * last_initialized_offsets_page is the XLOG_MULTIXACT_ZERO_OFF_PAGE record
+ * that we saw during WAL replay, or -1 if we haven't seen any yet.
+ *
+ * pre_initialized_offsets_page is the last page that was implicitly
+ * initialized by replaying a XLOG_MULTIXACT_CREATE_ID record, when we had not
+ * seen a XLOG_MULTIXACT_ZERO_OFF_PAGE record for the page yet.
+ */
+static int last_initialized_offsets_page = -1;
+static int pre_initialized_offsets_page = -1;
 
 /* internal MultiXactId management */
 static void MultiXactIdSetOldestVisible(void);
@@ -898,24 +908,63 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
         * such a version, the next page might not be initialized yet.  Initialize
         * it now.
         */
-       if (InRecovery &&
-               next_pageno != pageno &&
-               MultiXactOffsetCtl->shared->latest_page_number == pageno)
+       if (InRecovery && next_pageno != pageno)
        {
-               elog(DEBUG1, "next offsets page is not initialized, initializing it now");
+               bool            init_needed;
 
-               /* Create and zero the page */
-               slotno = SimpleLruZeroPage(MultiXactOffsetCtl, next_pageno);
+               /*----------
+                * Check if the page exists, and if not, initialize it now.
+                *
+                * The straightforward way to check if the page exists is to call
+                * SimpleLruDoesPhysicalPageExist().  However, there two problems with
+                * that:
+                *
+                * 1. It's somewhat expensive to call on every page switch.
+                *
+                * 2. It does not take into account pages that have been initialized
+                *    in the SLRU buffer cache but not yet flushed to disk.  For such
+                *    pages, it will incorrectly return false.
+                *
+                * To fix both of those problems, if we have replayed any
+                * XLOG_MULTIXACT_ZERO_OFF_PAGE records, we assume that the last page
+                * that was zeroed by XLOG_MULTIXACT_ZERO_OFF_PAGE is the last page
+                * that exists.  This works because the XLOG_MULTIXACT_ZERO_OFF_PAGE
+                * records must appear in the WAL in order, unlike CREATE_ID records.
+                * We only resort to SimpleLruDoesPhysicalPageExist() if we haven't
+                * seen any XLOG_MULTIXACT_ZERO_OFF_PAGE records yet, which should
+                * happen at most once after starting WAL recovery.
+                *
+                * As an extra safety measure, if we do resort to
+                * SimpleLruDoesPhysicalPageExist(), flush the SLRU buffers first so
+                * that it will return an accurate result.
+                *----------
+                */
+               if (last_initialized_offsets_page == -1)
+               {
+                       SimpleLruWriteAll(MultiXactOffsetCtl, false);
+                       init_needed = !SimpleLruDoesPhysicalPageExist(MultiXactOffsetCtl, next_pageno);
+               }
+               else
+                       init_needed = (last_initialized_offsets_page == pageno);
 
-               /* Make sure it's written out */
-               SimpleLruWritePage(MultiXactOffsetCtl, slotno);
-               Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]);
+               if (init_needed)
+               {
+                       elog(DEBUG1, "next offsets page is not initialized, initializing it now");
 
-               /*
-                * Remember that we initialized the page, so that we don't zero it
-                * again at the XLOG_MULTIXACT_ZERO_OFF_PAGE record.
-                */
-               pre_initialized_offsets_page = next_pageno;
+                       /* Create and zero the page */
+                       slotno = SimpleLruZeroPage(MultiXactOffsetCtl, next_pageno);
+
+                       /* Make sure it's written out */
+                       SimpleLruWritePage(MultiXactOffsetCtl, slotno);
+                       Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]);
+
+                       /*
+                        * Remember that we initialized the page, so that we don't zero it
+                        * again at the XLOG_MULTIXACT_ZERO_OFF_PAGE record.
+                        */
+                       pre_initialized_offsets_page = next_pageno;
+                       last_initialized_offsets_page = next_pageno;
+               }
        }
 
        /*
@@ -3366,6 +3415,8 @@ multixact_redo(XLogReaderState *record)
                        SimpleLruWritePage(MultiXactOffsetCtl, slotno);
                        Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]);
                        LWLockRelease(MultiXactOffsetSLRULock);
+
+                       last_initialized_offsets_page = pageno;
                }
                else
                        elog(DEBUG1, "skipping initialization of offsets page %d because it was already initialized on multixid creation", pageno);
index 90b897a88ba46184de9c97df3cdbf805e6c40bfa..a8a424d92da35f7a34da1c6e07609163f4211fae 100644 (file)
@@ -93,9 +93,7 @@ typedef struct SlruSharedData
        /*
         * latest_page_number is the page number of the current end of the log;
         * this is not critical data, since we use it only to avoid swapping out
-        * the latest page.  (An exception: an accurate latest_page_number is
-        * needed on pg_multixact/offsets to replay WAL generated with older minor
-        * versions correctly.  See RecordNewMultiXact().)
+        * the latest page.
         */
        int                     latest_page_number;