]> git.ipfire.org Git - thirdparty/postgresql.git/commitdiff
Rework ginScanToDelete() to pass Buffers instead of BlockNumbers.
authorAlexander Korotkov <akorotkov@postgresql.org>
Sat, 21 Feb 2026 09:08:08 +0000 (11:08 +0200)
committerAlexander Korotkov <akorotkov@postgresql.org>
Fri, 13 Mar 2026 11:50:13 +0000 (13:50 +0200)
Previously, ginScanToDelete() and ginDeletePage() passed BlockNumbers and
re-read pages that were already pinned and locked during the tree walk.  The
caller ginVacuumPostingTree()) held a cleanup-locked root buffer, yet
ginScanToDelete() re-read it by block number with special-case code to skip
re-locking.

At first, this commit gives both functions more appropriate names,
ginScanPostingTreeToDelete() and ginDeletePostingPage(), indicating they deal
with posting trees/pages.  This is more descriptive and similar to the way we
name other GIN functions, for instance, ginVacuumPostingTree() and
ginVacuumPostingTreeLeaves().

Then rework both functions to pass Buffers directly.  DataPageDeleteStack now
carries buffer, myoff (downlink offset in parent), and isRoot per level,
so ginScanPostingTreeToDelete() takes only GinVacuumState and
DataPageDeleteStack pointers.  Also, ginDeletePostingPage() receives the three
Buffers directly, and no longer reads or releases them itself.  The caller
reads and locks child pages before recursing, and manages buffer lifecycle
afterward.

This eliminates the confusing isRoot special cases in buffer management,
including the apparent (but unreachable) double release of the root
buffer identified by Andres Freund.

Add comments explaining the locking protocol and the DataPageDeleteStack
structure.

Reported-by: Andres Freund <andres@anarazel.de>
Discussion: https://postgr.es/m/utrlxij43fbguzw4kldte2spc4btoldizutcqyrfakqnbrp3ir@ph3sphpj4asz
Reviewed-by: Pavel Borisov <pashkin.elfe@gmail.com>
Reviewed-by: Xuneng Zhou <xunengzhou@gmail.com>
Reviewed-by: Jinbinge <jinbinge@126.com>
src/backend/access/gin/README
src/backend/access/gin/ginvacuum.c

index 742bcbad499fab30397fd1ff922a772166810135..4c16dc4d4e0ec797b66528f576cf6f9749343b70 100644 (file)
@@ -393,11 +393,11 @@ tree leafs in logical order by rightlinks and removes deletable TIDs from
 posting lists. Posting trees are processed by links from entry tree leafs. They
 are vacuumed in two stages. At first stage, deletable TIDs are removed from
 leafs. If first stage detects at least one empty page, then at the second stage
-ginScanToDelete() deletes empty pages.
+ginScanPostingTreeToDelete() deletes empty pages.
 
-ginScanToDelete() traverses the whole tree in depth-first manner.  It starts
-from the full cleanup lock on the tree root.  This lock prevents all the
-concurrent insertions into this tree while we're deleting pages.  However,
+ginScanPostingTreeToDelete() traverses the whole tree in depth-first manner.
+It starts from the full cleanup lock on the tree root.  This lock prevents all
+the concurrent insertions into this tree while we're deleting pages.  However,
 there are still might be some in-progress readers, who traversed root before
 we locked it.
 
index d5c8bef5ceb18c9a6190cccd5a9160ba2725f163..840543eb6642bd269250a08dd501a5556263c3d3 100644 (file)
@@ -111,31 +111,51 @@ xlogVacuumPage(Relation index, Buffer buffer)
 }
 
 
+/*
+ * Stack entry used during posting tree empty-page deletion scan.
+ *
+ * One DataPageDeleteStack entry is allocated per tree level.  As
+ * ginScanPostingTreeToDelete() recurses down the tree, each entry tracks
+ * the buffer of the page currently being visited at that level ('buffer'),
+ * and the buffer of its left sibling ('leftBuffer').  The left page is kept
+ * pinned and exclusively locked because ginDeletePostingPage() needs it to
+ * update the sibling chain; acquiring it later could deadlock with
+ * ginStepRight(), which locks pages left-to-right.
+ */
 typedef struct DataPageDeleteStack
 {
        struct DataPageDeleteStack *child;
        struct DataPageDeleteStack *parent;
 
-       BlockNumber blkno;                      /* current block number */
-       Buffer          leftBuffer;             /* pinned and locked rightest non-deleted page
-                                                                * on left */
+       Buffer          buffer;                 /* buffer for the page being visited at this
+                                                                * tree level */
+       Buffer          leftBuffer;             /* pinned and locked rightmost non-deleted
+                                                                * sibling to the left of the current page */
+       OffsetNumber myoff;                     /* offset of this page's downlink in the
+                                                                * parent */
        bool            isRoot;
 } DataPageDeleteStack;
 
 
 /*
  * Delete a posting tree page.
+ *
+ * Removes the page identified by dBuffer from the posting tree by updating
+ * the left sibling's rightlink (in lBuffer) to skip over the deleted page,
+ * and removing the downlink from the parent page (in pBuffer).  All three
+ * buffers must already have been pinned and exclusively locked by the caller.
+ *
+ * The buffers are NOT released nor unlocked here; the caller is responsible
+ * for this.
  */
 static void
-ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkno,
-                         BlockNumber parentBlkno, OffsetNumber myoff, bool isParentRoot)
+ginDeletePostingPage(GinVacuumState *gvs, Buffer dBuffer, Buffer lBuffer,
+                                        Buffer pBuffer, OffsetNumber myoff, bool isParentRoot)
 {
-       Buffer          dBuffer;
-       Buffer          lBuffer;
-       Buffer          pBuffer;
        Page            page,
                                parentPage;
        BlockNumber rightlink;
+       BlockNumber deleteBlkno = BufferGetBlockNumber(dBuffer);
 
        /*
         * This function MUST be called only if someone of parent pages hold
@@ -143,12 +163,6 @@ ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkn
         * happen in this subtree. Caller also acquires Exclusive locks on
         * deletable, parent and left pages.
         */
-       lBuffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, leftBlkno,
-                                                                RBM_NORMAL, gvs->strategy);
-       dBuffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, deleteBlkno,
-                                                                RBM_NORMAL, gvs->strategy);
-       pBuffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, parentBlkno,
-                                                                RBM_NORMAL, gvs->strategy);
 
        page = BufferGetPage(dBuffer);
        rightlink = GinPageGetOpaque(page)->rightlink;
@@ -227,55 +241,38 @@ ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkn
 
        END_CRIT_SECTION();
 
-       ReleaseBuffer(pBuffer);
-       ReleaseBuffer(lBuffer);
-       ReleaseBuffer(dBuffer);
-
        gvs->result->pages_newly_deleted++;
        gvs->result->pages_deleted++;
 }
 
 
 /*
- * Scans posting tree and deletes empty pages.  Caller must lock root page for
- * cleanup.  During scan path from root to current page is kept exclusively
- * locked.  Also keep left page exclusively locked, because ginDeletePage()
- * needs it.  If we try to relock left page later, it could deadlock with
- * ginStepRight().
+ * Scans a posting tree and deletes empty pages.
+ *
+ * The caller must hold a cleanup lock on the root page to prevent concurrent
+ * inserts.  The entire path from the root down to the current page is kept
+ * exclusively locked throughout the scan.  The left sibling at each level is
+ * also kept locked, because ginDeletePostingPage() needs it to update the
+ * rightlink of the left sibling; re-acquiring the left sibling lock later
+ * could deadlock with ginStepRight(), which acquires page locks
+ * left-to-right.
+ *
+ * All per-level state is carried in 'myStackItem': the buffer to process
+ * (must already be pinned and exclusively locked), the left sibling buffer,
+ * and this page's offset in the parent's downlink array.  The root entry is
+ * set up by ginVacuumPostingTree(); child entries are populated here before
+ * recursing.
+ *
+ * Returns true if the page was deleted, false otherwise.
  */
 static bool
-ginScanToDelete(GinVacuumState *gvs, BlockNumber blkno, bool isRoot,
-                               DataPageDeleteStack *parent, OffsetNumber myoff)
+ginScanPostingTreeToDelete(GinVacuumState *gvs, DataPageDeleteStack *myStackItem)
 {
-       DataPageDeleteStack *me;
-       Buffer          buffer;
+       Buffer          buffer = myStackItem->buffer;
        Page            page;
-       bool            meDelete = false;
+       bool            pageWasDeleted = false;
        bool            isempty;
 
-       if (isRoot)
-       {
-               me = parent;
-       }
-       else
-       {
-               if (!parent->child)
-               {
-                       me = palloc0_object(DataPageDeleteStack);
-                       me->parent = parent;
-                       parent->child = me;
-                       me->leftBuffer = InvalidBuffer;
-               }
-               else
-                       me = parent->child;
-       }
-
-       buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, blkno,
-                                                               RBM_NORMAL, gvs->strategy);
-
-       if (!isRoot)
-               LockBuffer(buffer, GIN_EXCLUSIVE);
-
        page = BufferGetPage(buffer);
 
        Assert(GinPageIsData(page));
@@ -284,19 +281,48 @@ ginScanToDelete(GinVacuumState *gvs, BlockNumber blkno, bool isRoot,
        {
                OffsetNumber i;
 
-               me->blkno = blkno;
-               for (i = FirstOffsetNumber; i <= GinPageGetOpaque(page)->maxoff; i++)
+               for (i = FirstOffsetNumber; i <= GinPageGetOpaque(page)->maxoff;)
                {
                        PostingItem *pitem = GinDataPageGetPostingItem(page, i);
+                       Buffer          childBuffer;
+
+                       childBuffer = ReadBufferExtended(gvs->index,
+                                                                                        MAIN_FORKNUM,
+                                                                                        PostingItemGetBlockNumber(pitem),
+                                                                                        RBM_NORMAL, gvs->strategy);
+                       LockBuffer(childBuffer, GIN_EXCLUSIVE);
+
+                       /* Allocate a child stack entry on first use; reuse thereafter */
+                       if (!myStackItem->child)
+                       {
+                               myStackItem->child = palloc0_object(DataPageDeleteStack);
+                               myStackItem->child->parent = myStackItem;
+                               myStackItem->child->leftBuffer = InvalidBuffer;
+                       }
 
-                       if (ginScanToDelete(gvs, PostingItemGetBlockNumber(pitem), false, me, i))
-                               i--;
+                       myStackItem->child->buffer = childBuffer;
+                       myStackItem->child->isRoot = false;
+                       myStackItem->child->myoff = i;
+
+                       /*
+                        * Recurse into child.  If the child page was deleted, its
+                        * downlink was removed from our page, so re-examine the same
+                        * offset; otherwise advance to the next downlink.
+                        */
+                       if (!ginScanPostingTreeToDelete(gvs, myStackItem->child))
+                               i++;
                }
+               myStackItem->buffer = InvalidBuffer;
 
-               if (GinPageRightMost(page) && BufferIsValid(me->child->leftBuffer))
+               /*
+                * After processing all children at this level, release the child
+                * level's leftBuffer if we're at the rightmost page.  There is no
+                * right sibling that could need it for deletion.
+                */
+               if (GinPageRightMost(page) && BufferIsValid(myStackItem->child->leftBuffer))
                {
-                       UnlockReleaseBuffer(me->child->leftBuffer);
-                       me->child->leftBuffer = InvalidBuffer;
+                       UnlockReleaseBuffer(myStackItem->child->leftBuffer);
+                       myStackItem->child->leftBuffer = InvalidBuffer;
                }
        }
 
@@ -307,34 +333,41 @@ ginScanToDelete(GinVacuumState *gvs, BlockNumber blkno, bool isRoot,
 
        if (isempty)
        {
-               /* we never delete the left- or rightmost branch */
-               if (BufferIsValid(me->leftBuffer) && !GinPageRightMost(page))
+               /*
+                * Proceed to the ginDeletePostingPage() if that's not the leftmost or
+                * the rightmost page.
+                */
+               if (BufferIsValid(myStackItem->leftBuffer) && !GinPageRightMost(page))
                {
-                       Assert(!isRoot);
-                       ginDeletePage(gvs, blkno, BufferGetBlockNumber(me->leftBuffer),
-                                                 me->parent->blkno, myoff, me->parent->isRoot);
-                       meDelete = true;
+                       Assert(!myStackItem->isRoot);
+                       ginDeletePostingPage(gvs, buffer, myStackItem->leftBuffer,
+                                                                myStackItem->parent->buffer,
+                                                                myStackItem->myoff,
+                                                                myStackItem->parent->isRoot);
+                       pageWasDeleted = true;
                }
        }
 
-       if (!meDelete)
+       if (!pageWasDeleted)
        {
-               if (BufferIsValid(me->leftBuffer))
-                       UnlockReleaseBuffer(me->leftBuffer);
-               me->leftBuffer = buffer;
+               /*
+                * Keep this page as the new leftBuffer for this level: the next
+                * sibling to the right might need it for deletion.  Release any
+                * previously held left page first.
+                */
+               if (BufferIsValid(myStackItem->leftBuffer))
+                       UnlockReleaseBuffer(myStackItem->leftBuffer);
+               myStackItem->leftBuffer = buffer;
        }
        else
        {
-               if (!isRoot)
-                       LockBuffer(buffer, GIN_UNLOCK);
-
-               ReleaseBuffer(buffer);
+               /*
+                * Page was deleted; release the buffer.  leftBuffer remains the same.
+                */
+               UnlockReleaseBuffer(buffer);
        }
 
-       if (isRoot)
-               ReleaseBuffer(buffer);
-
-       return meDelete;
+       return pageWasDeleted;
 }
 
 
@@ -418,6 +451,7 @@ ginVacuumPostingTree(GinVacuumState *gvs, BlockNumber rootBlkno)
                DataPageDeleteStack root,
                                   *ptr,
                                   *tmp;
+               bool            deleted PG_USED_FOR_ASSERTS_ONLY;
 
                buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, rootBlkno,
                                                                        RBM_NORMAL, gvs->strategy);
@@ -429,10 +463,13 @@ ginVacuumPostingTree(GinVacuumState *gvs, BlockNumber rootBlkno)
                LockBufferForCleanup(buffer);
 
                memset(&root, 0, sizeof(DataPageDeleteStack));
+               root.buffer = buffer;
                root.leftBuffer = InvalidBuffer;
+               root.myoff = InvalidOffsetNumber;
                root.isRoot = true;
 
-               ginScanToDelete(gvs, rootBlkno, true, &root, InvalidOffsetNumber);
+               deleted = ginScanPostingTreeToDelete(gvs, &root);
+               Assert(!deleted);
 
                ptr = root.child;
 
@@ -839,7 +876,7 @@ GinPageIsRecyclable(Page page)
 
        /*
         * If no backend still could view delete_xid as in running, all scans
-        * concurrent with ginDeletePage() must have finished.
+        * concurrent with ginDeletePostingPage() must have finished.
         */
        return GlobalVisCheckRemovableXid(NULL, delete_xid);
 }