]> git.ipfire.org Git - thirdparty/postgresql.git/commitdiff
Adjust btbulkdelete logic so that only one WAL record is issued while
authorTom Lane <tgl@sss.pgh.pa.us>
Sun, 23 Feb 2003 22:43:09 +0000 (22:43 +0000)
committerTom Lane <tgl@sss.pgh.pa.us>
Sun, 23 Feb 2003 22:43:09 +0000 (22:43 +0000)
deleting multiple index entries on a single index page.  This makes for
a very substantial reduction in the amount of WAL traffic during a
large delete operation.

src/backend/access/nbtree/nbtpage.c
src/backend/access/nbtree/nbtree.c
src/backend/access/nbtree/nbtxlog.c
src/include/access/nbtree.h

index 16439972024bbd77b145762d342fb2c21bcfbebd..a9bd393440ca46ef7b347a43fb54e095b4090e92 100644 (file)
@@ -9,7 +9,7 @@
  *
  *
  * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v 1.61 2003/02/23 06:17:13 tgl Exp $
+ *       $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v 1.62 2003/02/23 22:43:08 tgl Exp $
  *
  *     NOTES
  *        Postgres btree pages look like ordinary relation pages.      The opaque
@@ -618,26 +618,34 @@ _bt_metaproot(Relation rel, BlockNumber rootbknum, uint32 level)
 }
 
 /*
- * Delete an item from a btree page.
+ * Delete item(s) from a btree page.
  *
  * This must only be used for deleting leaf items.  Deleting an item on a
  * non-leaf page has to be done as part of an atomic action that includes
  * deleting the page it points to.
  *
  * This routine assumes that the caller has pinned and locked the buffer,
- * and will write the buffer afterwards.
+ * and will write the buffer afterwards.  Also, the given itemnos *must*
+ * appear in increasing order in the array.
  */
 void
-_bt_itemdel(Relation rel, Buffer buf, ItemPointer tid)
+_bt_delitems(Relation rel, Buffer buf,
+                        OffsetNumber *itemnos, int nitems)
 {
        Page            page = BufferGetPage(buf);
-       OffsetNumber offno;
-
-       offno = ItemPointerGetOffsetNumber(tid);
+       int                     i;
 
+       /* No elog(ERROR) until changes are logged */
        START_CRIT_SECTION();
 
-       PageIndexTupleDelete(page, offno);
+       /*
+        * Delete the items in reverse order so we don't have to think about
+        * adjusting item numbers for previous deletions.
+        */
+       for (i = nitems - 1; i >= 0; i--)
+       {
+               PageIndexTupleDelete(page, itemnos[i]);
+       }
 
        /* XLOG stuff */
        if (!rel->rd_istemp)
@@ -646,17 +654,30 @@ _bt_itemdel(Relation rel, Buffer buf, ItemPointer tid)
                XLogRecPtr      recptr;
                XLogRecData rdata[2];
 
-               xlrec.target.node = rel->rd_node;
-               xlrec.target.tid = *tid;
+               xlrec.node = rel->rd_node;
+               xlrec.block = BufferGetBlockNumber(buf);
 
                rdata[0].buffer = InvalidBuffer;
                rdata[0].data = (char *) &xlrec;
                rdata[0].len = SizeOfBtreeDelete;
                rdata[0].next = &(rdata[1]);
 
+               /*
+                * The target-offsets array is not in the buffer, but pretend
+                * that it is.  When XLogInsert stores the whole buffer, the offsets
+                * array need not be stored too.
+                */
                rdata[1].buffer = buf;
-               rdata[1].data = NULL;
-               rdata[1].len = 0;
+               if (nitems > 0)
+               {
+                       rdata[1].data = (char *) itemnos;
+                       rdata[1].len = nitems * sizeof(OffsetNumber);
+               }
+               else
+               {
+                       rdata[1].data = NULL;
+                       rdata[1].len = 0;
+               }
                rdata[1].next = NULL;
 
                recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE, rdata);
index b7a1e7ada1c532be2c6f0996a1b012d3c2afe5d4..d73af9e6782b9bbb2505d9472e01f96a92ab9d01 100644 (file)
@@ -12,7 +12,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.97 2003/02/23 06:17:13 tgl Exp $
+ *       $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.98 2003/02/23 22:43:08 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -572,121 +572,110 @@ btbulkdelete(PG_FUNCTION_ARGS)
        IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(1);
        void       *callback_state = (void *) PG_GETARG_POINTER(2);
        IndexBulkDeleteResult *result;
-       BlockNumber num_pages;
        double          tuples_removed;
        double          num_index_tuples;
-       IndexScanDesc scan;
-       BTScanOpaque so;
-       ItemPointer current;
+       OffsetNumber deletable[BLCKSZ / sizeof(OffsetNumber)];
+       int                     ndeletable;
+       Buffer          buf;
+       BlockNumber num_pages;
 
        tuples_removed = 0;
        num_index_tuples = 0;
 
        /*
-        * We use a standard IndexScanDesc scan object, but to speed up the
-        * loop, we skip most of the wrapper layers of index_getnext and
-        * instead call _bt_step directly.      This implies holding buffer lock
-        * on a target page throughout the loop over the page's tuples.
-        *
-        * Whenever we step onto a new page, we have to trade in the read
-        * lock acquired by _bt_first or _bt_step for an exclusive write lock
-        * (fortunately, _bt_relbuf doesn't care which kind of lock it's
-        * releasing when it comes time for _bt_step to release our lock).
+        * The outer loop iterates over index leaf pages, the inner over items
+        * on a leaf page.  We issue just one _bt_delitems() call per page,
+        * so as to minimize WAL traffic.
         *
-        * Note that we exclusive-lock every leaf page, or at least every one
-        * containing data items.  It sounds attractive to only exclusive-lock
+        * Note that we exclusive-lock every leaf page containing data items,
+        * in sequence left to right.  It sounds attractive to only exclusive-lock
         * those containing items we need to delete, but unfortunately that
         * is not safe: we could then pass a stopped indexscan, which could
         * in rare cases lead to deleting the item it needs to find when it
         * resumes.  (See _bt_restscan --- this could only happen if an indexscan
         * stops on a deletable item and then a page split moves that item
         * into a page further to its right, which the indexscan will have no
-        * pin on.)
+        * pin on.)  We can skip obtaining exclusive lock on empty pages
+        * though, since no indexscan could be stopped on those.
         */
-       scan = index_beginscan(NULL, rel, SnapshotAny, 0, (ScanKey) NULL);
-       so = (BTScanOpaque) scan->opaque;
-       current = &(scan->currentItemData);
-
-       /* Use _bt_first to get started, then _bt_step to remaining tuples */
-       if (_bt_first(scan, ForwardScanDirection))
+       buf = _bt_get_endpoint(rel, 0, false);
+       if (BufferIsValid(buf))         /* check for empty index */
        {
-               Buffer          buf;
-               BlockNumber lockedBlock = InvalidBlockNumber;
-
-               /* we have the buffer pinned and read-locked */
-               buf = so->btso_curbuf;
-               Assert(BufferIsValid(buf));
-
-               do
+               for (;;)
                {
                        Page            page;
-                       BlockNumber blkno;
-                       OffsetNumber offnum;
-                       BTItem          btitem;
                        BTPageOpaque opaque;
-                       IndexTuple      itup;
-                       ItemPointer htup;
+                       OffsetNumber offnum,
+                                               minoff,
+                                               maxoff;
+                       BlockNumber     nextpage;
 
                        CHECK_FOR_INTERRUPTS();
 
-                       /* current is the next index tuple */
+                       ndeletable = 0;
                        page = BufferGetPage(buf);
-                       blkno = ItemPointerGetBlockNumber(current);
-
-                       /*
-                        * Make sure we have a super-exclusive write lock on this page.
-                        *
-                        * We assume that only concurrent insertions, not deletions,
-                        * can occur while we're not holding the page lock (the
-                        * caller should hold a suitable relation lock to ensure
-                        * this). Therefore, no items can escape being scanned because
-                        * of this temporary lock release.
-                        */
-                       if (blkno != lockedBlock)
+                       opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+                       minoff = P_FIRSTDATAKEY(opaque);
+                       maxoff = PageGetMaxOffsetNumber(page);
+                       /* We probably cannot see deleted pages, but skip 'em if so */
+                       if (minoff <= maxoff && !P_ISDELETED(opaque))
                        {
+                               /*
+                                * Trade in the initial read lock for a super-exclusive
+                                * write lock on this page.
+                                */
                                LockBuffer(buf, BUFFER_LOCK_UNLOCK);
                                LockBufferForCleanup(buf);
-                               lockedBlock = blkno;
                                /*
-                                * If the page was formerly rightmost but was split while we
-                                * didn't hold the lock, and ip_posid is pointing to item
-                                * 1, then ip_posid now points at the high key not a valid
-                                * data item. In this case we need to step forward.
+                                * Recompute minoff/maxoff, both of which could have changed
+                                * while we weren't holding the lock.
                                 */
-                               opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-                               if (current->ip_posid < P_FIRSTDATAKEY(opaque))
-                                       current->ip_posid = P_FIRSTDATAKEY(opaque);
-                       }
-
-                       offnum = ItemPointerGetOffsetNumber(current);
-                       btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
-                       itup = &btitem->bti_itup;
-                       htup = &(itup->t_tid);
-
-                       if (callback(htup, callback_state))
-                       {
-                               /* Okay to delete the item from the page */
-                               _bt_itemdel(rel, buf, current);
-
-                               /* Mark buffer dirty, but keep the lock and pin */
-                               WriteNoReleaseBuffer(buf);
-
-                               tuples_removed += 1;
-
+                               minoff = P_FIRSTDATAKEY(opaque);
+                               maxoff = PageGetMaxOffsetNumber(page);
                                /*
-                                * We now need to back up the scan one item, so that the next
-                                * cycle will re-examine the same offnum on this page (which
-                                * now holds the next item).
+                                * Scan over all items to see which ones need deleted
+                                * according to the callback function.
                                 */
-                               current->ip_posid--;
+                               for (offnum = minoff;
+                                        offnum <= maxoff;
+                                        offnum = OffsetNumberNext(offnum))
+                               {
+                                       BTItem          btitem;
+                                       ItemPointer htup;
+
+                                       btitem = (BTItem) PageGetItem(page,
+                                                                                                 PageGetItemId(page, offnum));
+                                       htup = &(btitem->bti_itup.t_tid);
+                                       if (callback(htup, callback_state))
+                                       {
+                                               deletable[ndeletable++] = offnum;
+                                               tuples_removed += 1;
+                                       }
+                                       else
+                                               num_index_tuples += 1;
+                               }
+                       }
+                       /*
+                        * If we need to delete anything, do it and write the buffer;
+                        * else just release the buffer.
+                        */
+                       nextpage = opaque->btpo_next;
+                       if (ndeletable > 0)
+                       {
+                               _bt_delitems(rel, buf, deletable, ndeletable);
+                               _bt_wrtbuf(rel, buf);
                        }
                        else
-                               num_index_tuples += 1;
-               } while (_bt_step(scan, &buf, ForwardScanDirection));
+                       {
+                               _bt_relbuf(rel, buf);
+                       }
+                       /* And advance to next page, if any */
+                       if (nextpage == P_NONE)
+                               break;
+                       buf = _bt_getbuf(rel, nextpage, BT_READ);
+               }
        }
 
-       index_endscan(scan);
-
        /* return statistics */
        num_pages = RelationGetNumberOfBlocks(rel);
 
@@ -765,7 +754,7 @@ btvacuumcleanup(PG_FUNCTION_ARGS)
                        }
                }
                else if ((opaque->btpo_flags & BTP_HALF_DEAD) ||
-                                P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page))
+                                P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page))
                {
                        /* Empty, try to delete */
                        int             ndel;
index 058b13b6a43fc630d773dd1d163328dae58b5b1e..a1a52571fe187734e25e7a5d0d43163375865b92 100644 (file)
@@ -8,7 +8,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.2 2003/02/23 06:17:13 tgl Exp $
+ *       $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.3 2003/02/23 22:43:08 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -379,11 +379,10 @@ btree_xlog_delete(bool redo, XLogRecPtr lsn, XLogRecord *record)
                return;
 
        xlrec = (xl_btree_delete *) XLogRecGetData(record);
-       reln = XLogOpenRelation(redo, RM_BTREE_ID, xlrec->target.node);
+       reln = XLogOpenRelation(redo, RM_BTREE_ID, xlrec->node);
        if (!RelationIsValid(reln))
                return;
-       buffer = XLogReadBuffer(false, reln,
-                                               ItemPointerGetBlockNumber(&(xlrec->target.tid)));
+       buffer = XLogReadBuffer(false, reln, xlrec->block);
        if (!BufferIsValid(buffer))
                elog(PANIC, "btree_delete_redo: block unfound");
        page = (Page) BufferGetPage(buffer);
@@ -396,7 +395,21 @@ btree_xlog_delete(bool redo, XLogRecPtr lsn, XLogRecord *record)
                return;
        }
 
-       PageIndexTupleDelete(page, ItemPointerGetOffsetNumber(&(xlrec->target.tid)));
+       if (record->xl_len > SizeOfBtreeDelete)
+       {
+               OffsetNumber *unused;
+               OffsetNumber *unend;
+
+               unused = (OffsetNumber *) ((char *) xlrec + SizeOfBtreeDelete);
+               unend = (OffsetNumber *) ((char *) xlrec + record->xl_len);
+
+               /* be careful to delete from back to front */
+               while (unused < unend)
+               {
+                       unend--;
+                       PageIndexTupleDelete(page, *unend);
+               }
+       }
 
        PageSetLSN(page, lsn);
        PageSetSUI(page, ThisStartUpID);
@@ -853,8 +866,8 @@ btree_desc(char *buf, uint8 xl_info, char *rec)
                {
                        xl_btree_delete *xlrec = (xl_btree_delete *) rec;
 
-                       strcat(buf, "delete: ");
-                       out_target(buf, &(xlrec->target));
+                       sprintf(buf + strlen(buf), "delete: node %u/%u; blk %u",
+                                       xlrec->node.tblNode, xlrec->node.relNode, xlrec->block);
                        break;
                }
                case XLOG_BTREE_DELETE_PAGE:
index 35b6c94e7e247d57272c997881bad023472a5458..d6ea70cd7d1fe3c0059a6903a0105a76aca01cba 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $Id: nbtree.h,v 1.66 2003/02/23 06:17:13 tgl Exp $
+ * $Id: nbtree.h,v 1.67 2003/02/23 22:43:09 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -263,14 +263,18 @@ typedef struct xl_btree_split
 #define SizeOfBtreeSplit       (offsetof(xl_btree_split, leftlen) + sizeof(uint16))
 
 /*
- * This is what we need to know about delete of an individual leaf btitem
+ * This is what we need to know about delete of individual leaf btitems.
+ * The WAL record can represent deletion of any number of btitems on a
+ * single index page.
  */
 typedef struct xl_btree_delete
 {
-       xl_btreetid target;                     /* deleted tuple id */
+       RelFileNode node;
+       BlockNumber block;
+       /* TARGET OFFSET NUMBERS FOLLOW AT THE END */
 } xl_btree_delete;
 
-#define SizeOfBtreeDelete      (offsetof(xl_btreetid, tid) + SizeOfIptrData)
+#define SizeOfBtreeDelete      (offsetof(xl_btree_delete, block) + sizeof(BlockNumber))
 
 /*
  * This is what we need to know about deletion of a btree page.  The target
@@ -453,7 +457,8 @@ extern void _bt_wrtnorelbuf(Relation rel, Buffer buf);
 extern void _bt_pageinit(Page page, Size size);
 extern bool _bt_page_recyclable(Page page);
 extern void _bt_metaproot(Relation rel, BlockNumber rootbknum, uint32 level);
-extern void _bt_itemdel(Relation rel, Buffer buf, ItemPointer tid);
+extern void _bt_delitems(Relation rel, Buffer buf,
+                                                OffsetNumber *itemnos, int nitems);
 extern int     _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full);
 
 /*