Use fake LSNs in all nbtree critical sections that write a WAL record.
That way we can safely apply the _bt_killitems LSN trick with logged and
unlogged indexes alike. This brings the same benefits to plain scans of
unlogged relations that commit
2ed5b87f brought to plain scans of logged
relations: scans will drop their leaf page pin eagerly (by applying the
"dropPin" optimization), which avoids blocking progress by VACUUM. This
is particularly helpful with applications that allow a scrollable cursor
to remain idle for long periods.
Preparation for an upcoming commit that will add the amgetbatch
interface, and switch nbtree over to it (from amgettuple) to enable I/O
prefetching. The index prefetching read stream's effective prefetch
distance is adversely affected by any buffer pins held by the index AM.
At the same time, it can be useful for prefetching to read dozens of
leaf pages ahead of the scan to maintain an adequate prefetch distance.
The index prefetching patch avoids this tension by always eagerly
dropping index page pins of the kind traditionally held as an interlock
against unsafe concurrent TID recycling by VACUUM (essentially the same
way that amgetbitmap routines have always avoided holding onto pins).
The work from this commit makes that possible during scans of nbtree
unlogged indexes -- without our having to give up on setting LP_DEAD
bits on index tuples altogether.
Follow-up to commit
d774072f, which moved the fake LSN infrastructure
out of GiST so that it could be used by other index AMs.
Author: Peter Geoghegan <pg@bowt.ie>
Reviewed-By: Andres Freund <andres@anarazel.de>
Reviewed-By: Tomas Vondra <tomas@vondra.me>
Discussion: https://postgr.es/m/CAH2-WzkehuhxyuA8quc7rRN3EtNXpiKsjPfO8mhb+0Dr2K0Dtg@mail.gmail.com
scans conservatively assume that any change to the leaf page at all
implies that it was reached by btbulkdelete in the interim period when no
buffer pin was held. This is implemented by not setting any LP_DEAD bits
-on the leaf page at all when the page's LSN has changed. (That won't work
-with an unlogged index, so for now we don't ever apply the "don't hold
-onto pin" optimization there.)
+on the leaf page at all when the page's LSN has changed. (This is why we
+implement "fake" LSNs for unlogged index relations.)
Fastpath For Index Insertion
----------------------------
Size pagesaving PG_USED_FOR_ASSERTS_ONLY = 0;
bool singlevalstrat = false;
int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
+ XLogRecPtr recptr;
/* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
newitemsz += sizeof(ItemIdData);
/* XLOG stuff */
if (RelationNeedsWAL(rel))
{
- XLogRecPtr recptr;
xl_btree_dedup xlrec_dedup;
xlrec_dedup.nintervals = state->nintervals;
state->nintervals * sizeof(BTDedupInterval));
recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DEDUP);
-
- PageSetLSN(page, recptr);
}
+ else
+ recptr = XLogGetFakeLSN(rel);
+
+ PageSetLSN(page, recptr);
END_CRIT_SECTION();
IndexTuple oposting = NULL;
IndexTuple origitup = NULL;
IndexTuple nposting = NULL;
+ XLogRecPtr recptr;
page = BufferGetPage(buf);
opaque = BTPageGetOpaque(page);
xl_btree_insert xlrec;
xl_btree_metadata xlmeta;
uint8 xlinfo;
- XLogRecPtr recptr;
uint16 upostingoff;
xlrec.offnum = newitemoff;
}
recptr = XLogInsert(RM_BTREE_ID, xlinfo);
+ }
+ else
+ recptr = XLogGetFakeLSN(rel);
- if (BufferIsValid(metabuf))
- PageSetLSN(metapg, recptr);
- if (!isleaf)
- PageSetLSN(BufferGetPage(cbuf), recptr);
+ if (BufferIsValid(metabuf))
+ PageSetLSN(metapg, recptr);
+ if (!isleaf)
+ PageSetLSN(BufferGetPage(cbuf), recptr);
- PageSetLSN(page, recptr);
- }
+ PageSetLSN(page, recptr);
END_CRIT_SECTION();
bool newitemonleft,
isleaf,
isrightmost;
+ XLogRecPtr recptr;
/*
* origpage is the original page to be split. leftpage is a temporary
{
xl_btree_split xlrec;
uint8 xlinfo;
- XLogRecPtr recptr;
xlrec.level = ropaque->btpo_level;
/* See comments below on newitem, orignewitem, and posting lists */
xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L : XLOG_BTREE_SPLIT_R;
recptr = XLogInsert(RM_BTREE_ID, xlinfo);
-
- PageSetLSN(origpage, recptr);
- PageSetLSN(rightpage, recptr);
- if (!isrightmost)
- PageSetLSN(spage, recptr);
- if (!isleaf)
- PageSetLSN(BufferGetPage(cbuf), recptr);
}
+ else
+ recptr = XLogGetFakeLSN(rel);
+
+ PageSetLSN(origpage, recptr);
+ PageSetLSN(rightpage, recptr);
+ if (!isrightmost)
+ PageSetLSN(spage, recptr);
+ if (!isleaf)
+ PageSetLSN(BufferGetPage(cbuf), recptr);
END_CRIT_SECTION();
Buffer metabuf;
Page metapg;
BTMetaPageData *metad;
+ XLogRecPtr recptr;
lbkno = BufferGetBlockNumber(lbuf);
rbkno = BufferGetBlockNumber(rbuf);
if (RelationNeedsWAL(rel))
{
xl_btree_newroot xlrec;
- XLogRecPtr recptr;
xl_btree_metadata md;
xlrec.rootblk = rootblknum;
((PageHeader) rootpage)->pd_upper);
recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
-
- PageSetLSN(lpage, recptr);
- PageSetLSN(rootpage, recptr);
- PageSetLSN(metapg, recptr);
}
+ else
+ recptr = XLogGetFakeLSN(rel);
+
+ PageSetLSN(lpage, recptr);
+ PageSetLSN(rootpage, recptr);
+ PageSetLSN(metapg, recptr);
END_CRIT_SECTION();
Buffer metabuf;
Page metapg;
BTMetaPageData *metad;
+ XLogRecPtr recptr;
/*
* On-disk compatibility note: The btm_last_cleanup_num_delpages metapage
if (RelationNeedsWAL(rel))
{
xl_btree_metadata md;
- XLogRecPtr recptr;
XLogBeginInsert();
XLogRegisterBuffer(0, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
XLogRegisterBufData(0, &md, sizeof(xl_btree_metadata));
recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_META_CLEANUP);
-
- PageSetLSN(metapg, recptr);
}
+ else
+ recptr = XLogGetFakeLSN(rel);
+
+ PageSetLSN(metapg, recptr);
END_CRIT_SECTION();
BlockNumber rootblkno;
uint32 rootlevel;
BTMetaPageData *metad;
+ XLogRecPtr recptr;
Assert(access == BT_READ || heaprel != NULL);
if (RelationNeedsWAL(rel))
{
xl_btree_newroot xlrec;
- XLogRecPtr recptr;
xl_btree_metadata md;
XLogBeginInsert();
XLogRegisterData(&xlrec, SizeOfBtreeNewroot);
recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
-
- PageSetLSN(rootpage, recptr);
- PageSetLSN(metapg, recptr);
}
+ else
+ recptr = XLogGetFakeLSN(rel);
+
+ PageSetLSN(rootpage, recptr);
+ PageSetLSN(metapg, recptr);
END_CRIT_SECTION();
char *updatedbuf = NULL;
Size updatedbuflen = 0;
OffsetNumber updatedoffsets[MaxIndexTuplesPerPage];
+ XLogRecPtr recptr;
/* Shouldn't be called unless there's something to do */
Assert(ndeletable > 0 || nupdatable > 0);
/* XLOG stuff */
if (needswal)
{
- XLogRecPtr recptr;
xl_btree_vacuum xlrec_vacuum;
xlrec_vacuum.ndeleted = ndeletable;
}
recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM);
-
- PageSetLSN(page, recptr);
}
+ else
+ recptr = XLogGetFakeLSN(rel);
+
+ PageSetLSN(page, recptr);
END_CRIT_SECTION();
char *updatedbuf = NULL;
Size updatedbuflen = 0;
OffsetNumber updatedoffsets[MaxIndexTuplesPerPage];
+ XLogRecPtr recptr;
/* Shouldn't be called unless there's something to do */
Assert(ndeletable > 0 || nupdatable > 0);
/* XLOG stuff */
if (needswal)
{
- XLogRecPtr recptr;
xl_btree_delete xlrec_delete;
xlrec_delete.snapshotConflictHorizon = snapshotConflictHorizon;
}
recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE);
-
- PageSetLSN(page, recptr);
}
+ else
+ recptr = XLogGetFakeLSN(rel);
+
+ PageSetLSN(page, recptr);
END_CRIT_SECTION();
OffsetNumber nextoffset;
IndexTuple itup;
IndexTupleData trunctuple;
+ XLogRecPtr recptr;
page = BufferGetPage(leafbuf);
opaque = BTPageGetOpaque(page);
if (RelationNeedsWAL(rel))
{
xl_btree_mark_page_halfdead xlrec;
- XLogRecPtr recptr;
xlrec.poffset = poffset;
xlrec.leafblk = leafblkno;
XLogRegisterData(&xlrec, SizeOfBtreeMarkPageHalfDead);
recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_MARK_PAGE_HALFDEAD);
-
- page = BufferGetPage(subtreeparent);
- PageSetLSN(page, recptr);
- page = BufferGetPage(leafbuf);
- PageSetLSN(page, recptr);
}
+ else
+ recptr = XLogGetFakeLSN(rel);
+
+ page = BufferGetPage(subtreeparent);
+ PageSetLSN(page, recptr);
+ page = BufferGetPage(leafbuf);
+ PageSetLSN(page, recptr);
END_CRIT_SECTION();
uint32 targetlevel;
IndexTuple leafhikey;
BlockNumber leaftopparent;
+ XLogRecPtr recptr;
page = BufferGetPage(leafbuf);
opaque = BTPageGetOpaque(page);
xl_btree_unlink_page xlrec;
xl_btree_metadata xlmeta;
uint8 xlinfo;
- XLogRecPtr recptr;
XLogBeginInsert();
xlinfo = XLOG_BTREE_UNLINK_PAGE;
recptr = XLogInsert(RM_BTREE_ID, xlinfo);
+ }
+ else
+ recptr = XLogGetFakeLSN(rel);
- if (BufferIsValid(metabuf))
- {
- PageSetLSN(metapg, recptr);
- }
- page = BufferGetPage(rbuf);
+ if (BufferIsValid(metabuf))
+ PageSetLSN(metapg, recptr);
+ page = BufferGetPage(rbuf);
+ PageSetLSN(page, recptr);
+ page = BufferGetPage(buf);
+ PageSetLSN(page, recptr);
+ if (BufferIsValid(lbuf))
+ {
+ page = BufferGetPage(lbuf);
PageSetLSN(page, recptr);
- page = BufferGetPage(buf);
+ }
+ if (target != leafblkno)
+ {
+ page = BufferGetPage(leafbuf);
PageSetLSN(page, recptr);
- if (BufferIsValid(lbuf))
- {
- page = BufferGetPage(lbuf);
- PageSetLSN(page, recptr);
- }
- if (target != leafblkno)
- {
- page = BufferGetPage(leafbuf);
- PageSetLSN(page, recptr);
- }
}
END_CRIT_SECTION();
* race condition involving VACUUM setting pages all-visible in the VM.
* It's also unsafe for plain index scans that use a non-MVCC snapshot.
*
- * When we drop pins eagerly, the mechanism that marks so->killedItems[]
- * index tuples LP_DEAD has to deal with concurrent TID recycling races.
- * The scheme used to detect unsafe TID recycling won't work when scanning
- * unlogged relations (since it involves saving an affected page's LSN).
- * Opt out of eager pin dropping during unlogged relation scans for now
- * (this is preferable to opting out of kill_prior_tuple LP_DEAD setting).
- *
* Also opt out of dropping leaf page pins eagerly during bitmap scans.
* Pins cannot be held for more than an instant during bitmap scans either
* way, so we might as well avoid wasting cycles on acquiring page LSNs.
*/
so->dropPin = (!scan->xs_want_itup &&
IsMVCCLikeSnapshot(scan->xs_snapshot) &&
- RelationNeedsWAL(scan->indexRelation) &&
scan->heapRelation != NULL);
so->markItemIndex = -1;
* Have to set so->currPos.lsn so that _bt_killitems has a way to detect
* when concurrent heap TID recycling by VACUUM might have taken place.
*/
- Assert(RelationNeedsWAL(rel));
so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf);
_bt_relbuf(rel, so->currPos.buf);
so->currPos.buf = InvalidBuffer;
XLogRecPtr latestlsn;
Assert(!BTScanPosIsPinned(so->currPos));
- Assert(RelationNeedsWAL(rel));
buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ);
latestlsn = BufferGetLSNAtomic(buf);