From: Peter Geoghegan Date: Fri, 19 Jun 2020 15:57:24 +0000 (-0700) Subject: Fix deduplication "single value" strategy bug. X-Git-Tag: REL_14_BETA1~2109 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=be14f884d57bc9c8ec8415edafea35ba5d31af59;p=thirdparty%2Fpostgresql.git Fix deduplication "single value" strategy bug. It was possible for deduplication's single value strategy to mistakenly believe that a very small duplicate tuple counts as one of the six large tuples that it aims to leave behind after the page finally splits. This could cause slightly suboptimal space utilization with very low cardinality indexes, though only under fairly narrow conditions. To fix, be particular about what kind of tuple counts as a maxpostingsize-capped tuple. This avoids confusion in the event of a small tuple that gets "wedged" between two large tuples, where all tuples on the page are duplicates of the same value. Discussion: https://postgr.es/m/CAH2-Wz=Y+sgSFc-O3LpiZX-POx2bC+okec2KafERHuzdVa7-rQ@mail.gmail.com Backpatch: 13-, where deduplication was introduced (by commit 0d861bbb) --- diff --git a/src/backend/access/nbtree/nbtdedup.c b/src/backend/access/nbtree/nbtdedup.c index b20faf693da..f6be865b17e 100644 --- a/src/backend/access/nbtree/nbtdedup.c +++ b/src/backend/access/nbtree/nbtdedup.c @@ -62,7 +62,6 @@ _bt_dedup_one_page(Relation rel, Buffer buf, Relation heapRel, Page page = BufferGetPage(buf); BTPageOpaque opaque; Page newpage; - int newpagendataitems = 0; OffsetNumber deletable[MaxIndexTuplesPerPage]; BTDedupState state; int ndeletable = 0; @@ -124,6 +123,7 @@ _bt_dedup_one_page(Relation rel, Buffer buf, Relation heapRel, */ state = (BTDedupState) palloc(sizeof(BTDedupStateData)); state->deduplicate = true; + state->nmaxitems = 0; state->maxpostingsize = Min(BTMaxItemSize(page) / 2, INDEX_SIZE_MASK); /* Metadata about base tuple of current pending posting list */ state->base = NULL; @@ -204,26 +204,25 @@ _bt_dedup_one_page(Relation rel, Buffer buf, Relation heapRel, * reset the state and move on without modifying the page. */ pagesaving += _bt_dedup_finish_pending(newpage, state); - newpagendataitems++; if (singlevalstrat) { /* * Single value strategy's extra steps. * - * Lower maxpostingsize for sixth and final item that might be - * deduplicated by current deduplication pass. When sixth - * item formed/observed, stop deduplicating items. + * Lower maxpostingsize for sixth and final large posting list + * tuple at the point where 5 maxpostingsize-capped tuples + * have either been formed or observed. * - * Note: It's possible that this will be reached even when - * current deduplication pass has yet to merge together some - * existing items. It doesn't matter whether or not the - * current call generated the maxpostingsize-capped duplicate - * tuples at the start of the page. + * When a sixth maxpostingsize-capped item is formed/observed, + * stop merging together tuples altogether. The few tuples + * that remain at the end of the page won't be merged together + * at all (at least not until after a future page split takes + * place). */ - if (newpagendataitems == 5) + if (state->nmaxitems == 5) _bt_singleval_fillfactor(page, state, newitemsz); - else if (newpagendataitems == 6) + else if (state->nmaxitems == 6) { state->deduplicate = false; singlevalstrat = false; /* won't be back here */ @@ -237,7 +236,6 @@ _bt_dedup_one_page(Relation rel, Buffer buf, Relation heapRel, /* Handle the last item */ pagesaving += _bt_dedup_finish_pending(newpage, state); - newpagendataitems++; /* * If no items suitable for deduplication were found, newpage must be @@ -404,7 +402,24 @@ _bt_dedup_save_htid(BTDedupState state, IndexTuple itup) (state->nhtids + nhtids) * sizeof(ItemPointerData)); if (mergedtupsz > state->maxpostingsize) + { + /* + * Count this as an oversized item for single value strategy, though + * only when there are 50 TIDs in the final posting list tuple. This + * limit (which is fairly arbitrary) avoids confusion about how many + * 1/6 of a page tuples have been encountered/created by the current + * deduplication pass. + * + * Note: We deliberately don't consider which deduplication pass + * merged together tuples to create this item (could be a previous + * deduplication pass, or current pass). See _bt_do_singleval() + * comments. + */ + if (state->nhtids > 50) + state->nmaxitems++; + return false; + } /* * Save heap TIDs to pending posting list tuple -- itup can be merged into diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 15f10a29d3d..c03998834d4 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -1095,6 +1095,7 @@ _bt_sort_dedup_finish_pending(BTWriteState *wstate, BTPageState *state, pfree(postingtuple); } + dstate->nmaxitems = 0; dstate->nhtids = 0; dstate->nitems = 0; dstate->phystupsize = 0; @@ -1310,6 +1311,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) dstate = (BTDedupState) palloc(sizeof(BTDedupStateData)); dstate->deduplicate = true; /* unused */ + dstate->nmaxitems = 0; /* unused */ dstate->maxpostingsize = 0; /* set later */ /* Metadata about base tuple of current pending posting list */ dstate->base = NULL; diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c index 87a8612c28c..5bec59d448d 100644 --- a/src/backend/access/nbtree/nbtxlog.c +++ b/src/backend/access/nbtree/nbtxlog.c @@ -483,6 +483,7 @@ btree_xlog_dedup(XLogReaderState *record) state = (BTDedupState) palloc(sizeof(BTDedupStateData)); state->deduplicate = true; /* unused */ + state->nmaxitems = 0; /* unused */ /* Conservatively use larger maxpostingsize than primary */ state->maxpostingsize = BTMaxItemSize(page); state->base = NULL; diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 3b2bcb22a70..79506c748b2 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -739,6 +739,7 @@ typedef struct BTDedupStateData { /* Deduplication status info for entire pass over page */ bool deduplicate; /* Still deduplicating page? */ + int nmaxitems; /* Number of max-sized tuples so far */ Size maxpostingsize; /* Limit on size of final tuple */ /* Metadata about base tuple of current pending posting list */