From c98dffcb7c7010d216dc16d22cb594ef7d65fde1 Mon Sep 17 00:00:00 2001 From: Tomas Vondra Date: Tue, 4 Nov 2025 18:46:37 +0100 Subject: [PATCH] Limit the size of TID lists during parallel GIN build When building intermediate TID lists during parallel GIN builds, split the sorted lists into smaller chunks, to limit the amount of memory needed when merging the chunks later. The leader may need to keep in memory up to one chunk per worker, and possibly one extra chunk (before evicting some of the data). The code processing item pointers uses regular palloc/repalloc calls, which means it's subject to the MaxAllocSize (1GB) limit. We could fix this by allowing huge allocations, but that'd require changes in many places without much benefit. Larger chunks do not actually improve performance, so the memory usage would be wasted. Fixed by limiting the chunk size to not hit MaxAllocSize. Each worker gets a fair share. This requires remembering the number of participating workers, in a place that can be accessed from the callback. Luckily, the bs_worker_id field in GinBuildState was unused, so repurpose that. Report by Greg Smith, investigation and fix by me. Batchpatched to 18, where parallel GIN builds were introduced. Reported-by: Gregory Smith Discussion: https://postgr.es/m/CAHLJuCWDwn-PE2BMZE4Kux7x5wWt_6RoWtA0mUQffEDLeZ6sfA@mail.gmail.com Backpatch-through: 18 --- src/backend/access/gin/gininsert.c | 52 +++++++++++++++++++++++------- 1 file changed, 41 insertions(+), 11 deletions(-) diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c index 3d71b442aa9..61869d9d931 100644 --- a/src/backend/access/gin/gininsert.c +++ b/src/backend/access/gin/gininsert.c @@ -152,7 +152,9 @@ typedef struct * only in the leader process. */ GinLeader *bs_leader; - int bs_worker_id; + + /* number of participating workers (including leader) */ + int bs_num_workers; /* used to pass information from workers to leader */ double bs_numtuples; @@ -483,6 +485,15 @@ ginBuildCallback(Relation index, ItemPointer tid, Datum *values, /* * ginFlushBuildState * Write all data from BuildAccumulator into the tuplesort. + * + * The number of TIDs written to the tuplesort at once is limited, to reduce + * the amount of memory needed when merging the intermediate results later. + * The leader will see up to two chunks per worker, so calculate the limit to + * not need more than MaxAllocSize overall. + * + * We don't need to worry about overflowing maintenance_work_mem. We can't + * build chunks larger than work_mem, and that limit was set so that workers + * produce sufficiently small chunks. */ static void ginFlushBuildState(GinBuildState *buildstate, Relation index) @@ -493,6 +504,11 @@ ginFlushBuildState(GinBuildState *buildstate, Relation index) uint32 nlist; OffsetNumber attnum; TupleDesc tdesc = RelationGetDescr(index); + uint32 maxlen; + + /* maximum number of TIDs per chunk (two chunks per worker) */ + maxlen = MaxAllocSize / sizeof(ItemPointerData); + maxlen /= (2 * buildstate->bs_num_workers); ginBeginBAScan(&buildstate->accum); while ((list = ginGetBAEntry(&buildstate->accum, @@ -501,20 +517,31 @@ ginFlushBuildState(GinBuildState *buildstate, Relation index) /* information about the key */ CompactAttribute *attr = TupleDescCompactAttr(tdesc, (attnum - 1)); - /* GIN tuple and tuple length */ - GinTuple *tup; - Size tuplen; + /* start of the chunk */ + uint32 offset = 0; - /* there could be many entries, so be willing to abort here */ - CHECK_FOR_INTERRUPTS(); + /* split the entry into smaller chunk with up to maxlen items */ + while (offset < nlist) + { + /* GIN tuple and tuple length */ + GinTuple *tup; + Size tuplen; + uint32 len = Min(maxlen, nlist - offset); - tup = _gin_build_tuple(attnum, category, - key, attr->attlen, attr->attbyval, - list, nlist, &tuplen); + /* there could be many entries, so be willing to abort here */ + CHECK_FOR_INTERRUPTS(); + + tup = _gin_build_tuple(attnum, category, + key, attr->attlen, attr->attbyval, + &list[offset], len, + &tuplen); - tuplesort_putgintuple(buildstate->bs_worker_sort, tup, tuplen); + offset += len; - pfree(tup); + tuplesort_putgintuple(buildstate->bs_worker_sort, tup, tuplen); + + pfree(tup); + } } MemoryContextReset(buildstate->tmpCtx); @@ -2017,6 +2044,9 @@ _gin_parallel_scan_and_build(GinBuildState *state, /* remember how much space is allowed for the accumulated entries */ state->work_mem = (sortmem / 2); + /* remember how many workers participate in the build */ + state->bs_num_workers = ginshared->scantuplesortstates; + /* Begin "partial" tuplesort */ state->bs_sortstate = tuplesort_begin_index_gin(heap, index, state->work_mem, -- 2.47.3