]> git.ipfire.org Git - thirdparty/postgresql.git/commitdiff
Limit the size of TID lists during parallel GIN build
authorTomas Vondra <tomas.vondra@postgresql.org>
Tue, 4 Nov 2025 17:46:37 +0000 (18:46 +0100)
committerTomas Vondra <tomas.vondra@postgresql.org>
Tue, 4 Nov 2025 17:51:17 +0000 (18:51 +0100)
When building intermediate TID lists during parallel GIN builds, split
the sorted lists into smaller chunks, to limit the amount of memory
needed when merging the chunks later.

The leader may need to keep in memory up to one chunk per worker, and
possibly one extra chunk (before evicting some of the data). The code
processing item pointers uses regular palloc/repalloc calls, which means
it's subject to the MaxAllocSize (1GB) limit.

We could fix this by allowing huge allocations, but that'd require
changes in many places without much benefit. Larger chunks do not
actually improve performance, so the memory usage would be wasted.

Fixed by limiting the chunk size to not hit MaxAllocSize. Each worker
gets a fair share.

This requires remembering the number of participating workers, in a
place that can be accessed from the callback. Luckily, the bs_worker_id
field in GinBuildState was unused, so repurpose that.

Report by Greg Smith, investigation and fix by me. Batchpatched to 18,
where parallel GIN builds were introduced.

Reported-by: Gregory Smith <gregsmithpgsql@gmail.com>
Discussion: https://postgr.es/m/CAHLJuCWDwn-PE2BMZE4Kux7x5wWt_6RoWtA0mUQffEDLeZ6sfA@mail.gmail.com
Backpatch-through: 18

src/backend/access/gin/gininsert.c

index 3d71b442aa9e71da789cc79058d717f590818229..61869d9d93125643dad99e17fe3ad9d3051e7499 100644 (file)
@@ -152,7 +152,9 @@ typedef struct
         * only in the leader process.
         */
        GinLeader  *bs_leader;
-       int                     bs_worker_id;
+
+       /* number of participating workers (including leader) */
+       int                     bs_num_workers;
 
        /* used to pass information from workers to leader */
        double          bs_numtuples;
@@ -483,6 +485,15 @@ ginBuildCallback(Relation index, ItemPointer tid, Datum *values,
 /*
  * ginFlushBuildState
  *             Write all data from BuildAccumulator into the tuplesort.
+ *
+ * The number of TIDs written to the tuplesort at once is limited, to reduce
+ * the amount of memory needed when merging the intermediate results later.
+ * The leader will see up to two chunks per worker, so calculate the limit to
+ * not need more than MaxAllocSize overall.
+ *
+ * We don't need to worry about overflowing maintenance_work_mem. We can't
+ * build chunks larger than work_mem, and that limit was set so that workers
+ * produce sufficiently small chunks.
  */
 static void
 ginFlushBuildState(GinBuildState *buildstate, Relation index)
@@ -493,6 +504,11 @@ ginFlushBuildState(GinBuildState *buildstate, Relation index)
        uint32          nlist;
        OffsetNumber attnum;
        TupleDesc       tdesc = RelationGetDescr(index);
+       uint32          maxlen;
+
+       /* maximum number of TIDs per chunk (two chunks per worker) */
+       maxlen = MaxAllocSize / sizeof(ItemPointerData);
+       maxlen /= (2 * buildstate->bs_num_workers);
 
        ginBeginBAScan(&buildstate->accum);
        while ((list = ginGetBAEntry(&buildstate->accum,
@@ -501,20 +517,31 @@ ginFlushBuildState(GinBuildState *buildstate, Relation index)
                /* information about the key */
                CompactAttribute *attr = TupleDescCompactAttr(tdesc, (attnum - 1));
 
-               /* GIN tuple and tuple length */
-               GinTuple   *tup;
-               Size            tuplen;
+               /* start of the chunk */
+               uint32          offset = 0;
 
-               /* there could be many entries, so be willing to abort here */
-               CHECK_FOR_INTERRUPTS();
+               /* split the entry into smaller chunk with up to maxlen items */
+               while (offset < nlist)
+               {
+                       /* GIN tuple and tuple length */
+                       GinTuple   *tup;
+                       Size            tuplen;
+                       uint32          len = Min(maxlen, nlist - offset);
 
-               tup = _gin_build_tuple(attnum, category,
-                                                          key, attr->attlen, attr->attbyval,
-                                                          list, nlist, &tuplen);
+                       /* there could be many entries, so be willing to abort here */
+                       CHECK_FOR_INTERRUPTS();
+
+                       tup = _gin_build_tuple(attnum, category,
+                                                                  key, attr->attlen, attr->attbyval,
+                                                                  &list[offset], len,
+                                                                  &tuplen);
 
-               tuplesort_putgintuple(buildstate->bs_worker_sort, tup, tuplen);
+                       offset += len;
 
-               pfree(tup);
+                       tuplesort_putgintuple(buildstate->bs_worker_sort, tup, tuplen);
+
+                       pfree(tup);
+               }
        }
 
        MemoryContextReset(buildstate->tmpCtx);
@@ -2017,6 +2044,9 @@ _gin_parallel_scan_and_build(GinBuildState *state,
        /* remember how much space is allowed for the accumulated entries */
        state->work_mem = (sortmem / 2);
 
+       /* remember how many workers participate in the build */
+       state->bs_num_workers = ginshared->scantuplesortstates;
+
        /* Begin "partial" tuplesort */
        state->bs_sortstate = tuplesort_begin_index_gin(heap, index,
                                                                                                        state->work_mem,