Skip to content

Commit a26b753

Browse files
committed
Limit the size of TID lists during parallel GIN build
When building intermediate TID lists during parallel GIN builds, split the sorted lists into smaller chunks, to limit the amount of memory needed when merging the chunks later. The leader may need to keep in memory up to one chunk per worker, and possibly one extra chunk (before evicting some of the data). The code processing item pointers uses regular palloc/repalloc calls, which means it's subject to the MaxAllocSize (1GB) limit. We could fix this by allowing huge allocations, but that'd require changes in many places without much benefit. Larger chunks do not actually improve performance, so the memory usage would be wasted. Fixed by limiting the chunk size to not hit MaxAllocSize. Each worker gets a fair share. This requires remembering the number of participating workers, in a place that can be accessed from the callback. Luckily, the bs_worker_id field in GinBuildState was unused, so repurpose that. Report by Greg Smith, investigation and fix by me. Batchpatched to 18, where parallel GIN builds were introduced. Reported-by: Gregory Smith <gregsmithpgsql@gmail.com> Discussion: https://postgr.es/m/CAHLJuCWDwn-PE2BMZE4Kux7x5wWt_6RoWtA0mUQffEDLeZ6sfA@mail.gmail.com Backpatch-through: 18
1 parent 8864016 commit a26b753

File tree

1 file changed

+41
-11
lines changed

1 file changed

+41
-11
lines changed

src/backend/access/gin/gininsert.c

Lines changed: 41 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,9 @@ typedef struct
152152
* only in the leader process.
153153
*/
154154
GinLeader *bs_leader;
155-
int bs_worker_id;
155+
156+
/* number of participating workers (including leader) */
157+
int bs_num_workers;
156158

157159
/* used to pass information from workers to leader */
158160
double bs_numtuples;
@@ -479,6 +481,15 @@ ginBuildCallback(Relation index, ItemPointer tid, Datum *values,
479481
/*
480482
* ginFlushBuildState
481483
* Write all data from BuildAccumulator into the tuplesort.
484+
*
485+
* The number of TIDs written to the tuplesort at once is limited, to reduce
486+
* the amount of memory needed when merging the intermediate results later.
487+
* The leader will see up to two chunks per worker, so calculate the limit to
488+
* not need more than MaxAllocSize overall.
489+
*
490+
* We don't need to worry about overflowing maintenance_work_mem. We can't
491+
* build chunks larger than work_mem, and that limit was set so that workers
492+
* produce sufficiently small chunks.
482493
*/
483494
static void
484495
ginFlushBuildState(GinBuildState *buildstate, Relation index)
@@ -489,6 +500,11 @@ ginFlushBuildState(GinBuildState *buildstate, Relation index)
489500
uint32 nlist;
490501
OffsetNumber attnum;
491502
TupleDesc tdesc = RelationGetDescr(index);
503+
uint32 maxlen;
504+
505+
/* maximum number of TIDs per chunk (two chunks per worker) */
506+
maxlen = MaxAllocSize / sizeof(ItemPointerData);
507+
maxlen /= (2 * buildstate->bs_num_workers);
492508

493509
ginBeginBAScan(&buildstate->accum);
494510
while ((list = ginGetBAEntry(&buildstate->accum,
@@ -497,20 +513,31 @@ ginFlushBuildState(GinBuildState *buildstate, Relation index)
497513
/* information about the key */
498514
Form_pg_attribute attr = TupleDescAttr(tdesc, (attnum - 1));
499515

500-
/* GIN tuple and tuple length */
501-
GinTuple *tup;
502-
Size tuplen;
516+
/* start of the chunk */
517+
uint32 offset = 0;
503518

504-
/* there could be many entries, so be willing to abort here */
505-
CHECK_FOR_INTERRUPTS();
519+
/* split the entry into smaller chunk with up to maxlen items */
520+
while (offset < nlist)
521+
{
522+
/* GIN tuple and tuple length */
523+
GinTuple *tup;
524+
Size tuplen;
525+
uint32 len = Min(maxlen, nlist - offset);
506526

507-
tup = _gin_build_tuple(attnum, category,
508-
key, attr->attlen, attr->attbyval,
509-
list, nlist, &tuplen);
527+
/* there could be many entries, so be willing to abort here */
528+
CHECK_FOR_INTERRUPTS();
529+
530+
tup = _gin_build_tuple(attnum, category,
531+
key, attr->attlen, attr->attbyval,
532+
&list[offset], len,
533+
&tuplen);
510534

511-
tuplesort_putgintuple(buildstate->bs_worker_sort, tup, tuplen);
535+
offset += len;
512536

513-
pfree(tup);
537+
tuplesort_putgintuple(buildstate->bs_worker_sort, tup, tuplen);
538+
539+
pfree(tup);
540+
}
514541
}
515542

516543
MemoryContextReset(buildstate->tmpCtx);
@@ -2013,6 +2040,9 @@ _gin_parallel_scan_and_build(GinBuildState *state,
20132040
/* remember how much space is allowed for the accumulated entries */
20142041
state->work_mem = (sortmem / 2);
20152042

2043+
/* remember how many workers participate in the build */
2044+
state->bs_num_workers = ginshared->scantuplesortstates;
2045+
20162046
/* Begin "partial" tuplesort */
20172047
state->bs_sortstate = tuplesort_begin_index_gin(heap, index,
20182048
state->work_mem,

0 commit comments

Comments
 (0)