]> git.ipfire.org Git - thirdparty/postgresql.git/commitdiff
Revise the TIDBitmap API to support multiple concurrent iterations over a
authorTom Lane <tgl@sss.pgh.pa.us>
Sat, 10 Jan 2009 21:08:36 +0000 (21:08 +0000)
committerTom Lane <tgl@sss.pgh.pa.us>
Sat, 10 Jan 2009 21:08:36 +0000 (21:08 +0000)
bitmap.  This is extracted from Greg Stark's posix_fadvise patch; it seems
worth committing separately, since it's potentially useful independently of
posix_fadvise.

src/backend/access/gin/ginget.c
src/backend/access/gin/ginscan.c
src/backend/executor/nodeBitmapHeapscan.c
src/backend/nodes/tidbitmap.c
src/include/access/gin.h
src/include/nodes/execnodes.h
src/include/nodes/tidbitmap.h

index d4559815340774e607479635c0298c5a02a02694..182981498c10ddb1c437934d4da5a4b0d4f5849d 100644 (file)
@@ -8,7 +8,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *                     $PostgreSQL: pgsql/src/backend/access/gin/ginget.c,v 1.21 2009/01/01 17:23:34 momjian Exp $
+ *                     $PostgreSQL: pgsql/src/backend/access/gin/ginget.c,v 1.22 2009/01/10 21:08:36 tgl Exp $
  *-------------------------------------------------------------------------
  */
 
@@ -290,6 +290,7 @@ startScanEntry(Relation index, GinState *ginstate, GinScanEntry entry)
        entry->list = NULL;
        entry->nlist = 0;
        entry->partialMatch = NULL;
+       entry->partialMatchIterator = NULL;
        entry->partialMatchResult = NULL;
        entry->reduceResult = FALSE;
        entry->predictNumberResult = 0;
@@ -311,6 +312,9 @@ startScanEntry(Relation index, GinState *ginstate, GinScanEntry entry)
                         */
                        if ( entry->partialMatch )
                        {
+                               if (entry->partialMatchIterator)
+                                       tbm_end_iterate(entry->partialMatchIterator);
+                               entry->partialMatchIterator = NULL;
                                tbm_free( entry->partialMatch );
                                entry->partialMatch = NULL;
                        }
@@ -323,7 +327,7 @@ startScanEntry(Relation index, GinState *ginstate, GinScanEntry entry)
 
                if ( entry->partialMatch && !tbm_is_empty(entry->partialMatch) )
                {
-                       tbm_begin_iterate(entry->partialMatch);
+                       entry->partialMatchIterator = tbm_begin_iterate(entry->partialMatch);
                        entry->isFinished = FALSE;
                }
        }
@@ -534,11 +538,13 @@ entryGetItem(Relation index, GinScanEntry entry)
                {
                        if ( entry->partialMatchResult == NULL || entry->offset >= entry->partialMatchResult->ntuples )
                        {
-                               entry->partialMatchResult = tbm_iterate( entry->partialMatch );
+                               entry->partialMatchResult = tbm_iterate( entry->partialMatchIterator );
 
                                if ( entry->partialMatchResult == NULL )
                                {
                                        ItemPointerSet(&entry->curItem, InvalidBlockNumber, InvalidOffsetNumber);
+                                       tbm_end_iterate(entry->partialMatchIterator);
+                                       entry->partialMatchIterator = NULL;
                                        entry->isFinished = TRUE;
                                        break;
                                }
index 9c122cb526f06e49d77b2d746afa6e32e753cbf7..ba3774192229fd3ae3f1f638f30c9bc90dd115af 100644 (file)
@@ -8,7 +8,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *                     $PostgreSQL: pgsql/src/backend/access/gin/ginscan.c,v 1.20 2009/01/01 17:23:34 momjian Exp $
+ *                     $PostgreSQL: pgsql/src/backend/access/gin/ginscan.c,v 1.21 2009/01/10 21:08:36 tgl Exp $
  *-------------------------------------------------------------------------
  */
 
@@ -61,6 +61,8 @@ fillScanKey(GinState *ginstate, GinScanKey key, OffsetNumber attnum, Datum query
                key->scanEntry[i].offset = InvalidOffsetNumber;
                key->scanEntry[i].buffer = InvalidBuffer;
                key->scanEntry[i].partialMatch = NULL;
+               key->scanEntry[i].partialMatchIterator = NULL;
+               key->scanEntry[i].partialMatchResult = NULL;
                key->scanEntry[i].strategy = strategy;
                key->scanEntry[i].list = NULL;
                key->scanEntry[i].nlist = 0;
@@ -107,6 +109,7 @@ resetScanKeys(GinScanKey keys, uint32 nkeys)
                        key->scanEntry[j].list = NULL;
                        key->scanEntry[j].nlist = 0;
                        key->scanEntry[j].partialMatch = NULL;
+                       key->scanEntry[j].partialMatchIterator = NULL;
                        key->scanEntry[j].partialMatchResult = NULL;
                }
        }
@@ -132,6 +135,8 @@ freeScanKeys(GinScanKey keys, uint32 nkeys)
                                ReleaseBuffer(key->scanEntry[j].buffer);
                        if (key->scanEntry[j].list)
                                pfree(key->scanEntry[j].list);
+                       if (key->scanEntry[j].partialMatchIterator)
+                               tbm_end_iterate(key->scanEntry[j].partialMatchIterator);
                        if (key->scanEntry[j].partialMatch)
                                tbm_free(key->scanEntry[j].partialMatch);
                }
index a74efe686cf7f9becb69876f078a075a23d72801..880b9c9590eae67d2b1e5fa55cf2acf6c7e7a783 100644 (file)
@@ -21,7 +21,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/executor/nodeBitmapHeapscan.c,v 1.31 2009/01/01 17:23:41 momjian Exp $
+ *       $PostgreSQL: pgsql/src/backend/executor/nodeBitmapHeapscan.c,v 1.32 2009/01/10 21:08:36 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -65,6 +65,7 @@ BitmapHeapNext(BitmapHeapScanState *node)
        HeapScanDesc scan;
        Index           scanrelid;
        TIDBitmap  *tbm;
+       TBMIterator *tbmiterator;
        TBMIterateResult *tbmres;
        OffsetNumber targoffset;
        TupleTableSlot *slot;
@@ -78,6 +79,7 @@ BitmapHeapNext(BitmapHeapScanState *node)
        scan = node->ss.ss_currentScanDesc;
        scanrelid = ((BitmapHeapScan *) node->ss.ps.plan)->scan.scanrelid;
        tbm = node->tbm;
+       tbmiterator = node->tbmiterator;
        tbmres = node->tbmres;
 
        /*
@@ -111,7 +113,7 @@ BitmapHeapNext(BitmapHeapScanState *node)
 
        /*
         * If we haven't yet performed the underlying index scan, do it, and
-        * prepare the bitmap to be iterated over.
+        * begin the iteration over the bitmap.
         */
        if (tbm == NULL)
        {
@@ -121,9 +123,8 @@ BitmapHeapNext(BitmapHeapScanState *node)
                        elog(ERROR, "unrecognized result from subplan");
 
                node->tbm = tbm;
+               node->tbmiterator = tbmiterator = tbm_begin_iterate(tbm);
                node->tbmres = tbmres = NULL;
-
-               tbm_begin_iterate(tbm);
        }
 
        for (;;)
@@ -136,7 +137,7 @@ BitmapHeapNext(BitmapHeapScanState *node)
                 */
                if (tbmres == NULL)
                {
-                       node->tbmres = tbmres = tbm_iterate(tbm);
+                       node->tbmres = tbmres = tbm_iterate(tbmiterator);
                        if (tbmres == NULL)
                        {
                                /* no more entries in the bitmap */
@@ -376,9 +377,12 @@ ExecBitmapHeapReScan(BitmapHeapScanState *node, ExprContext *exprCtxt)
        /* rescan to release any page pin */
        heap_rescan(node->ss.ss_currentScanDesc, NULL);
 
+       if (node->tbmiterator)
+               tbm_end_iterate(node->tbmiterator);
        if (node->tbm)
                tbm_free(node->tbm);
        node->tbm = NULL;
+       node->tbmiterator = NULL;
        node->tbmres = NULL;
 
        /*
@@ -423,6 +427,8 @@ ExecEndBitmapHeapScan(BitmapHeapScanState *node)
        /*
         * release bitmap if any
         */
+       if (node->tbmiterator)
+               tbm_end_iterate(node->tbmiterator);
        if (node->tbm)
                tbm_free(node->tbm);
 
@@ -466,6 +472,7 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
        scanstate->ss.ps.state = estate;
 
        scanstate->tbm = NULL;
+       scanstate->tbmiterator = NULL;
        scanstate->tbmres = NULL;
 
        /*
index 54acf18fbf2e7ebc36b87ca4c98a571204584438..e214bbb763403b7d65ab5286871c99686b08191c 100644 (file)
@@ -32,7 +32,7 @@
  * Copyright (c) 2003-2009, PostgreSQL Global Development Group
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/nodes/tidbitmap.c,v 1.16 2009/01/01 17:23:43 momjian Exp $
+ *       $PostgreSQL: pgsql/src/backend/nodes/tidbitmap.c,v 1.17 2009/01/10 21:08:36 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -136,9 +136,20 @@ struct TIDBitmap
        int                     nchunks;                /* number of lossy entries in pagetable */
        bool            iterating;              /* tbm_begin_iterate called? */
        PagetableEntry entry1;          /* used when status == TBM_ONE_PAGE */
-       /* the remaining fields are used while producing sorted output: */
+       /* these are valid when iterating is true: */
        PagetableEntry **spages;        /* sorted exact-page list, or NULL */
        PagetableEntry **schunks;       /* sorted lossy-chunk list, or NULL */
+};
+
+/*
+ * When iterating over a bitmap in sorted order, a TBMIterator is used to
+ * track our progress.  There can be several iterators scanning the same
+ * bitmap concurrently.  Note that the bitmap becomes read-only as soon as
+ * any iterator is created.
+ */
+struct TBMIterator
+{
+       TIDBitmap  *tbm;                        /* TIDBitmap we're iterating over */
        int                     spageptr;               /* next spages index */
        int                     schunkptr;              /* next schunks index */
        int                     schunkbit;              /* next bit to check in current schunk */
@@ -172,16 +183,9 @@ tbm_create(long maxbytes)
        TIDBitmap  *tbm;
        long            nbuckets;
 
-       /*
-        * Create the TIDBitmap struct, with enough trailing space to serve the
-        * needs of the TBMIterateResult sub-struct.
-        */
-       tbm = (TIDBitmap *) palloc(sizeof(TIDBitmap) +
-                                                          MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber));
-       /* Zero all the fixed fields */
-       MemSetAligned(tbm, 0, sizeof(TIDBitmap));
+       /* Create the TIDBitmap struct and zero all its fields */
+       tbm = makeNode(TIDBitmap);
 
-       tbm->type = T_TIDBitmap;        /* Set NodeTag */
        tbm->mcxt = CurrentMemoryContext;
        tbm->status = TBM_EMPTY;
 
@@ -533,60 +537,80 @@ tbm_is_empty(const TIDBitmap *tbm)
 /*
  * tbm_begin_iterate - prepare to iterate through a TIDBitmap
  *
+ * The TBMIterator struct is created in the caller's memory context.
+ * For a clean shutdown of the iteration, call tbm_end_iterate; but it's
+ * okay to just allow the memory context to be released, too.  It is caller's
+ * responsibility not to touch the TBMIterator anymore once the TIDBitmap
+ * is freed.
+ *
  * NB: after this is called, it is no longer allowed to modify the contents
  * of the bitmap.  However, you can call this multiple times to scan the
- * contents repeatedly.
+ * contents repeatedly, including parallel scans.
  */
-void
+TBMIterator *
 tbm_begin_iterate(TIDBitmap *tbm)
 {
-       HASH_SEQ_STATUS status;
-       PagetableEntry *page;
-       int                     npages;
-       int                     nchunks;
-
-       tbm->iterating = true;
+       TBMIterator *iterator;
 
        /*
-        * Reset iteration pointers.
+        * Create the TBMIterator struct, with enough trailing space to serve the
+        * needs of the TBMIterateResult sub-struct.
         */
-       tbm->spageptr = 0;
-       tbm->schunkptr = 0;
-       tbm->schunkbit = 0;
+       iterator = (TBMIterator *) palloc(sizeof(TBMIterator) +
+                                                                         MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber));
+       iterator->tbm = tbm;
 
        /*
-        * Nothing else to do if no entries, nor if we don't have a hashtable.
+        * Initialize iteration pointers.
         */
-       if (tbm->nentries == 0 || tbm->status != TBM_HASH)
-               return;
+       iterator->spageptr = 0;
+       iterator->schunkptr = 0;
+       iterator->schunkbit = 0;
 
        /*
-        * Create and fill the sorted page lists if we didn't already.
+        * If we have a hashtable, create and fill the sorted page lists,
+        * unless we already did that for a previous iterator.  Note that the
+        * lists are attached to the bitmap not the iterator, so they can be
+        * used by more than one iterator.
         */
-       if (!tbm->spages && tbm->npages > 0)
-               tbm->spages = (PagetableEntry **)
-                       MemoryContextAlloc(tbm->mcxt,
-                                                          tbm->npages * sizeof(PagetableEntry *));
-       if (!tbm->schunks && tbm->nchunks > 0)
-               tbm->schunks = (PagetableEntry **)
-                       MemoryContextAlloc(tbm->mcxt,
-                                                          tbm->nchunks * sizeof(PagetableEntry *));
-
-       hash_seq_init(&status, tbm->pagetable);
-       npages = nchunks = 0;
-       while ((page = (PagetableEntry *) hash_seq_search(&status)) != NULL)
+       if (tbm->status == TBM_HASH && !tbm->iterating)
        {
-               if (page->ischunk)
-                       tbm->schunks[nchunks++] = page;
-               else
-                       tbm->spages[npages++] = page;
+               HASH_SEQ_STATUS status;
+               PagetableEntry *page;
+               int                     npages;
+               int                     nchunks;
+
+               if (!tbm->spages && tbm->npages > 0)
+                       tbm->spages = (PagetableEntry **)
+                               MemoryContextAlloc(tbm->mcxt,
+                                                                  tbm->npages * sizeof(PagetableEntry *));
+               if (!tbm->schunks && tbm->nchunks > 0)
+                       tbm->schunks = (PagetableEntry **)
+                               MemoryContextAlloc(tbm->mcxt,
+                                                                  tbm->nchunks * sizeof(PagetableEntry *));
+
+               hash_seq_init(&status, tbm->pagetable);
+               npages = nchunks = 0;
+               while ((page = (PagetableEntry *) hash_seq_search(&status)) != NULL)
+               {
+                       if (page->ischunk)
+                               tbm->schunks[nchunks++] = page;
+                       else
+                               tbm->spages[npages++] = page;
+               }
+               Assert(npages == tbm->npages);
+               Assert(nchunks == tbm->nchunks);
+               if (npages > 1)
+                       qsort(tbm->spages, npages, sizeof(PagetableEntry *),
+                                 tbm_comparator);
+               if (nchunks > 1)
+                       qsort(tbm->schunks, nchunks, sizeof(PagetableEntry *),
+                                 tbm_comparator);
        }
-       Assert(npages == tbm->npages);
-       Assert(nchunks == tbm->nchunks);
-       if (npages > 1)
-               qsort(tbm->spages, npages, sizeof(PagetableEntry *), tbm_comparator);
-       if (nchunks > 1)
-               qsort(tbm->schunks, nchunks, sizeof(PagetableEntry *), tbm_comparator);
+
+       tbm->iterating = true;
+
+       return iterator;
 }
 
 /*
@@ -602,9 +626,10 @@ tbm_begin_iterate(TIDBitmap *tbm)
  * testing, recheck is always set true when ntuples < 0.)
  */
 TBMIterateResult *
-tbm_iterate(TIDBitmap *tbm)
+tbm_iterate(TBMIterator *iterator)
 {
-       TBMIterateResult *output = &(tbm->output);
+       TIDBitmap *tbm = iterator->tbm;
+       TBMIterateResult *output = &(iterator->output);
 
        Assert(tbm->iterating);
 
@@ -612,10 +637,10 @@ tbm_iterate(TIDBitmap *tbm)
         * If lossy chunk pages remain, make sure we've advanced schunkptr/
         * schunkbit to the next set bit.
         */
-       while (tbm->schunkptr < tbm->nchunks)
+       while (iterator->schunkptr < tbm->nchunks)
        {
-               PagetableEntry *chunk = tbm->schunks[tbm->schunkptr];
-               int                     schunkbit = tbm->schunkbit;
+               PagetableEntry *chunk = tbm->schunks[iterator->schunkptr];
+               int                     schunkbit = iterator->schunkbit;
 
                while (schunkbit < PAGES_PER_CHUNK)
                {
@@ -628,37 +653,37 @@ tbm_iterate(TIDBitmap *tbm)
                }
                if (schunkbit < PAGES_PER_CHUNK)
                {
-                       tbm->schunkbit = schunkbit;
+                       iterator->schunkbit = schunkbit;
                        break;
                }
                /* advance to next chunk */
-               tbm->schunkptr++;
-               tbm->schunkbit = 0;
+               iterator->schunkptr++;
+               iterator->schunkbit = 0;
        }
 
        /*
         * If both chunk and per-page data remain, must output the numerically
         * earlier page.
         */
-       if (tbm->schunkptr < tbm->nchunks)
+       if (iterator->schunkptr < tbm->nchunks)
        {
-               PagetableEntry *chunk = tbm->schunks[tbm->schunkptr];
+               PagetableEntry *chunk = tbm->schunks[iterator->schunkptr];
                BlockNumber chunk_blockno;
 
-               chunk_blockno = chunk->blockno + tbm->schunkbit;
-               if (tbm->spageptr >= tbm->npages ||
-                       chunk_blockno < tbm->spages[tbm->spageptr]->blockno)
+               chunk_blockno = chunk->blockno + iterator->schunkbit;
+               if (iterator->spageptr >= tbm->npages ||
+                       chunk_blockno < tbm->spages[iterator->spageptr]->blockno)
                {
                        /* Return a lossy page indicator from the chunk */
                        output->blockno = chunk_blockno;
                        output->ntuples = -1;
                        output->recheck = true;
-                       tbm->schunkbit++;
+                       iterator->schunkbit++;
                        return output;
                }
        }
 
-       if (tbm->spageptr < tbm->npages)
+       if (iterator->spageptr < tbm->npages)
        {
                PagetableEntry *page;
                int                     ntuples;
@@ -668,7 +693,7 @@ tbm_iterate(TIDBitmap *tbm)
                if (tbm->status == TBM_ONE_PAGE)
                        page = &tbm->entry1;
                else
-                       page = tbm->spages[tbm->spageptr];
+                       page = tbm->spages[iterator->spageptr];
 
                /* scan bitmap to extract individual offset numbers */
                ntuples = 0;
@@ -692,7 +717,7 @@ tbm_iterate(TIDBitmap *tbm)
                output->blockno = page->blockno;
                output->ntuples = ntuples;
                output->recheck = page->recheck;
-               tbm->spageptr++;
+               iterator->spageptr++;
                return output;
        }
 
@@ -700,6 +725,19 @@ tbm_iterate(TIDBitmap *tbm)
        return NULL;
 }
 
+/*
+ * tbm_end_iterate - finish an iteration over a TIDBitmap
+ *
+ * Currently this is just a pfree, but it might do more someday.  (For
+ * instance, it could be useful to count open iterators and allow the
+ * bitmap to return to read/write status when there are no more iterators.)
+ */
+void
+tbm_end_iterate(TBMIterator *iterator)
+{
+       pfree(iterator);
+}
+
 /*
  * tbm_find_pageentry - find a PagetableEntry for the pageno
  *
index 78269a415a0153dd4105329c06555656775f5b0a..1425333221d487ab4a0b58f60c173dcc4dde53ee 100644 (file)
@@ -4,7 +4,7 @@
  *
  *     Copyright (c) 2006-2009, PostgreSQL Global Development Group
  *
- *     $PostgreSQL: pgsql/src/include/access/gin.h,v 1.27 2009/01/01 17:23:55 momjian Exp $
+ *     $PostgreSQL: pgsql/src/include/access/gin.h,v 1.28 2009/01/10 21:08:36 tgl Exp $
  *--------------------------------------------------------------------------
  */
 
@@ -380,6 +380,7 @@ typedef struct GinScanEntryData
        /* partial match support */
        bool            isPartialMatch;
        TIDBitmap  *partialMatch;
+       TBMIterator *partialMatchIterator;
        TBMIterateResult *partialMatchResult;
        StrategyNumber strategy;
 
index 4b2b64c300ee0540ba561903b1428c4d94d8ffac..506605df0014676ded4f42770ea11469d4087148 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/nodes/execnodes.h,v 1.199 2009/01/01 17:23:59 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/nodes/execnodes.h,v 1.200 2009/01/10 21:08:36 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -1152,6 +1152,7 @@ typedef struct BitmapIndexScanState
  *
  *             bitmapqualorig     execution state for bitmapqualorig expressions
  *             tbm                                bitmap obtained from child index scan(s)
+ *             tbmiterator                iterator for scanning current pages
  *             tbmres                     current-page data
  * ----------------
  */
@@ -1160,6 +1161,7 @@ typedef struct BitmapHeapScanState
        ScanState       ss;                             /* its first field is NodeTag */
        List       *bitmapqualorig;
        TIDBitmap  *tbm;
+       TBMIterator *tbmiterator;
        TBMIterateResult *tbmres;
 } BitmapHeapScanState;
 
index e6ce0db892d433dd881d856c58c325df1fe5142f..93658543e4287b315bcd3e166b17326dc7476b4c 100644 (file)
@@ -15,7 +15,7 @@
  *
  * Copyright (c) 2003-2009, PostgreSQL Global Development Group
  *
- * $PostgreSQL: pgsql/src/include/nodes/tidbitmap.h,v 1.8 2009/01/01 17:24:00 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/nodes/tidbitmap.h,v 1.9 2009/01/10 21:08:36 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -31,6 +31,9 @@
  */
 typedef struct TIDBitmap TIDBitmap;
 
+/* Likewise, TBMIterator is private */
+typedef struct TBMIterator TBMIterator;
+
 /* Result structure for tbm_iterate */
 typedef struct
 {
@@ -55,7 +58,8 @@ extern void tbm_intersect(TIDBitmap *a, const TIDBitmap *b);
 
 extern bool tbm_is_empty(const TIDBitmap *tbm);
 
-extern void tbm_begin_iterate(TIDBitmap *tbm);
-extern TBMIterateResult *tbm_iterate(TIDBitmap *tbm);
+extern TBMIterator *tbm_begin_iterate(TIDBitmap *tbm);
+extern TBMIterateResult *tbm_iterate(TBMIterator *iterator);
+extern void tbm_end_iterate(TBMIterator *iterator);
 
 #endif   /* TIDBITMAP_H */