]> git.ipfire.org Git - thirdparty/postgresql.git/commitdiff
Fix dynahash.c to suppress hash bucket splits while a hash_seq_search() scan
authorTom Lane <tgl@sss.pgh.pa.us>
Thu, 26 Apr 2007 23:24:57 +0000 (23:24 +0000)
committerTom Lane <tgl@sss.pgh.pa.us>
Thu, 26 Apr 2007 23:24:57 +0000 (23:24 +0000)
is in progress on the same hashtable.  This seems the least invasive way to
fix the recently-recognized problem that a split could cause the scan to
visit entries twice or (with much lower probability) miss them entirely.
The only field-reported problem caused by this is the "failed to re-find
shared lock object" PANIC in COMMIT PREPARED reported by Michel Dorochevsky,
which was caused by multiply visited entries.  However, it seems certain
that mdsync() is vulnerable to missing required fsync's due to missed
entries, and I am fearful that RelationCacheInitializePhase2() might be at
risk as well.  Because of that and the generalized hazard presented by this
bug, back-patch all the supported branches.

Along the way, fix pg_prepared_statement() and pg_cursor() to not assume
that the hashtables they are examining will stay static between calls.
This is risky regardless of the newly noted dynahash problem, because
hash_seq_search() has never promised to cope with deletion of table entries
other than the just-returned one.  There may be no bug here because the only
supported way to call these functions is via ExecMakeTableFunctionResult()
which will cycle them to completion before doing anything very interesting,
but it seems best to get rid of the assumption.  This affects 8.2 and HEAD
only, since those functions weren't there earlier.

src/backend/access/transam/xact.c
src/backend/commands/prepare.c
src/backend/executor/nodeSubplan.c
src/backend/nodes/tidbitmap.c
src/backend/storage/smgr/md.c
src/backend/utils/hash/dynahash.c
src/backend/utils/mmgr/portalmem.c
src/include/nodes/execnodes.h
src/include/utils/hsearch.h

index 673a34ad034b45cb93b87a11a721710f5cf0407a..1b5cfaa7cc19c62e2b21009bb334b4a1d16658b6 100644 (file)
@@ -10,7 +10,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/access/transam/xact.c,v 1.229 2006/11/23 01:14:59 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/access/transam/xact.c,v 1.229.2.1 2007/04/26 23:24:56 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -1627,6 +1627,7 @@ CommitTransaction(void)
        AtEOXact_Namespace(true);
        /* smgrcommit already done */
        AtEOXact_Files();
+       AtEOXact_HashTables(true);
        pgstat_count_xact_commit();
 
        CurrentResourceOwner = NULL;
@@ -1842,6 +1843,7 @@ PrepareTransaction(void)
        AtEOXact_Namespace(true);
        /* smgrcommit already done */
        AtEOXact_Files();
+       AtEOXact_HashTables(true);
 
        CurrentResourceOwner = NULL;
        ResourceOwnerDelete(TopTransactionResourceOwner);
@@ -1993,6 +1995,7 @@ AbortTransaction(void)
        AtEOXact_Namespace(false);
        smgrabort();
        AtEOXact_Files();
+       AtEOXact_HashTables(false);
        pgstat_count_xact_rollback();
 
        /*
@@ -3704,6 +3707,7 @@ CommitSubTransaction(void)
                                                  s->parent->subTransactionId);
        AtEOSubXact_Files(true, s->subTransactionId,
                                          s->parent->subTransactionId);
+       AtEOSubXact_HashTables(true, s->nestingLevel);
 
        /*
         * We need to restore the upper transaction's read-only state, in case the
@@ -3815,6 +3819,7 @@ AbortSubTransaction(void)
                                                          s->parent->subTransactionId);
                AtEOSubXact_Files(false, s->subTransactionId,
                                                  s->parent->subTransactionId);
+               AtEOSubXact_HashTables(false, s->nestingLevel);
        }
 
        /*
index 6b7c11a189460e83828ac007dbf0be38bc4ebad8..d5ec457732aec0adf0e8b7c80eb8d9c50f711546 100644 (file)
@@ -10,7 +10,7 @@
  * Copyright (c) 2002-2006, PostgreSQL Global Development Group
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/commands/prepare.c,v 1.66 2006/10/04 00:29:51 momjian Exp $
+ *       $PostgreSQL: pgsql/src/backend/commands/prepare.c,v 1.66.2.1 2007/04/26 23:24:56 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -21,7 +21,7 @@
 #include "catalog/pg_type.h"
 #include "commands/explain.h"
 #include "commands/prepare.h"
-#include "funcapi.h"
+#include "miscadmin.h"
 #include "rewrite/rewriteHandler.h"
 #include "tcop/pquery.h"
 #include "tcop/tcopprot.h"
@@ -674,91 +674,98 @@ ExplainExecuteQuery(ExplainStmt *stmt, ParamListInfo params,
 Datum
 pg_prepared_statement(PG_FUNCTION_ARGS)
 {
-       FuncCallContext *funcctx;
-       HASH_SEQ_STATUS *hash_seq;
-       PreparedStatement *prep_stmt;
-
-       /* stuff done only on the first call of the function */
-       if (SRF_IS_FIRSTCALL())
-       {
-               TupleDesc       tupdesc;
-               MemoryContext oldcontext;
-
-               /* create a function context for cross-call persistence */
-               funcctx = SRF_FIRSTCALL_INIT();
+       ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+       TupleDesc       tupdesc;
+       Tuplestorestate *tupstore;
+       MemoryContext per_query_ctx;
+       MemoryContext oldcontext;
+
+       /* check to see if caller supports us returning a tuplestore */
+       if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
+               ereport(ERROR,
+                               (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+                                errmsg("set-valued function called in context that cannot accept a set")));
+       if (!(rsinfo->allowedModes & SFRM_Materialize))
+               ereport(ERROR,
+                               (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+                                errmsg("materialize mode required, but it is not " \
+                                               "allowed in this context")));
 
-               /*
-                * switch to memory context appropriate for multiple function calls
-                */
-               oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
+       /* need to build tuplestore in query context */
+       per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
+       oldcontext = MemoryContextSwitchTo(per_query_ctx);
 
-               /* allocate memory for user context */
-               if (prepared_queries)
-               {
-                       hash_seq = (HASH_SEQ_STATUS *) palloc(sizeof(HASH_SEQ_STATUS));
-                       hash_seq_init(hash_seq, prepared_queries);
-                       funcctx->user_fctx = (void *) hash_seq;
-               }
-               else
-                       funcctx->user_fctx = NULL;
+       /*
+        * build tupdesc for result tuples. This must match the definition of
+        * the pg_prepared_statements view in system_views.sql
+        */
+       tupdesc = CreateTemplateTupleDesc(5, false);
+       TupleDescInitEntry(tupdesc, (AttrNumber) 1, "name",
+                                          TEXTOID, -1, 0);
+       TupleDescInitEntry(tupdesc, (AttrNumber) 2, "statement",
+                                          TEXTOID, -1, 0);
+       TupleDescInitEntry(tupdesc, (AttrNumber) 3, "prepare_time",
+                                          TIMESTAMPTZOID, -1, 0);
+       TupleDescInitEntry(tupdesc, (AttrNumber) 4, "parameter_types",
+                                          REGTYPEARRAYOID, -1, 0);
+       TupleDescInitEntry(tupdesc, (AttrNumber) 5, "from_sql",
+                                          BOOLOID, -1, 0);
 
-               /*
-                * build tupdesc for result tuples. This must match the definition of
-                * the pg_prepared_statements view in system_views.sql
-                */
-               tupdesc = CreateTemplateTupleDesc(5, false);
-               TupleDescInitEntry(tupdesc, (AttrNumber) 1, "name",
-                                                  TEXTOID, -1, 0);
-               TupleDescInitEntry(tupdesc, (AttrNumber) 2, "statement",
-                                                  TEXTOID, -1, 0);
-               TupleDescInitEntry(tupdesc, (AttrNumber) 3, "prepare_time",
-                                                  TIMESTAMPTZOID, -1, 0);
-               TupleDescInitEntry(tupdesc, (AttrNumber) 4, "parameter_types",
-                                                  REGTYPEARRAYOID, -1, 0);
-               TupleDescInitEntry(tupdesc, (AttrNumber) 5, "from_sql",
-                                                  BOOLOID, -1, 0);
-
-               funcctx->tuple_desc = BlessTupleDesc(tupdesc);
-               MemoryContextSwitchTo(oldcontext);
-       }
+       /*
+        * We put all the tuples into a tuplestore in one scan of the hashtable.
+        * This avoids any issue of the hashtable possibly changing between calls.
+        */
+       tupstore = tuplestore_begin_heap(true, false, work_mem);
 
-       /* stuff done on every call of the function */
-       funcctx = SRF_PERCALL_SETUP();
-       hash_seq = (HASH_SEQ_STATUS *) funcctx->user_fctx;
+       /* hash table might be uninitialized */
+       if (prepared_queries)
+       {
+               HASH_SEQ_STATUS hash_seq;
+               PreparedStatement *prep_stmt;
 
-       /* if the hash table is uninitialized, we're done */
-       if (hash_seq == NULL)
-               SRF_RETURN_DONE(funcctx);
+               hash_seq_init(&hash_seq, prepared_queries);
+               while ((prep_stmt = hash_seq_search(&hash_seq)) != NULL)
+               {
+                       HeapTuple       tuple;
+                       Datum           values[5];
+                       bool            nulls[5];
 
-       prep_stmt = hash_seq_search(hash_seq);
-       if (prep_stmt)
-       {
-               Datum           result;
-               HeapTuple       tuple;
-               Datum           values[5];
-               bool            nulls[5];
+                       /* generate junk in short-term context */
+                       MemoryContextSwitchTo(oldcontext);
 
-               MemSet(nulls, 0, sizeof(nulls));
+                       MemSet(nulls, 0, sizeof(nulls));
 
-               values[0] = DirectFunctionCall1(textin,
+                       values[0] = DirectFunctionCall1(textin,
                                                                          CStringGetDatum(prep_stmt->stmt_name));
 
-               if (prep_stmt->query_string == NULL)
-                       nulls[1] = true;
-               else
-                       values[1] = DirectFunctionCall1(textin,
+                       if (prep_stmt->query_string == NULL)
+                               nulls[1] = true;
+                       else
+                               values[1] = DirectFunctionCall1(textin,
                                                                   CStringGetDatum(prep_stmt->query_string));
 
-               values[2] = TimestampTzGetDatum(prep_stmt->prepare_time);
-               values[3] = build_regtype_array(prep_stmt->argtype_list);
-               values[4] = BoolGetDatum(prep_stmt->from_sql);
+                       values[2] = TimestampTzGetDatum(prep_stmt->prepare_time);
+                       values[3] = build_regtype_array(prep_stmt->argtype_list);
+                       values[4] = BoolGetDatum(prep_stmt->from_sql);
 
-               tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
-               result = HeapTupleGetDatum(tuple);
-               SRF_RETURN_NEXT(funcctx, result);
+                       tuple = heap_form_tuple(tupdesc, values, nulls);
+
+                       /* switch to appropriate context while storing the tuple */
+                       MemoryContextSwitchTo(per_query_ctx);
+                       tuplestore_puttuple(tupstore, tuple);
+               }
        }
 
-       SRF_RETURN_DONE(funcctx);
+       /* clean up and return the tuplestore */
+       tuplestore_donestoring(tupstore);
+
+       MemoryContextSwitchTo(oldcontext);
+
+       rsinfo->returnMode = SFRM_Materialize;
+       rsinfo->setResult = tupstore;
+       rsinfo->setDesc = tupdesc;
+
+       return (Datum) 0;
 }
 
 /*
index 9b751b7c305fbbbb8365bdc7f0952919132d7a6e..02a00b15a7b4a3db048c0595db2402208a12622a 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/executor/nodeSubplan.c,v 1.80.2.2 2007/02/02 00:07:28 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/executor/nodeSubplan.c,v 1.80.2.3 2007/04/26 23:24:56 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -569,7 +569,7 @@ findPartialMatch(TupleHashTable hashtable, TupleTableSlot *slot)
        TupleHashIterator hashiter;
        TupleHashEntry entry;
 
-       ResetTupleHashIterator(hashtable, &hashiter);
+       InitTupleHashIterator(hashtable, &hashiter);
        while ((entry = ScanTupleHashTable(&hashiter)) != NULL)
        {
                ExecStoreMinimalTuple(entry->firstTuple, hashtable->tableslot, false);
@@ -577,8 +577,12 @@ findPartialMatch(TupleHashTable hashtable, TupleTableSlot *slot)
                                                           numCols, keyColIdx,
                                                           hashtable->eqfunctions,
                                                           hashtable->tempcxt))
+               {
+                       TermTupleHashIterator(&hashiter);
                        return true;
+               }
        }
+       /* No TermTupleHashIterator call needed here */
        return false;
 }
 
index 0e01ab71ef2d112fe91ae852d96fce88eaccbcc4..e901659d5834d7ff51d5e05a6b72206b6f725e81 100644 (file)
@@ -23,7 +23,7 @@
  * Copyright (c) 2003-2006, PostgreSQL Global Development Group
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/nodes/tidbitmap.c,v 1.10 2006/07/13 17:47:01 momjian Exp $
+ *       $PostgreSQL: pgsql/src/backend/nodes/tidbitmap.c,v 1.10.2.1 2007/04/26 23:24:56 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -907,7 +907,11 @@ tbm_lossify(TIDBitmap *tbm)
                tbm_mark_page_lossy(tbm, page->blockno);
 
                if (tbm->nentries <= tbm->maxentries)
-                       return;                         /* we have done enough */
+               {
+                       /* we have done enough */
+                       hash_seq_term(&status);
+                       break;
+               }
 
                /*
                 * Note: tbm_mark_page_lossy may have inserted a lossy chunk into the
index 3480b6c1027e0136266af1d404472df517af0776..4c91278a1d33e5b3ce0ab882e7fd31bd58ca7615 100644 (file)
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.123.2.2 2007/04/12 17:11:00 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.123.2.3 2007/04/26 23:24:57 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -919,6 +919,7 @@ mdsync(void)
                                                                        entry->tag.rnode.spcNode,
                                                                        entry->tag.rnode.dbNode,
                                                                        entry->tag.rnode.relNode)));
+                                       hash_seq_term(&hstat);
                                        return false;
                                }
                                else
index 23b48c0b6344829b6e51ac3c5cd75b3f7a9d75ad..d61a952aefc7cf50d11d687b74cdda9464c3a835 100644 (file)
@@ -26,7 +26,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/utils/hash/dynahash.c,v 1.73 2006/10/04 00:30:02 momjian Exp $
+ *       $PostgreSQL: pgsql/src/backend/utils/hash/dynahash.c,v 1.73.2.1 2007/04/26 23:24:57 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -63,6 +63,7 @@
 
 #include "postgres.h"
 
+#include "access/xact.h"
 #include "storage/shmem.h"
 #include "storage/spin.h"
 #include "utils/dynahash.h"
@@ -160,6 +161,9 @@ struct HTAB
        char       *tabname;            /* table name (for error messages) */
        bool            isshared;               /* true if table is in shared memory */
 
+       /* freezing a shared table isn't allowed, so we can keep state here */
+       bool            frozen;                 /* true = no more inserts allowed */
+
        /* We keep local copies of these fixed values to reduce contention */
        Size            keysize;                /* hash key length in bytes */
        long            ssize;                  /* segment size --- must be power of 2 */
@@ -195,6 +199,9 @@ static void hdefault(HTAB *hashp);
 static int     choose_nelem_alloc(Size entrysize);
 static bool init_htab(HTAB *hashp, long nelem);
 static void hash_corrupted(HTAB *hashp);
+static void register_seq_scan(HTAB *hashp);
+static void deregister_seq_scan(HTAB *hashp);
+static bool has_seq_scans(HTAB *hashp);
 
 
 /*
@@ -356,6 +363,8 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
                                         errmsg("out of memory")));
        }
 
+       hashp->frozen = false;
+
        hdefault(hashp);
 
        hctl = hashp->hctl;
@@ -898,6 +907,10 @@ hash_search_with_hash_value(HTAB *hashp,
                        if (currBucket != NULL)
                                return (void *) ELEMENTKEY(currBucket);
 
+                       /* disallow inserts if frozen */
+                       if (hashp->frozen)
+                               elog(ERROR, "cannot insert into a frozen hashtable");
+
                        currBucket = get_hash_entry(hashp);
                        if (currBucket == NULL)
                        {
@@ -925,10 +938,15 @@ hash_search_with_hash_value(HTAB *hashp,
 
                        /* caller is expected to fill the data field on return */
 
-                       /* Check if it is time to split a bucket */
-                       /* Can't split if running in partitioned mode */
+                       /*
+                        * Check if it is time to split a bucket.  Can't split if running
+                        * in partitioned mode, nor if table is the subject of any active
+                        * hash_seq_search scans.  Strange order of these tests is to try
+                        * to check cheaper conditions first.
+                        */
                        if (!IS_PARTITIONED(hctl) &&
-                        hctl->nentries / (long) (hctl->max_bucket + 1) >= hctl->ffactor)
+                               hctl->nentries / (long) (hctl->max_bucket + 1) >= hctl->ffactor &&
+                               !has_seq_scans(hashp))
                        {
                                /*
                                 * NOTE: failure to expand table is not a fatal error, it just
@@ -1001,18 +1019,30 @@ hash_get_num_entries(HTAB *hashp)
 }
 
 /*
- * hash_seq_init/_search
+ * hash_seq_init/_search/_term
  *                     Sequentially search through hash table and return
  *                     all the elements one by one, return NULL when no more.
  *
+ * hash_seq_term should be called if and only if the scan is abandoned before
+ * completion; if hash_seq_search returns NULL then it has already done the
+ * end-of-scan cleanup.
+ *
  * NOTE: caller may delete the returned element before continuing the scan.
  * However, deleting any other element while the scan is in progress is
  * UNDEFINED (it might be the one that curIndex is pointing at!).  Also,
  * if elements are added to the table while the scan is in progress, it is
  * unspecified whether they will be visited by the scan or not.
  *
+ * NOTE: it is possible to use hash_seq_init/hash_seq_search without any
+ * worry about hash_seq_term cleanup, if the hashtable is first locked against
+ * further insertions by calling hash_freeze.  This is used by nodeAgg.c,
+ * wherein it is inconvenient to track whether a scan is still open, and
+ * there's no possibility of further insertions after readout has begun.
+ *
  * NOTE: to use this with a partitioned hashtable, caller had better hold
  * at least shared lock on all partitions of the table throughout the scan!
+ * We can cope with insertions or deletions by our own backend, but *not*
+ * with concurrent insertions or deletions by another.
  */
 void
 hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
@@ -1020,6 +1050,8 @@ hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
        status->hashp = hashp;
        status->curBucket = 0;
        status->curEntry = NULL;
+       if (!hashp->frozen)
+               register_seq_scan(hashp);
 }
 
 void *
@@ -1054,7 +1086,10 @@ hash_seq_search(HASH_SEQ_STATUS *status)
        max_bucket = hctl->max_bucket;
 
        if (curBucket > max_bucket)
+       {
+               hash_seq_term(status);
                return NULL;                    /* search is done */
+       }
 
        /*
         * first find the right segment in the table directory.
@@ -1076,6 +1111,7 @@ hash_seq_search(HASH_SEQ_STATUS *status)
                if (++curBucket > max_bucket)
                {
                        status->curBucket = curBucket;
+                       hash_seq_term(status);
                        return NULL;            /* search is done */
                }
                if (++segment_ndx >= ssize)
@@ -1094,6 +1130,36 @@ hash_seq_search(HASH_SEQ_STATUS *status)
        return (void *) ELEMENTKEY(curElem);
 }
 
+void
+hash_seq_term(HASH_SEQ_STATUS *status)
+{
+       if (!status->hashp->frozen)
+               deregister_seq_scan(status->hashp);
+}
+
+/*
+ * hash_freeze
+ *                     Freeze a hashtable against future insertions (deletions are
+ *                     still allowed)
+ *
+ * The reason for doing this is that by preventing any more bucket splits,
+ * we no longer need to worry about registering hash_seq_search scans,
+ * and thus caller need not be careful about ensuring hash_seq_term gets
+ * called at the right times.
+ *
+ * Multiple calls to hash_freeze() are allowed, but you can't freeze a table
+ * with active scans (since hash_seq_term would then do the wrong thing).
+ */
+void
+hash_freeze(HTAB *hashp)
+{
+       if (hashp->isshared)
+               elog(ERROR, "cannot freeze shared hashtable");
+       if (!hashp->frozen && has_seq_scans(hashp))
+               elog(ERROR, "cannot freeze hashtable with active scans");
+       hashp->frozen = true;
+}
+
 
 /********************************* UTILITIES ************************/
 
@@ -1324,3 +1390,136 @@ my_log2(long num)
                ;
        return i;
 }
+
+
+/************************* SEQ SCAN TRACKING ************************/
+
+/*
+ * We track active hash_seq_search scans here.  The need for this mechanism
+ * comes from the fact that a scan will get confused if a bucket split occurs
+ * while it's in progress: it might visit entries twice, or even miss some
+ * entirely (if it's partway through the same bucket that splits).  Hence
+ * we want to inhibit bucket splits if there are any active scans on the
+ * table being inserted into.  This is a fairly rare case in current usage,
+ * so just postponing the split until the next insertion seems sufficient.
+ *
+ * Given present usages of the function, only a few scans are likely to be
+ * open concurrently; so a finite-size stack of open scans seems sufficient,
+ * and we don't worry that linear search is too slow.  Note that we do
+ * allow multiple scans of the same hashtable to be open concurrently.
+ *
+ * This mechanism can support concurrent scan and insertion in a shared
+ * hashtable if it's the same backend doing both.  It would fail otherwise,
+ * but locking reasons seem to preclude any such scenario anyway, so we don't
+ * worry.
+ *
+ * This arrangement is reasonably robust if a transient hashtable is deleted
+ * without notifying us.  The absolute worst case is we might inhibit splits
+ * in another table created later at exactly the same address.  We will give
+ * a warning at transaction end for reference leaks, so any bugs leading to
+ * lack of notification should be easy to catch.
+ */
+
+#define MAX_SEQ_SCANS 100
+
+static HTAB *seq_scan_tables[MAX_SEQ_SCANS];   /* tables being scanned */
+static int     seq_scan_level[MAX_SEQ_SCANS];          /* subtransaction nest level */
+static int     num_seq_scans = 0;
+
+
+/* Register a table as having an active hash_seq_search scan */
+static void
+register_seq_scan(HTAB *hashp)
+{
+       if (num_seq_scans >= MAX_SEQ_SCANS)
+               elog(ERROR, "too many active hash_seq_search scans");
+       seq_scan_tables[num_seq_scans] = hashp;
+       seq_scan_level[num_seq_scans] = GetCurrentTransactionNestLevel();
+       num_seq_scans++;
+}
+
+/* Deregister an active scan */
+static void
+deregister_seq_scan(HTAB *hashp)
+{
+       int             i;
+
+       /* Search backward since it's most likely at the stack top */
+       for (i = num_seq_scans - 1; i >= 0; i--)
+       {
+               if (seq_scan_tables[i] == hashp)
+               {
+                       seq_scan_tables[i] = seq_scan_tables[num_seq_scans - 1];
+                       seq_scan_level[i] = seq_scan_level[num_seq_scans - 1];
+                       num_seq_scans--;
+                       return;
+               }
+       }
+       elog(ERROR, "no hash_seq_search scan for hash table \"%s\"",
+                hashp->tabname);
+}
+
+/* Check if a table has any active scan */
+static bool
+has_seq_scans(HTAB *hashp)
+{
+       int             i;
+
+       for (i = 0; i < num_seq_scans; i++)
+       {
+               if (seq_scan_tables[i] == hashp)
+                       return true;
+       }
+       return false;
+}
+
+/* Clean up any open scans at end of transaction */
+void
+AtEOXact_HashTables(bool isCommit)
+{
+       /*
+        * During abort cleanup, open scans are expected; just silently clean 'em
+        * out.  An open scan at commit means someone forgot a hash_seq_term()
+        * call, so complain.
+        *
+        * Note: it's tempting to try to print the tabname here, but refrain for
+        * fear of touching deallocated memory.  This isn't a user-facing message
+        * anyway, so it needn't be pretty.
+        */
+       if (isCommit)
+       {
+               int             i;
+
+               for (i = 0; i < num_seq_scans; i++)
+               {
+                       elog(WARNING, "leaked hash_seq_search scan for hash table %p",
+                                seq_scan_tables[i]);
+               }
+       }
+       num_seq_scans = 0;
+}
+
+/* Clean up any open scans at end of subtransaction */
+void
+AtEOSubXact_HashTables(bool isCommit, int nestDepth)
+{
+       int             i;
+
+       /*
+        * Search backward to make cleanup easy.  Note we must check all entries,
+        * not only those at the end of the array, because deletion technique
+        * doesn't keep them in order.
+        */
+       for (i = num_seq_scans - 1; i >= 0; i--)
+       {
+               if (seq_scan_level[i] >= nestDepth)
+               {
+                       if (isCommit)
+                               elog(WARNING, "leaked hash_seq_search scan for hash table %p",
+                                        seq_scan_tables[i]);
+                       seq_scan_tables[i] = seq_scan_tables[num_seq_scans - 1];
+                       seq_scan_level[i] = seq_scan_level[num_seq_scans - 1];
+                       num_seq_scans--;
+               }
+       }
+}
index 4f8cba9ba5a864b720b1481525f96a2437286ead..3e3505caf05dbf94af722cc1526d20683298df0d 100644 (file)
@@ -12,7 +12,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/utils/mmgr/portalmem.c,v 1.97 2006/11/23 01:14:59 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/utils/mmgr/portalmem.c,v 1.97.2.1 2007/04/26 23:24:57 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -22,7 +22,6 @@
 #include "access/xact.h"
 #include "catalog/pg_type.h"
 #include "commands/portalcmds.h"
-#include "funcapi.h"
 #include "miscadmin.h"
 #include "utils/builtins.h"
 #include "utils/memutils.h"
@@ -577,7 +576,9 @@ AtCommit_Portals(void)
                /* Zap all non-holdable portals */
                PortalDrop(portal, true);
 
-               /* Restart the iteration */
+               /* Restart the iteration in case that led to other drops */
+               /* XXX is this really necessary? */
+               hash_seq_term(&status);
                hash_seq_init(&status, PortalHashTable);
        }
 }
@@ -806,79 +807,68 @@ AtSubCleanup_Portals(SubTransactionId mySubid)
 Datum
 pg_cursor(PG_FUNCTION_ARGS)
 {
-       FuncCallContext *funcctx;
-       HASH_SEQ_STATUS *hash_seq;
+       ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+       TupleDesc       tupdesc;
+       Tuplestorestate *tupstore;
+       MemoryContext per_query_ctx;
+       MemoryContext oldcontext;
+       HASH_SEQ_STATUS hash_seq;
        PortalHashEnt *hentry;
 
-       /* stuff done only on the first call of the function */
-       if (SRF_IS_FIRSTCALL())
-       {
-               MemoryContext oldcontext;
-               TupleDesc       tupdesc;
-
-               /* create a function context for cross-call persistence */
-               funcctx = SRF_FIRSTCALL_INIT();
-
-               /*
-                * switch to memory context appropriate for multiple function calls
-                */
-               oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
-
-               if (PortalHashTable)
-               {
-                       hash_seq = (HASH_SEQ_STATUS *) palloc(sizeof(HASH_SEQ_STATUS));
-                       hash_seq_init(hash_seq, PortalHashTable);
-                       funcctx->user_fctx = (void *) hash_seq;
-               }
-               else
-                       funcctx->user_fctx = NULL;
-
-               /*
-                * build tupdesc for result tuples. This must match the definition of
-                * the pg_cursors view in system_views.sql
-                */
-               tupdesc = CreateTemplateTupleDesc(6, false);
-               TupleDescInitEntry(tupdesc, (AttrNumber) 1, "name",
-                                                  TEXTOID, -1, 0);
-               TupleDescInitEntry(tupdesc, (AttrNumber) 2, "statement",
-                                                  TEXTOID, -1, 0);
-               TupleDescInitEntry(tupdesc, (AttrNumber) 3, "is_holdable",
-                                                  BOOLOID, -1, 0);
-               TupleDescInitEntry(tupdesc, (AttrNumber) 4, "is_binary",
-                                                  BOOLOID, -1, 0);
-               TupleDescInitEntry(tupdesc, (AttrNumber) 5, "is_scrollable",
-                                                  BOOLOID, -1, 0);
-               TupleDescInitEntry(tupdesc, (AttrNumber) 6, "creation_time",
-                                                  TIMESTAMPTZOID, -1, 0);
-
-               funcctx->tuple_desc = BlessTupleDesc(tupdesc);
-               MemoryContextSwitchTo(oldcontext);
-       }
-
-       /* stuff done on every call of the function */
-       funcctx = SRF_PERCALL_SETUP();
-       hash_seq = (HASH_SEQ_STATUS *) funcctx->user_fctx;
+       /* check to see if caller supports us returning a tuplestore */
+       if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
+               ereport(ERROR,
+                               (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+                                errmsg("set-valued function called in context that cannot accept a set")));
+       if (!(rsinfo->allowedModes & SFRM_Materialize))
+               ereport(ERROR,
+                               (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+                                errmsg("materialize mode required, but it is not " \
+                                               "allowed in this context")));
+
+       /* need to build tuplestore in query context */
+       per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
+       oldcontext = MemoryContextSwitchTo(per_query_ctx);
 
-       /* if the hash table is uninitialized, we're done */
-       if (hash_seq == NULL)
-               SRF_RETURN_DONE(funcctx);
+       /*
+        * build tupdesc for result tuples. This must match the definition of
+        * the pg_cursors view in system_views.sql
+        */
+       tupdesc = CreateTemplateTupleDesc(6, false);
+       TupleDescInitEntry(tupdesc, (AttrNumber) 1, "name",
+                                          TEXTOID, -1, 0);
+       TupleDescInitEntry(tupdesc, (AttrNumber) 2, "statement",
+                                          TEXTOID, -1, 0);
+       TupleDescInitEntry(tupdesc, (AttrNumber) 3, "is_holdable",
+                                          BOOLOID, -1, 0);
+       TupleDescInitEntry(tupdesc, (AttrNumber) 4, "is_binary",
+                                          BOOLOID, -1, 0);
+       TupleDescInitEntry(tupdesc, (AttrNumber) 5, "is_scrollable",
+                                          BOOLOID, -1, 0);
+       TupleDescInitEntry(tupdesc, (AttrNumber) 6, "creation_time",
+                                          TIMESTAMPTZOID, -1, 0);
 
-       /* loop until we find a visible portal or hit the end of the list */
-       while ((hentry = hash_seq_search(hash_seq)) != NULL)
-       {
-               if (hentry->portal->visible)
-                       break;
-       }
+       /*
+        * We put all the tuples into a tuplestore in one scan of the hashtable.
+        * This avoids any issue of the hashtable possibly changing between calls.
+        */
+       tupstore = tuplestore_begin_heap(true, false, work_mem);
 
-       if (hentry)
+       hash_seq_init(&hash_seq, PortalHashTable);
+       while ((hentry = hash_seq_search(&hash_seq)) != NULL)
        {
-               Portal          portal;
-               Datum           result;
+               Portal          portal = hentry->portal;
                HeapTuple       tuple;
                Datum           values[6];
                bool            nulls[6];
 
-               portal = hentry->portal;
+               /* report only "visible" entries */
+               if (!portal->visible)
+                       continue;
+
+               /* generate junk in short-term context */
+               MemoryContextSwitchTo(oldcontext);
+
                MemSet(nulls, 0, sizeof(nulls));
 
                values[0] = DirectFunctionCall1(textin, CStringGetDatum(portal->name));
@@ -892,10 +882,21 @@ pg_cursor(PG_FUNCTION_ARGS)
                values[4] = BoolGetDatum(portal->cursorOptions & CURSOR_OPT_SCROLL);
                values[5] = TimestampTzGetDatum(portal->creation_time);
 
-               tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
-               result = HeapTupleGetDatum(tuple);
-               SRF_RETURN_NEXT(funcctx, result);
+               tuple = heap_form_tuple(tupdesc, values, nulls);
+
+               /* switch to appropriate context while storing the tuple */
+               MemoryContextSwitchTo(per_query_ctx);
+               tuplestore_puttuple(tupstore, tuple);
        }
 
-       SRF_RETURN_DONE(funcctx);
+       /* clean up and return the tuplestore */
+       tuplestore_donestoring(tupstore);
+
+       MemoryContextSwitchTo(oldcontext);
+
+       rsinfo->returnMode = SFRM_Materialize;
+       rsinfo->setResult = tupstore;
+       rsinfo->setDesc = tupdesc;
+
+       return (Datum) 0;
 }
index cbed243c8a47c0a637193b31c85829aa8681a5f1..e981a6a933bc7fb11af9c3033418141d0d98cd8f 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/nodes/execnodes.h,v 1.161.2.1 2006/12/26 21:37:28 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/nodes/execnodes.h,v 1.161.2.2 2007/04/26 23:24:57 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -390,8 +390,20 @@ typedef struct TupleHashTableData
 
 typedef HASH_SEQ_STATUS TupleHashIterator;
 
-#define ResetTupleHashIterator(htable, iter) \
+/*
+ * Use InitTupleHashIterator/TermTupleHashIterator for a read/write scan.
+ * Use ResetTupleHashIterator if the table can be frozen (in this case no
+ * explicit scan termination is needed).
+ */
+#define InitTupleHashIterator(htable, iter) \
        hash_seq_init(iter, (htable)->hashtab)
+#define TermTupleHashIterator(iter) \
+       hash_seq_term(iter)
+#define ResetTupleHashIterator(htable, iter) \
+       do { \
+               hash_freeze((htable)->hashtab); \
+               hash_seq_init(iter, (htable)->hashtab); \
+       } while (0)
 #define ScanTupleHashTable(iter) \
        ((TupleHashEntry) hash_seq_search(iter))
 
index 5582b88ee666f069605ce7a21d6a66a7662c5287..15c95f463915d6ef9ae7528ebb31040c49091f48 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/utils/hsearch.h,v 1.45 2006/10/04 00:30:10 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/utils/hsearch.h,v 1.45.2.1 2007/04/26 23:24:57 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -130,9 +130,13 @@ extern void *hash_search_with_hash_value(HTAB *hashp, const void *keyPtr,
 extern long hash_get_num_entries(HTAB *hashp);
 extern void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp);
 extern void *hash_seq_search(HASH_SEQ_STATUS *status);
+extern void hash_seq_term(HASH_SEQ_STATUS *status);
+extern void hash_freeze(HTAB *hashp);
 extern Size hash_estimate_size(long num_entries, Size entrysize);
 extern long hash_select_dirsize(long num_entries);
 extern Size hash_get_shared_size(HASHCTL *info, int flags);
+extern void AtEOXact_HashTables(bool isCommit);
+extern void AtEOSubXact_HashTables(bool isCommit, int nestDepth);
 
 /*
  * prototypes for functions in hashfn.c