Improve comments about partitioned hash table freelists.

author Tom Lane <tgl@sss.pgh.pa.us>

Sat, 22 Jul 2017 22:02:26 +0000 (18:02 -0400)

committer Tom Lane <tgl@sss.pgh.pa.us>

Sat, 22 Jul 2017 22:02:26 +0000 (18:02 -0400)
author Tom Lane <tgl@sss.pgh.pa.us>
Sat, 22 Jul 2017 22:02:26 +0000 (18:02 -0400)
committer Tom Lane <tgl@sss.pgh.pa.us>
Sat, 22 Jul 2017 22:02:26 +0000 (18:02 -0400)
diff --git a/src/backend/utils/hash/dynahash.c b/src/backend/utils/hash/dynahash.c

index ebd55da997e12718b96488df1a3bb8d310da1b81..a58a479ae569be3071588f674e18facbf05696eb 100644 (file)
--- a/src/backend/utils/hash/dynahash.c
+++ b/src/backend/utils/hash/dynahash.c
@@ -15,7 +15,8 @@
   * to hash_create.  This prevents any attempt to split buckets on-the-fly.
   * Therefore, each hash bucket chain operates independently, and no fields
   * of the hash header change after init except nentries and freeList.
- * A partitioned table uses spinlocks to guard changes of those fields.
+ * (A partitioned table uses multiple copies of those fields, guarded by
+ * spinlocks, for additional concurrency.)
   * This lets any subset of the hash buckets be treated as a separately
   * lockable partition.  We expect callers to use the low-order bits of a
   * lookup key's hash value as a partition number --- this will work because
@@ -121,15 +122,27 @@ typedef HASHELEMENT *HASHBUCKET;
  typedef HASHBUCKET *HASHSEGMENT;
  
  /*
- * Using array of FreeListData instead of separate arrays of mutexes, nentries
- * and freeLists prevents, at least partially, sharing one cache line between
- * different mutexes (see below).
+ * Per-freelist data.
+ *
+ * In a partitioned hash table, each freelist is associated with a specific
+ * set of hashcodes, as determined by the FREELIST_IDX() macro below.
+ * nentries tracks the number of live hashtable entries having those hashcodes
+ * (NOT the number of entries in the freelist, as you might expect).
+ *
+ * The coverage of a freelist might be more or less than one partition, so it
+ * needs its own lock rather than relying on caller locking.  Relying on that
+ * wouldn't work even if the coverage was the same, because of the occasional
+ * need to "borrow" entries from another freelist; see get_hash_entry().
+ *
+ * Using an array of FreeListData instead of separate arrays of mutexes,
+ * nentries and freeLists helps to reduce sharing of cache lines between
+ * different mutexes.
   */
  typedef struct
  {
-       slock_t         mutex;                  /* spinlock */
-       long            nentries;               /* number of entries */
-       HASHELEMENT *freeList;          /* list of free elements */
+       slock_t         mutex;                  /* spinlock for this freelist */
+       long            nentries;               /* number of entries in associated buckets */
+       HASHELEMENT *freeList;          /* chain of free elements */
  } FreeListData;
  
  /*
@@ -143,12 +156,14 @@ typedef struct
  struct HASHHDR
  {
         /*
-        * The freelist can become a point of contention on high-concurrency hash
-        * tables, so we use an array of freelist, each with its own mutex and
-        * nentries count, instead of just a single one.
+        * The freelist can become a point of contention in high-concurrency hash
+        * tables, so we use an array of freelists, each with its own mutex and
+        * nentries count, instead of just a single one.  Although the freelists
+        * normally operate independently, we will scavenge entries from freelists
+        * other than a hashcode's default freelist when necessary.
          *
-        * If hash table is not partitioned only freeList[0] is used and spinlocks
-        * are not used at all.
+        * If the hash table is not partitioned, only freeList[0] is used and its
+        * spinlock is not used at all; callers' locking is assumed sufficient.
          */
         FreeListData freeList[NUM_FREELISTS];
  
@@ -184,7 +199,7 @@ struct HASHHDR
  #define IS_PARTITIONED(hctl)  ((hctl)->num_partitions != 0)
  
  #define FREELIST_IDX(hctl, hashcode) \
-       (IS_PARTITIONED(hctl) ? hashcode % NUM_FREELISTS : 0)
+       (IS_PARTITIONED(hctl) ? (hashcode) % NUM_FREELISTS : 0)
  
  /*
   * Top control structure for a hashtable --- in a shared table, each backend
@@ -506,8 +521,8 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
                                         nelem_alloc_first;
  
                 /*
-                * If hash table is partitioned all freeLists have equal number of
-                * elements. Otherwise only freeList[0] is used.
+                * If hash table is partitioned, give each freelist an equal share of
+                * the initial allocation.  Otherwise only freeList[0] is used.
                  */
                 if (IS_PARTITIONED(hashp->hctl))
                         freelist_partitions = NUM_FREELISTS;
@@ -515,10 +530,13 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
                         freelist_partitions = 1;
  
                 nelem_alloc = nelem / freelist_partitions;
-               if (nelem_alloc == 0)
+               if (nelem_alloc <= 0)
                         nelem_alloc = 1;
  
-               /* Make sure all memory will be used */
+               /*
+                * Make sure we'll allocate all the requested elements; freeList[0]
+                * gets the excess if the request isn't divisible by NUM_FREELISTS.
+                */
                 if (nelem_alloc * freelist_partitions < nelem)
                         nelem_alloc_first =
                                 nelem - nelem_alloc * (freelist_partitions - 1);
@@ -620,7 +638,7 @@ init_htab(HTAB *hashp, long nelem)
         int                     i;
  
         /*
-        * initialize mutex if it's a partitioned table
+        * initialize mutexes if it's a partitioned table
          */
         if (IS_PARTITIONED(hctl))
                 for (i = 0; i < NUM_FREELISTS; i++)
@@ -902,6 +920,7 @@ hash_search_with_hash_value(HTAB *hashp,
                                                         bool *foundPtr)
  {
         HASHHDR    *hctl = hashp->hctl;
+       int                     freelist_idx = FREELIST_IDX(hctl, hashvalue);
         Size            keysize;
         uint32          bucket;
         long            segment_num;
@@ -910,7 +929,6 @@ hash_search_with_hash_value(HTAB *hashp,
         HASHBUCKET      currBucket;
         HASHBUCKET *prevBucketPtr;
         HashCompareFunc match;
-       int                     freelist_idx = FREELIST_IDX(hctl, hashvalue);
  
  #if HASH_STATISTICS
         hash_accesses++;
@@ -993,13 +1011,14 @@ hash_search_with_hash_value(HTAB *hashp,
                                 if (IS_PARTITIONED(hctl))
                                         SpinLockAcquire(&(hctl->freeList[freelist_idx].mutex));
  
+                               /* delete the record from the appropriate nentries counter. */
                                 Assert(hctl->freeList[freelist_idx].nentries > 0);
                                 hctl->freeList[freelist_idx].nentries--;
  
                                 /* remove record from hash bucket's chain. */
                                 *prevBucketPtr = currBucket->link;
  
-                               /* add the record to the freelist for this table.  */
+                               /* add the record to the appropriate freelist. */
                                 currBucket->link = hctl->freeList[freelist_idx].freeList;
                                 hctl->freeList[freelist_idx].freeList = currBucket;
  
@@ -1220,14 +1239,15 @@ hash_update_hash_key(HTAB *hashp,
  }
  
  /*
- * create a new entry if possible
+ * Allocate a new hashtable entry if possible; return NULL if out of memory.
+ * (Or, if the underlying space allocator throws error for out-of-memory,
+ * we won't return at all.)
   */
  static HASHBUCKET
  get_hash_entry(HTAB *hashp, int freelist_idx)
  {
         HASHHDR    *hctl = hashp->hctl;
         HASHBUCKET      newElement;
-       int                     borrow_from_idx;
  
         for (;;)
         {
@@ -1244,19 +1264,32 @@ get_hash_entry(HTAB *hashp, int freelist_idx)
                 if (IS_PARTITIONED(hctl))
                         SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
  
-               /* no free elements.  allocate another chunk of buckets */
+               /*
+                * No free elements in this freelist.  In a partitioned table, there
+                * might be entries in other freelists, but to reduce contention we
+                * prefer to first try to get another chunk of buckets from the main
+                * shmem allocator.  If that fails, though, we *MUST* root through all
+                * the other freelists before giving up.  There are multiple callers
+                * that assume that they can allocate every element in the initially
+                * requested table size, or that deleting an element guarantees they
+                * can insert a new element, even if shared memory is entirely full.
+                * Failing because the needed element is in a different freelist is
+                * not acceptable.
+                */
                 if (!element_alloc(hashp, hctl->nelem_alloc, freelist_idx))
                 {
+                       int                     borrow_from_idx;
+
                         if (!IS_PARTITIONED(hctl))
                                 return NULL;    /* out of memory */
  
-                       /* try to borrow element from another partition */
+                       /* try to borrow element from another freelist */
                         borrow_from_idx = freelist_idx;
                         for (;;)
                         {
                                 borrow_from_idx = (borrow_from_idx + 1) % NUM_FREELISTS;
                                 if (borrow_from_idx == freelist_idx)
-                                       break;
+                                       break;          /* examined all freelists, fail */
  
                                 SpinLockAcquire(&(hctl->freeList[borrow_from_idx].mutex));
                                 newElement = hctl->freeList[borrow_from_idx].freeList;
@@ -1266,17 +1299,19 @@ get_hash_entry(HTAB *hashp, int freelist_idx)
                                         hctl->freeList[borrow_from_idx].freeList = newElement->link;
                                         SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex));
  
+                                       /* careful: count the new element in its proper freelist */
                                         SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
                                         hctl->freeList[freelist_idx].nentries++;
                                         SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
  
-                                       break;
+                                       return newElement;
                                 }
  
                                 SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex));
                         }
  
-                       return newElement;
+                       /* no elements available to borrow either, so out of memory */
+                       return NULL;
                 }
         }
  
@@ -1300,15 +1335,15 @@ hash_get_num_entries(HTAB *hashp)
         long            sum = hashp->hctl->freeList[0].nentries;
  
         /*
-        * We currently don't bother with the mutex; it's only sensible to call
-        * this function if you've got lock on all partitions of the table.
+        * We currently don't bother with acquiring the mutexes; it's only
+        * sensible to call this function if you've got lock on all partitions of
+        * the table.
          */
-
-       if (!IS_PARTITIONED(hashp->hctl))
-               return sum;
-
-       for (i = 1; i < NUM_FREELISTS; i++)
-               sum += hashp->hctl->freeList[i].nentries;
+       if (IS_PARTITIONED(hashp->hctl))
+       {
+               for (i = 1; i < NUM_FREELISTS; i++)
+                       sum += hashp->hctl->freeList[i].nentries;
+       }
  
         return sum;
  }
author	Tom Lane <tgl@sss.pgh.pa.us>
	Sat, 22 Jul 2017 22:02:26 +0000 (18:02 -0400)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Sat, 22 Jul 2017 22:02:26 +0000 (18:02 -0400)