Improve planner's estimates of tuple hash table sizes.

author Tom Lane <tgl@sss.pgh.pa.us>

Sun, 2 Nov 2025 21:57:26 +0000 (16:57 -0500)

committer Tom Lane <tgl@sss.pgh.pa.us>

Sun, 2 Nov 2025 21:57:26 +0000 (16:57 -0500)
author Tom Lane <tgl@sss.pgh.pa.us>
Sun, 2 Nov 2025 21:57:26 +0000 (16:57 -0500)
committer Tom Lane <tgl@sss.pgh.pa.us>
Sun, 2 Nov 2025 21:57:26 +0000 (16:57 -0500)
diff --git a/src/backend/executor/execGrouping.c b/src/backend/executor/execGrouping.c

index b4bdaa3c3056f408d933aff478ec7f203bc05d5b..e1a3a813dd9b86c554e08e2900d3b9ef23071242 100644 (file)
--- a/src/backend/executor/execGrouping.c
+++ b/src/backend/executor/execGrouping.c
@@ -14,6 +14,7 @@
   */
  #include "postgres.h"
  
+#include "access/htup_details.h"
  #include "access/parallel.h"
  #include "common/hashfn.h"
  #include "executor/executor.h"
@@ -302,6 +303,64 @@ ResetTupleHashTable(TupleHashTable hashtable)
         MemoryContextReset(hashtable->tuplescxt);
  }
  
+/*
+ * Estimate the amount of space needed for a TupleHashTable with nentries
+ * entries, if the tuples have average data width tupleWidth and the caller
+ * requires additionalsize extra space per entry.
+ *
+ * Return SIZE_MAX if it'd overflow size_t.
+ *
+ * nentries is "double" because this is meant for use by the planner,
+ * which typically works with double rowcount estimates.  So we'd need to
+ * clamp to integer somewhere and that might as well be here.  We do expect
+ * the value not to be NaN or negative, else the result will be garbage.
+ */
+Size
+EstimateTupleHashTableSpace(double nentries,
+                                                       Size tupleWidth,
+                                                       Size additionalsize)
+{
+       Size            sh_space;
+       double          tuples_space;
+
+       /* First estimate the space needed for the simplehash table */
+       sh_space = tuplehash_estimate_space(nentries);
+
+       /* Give up if that's already too big */
+       if (sh_space >= SIZE_MAX)
+               return sh_space;
+
+       /*
+        * Compute space needed for hashed tuples with additional data.  nentries
+        * must be somewhat sane, so it should be safe to compute this product.
+        *
+        * We assume that the hashed tuples will be kept in a BumpContext so that
+        * there is not additional per-tuple overhead.
+        *
+        * (Note that this is only accurate if MEMORY_CONTEXT_CHECKING is off,
+        * else bump.c will add a MemoryChunk header to each tuple.  However, it
+        * seems undesirable for debug builds to make different planning choices
+        * than production builds, so we assume the production behavior always.)
+        */
+       tuples_space = nentries * (MAXALIGN(SizeofMinimalTupleHeader) +
+                                                          MAXALIGN(tupleWidth) +
+                                                          MAXALIGN(additionalsize));
+
+       /*
+        * Check for size_t overflow.  This coding is trickier than it may appear,
+        * because on 64-bit machines SIZE_MAX cannot be represented exactly as a
+        * double.  We must cast it explicitly to suppress compiler warnings about
+        * an inexact conversion, and we must trust that any double value that
+        * compares strictly less than "(double) SIZE_MAX" will cast to a
+        * representable size_t value.
+        */
+       if (sh_space + tuples_space >= (double) SIZE_MAX)
+               return SIZE_MAX;
+
+       /* We don't bother estimating size of the miscellaneous overhead data */
+       return (Size) (sh_space + tuples_space);
+}
+
  /*
   * Find or create a hashtable entry for the tuple group containing the
   * given tuple.  The tuple must be the same type as the hashtable entries.
diff --git a/src/backend/executor/nodeSetOp.c b/src/backend/executor/nodeSetOp.c

index 7b223a7ca3ae9792b4ae635eafccfdd6a60d08b0..5aabed18a09e80c672f44f78c659ed248150688b 100644 (file)
--- a/src/backend/executor/nodeSetOp.c
+++ b/src/backend/executor/nodeSetOp.c
@@ -111,6 +111,15 @@ build_hash_table(SetOpState *setopstate)
                                                                                                 false);
  }
  
+/* Planner support routine to estimate space needed for hash table */
+Size
+EstimateSetOpHashTableSpace(double nentries, Size tupleWidth)
+{
+       return EstimateTupleHashTableSpace(nentries,
+                                                                          tupleWidth,
+                                                                          sizeof(SetOpStatePerGroupData));
+}
+
  /*
   * We've completed processing a tuple group.  Decide how many copies (if any)
   * of its representative row to emit, and store the count into numOutput.
diff --git a/src/backend/executor/nodeSubplan.c b/src/backend/executor/nodeSubplan.c

index 9f6e45bcb0bac9b4fba99afd680455b259f6a917..1cd0988bb496572beb1015fd474dfdc43a7fc9f1 100644 (file)
--- a/src/backend/executor/nodeSubplan.c
+++ b/src/backend/executor/nodeSubplan.c
@@ -525,7 +525,7 @@ buildSubPlanHash(SubPlanState *node, ExprContext *econtext)
                                                                                           node->tab_hash_funcs,
                                                                                           node->tab_collations,
                                                                                           nbuckets,
-                                                                                         0,
+                                                                                         0,    /* no additional data */
                                                                                           node->planstate->state->es_query_cxt,
                                                                                           node->tuplesContext,
                                                                                           innerecontext->ecxt_per_tuple_memory,
@@ -554,7 +554,7 @@ buildSubPlanHash(SubPlanState *node, ExprContext *econtext)
                                                                                                   node->tab_hash_funcs,
                                                                                                   node->tab_collations,
                                                                                                   nbuckets,
-                                                                                                 0,
+                                                                                                 0,    /* no additional data */
                                                                                                   node->planstate->state->es_query_cxt,
                                                                                                   node->tuplesContext,
                                                                                                   innerecontext->ecxt_per_tuple_memory,
@@ -636,6 +636,55 @@ buildSubPlanHash(SubPlanState *node, ExprContext *econtext)
         MemoryContextSwitchTo(oldcontext);
  }
  
+/* Planner support routine to estimate space needed for hash table(s) */
+Size
+EstimateSubplanHashTableSpace(double nentries,
+                                                         Size tupleWidth,
+                                                         bool unknownEqFalse)
+{
+       Size            tab1space,
+                               tab2space;
+
+       /* Estimate size of main hashtable */
+       tab1space = EstimateTupleHashTableSpace(nentries,
+                                                                                       tupleWidth,
+                                                                                       0 /* no additional data */ );
+
+       /* Give up if that's already too big */
+       if (tab1space >= SIZE_MAX)
+               return tab1space;
+
+       /* Done if we don't need a hashnulls table */
+       if (unknownEqFalse)
+               return tab1space;
+
+       /*
+        * Adjust the rowcount estimate in the same way that buildSubPlanHash
+        * will, except that we don't bother with the special case for a single
+        * hash column.  (We skip that detail because it'd be notationally painful
+        * for our caller to provide the column count, and this table has
+        * relatively little impact on the total estimate anyway.)
+        */
+       nentries /= 16;
+       if (nentries < 1)
+               nentries = 1;
+
+       /*
+        * It might be sane to also reduce the tupleWidth, but on the other hand
+        * we are not accounting for the space taken by the tuples' null bitmaps.
+        * Leave it alone for now.
+        */
+       tab2space = EstimateTupleHashTableSpace(nentries,
+                                                                                       tupleWidth,
+                                                                                       0 /* no additional data */ );
+
+       /* Guard against overflow */
+       if (tab2space >= SIZE_MAX - tab1space)
+               return SIZE_MAX;
+
+       return tab1space + tab2space;
+}
+
  /*
   * execTuplesUnequal
   *             Return true if two tuples are definitely unequal in the indicated
diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c

index 14192a1323655e9162747e9eaa84bdcfb8dc7e8e..ff63d20f8d536c31e0f0cae5634deace32c5b956 100644 (file)
--- a/src/backend/optimizer/plan/subselect.c
+++ b/src/backend/optimizer/plan/subselect.c
@@ -20,6 +20,7 @@
  #include "catalog/pg_operator.h"
  #include "catalog/pg_type.h"
  #include "executor/executor.h"
+#include "executor/nodeSubplan.h"
  #include "miscadmin.h"
  #include "nodes/makefuncs.h"
  #include "nodes/nodeFuncs.h"
@@ -79,8 +80,8 @@ static Node *convert_testexpr(PlannerInfo *root,
                                                           List *subst_nodes);
  static Node *convert_testexpr_mutator(Node *node,
                                                                           convert_testexpr_context *context);
-static bool subplan_is_hashable(Plan *plan);
-static bool subpath_is_hashable(Path *path);
+static bool subplan_is_hashable(Plan *plan, bool unknownEqFalse);
+static bool subpath_is_hashable(Path *path, bool unknownEqFalse);
  static bool testexpr_is_hashable(Node *testexpr, List *param_ids);
  static bool test_opexpr_is_hashable(OpExpr *testexpr, List *param_ids);
  static bool hash_ok_operator(OpExpr *expr);
@@ -283,7 +284,7 @@ make_subplan(PlannerInfo *root, Query *orig_subquery,
                         best_path = final_rel->cheapest_total_path;
  
                         /* Now we can check if it'll fit in hash_mem */
-                       if (subpath_is_hashable(best_path))
+                       if (subpath_is_hashable(best_path, true))
                         {
                                 SubPlan    *hashplan;
                                 AlternativeSubPlan *asplan;
@@ -524,7 +525,7 @@ build_subplan(PlannerInfo *root, Plan *plan, Path *path,
                  */
                 if (subLinkType == ANY_SUBLINK &&
                         splan->parParam == NIL &&
-                       subplan_is_hashable(plan) &&
+                       subplan_is_hashable(plan, unknownEqFalse) &&
                         testexpr_is_hashable(splan->testexpr, splan->paramIds))
                         splan->useHashTable = true;
  
@@ -711,19 +712,19 @@ convert_testexpr_mutator(Node *node,
   * is suitable for hashing.  We only look at the subquery itself.
   */
  static bool
-subplan_is_hashable(Plan *plan)
+subplan_is_hashable(Plan *plan, bool unknownEqFalse)
  {
-       double          subquery_size;
+       Size            hashtablesize;
  
         /*
-        * The estimated size of the subquery result must fit in hash_mem. (Note:
-        * we use heap tuple overhead here even though the tuples will actually be
-        * stored as MinimalTuples; this provides some fudge factor for hashtable
-        * overhead.)
+        * The estimated size of the hashtable holding the subquery result must
+        * fit in hash_mem.  (Note: reject on equality, to ensure that an estimate
+        * of SIZE_MAX disables hashing regardless of the hash_mem limit.)
          */
-       subquery_size = plan->plan_rows *
-               (MAXALIGN(plan->plan_width) + MAXALIGN(SizeofHeapTupleHeader));
-       if (subquery_size > get_hash_memory_limit())
+       hashtablesize = EstimateSubplanHashTableSpace(plan->plan_rows,
+                                                                                                 plan->plan_width,
+                                                                                                 unknownEqFalse);
+       if (hashtablesize >= get_hash_memory_limit())
                 return false;
  
         return true;
@@ -735,19 +736,19 @@ subplan_is_hashable(Plan *plan)
   * Identical to subplan_is_hashable, but work from a Path for the subplan.
   */
  static bool
-subpath_is_hashable(Path *path)
+subpath_is_hashable(Path *path, bool unknownEqFalse)
  {
-       double          subquery_size;
+       Size            hashtablesize;
  
         /*
-        * The estimated size of the subquery result must fit in hash_mem. (Note:
-        * we use heap tuple overhead here even though the tuples will actually be
-        * stored as MinimalTuples; this provides some fudge factor for hashtable
-        * overhead.)
+        * The estimated size of the hashtable holding the subquery result must
+        * fit in hash_mem.  (Note: reject on equality, to ensure that an estimate
+        * of SIZE_MAX disables hashing regardless of the hash_mem limit.)
          */
-       subquery_size = path->rows *
-               (MAXALIGN(path->pathtarget->width) + MAXALIGN(SizeofHeapTupleHeader));
-       if (subquery_size > get_hash_memory_limit())
+       hashtablesize = EstimateSubplanHashTableSpace(path->rows,
+                                                                                                 path->pathtarget->width,
+                                                                                                 unknownEqFalse);
+       if (hashtablesize >= get_hash_memory_limit())
                 return false;
  
         return true;
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c

index 44ac5312edda4d89698dfe9fdc26a745cbc168de..e4fd6950fad1d12cbe5625c9155166d0d84ecf77 100644 (file)
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -17,6 +17,7 @@
  #include <math.h>
  
  #include "access/htup_details.h"
+#include "executor/nodeSetOp.h"
  #include "foreign/fdwapi.h"
  #include "miscadmin.h"
  #include "nodes/extensible.h"
@@ -3461,7 +3462,7 @@ create_setop_path(PlannerInfo *root,
         }
         else
         {
-               Size            hashentrysize;
+               Size            hashtablesize;
  
                 /*
                  * In hashed mode, we must read all the input before we can emit
@@ -3490,11 +3491,12 @@ create_setop_path(PlannerInfo *root,
  
                 /*
                  * Also disable if it doesn't look like the hashtable will fit into
-                * hash_mem.
+                * hash_mem.  (Note: reject on equality, to ensure that an estimate of
+                * SIZE_MAX disables hashing regardless of the hash_mem limit.)
                  */
-               hashentrysize = MAXALIGN(leftpath->pathtarget->width) +
-                       MAXALIGN(SizeofMinimalTupleHeader);
-               if (hashentrysize * numGroups > get_hash_memory_limit())
+               hashtablesize = EstimateSetOpHashTableSpace(numGroups,
+                                                                                                       leftpath->pathtarget->width);
+               if (hashtablesize >= get_hash_memory_limit())
                         pathnode->path.disabled_nodes++;
         }
         pathnode->path.rows = outputRows;
diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h

index 8e7a5453064396905cdd2b8c3785b910647ebd41..086f52cff3d8685cd70e3f2a44425feeaf785765 100644 (file)
--- a/src/include/executor/executor.h
+++ b/src/include/executor/executor.h
@@ -157,6 +157,9 @@ extern TupleHashEntry FindTupleHashEntry(TupleHashTable hashtable,
                                                                                  ExprState *eqcomp,
                                                                                  ExprState *hashexpr);
  extern void ResetTupleHashTable(TupleHashTable hashtable);
+extern Size EstimateTupleHashTableSpace(double nentries,
+                                                                               Size tupleWidth,
+                                                                               Size additionalsize);
  
  #ifndef FRONTEND
  /*
diff --git a/src/include/executor/nodeSetOp.h b/src/include/executor/nodeSetOp.h

index 024c6ba1fceb80cf8abbb0bae8abfc2001f62943..302936df8be437a23a427adc31a5caded7010b14 100644 (file)
--- a/src/include/executor/nodeSetOp.h
+++ b/src/include/executor/nodeSetOp.h
@@ -20,4 +20,6 @@ extern SetOpState *ExecInitSetOp(SetOp *node, EState *estate, int eflags);
  extern void ExecEndSetOp(SetOpState *node);
  extern void ExecReScanSetOp(SetOpState *node);
  
+extern Size EstimateSetOpHashTableSpace(double nentries, Size tupleWidth);
+
  #endif                                                 /* NODESETOP_H */
diff --git a/src/include/executor/nodeSubplan.h b/src/include/executor/nodeSubplan.h

index a1cafbcc694d1eece1cb294772bae0b0c7c5e9a6..301c29d1f24693ea389e37dcb5db7390c2c2d6f6 100644 (file)
--- a/src/include/executor/nodeSubplan.h
+++ b/src/include/executor/nodeSubplan.h
@@ -20,6 +20,10 @@ extern SubPlanState *ExecInitSubPlan(SubPlan *subplan, PlanState *parent);
  
  extern Datum ExecSubPlan(SubPlanState *node, ExprContext *econtext, bool *isNull);
  
+extern Size EstimateSubplanHashTableSpace(double nentries,
+                                                                                 Size tupleWidth,
+                                                                                 bool unknownEqFalse);
+
  extern void ExecReScanSetParamPlan(SubPlanState *node, PlanState *parent);
  
  extern void ExecSetParamPlan(SubPlanState *node, ExprContext *econtext);
diff --git a/src/include/lib/simplehash.h b/src/include/lib/simplehash.h

index 9622131ede6811d9c0b8cb723a6587524735f79c..031a377da846c6a48df1400c584b4d760bcbb074 100644 (file)
--- a/src/include/lib/simplehash.h
+++ b/src/include/lib/simplehash.h
@@ -125,6 +125,7 @@
  #define SH_ITERATE SH_MAKE_NAME(iterate)
  #define SH_ALLOCATE SH_MAKE_NAME(allocate)
  #define SH_FREE SH_MAKE_NAME(free)
+#define SH_ESTIMATE_SPACE SH_MAKE_NAME(estimate_space)
  #define SH_STAT SH_MAKE_NAME(stat)
  
  /* internal helper functions (no externally visible prototypes) */
@@ -242,7 +243,10 @@ SH_SCOPE void SH_START_ITERATE_AT(SH_TYPE * tb, SH_ITERATOR * iter, uint32 at);
  /* <element> *<prefix>_iterate(<prefix>_hash *tb, <prefix>_iterator *iter) */
  SH_SCOPE       SH_ELEMENT_TYPE *SH_ITERATE(SH_TYPE * tb, SH_ITERATOR * iter);
  
-/* void <prefix>_stat(<prefix>_hash *tb */
+/* size_t <prefix>_estimate_space(double nentries) */
+SH_SCOPE size_t SH_ESTIMATE_SPACE(double nentries);
+
+/* void <prefix>_stat(<prefix>_hash *tb) */
  SH_SCOPE void SH_STAT(SH_TYPE * tb);
  
  #endif                                                 /* SH_DECLARE */
@@ -305,7 +309,7 @@ SH_SCOPE void SH_STAT(SH_TYPE * tb);
  
  /*
   * Compute allocation size for hashtable. Result can be passed to
- * SH_UPDATE_PARAMETERS.
+ * SH_UPDATE_PARAMETERS.  (Keep SH_ESTIMATE_SPACE in sync with this!)
   */
  static inline uint64
  SH_COMPUTE_SIZE(uint64 newsize)
@@ -1068,6 +1072,47 @@ SH_ITERATE(SH_TYPE * tb, SH_ITERATOR * iter)
         return NULL;
  }
  
+/*
+ * Estimate the amount of space needed for a hashtable with nentries entries.
+ * Return SIZE_MAX if that's too many entries.
+ *
+ * nentries is "double" because this is meant for use by the planner,
+ * which typically works with double rowcount estimates.  So we'd need to
+ * clamp to integer somewhere and that might as well be here.  We do expect
+ * the value not to be NaN or negative, else the result will be garbage.
+ */
+SH_SCOPE size_t
+SH_ESTIMATE_SPACE(double nentries)
+{
+       uint64          size;
+       uint64          space;
+
+       /* scale request by SH_FILLFACTOR, as SH_CREATE does */
+       nentries = nentries / SH_FILLFACTOR;
+
+       /* fail if we'd overrun SH_MAX_SIZE entries */
+       if (nentries >= SH_MAX_SIZE)
+               return SIZE_MAX;
+
+       /* should be safe to convert to uint64 */
+       size = (uint64) nentries;
+
+       /* supporting zero sized hashes would complicate matters */
+       size = Max(size, 2);
+
+       /* round up size to the next power of 2, that's how bucketing works */
+       size = pg_nextpower2_64(size);
+
+       /* calculate space needed for ->data */
+       space = ((uint64) sizeof(SH_ELEMENT_TYPE)) * size;
+
+       /* verify that allocation of ->data is possible on this platform */
+       if (space >= SIZE_MAX / 2)
+               return SIZE_MAX;
+
+       return (size_t) space + sizeof(SH_TYPE);
+}
+
  /*
   * Report some statistics about the state of the hashtable. For
   * debugging/profiling purposes only.
@@ -1195,6 +1240,7 @@ SH_STAT(SH_TYPE * tb)
  #undef SH_ITERATE
  #undef SH_ALLOCATE
  #undef SH_FREE
+#undef SH_ESTIMATE_SPACE
  #undef SH_STAT
  
  /* internal function names */
author	Tom Lane <tgl@sss.pgh.pa.us>
	Sun, 2 Nov 2025 21:57:26 +0000 (16:57 -0500)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Sun, 2 Nov 2025 21:57:26 +0000 (16:57 -0500)
src/backend/executor/execGrouping.c		patch \| blob \| blame \| history
src/backend/executor/nodeSetOp.c		patch \| blob \| blame \| history
src/backend/executor/nodeSubplan.c		patch \| blob \| blame \| history
src/backend/optimizer/plan/subselect.c		patch \| blob \| blame \| history
src/backend/optimizer/util/pathnode.c		patch \| blob \| blame \| history
src/include/executor/executor.h		patch \| blob \| blame \| history
src/include/executor/nodeSetOp.h		patch \| blob \| blame \| history
src/include/executor/nodeSubplan.h		patch \| blob \| blame \| history
src/include/lib/simplehash.h		patch \| blob \| blame \| history