From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Wed, 4 Mar 2026 20:33:15 +0000 (-0500)
Subject: Fix estimate_hash_bucket_stats's correction for skewed data.
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=e6a1d8f5acbc661d3165d92d0cc1ff6b8f997f16;p=thirdparty%2Fpostgresql.git

Fix estimate_hash_bucket_stats's correction for skewed data.

The previous idea was "scale up the bucketsize estimate by the ratio
of the MCV's frequency to the average value's frequency".  But we
should have been suspicious of that plan, since it frequently led to
impossible (> 1) values which we had to apply an ad-hoc clamp to.
Joel Jacobson demonstrated that it sometimes leads to making the
wrong choice about which side of the hash join should be inner.

Instead, drop the whole business of estimating average frequency, and
just clamp the bucketsize estimate to be at least the MCV's frequency.
This corresponds to the bucket size we'd get if only the MCV appears
in a bucket, and the MCV's frequency is not affected by the
WHERE-clause filters.  (We were already making the latter assumption.)
This also matches the coding used since 4867d7f62 in the case where
only a default ndistinct estimate is available.

Interestingly, this change affects no existing regression test cases.
Add one to demonstrate that it helps pick the smaller table to be
hashed when the MCV is common enough to affect the results.

This leaves estimate_hash_bucket_stats not considering the effects of
null join keys at all, which we should probably improve.  However,
I have a different patch in the queue that will change the executor's
handling of null join keys, so it seems appropriate to wait till
that's in before doing anything more here.

Reported-by: Joel Jacobson <joel@compiler.org>
Author: Tom Lane <tgl@sss.pgh.pa.us>
Reviewed-by: Joel Jacobson <joel@compiler.org>
Discussion: https://postgr.es/m/341b723c-da45-4058-9446-1514dedb17c1@app.fastmail.com
---

diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index dd7e11c0ca5..d4da0e8dea9 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -4373,8 +4373,8 @@ estimate_multivariate_bucketsize(PlannerInfo *root, RelOptInfo *inner,
  * exactly those that will be probed most often.  Therefore, the "average"
  * bucket size for costing purposes should really be taken as something close
  * to the "worst case" bucket size.  We try to estimate this by adjusting the
- * fraction if there are too few distinct data values, and then scaling up
- * by the ratio of the most common value's frequency to the average frequency.
+ * fraction if there are too few distinct data values, and then clamping to
+ * at least the bucket size implied by the most common value's frequency.
  *
  * If no statistics are available, use a default estimate of 0.1.  This will
  * discourage use of a hash rather strongly if the inner relation is large,
@@ -4394,9 +4394,7 @@ estimate_hash_bucket_stats(PlannerInfo *root, Node *hashkey, double nbuckets,
 {
 	VariableStatData vardata;
 	double		estfract,
-				ndistinct,
-				stanullfrac,
-				avgfreq;
+				ndistinct;
 	bool		isdefault;
 	AttStatsSlot sslot;
 
@@ -4446,20 +4444,6 @@ estimate_hash_bucket_stats(PlannerInfo *root, Node *hashkey, double nbuckets,
 		return;
 	}
 
-	/* Get fraction that are null */
-	if (HeapTupleIsValid(vardata.statsTuple))
-	{
-		Form_pg_statistic stats;
-
-		stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
-		stanullfrac = stats->stanullfrac;
-	}
-	else
-		stanullfrac = 0.0;
-
-	/* Compute avg freq of all distinct data values in raw relation */
-	avgfreq = (1.0 - stanullfrac) / ndistinct;
-
 	/*
 	 * Adjust ndistinct to account for restriction clauses.  Observe we are
 	 * assuming that the data distribution is affected uniformly by the
@@ -4485,20 +4469,11 @@ estimate_hash_bucket_stats(PlannerInfo *root, Node *hashkey, double nbuckets,
 		estfract = 1.0 / ndistinct;
 
 	/*
-	 * Adjust estimated bucketsize upward to account for skewed distribution.
-	 */
-	if (avgfreq > 0.0 && *mcv_freq > avgfreq)
-		estfract *= *mcv_freq / avgfreq;
-
-	/*
-	 * Clamp bucketsize to sane range (the above adjustment could easily
-	 * produce an out-of-range result).  We set the lower bound a little above
-	 * zero, since zero isn't a very sane result.
+	 * Clamp the bucketsize fraction to be not less than the MCV frequency,
+	 * since whichever bucket the MCV values end up in will have at least that
+	 * size.  This has no effect if *mcv_freq is still zero.
 	 */
-	if (estfract < 1.0e-6)
-		estfract = 1.0e-6;
-	else if (estfract > 1.0)
-		estfract = 1.0;
+	estfract = Max(estfract, *mcv_freq);
 
 	*bucketsize_frac = (Selectivity) estfract;
 
diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out
index 41521f275c8..ea3dabff77c 100644
--- a/src/test/regress/expected/join.out
+++ b/src/test/regress/expected/join.out
@@ -3192,6 +3192,31 @@ ORDER BY 1;
 
 reset enable_nestloop;
 --
+-- test that estimate_hash_bucket_stats estimates correctly with skewed data
+-- (we should choose to hash the filtered table)
+--
+create temp table skewedtable (val int not null, filt int not null);
+insert into skewedtable
+select
+    case when g <= 100 then 0 else (g % 100) + 1 end,
+    g % 10
+from generate_series(1, 1000) g;
+analyze skewedtable;
+explain (costs off)
+select * from skewedtable t1 join skewedtable t2 on t1.val = t2.val
+where t1.filt = 5;
+               QUERY PLAN               
+----------------------------------------
+ Hash Join
+   Hash Cond: (t2.val = t1.val)
+   ->  Seq Scan on skewedtable t2
+   ->  Hash
+         ->  Seq Scan on skewedtable t1
+               Filter: (filt = 5)
+(6 rows)
+
+drop table skewedtable;
+--
 -- basic semijoin and antijoin recognition tests
 --
 explain (costs off)
diff --git a/src/test/regress/sql/join.sql b/src/test/regress/sql/join.sql
index 4acd2512004..ad90c326c00 100644
--- a/src/test/regress/sql/join.sql
+++ b/src/test/regress/sql/join.sql
@@ -834,6 +834,25 @@ ORDER BY 1;
 
 reset enable_nestloop;
 
+--
+-- test that estimate_hash_bucket_stats estimates correctly with skewed data
+-- (we should choose to hash the filtered table)
+--
+
+create temp table skewedtable (val int not null, filt int not null);
+insert into skewedtable
+select
+    case when g <= 100 then 0 else (g % 100) + 1 end,
+    g % 10
+from generate_series(1, 1000) g;
+analyze skewedtable;
+
+explain (costs off)
+select * from skewedtable t1 join skewedtable t2 on t1.val = t2.val
+where t1.filt = 5;
+
+drop table skewedtable;
+
 --
 -- basic semijoin and antijoin recognition tests
 --