* exactly those that will be probed most often. Therefore, the "average"
* bucket size for costing purposes should really be taken as something close
* to the "worst case" bucket size. We try to estimate this by adjusting the
- * fraction if there are too few distinct data values, and then scaling up
- * by the ratio of the most common value's frequency to the average frequency.
+ * fraction if there are too few distinct data values, and then clamping to
+ * at least the bucket size implied by the most common value's frequency.
*
* If no statistics are available, use a default estimate of 0.1. This will
* discourage use of a hash rather strongly if the inner relation is large,
{
VariableStatData vardata;
double estfract,
- ndistinct,
- stanullfrac,
- avgfreq;
+ ndistinct;
bool isdefault;
AttStatsSlot sslot;
return;
}
- /* Get fraction that are null */
- if (HeapTupleIsValid(vardata.statsTuple))
- {
- Form_pg_statistic stats;
-
- stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
- stanullfrac = stats->stanullfrac;
- }
- else
- stanullfrac = 0.0;
-
- /* Compute avg freq of all distinct data values in raw relation */
- avgfreq = (1.0 - stanullfrac) / ndistinct;
-
/*
* Adjust ndistinct to account for restriction clauses. Observe we are
* assuming that the data distribution is affected uniformly by the
estfract = 1.0 / ndistinct;
/*
- * Adjust estimated bucketsize upward to account for skewed distribution.
- */
- if (avgfreq > 0.0 && *mcv_freq > avgfreq)
- estfract *= *mcv_freq / avgfreq;
-
- /*
- * Clamp bucketsize to sane range (the above adjustment could easily
- * produce an out-of-range result). We set the lower bound a little above
- * zero, since zero isn't a very sane result.
+ * Clamp the bucketsize fraction to be not less than the MCV frequency,
+ * since whichever bucket the MCV values end up in will have at least that
+ * size. This has no effect if *mcv_freq is still zero.
*/
- if (estfract < 1.0e-6)
- estfract = 1.0e-6;
- else if (estfract > 1.0)
- estfract = 1.0;
+ estfract = Max(estfract, *mcv_freq);
*bucketsize_frac = (Selectivity) estfract;
reset enable_nestloop;
--
+-- test that estimate_hash_bucket_stats estimates correctly with skewed data
+-- (we should choose to hash the filtered table)
+--
+create temp table skewedtable (val int not null, filt int not null);
+insert into skewedtable
+select
+ case when g <= 100 then 0 else (g % 100) + 1 end,
+ g % 10
+from generate_series(1, 1000) g;
+analyze skewedtable;
+explain (costs off)
+select * from skewedtable t1 join skewedtable t2 on t1.val = t2.val
+where t1.filt = 5;
+ QUERY PLAN
+----------------------------------------
+ Hash Join
+ Hash Cond: (t2.val = t1.val)
+ -> Seq Scan on skewedtable t2
+ -> Hash
+ -> Seq Scan on skewedtable t1
+ Filter: (filt = 5)
+(6 rows)
+
+drop table skewedtable;
+--
-- basic semijoin and antijoin recognition tests
--
explain (costs off)
reset enable_nestloop;
+--
+-- test that estimate_hash_bucket_stats estimates correctly with skewed data
+-- (we should choose to hash the filtered table)
+--
+
+create temp table skewedtable (val int not null, filt int not null);
+insert into skewedtable
+select
+ case when g <= 100 then 0 else (g % 100) + 1 end,
+ g % 10
+from generate_series(1, 1000) g;
+analyze skewedtable;
+
+explain (costs off)
+select * from skewedtable t1 join skewedtable t2 on t1.val = t2.val
+where t1.filt = 5;
+
+drop table skewedtable;
+
--
-- basic semijoin and antijoin recognition tests
--