From a1a9120c797d689be1dafd6fac8d956e3f667ba6 Mon Sep 17 00:00:00 2001 From: Richard Guo Date: Thu, 2 Jan 2025 18:02:02 +0900 Subject: [PATCH] Ignore nullingrels when looking up statistics When looking up statistical data about an expression, we do not need to concern ourselves with the outer joins that could null the Vars/PHVs contained in the expression. Accounting for nullingrels in the expression could cause estimate_num_groups to count the same Var multiple times if it's marked with different nullingrels. This is incorrect, and could lead to "ERROR: corrupt MVNDistinct entry" when searching for multivariate n-distinct. Furthermore, the nullingrels could prevent us from matching an expression to expressional index columns or to the expressions in extended statistics, leading to inaccurate estimates. To fix, strip out all the nullingrels from the expression before we look up statistical data about it. There is one ensuing plan change in the regression tests, but it looks reasonable and does not compromise its original purpose. This patch could result in plan changes, but it fixes an actual bug, so back-patch to v16 where the outer-join-aware-Var infrastructure was introduced. Author: Richard Guo Discussion: https://postgr.es/m/CAMbWs4-2Z4k+nFTiZe0Qbu5n8juUWenDAtMzi98bAZQtwHx0-w@mail.gmail.com --- src/backend/utils/adt/selfuncs.c | 28 +++++++++++++++++++++++++--- src/test/regress/expected/join.out | 26 ++++++++++++++++++++++++-- src/test/regress/sql/join.sql | 13 +++++++++++++ 3 files changed, 62 insertions(+), 5 deletions(-) diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index c4fcd0076ea..771781e28ca 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -120,6 +120,7 @@ #include "optimizer/plancat.h" #include "parser/parse_clause.h" #include "parser/parsetree.h" +#include "rewrite/rewriteManip.h" #include "statistics/statistics.h" #include "storage/bufmgr.h" #include "utils/acl.h" @@ -3273,6 +3274,15 @@ add_unique_group_var(PlannerInfo *root, List *varinfos, ndistinct = get_variable_numdistinct(vardata, &isdefault); + /* + * The nullingrels bits within the var could cause the same var to be + * counted multiple times if it's marked with different nullingrels. They + * could also prevent us from matching the var to the expressions in + * extended statistics (see estimate_multivariate_ndistinct). So strip + * them out first. + */ + var = remove_nulling_relids(var, root->outer_join_rels, NULL); + foreach(lc, varinfos) { varinfo = (GroupVarInfo *) lfirst(lc); @@ -4980,6 +4990,7 @@ examine_variable(PlannerInfo *root, Node *node, int varRelid, { Node *basenode; Relids varnos; + Relids basevarnos; RelOptInfo *onerel; /* Make sure we don't return dangling pointers in vardata */ @@ -5021,10 +5032,11 @@ examine_variable(PlannerInfo *root, Node *node, int varRelid, * relation are considered "real" vars. */ varnos = pull_varnos(root, basenode); + basevarnos = bms_difference(varnos, root->outer_join_rels); onerel = NULL; - switch (bms_membership(varnos)) + switch (bms_membership(basevarnos)) { case BMS_EMPTY_SET: /* No Vars at all ... must be pseudo-constant clause */ @@ -5033,7 +5045,7 @@ examine_variable(PlannerInfo *root, Node *node, int varRelid, if (varRelid == 0 || bms_is_member(varRelid, varnos)) { onerel = find_base_rel(root, - (varRelid ? varRelid : bms_singleton_member(varnos))); + (varRelid ? varRelid : bms_singleton_member(basevarnos))); vardata->rel = onerel; node = basenode; /* strip any relabeling */ } @@ -5057,7 +5069,7 @@ examine_variable(PlannerInfo *root, Node *node, int varRelid, break; } - bms_free(varnos); + bms_free(basevarnos); vardata->var = node; vardata->atttype = exprType(node); @@ -5082,6 +5094,14 @@ examine_variable(PlannerInfo *root, Node *node, int varRelid, ListCell *slist; Oid userid; + /* + * The nullingrels bits within the expression could prevent us from + * matching it to expressional index columns or to the expressions in + * extended statistics. So strip them out first. + */ + if (bms_overlap(varnos, root->outer_join_rels)) + node = remove_nulling_relids(node, root->outer_join_rels, NULL); + /* * Determine the user ID to use for privilege checks: either * onerel->userid if it's set (e.g., in case we're accessing the table @@ -5352,6 +5372,8 @@ examine_variable(PlannerInfo *root, Node *node, int varRelid, } } } + + bms_free(varnos); } /* diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out index 9af8d61a732..84e35981ed8 100644 --- a/src/test/regress/expected/join.out +++ b/src/test/regress/expected/join.out @@ -2517,10 +2517,11 @@ where t1.f1 = coalesce(t2.f1, 1); -> Materialize -> Seq Scan on int4_tbl t2 Filter: (f1 > 1) - -> Seq Scan on int4_tbl t3 + -> Materialize + -> Seq Scan on int4_tbl t3 -> Materialize -> Seq Scan on int4_tbl t4 -(13 rows) +(14 rows) explain (costs off) select * from int4_tbl t1 @@ -7981,3 +7982,24 @@ where exists (select 1 from j3 (13 rows) drop table j3; +-- Test that we do not account for nullingrels when looking up statistics +CREATE TABLE group_tbl (a INT, b INT); +INSERT INTO group_tbl SELECT 1, 1; +CREATE STATISTICS group_tbl_stat (ndistinct) ON a, b FROM group_tbl; +ANALYZE group_tbl; +EXPLAIN (COSTS OFF) +SELECT 1 FROM group_tbl t1 + LEFT JOIN (SELECT a c1, COALESCE(a) c2 FROM group_tbl t2) s ON TRUE +GROUP BY s.c1, s.c2; + QUERY PLAN +-------------------------------------------- + Group + Group Key: t2.a, (COALESCE(t2.a)) + -> Sort + Sort Key: t2.a, (COALESCE(t2.a)) + -> Nested Loop Left Join + -> Seq Scan on group_tbl t1 + -> Seq Scan on group_tbl t2 +(7 rows) + +DROP TABLE group_tbl; diff --git a/src/test/regress/sql/join.sql b/src/test/regress/sql/join.sql index 41949d41dd6..d6f646a1d50 100644 --- a/src/test/regress/sql/join.sql +++ b/src/test/regress/sql/join.sql @@ -2928,3 +2928,16 @@ where exists (select 1 from j3 and t1.unique1 < 1; drop table j3; + +-- Test that we do not account for nullingrels when looking up statistics +CREATE TABLE group_tbl (a INT, b INT); +INSERT INTO group_tbl SELECT 1, 1; +CREATE STATISTICS group_tbl_stat (ndistinct) ON a, b FROM group_tbl; +ANALYZE group_tbl; + +EXPLAIN (COSTS OFF) +SELECT 1 FROM group_tbl t1 + LEFT JOIN (SELECT a c1, COALESCE(a) c2 FROM group_tbl t2) s ON TRUE +GROUP BY s.c1, s.c2; + +DROP TABLE group_tbl; -- 2.39.5