]> git.ipfire.org Git - thirdparty/postgresql.git/commitdiff
Consider collation when proving subquery uniqueness
authorRichard Guo <rguo@postgresql.org>
Tue, 5 May 2026 01:23:31 +0000 (10:23 +0900)
committerRichard Guo <rguo@postgresql.org>
Tue, 5 May 2026 01:23:31 +0000 (10:23 +0900)
rel_is_distinct_for()'s RTE_SUBQUERY branch passed only the equality
operator from each join clause to query_is_distinct_for(), discarding
the operator's input collation.  query_is_distinct_for() then verified
opfamily compatibility but never checked collations, so a DISTINCT /
GROUP BY / set-op operating under one collation was trusted to prove
uniqueness for a comparison performed under an unrelated collation.
As with the recent fix in relation_has_unique_index_for(), this is
unsound for nondeterministic collations and yields wrong query results
in any optimization that consumes the proof.

Fix by carrying each clause's operator input collation into
query_is_distinct_for() and validating it at every check-site against
the subquery target expression's collation.

Back-patch to all supported branches.  query_is_distinct_for() is
declared in an installed header, so on stable branches the existing
two-list signature is retained as a thin wrapper that forwards to a
new collation-aware entry point; external callers continue to receive
the historical collation-blind answer.

Author: Richard Guo <guofenglinux@gmail.com>
Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us>
Discussion: https://postgr.es/m/CAMbWs4_XUUSTyzCaRjUeeahWNqi=8ZOA5Q4coi8zUVEDSBkM6A@mail.gmail.com
Backpatch-through: 14

src/backend/optimizer/plan/analyzejoins.c
src/include/nodes/pathnodes.h
src/include/optimizer/planmain.h
src/test/regress/expected/collate.icu.utf8.out
src/test/regress/sql/collate.icu.utf8.sql
src/tools/pgindent/typedefs.list

index 0f82ab9facb2daa2458a6cee394e089859a7bb5d..b07cb731401db7e5372e5e9d57e8811b3a0885c3 100644 (file)
@@ -68,7 +68,7 @@ static List *remove_rel_from_joinlist(List *joinlist, int relid, int *nremoved);
 static bool rel_supports_distinctness(PlannerInfo *root, RelOptInfo *rel);
 static bool rel_is_distinct_for(PlannerInfo *root, RelOptInfo *rel,
                                                                List *clause_list, List **extra_clauses);
-static Oid     distinct_col_search(int colno, List *colnos, List *opids);
+static DistinctColInfo *distinct_col_search(int colno, List *distinct_cols);
 static bool is_innerrel_unique_for(PlannerInfo *root,
                                                                   Relids joinrelids,
                                                                   Relids outerrelids,
@@ -1032,15 +1032,17 @@ rel_is_distinct_for(PlannerInfo *root, RelOptInfo *rel, List *clause_list,
        {
                Index           relid = rel->relid;
                Query      *subquery = root->simple_rte_array[relid]->subquery;
-               List       *colnos = NIL;
-               List       *opids = NIL;
+               List       *distinct_cols = NIL;
                ListCell   *l;
 
                /*
-                * Build the argument lists for query_is_distinct_for: a list of
-                * output column numbers that the query needs to be distinct over, and
-                * a list of equality operators that the output columns need to be
-                * distinct according to.
+                * Build the argument list for query_is_distinct_for: a list of
+                * DistinctColInfo entries, each holding an output column number that
+                * the query needs to be distinct over, the equality operator that the
+                * column needs to be distinct according to, and that operator's input
+                * collation.  The collation matters because the subquery's own
+                * DISTINCT / GROUP BY / set-op proves uniqueness under its own
+                * collation, which need not agree with the operator's.
                 *
                 * (XXX we are not considering restriction clauses attached to the
                 * subquery; is that worth doing?)
@@ -1048,18 +1050,18 @@ rel_is_distinct_for(PlannerInfo *root, RelOptInfo *rel, List *clause_list,
                foreach(l, clause_list)
                {
                        RestrictInfo *rinfo = lfirst_node(RestrictInfo, l);
-                       Oid                     op;
+                       OpExpr     *opexpr;
                        Var                *var;
+                       DistinctColInfo *dcinfo;
 
                        /*
-                        * Get the equality operator we need uniqueness according to.
-                        * (This might be a cross-type operator and thus not exactly the
-                        * same operator the subquery would consider; that's all right
-                        * since query_is_distinct_for can resolve such cases.)  The
-                        * caller's mergejoinability test should have selected only
-                        * OpExprs.
+                        * The caller's mergejoinability test should have selected only
+                        * OpExprs.  The operator might be a cross-type operator and thus
+                        * not exactly the same operator the subquery would consider;
+                        * that's all right since query_is_distinct_for can resolve such
+                        * cases.
                         */
-                       op = castNode(OpExpr, rinfo->clause)->opno;
+                       opexpr = castNode(OpExpr, rinfo->clause);
 
                        /* caller identified the inner side for us */
                        if (rinfo->outer_is_left)
@@ -1083,11 +1085,14 @@ rel_is_distinct_for(PlannerInfo *root, RelOptInfo *rel, List *clause_list,
                                var->varno != relid || var->varlevelsup != 0)
                                continue;
 
-                       colnos = lappend_int(colnos, var->varattno);
-                       opids = lappend_oid(opids, op);
+                       dcinfo = palloc(sizeof(DistinctColInfo));
+                       dcinfo->colno = var->varattno;
+                       dcinfo->opid = opexpr->opno;
+                       dcinfo->collid = opexpr->inputcollid;
+                       distinct_cols = lappend(distinct_cols, dcinfo);
                }
 
-               if (query_is_distinct_for(subquery, colnos, opids))
+               if (query_is_distinct_for(subquery, distinct_cols))
                        return true;
        }
        return false;
@@ -1131,25 +1136,32 @@ query_supports_distinctness(Query *query)
  * query is a not-yet-planned subquery (in current usage, it's always from
  * a subquery RTE, which the planner avoids scribbling on).
  *
- * colnos is an integer list of output column numbers (resno's).  We are
- * interested in whether rows consisting of just these columns are certain
- * to be distinct.  "Distinctness" is defined according to whether the
- * corresponding upper-level equality operators listed in opids would think
- * the values are distinct.  (Note: the opids entries could be cross-type
- * operators, and thus not exactly the equality operators that the subquery
- * would use itself.  We use equality_ops_are_compatible() to check
- * compatibility.  That looks at opfamily membership for index AMs that have
- * declared that they support consistent equality semantics within an
+ * distinct_cols is a list of DistinctColInfo, one per requested output column.
+ * Each entry names the subquery output column number we want distinct, the
+ * upper-level equality operator we'll compare values with, and that operator's
+ * input collation.  We are interested in whether rows consisting of just these
+ * columns are certain to be distinct.
+ *
+ * "Distinctness" is defined according to whether the corresponding upper-level
+ * equality operators would think the values are distinct.  (Note: each opid
+ * could be a cross-type operator, and thus not exactly the equality operator
+ * that the subquery would use itself.  We use equality_ops_are_compatible() to
+ * check compatibility.  That looks at opfamily membership for index AMs that
+ * have declared that they support consistent equality semantics within an
  * opfamily, and so should give trustworthy answers for all operators that we
  * might need to deal with here.)
+ *
+ * The collid must also agree on equality with the collation the subquery's own
+ * DISTINCT/GROUP BY/set-op uses to deduplicate the column, else the subquery's
+ * distinctness does not carry over to the caller's equality semantics.  Two
+ * collations agree on equality if they match or if both are deterministic (in
+ * which case both reduce equality to byte-equality; see CREATE COLLATION).
  */
 bool
-query_is_distinct_for(Query *query, List *colnos, List *opids)
+query_is_distinct_for(Query *query, List *distinct_cols)
 {
        ListCell   *l;
-       Oid                     opid;
-
-       Assert(list_length(colnos) == list_length(opids));
+       DistinctColInfo *dcinfo;
 
        /*
         * DISTINCT (including DISTINCT ON) guarantees uniqueness if all the
@@ -1165,9 +1177,11 @@ query_is_distinct_for(Query *query, List *colnos, List *opids)
                        TargetEntry *tle = get_sortgroupclause_tle(sgc,
                                                                                                           query->targetList);
 
-                       opid = distinct_col_search(tle->resno, colnos, opids);
-                       if (!OidIsValid(opid) ||
-                               !equality_ops_are_compatible(opid, sgc->eqop))
+                       dcinfo = distinct_col_search(tle->resno, distinct_cols);
+                       if (dcinfo == NULL ||
+                               !equality_ops_are_compatible(dcinfo->opid, sgc->eqop) ||
+                               !collations_agree_on_equality(dcinfo->collid,
+                                                                                         exprCollation((Node *) tle->expr)))
                                break;                  /* exit early if no match */
                }
                if (l == NULL)                  /* had matches for all? */
@@ -1196,9 +1210,11 @@ query_is_distinct_for(Query *query, List *colnos, List *opids)
                        TargetEntry *tle = get_sortgroupclause_tle(sgc,
                                                                                                           query->targetList);
 
-                       opid = distinct_col_search(tle->resno, colnos, opids);
-                       if (!OidIsValid(opid) ||
-                               !equality_ops_are_compatible(opid, sgc->eqop))
+                       dcinfo = distinct_col_search(tle->resno, distinct_cols);
+                       if (dcinfo == NULL ||
+                               !equality_ops_are_compatible(dcinfo->opid, sgc->eqop) ||
+                               !collations_agree_on_equality(dcinfo->collid,
+                                                                                         exprCollation((Node *) tle->expr)))
                                break;                  /* exit early if no match */
                }
                if (l == NULL)                  /* had matches for all? */
@@ -1268,9 +1284,11 @@ query_is_distinct_for(Query *query, List *colnos, List *opids)
                                sgc = (SortGroupClause *) lfirst(lg);
                                lg = lnext(topop->groupClauses, lg);
 
-                               opid = distinct_col_search(tle->resno, colnos, opids);
-                               if (!OidIsValid(opid) ||
-                                       !equality_ops_are_compatible(opid, sgc->eqop))
+                               dcinfo = distinct_col_search(tle->resno, distinct_cols);
+                               if (dcinfo == NULL ||
+                                       !equality_ops_are_compatible(dcinfo->opid, sgc->eqop) ||
+                                       !collations_agree_on_equality(dcinfo->collid,
+                                                                                                 exprCollation((Node *) tle->expr)))
                                        break;          /* exit early if no match */
                        }
                        if (l == NULL)          /* had matches for all? */
@@ -1292,22 +1310,21 @@ query_is_distinct_for(Query *query, List *colnos, List *opids)
 /*
  * distinct_col_search - subroutine for query_is_distinct_for
  *
- * If colno is in colnos, return the corresponding element of opids,
- * else return InvalidOid.  (Ordinarily colnos would not contain duplicates,
- * but if it does, we arbitrarily select the first match.)
+ * If colno matches the colno field of an entry in distinct_cols, return a
+ * pointer to that entry; else return NULL.  (Ordinarily distinct_cols would
+ * not contain duplicate colnos, but if it does, we arbitrarily select the
+ * first match.)
  */
-static Oid
-distinct_col_search(int colno, List *colnos, List *opids)
+static DistinctColInfo *
+distinct_col_search(int colno, List *distinct_cols)
 {
-       ListCell   *lc1,
-                          *lc2;
-
-       forboth(lc1, colnos, lc2, opids)
+       foreach_ptr(DistinctColInfo, dcinfo, distinct_cols)
        {
-               if (colno == lfirst_int(lc1))
-                       return lfirst_oid(lc2);
+               if (dcinfo->colno == colno)
+                       return dcinfo;
        }
-       return InvalidOid;
+
+       return NULL;
 }
 
 
index 693b879f76d1a63c4308155d21981ba2c619c49a..27a2c6815b70fc182c00865a4a62926725ebd3b4 100644 (file)
@@ -3375,6 +3375,20 @@ typedef struct RowIdentityVarInfo
        Relids          rowidrels;              /* RTE indexes of target rels using this */
 } RowIdentityVarInfo;
 
+/*
+ * One element of the list passed to query_is_distinct_for().  Each entry
+ * names a subquery output column that the caller needs to be distinct over,
+ * plus the upper-level equality operator and its input collation, so that
+ * the subquery's own DISTINCT/GROUP BY/set-op clauses can be compared for
+ * compatibility.
+ */
+typedef struct DistinctColInfo
+{
+       int                     colno;                  /* subquery output column resno */
+       Oid                     opid;                   /* upper-level equality operator */
+       Oid                     collid;                 /* input collation of opid */
+} DistinctColInfo;
+
 /*
  * For each distinct placeholder expression generated during planning, we
  * store a PlaceHolderInfo node in the PlannerInfo node's placeholder_list.
index d0dc3761b13fc0efdcb55b4e4e61c8d46bd10d98..71c043a25e80b45b7e5c600331f6766ba09772c0 100644 (file)
@@ -111,7 +111,7 @@ extern void match_foreign_keys_to_quals(PlannerInfo *root);
 extern List *remove_useless_joins(PlannerInfo *root, List *joinlist);
 extern void reduce_unique_semijoins(PlannerInfo *root);
 extern bool query_supports_distinctness(Query *query);
-extern bool query_is_distinct_for(Query *query, List *colnos, List *opids);
+extern bool query_is_distinct_for(Query *query, List *distinct_cols);
 extern bool innerrel_is_unique(PlannerInfo *root,
                                                           Relids joinrelids, Relids outerrelids, RelOptInfo *innerrel,
                                                           JoinType jointype, List *restrictlist, bool force_cache);
index 25bfd53787210b68f72c0e35275cae33c285a8b1..8c3a369e212ea41a7990b6aaff93172c575e5f6d 100644 (file)
@@ -1777,6 +1777,187 @@ ORDER BY 1;
  ghi
 (4 rows)
 
+--
+-- A DISTINCT / GROUP BY / set-op on a subquery does not prove uniqueness
+-- under a different collation, so the planner must not use such a proof for
+-- any optimization.
+--
+-- Ensure that we do not use inner-unique join execution
+EXPLAIN (VERBOSE, COSTS OFF)
+SELECT * FROM test1cs t1, (SELECT DISTINCT x FROM test3cs) t2
+WHERE t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1, 2;
+                                QUERY PLAN                                 
+---------------------------------------------------------------------------
+ Sort
+   Output: t1.x, test3cs.x
+   Sort Key: t1.x COLLATE case_sensitive, test3cs.x COLLATE case_sensitive
+   ->  Hash Join
+         Output: t1.x, test3cs.x
+         Hash Cond: ((t1.x)::text = (test3cs.x)::text)
+         ->  Seq Scan on collate_tests.test1cs t1
+               Output: t1.x
+         ->  Hash
+               Output: test3cs.x
+               ->  HashAggregate
+                     Output: test3cs.x
+                     Group Key: test3cs.x
+                     ->  Seq Scan on collate_tests.test3cs
+                           Output: test3cs.x
+(15 rows)
+
+SELECT * FROM test1cs t1, (SELECT DISTINCT x FROM test3cs) t2
+WHERE t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1, 2;
+  x  |  x  
+-----+-----
+ abc | abc
+ abc | ABC
+ ABC | abc
+ ABC | ABC
+ def | def
+ ghi | ghi
+(6 rows)
+
+-- Same with GROUP BY
+EXPLAIN (VERBOSE, COSTS OFF)
+SELECT * FROM test1cs t1, (SELECT x FROM test3cs GROUP BY x) t2
+WHERE t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1, 2;
+                                QUERY PLAN                                 
+---------------------------------------------------------------------------
+ Sort
+   Output: t1.x, test3cs.x
+   Sort Key: t1.x COLLATE case_sensitive, test3cs.x COLLATE case_sensitive
+   ->  Hash Join
+         Output: t1.x, test3cs.x
+         Hash Cond: ((t1.x)::text = (test3cs.x)::text)
+         ->  Seq Scan on collate_tests.test1cs t1
+               Output: t1.x
+         ->  Hash
+               Output: test3cs.x
+               ->  HashAggregate
+                     Output: test3cs.x
+                     Group Key: test3cs.x
+                     ->  Seq Scan on collate_tests.test3cs
+                           Output: test3cs.x
+(15 rows)
+
+SELECT * FROM test1cs t1, (SELECT x FROM test3cs GROUP BY x) t2
+WHERE t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1, 2;
+  x  |  x  
+-----+-----
+ abc | abc
+ abc | ABC
+ ABC | abc
+ ABC | ABC
+ def | def
+ ghi | ghi
+(6 rows)
+
+-- Same with set-op
+EXPLAIN (VERBOSE, COSTS OFF)
+SELECT * FROM test1cs t1, (SELECT x FROM test3cs UNION SELECT x FROM test3cs) t2
+WHERE t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1, 2;
+                                QUERY PLAN                                 
+---------------------------------------------------------------------------
+ Sort
+   Output: t1.x, test3cs.x
+   Sort Key: t1.x COLLATE case_sensitive, test3cs.x COLLATE case_sensitive
+   ->  Hash Join
+         Output: t1.x, test3cs.x
+         Hash Cond: ((test3cs.x)::text = (t1.x)::text)
+         ->  HashAggregate
+               Output: test3cs.x
+               Group Key: test3cs.x
+               ->  Append
+                     ->  Seq Scan on collate_tests.test3cs
+                           Output: test3cs.x
+                     ->  Seq Scan on collate_tests.test3cs test3cs_1
+                           Output: test3cs_1.x
+         ->  Hash
+               Output: t1.x
+               ->  Seq Scan on collate_tests.test1cs t1
+                     Output: t1.x
+(18 rows)
+
+SELECT * FROM test1cs t1, (SELECT x FROM test3cs UNION SELECT x FROM test3cs) t2
+WHERE t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1, 2;
+  x  |  x  
+-----+-----
+ abc | abc
+ abc | ABC
+ ABC | abc
+ ABC | ABC
+ def | def
+ ghi | ghi
+(6 rows)
+
+-- Ensure that left-join is not removed
+EXPLAIN (COSTS OFF)
+SELECT t1.* FROM test3cs t1
+       LEFT JOIN (SELECT DISTINCT x FROM test3cs) t2 ON t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1;
+                  QUERY PLAN                   
+-----------------------------------------------
+ Sort
+   Sort Key: t1.x COLLATE case_sensitive
+   ->  Hash Left Join
+         Hash Cond: (t1.x = (test3cs.x)::text)
+         ->  Seq Scan on test3cs t1
+         ->  Hash
+               ->  HashAggregate
+                     Group Key: test3cs.x
+                     ->  Seq Scan on test3cs
+(9 rows)
+
+SELECT t1.* FROM test3cs t1
+       LEFT JOIN (SELECT DISTINCT x FROM test3cs) t2 ON t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1;
+  x  
+-----
+ abc
+ abc
+ ABC
+ ABC
+ def
+ ghi
+(6 rows)
+
+-- Ensure that semijoin is not reduced to innerjoin
+EXPLAIN (COSTS OFF)
+SELECT * FROM test3cs t1
+  WHERE EXISTS (SELECT 1 FROM (SELECT DISTINCT x FROM test3cs) t2
+                WHERE t1.x = t2.x COLLATE case_insensitive)
+ORDER BY 1;
+                      QUERY PLAN                       
+-------------------------------------------------------
+ Sort
+   Sort Key: t1.x COLLATE case_sensitive
+   ->  Hash Semi Join
+         Hash Cond: ((t1.x)::text = (test3cs.x)::text)
+         ->  Seq Scan on test3cs t1
+         ->  Hash
+               ->  HashAggregate
+                     Group Key: test3cs.x
+                     ->  Seq Scan on test3cs
+(9 rows)
+
+SELECT * FROM test3cs t1
+  WHERE EXISTS (SELECT 1 FROM (SELECT DISTINCT x FROM test3cs) t2
+                WHERE t1.x = t2.x COLLATE case_insensitive)
+ORDER BY 1;
+  x  
+-----
+ abc
+ ABC
+ def
+ ghi
+(4 rows)
+
 CREATE TABLE test1ci (x text COLLATE case_insensitive);
 CREATE TABLE test2ci (x text COLLATE case_insensitive);
 CREATE TABLE test3ci (x text COLLATE case_insensitive);
index 81d32c03e1a322e312fcbaaf4f2cbae1de551ac2..fdcdb2094f8a98ecda48a968a1dc76194cea8e8e 100644 (file)
@@ -657,6 +657,64 @@ SELECT * FROM test3cs t1
   WHERE EXISTS (SELECT 1 FROM test3cs t2 WHERE t1.x = t2.x COLLATE case_insensitive)
 ORDER BY 1;
 
+--
+-- A DISTINCT / GROUP BY / set-op on a subquery does not prove uniqueness
+-- under a different collation, so the planner must not use such a proof for
+-- any optimization.
+--
+
+-- Ensure that we do not use inner-unique join execution
+EXPLAIN (VERBOSE, COSTS OFF)
+SELECT * FROM test1cs t1, (SELECT DISTINCT x FROM test3cs) t2
+WHERE t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1, 2;
+
+SELECT * FROM test1cs t1, (SELECT DISTINCT x FROM test3cs) t2
+WHERE t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1, 2;
+
+-- Same with GROUP BY
+EXPLAIN (VERBOSE, COSTS OFF)
+SELECT * FROM test1cs t1, (SELECT x FROM test3cs GROUP BY x) t2
+WHERE t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1, 2;
+
+SELECT * FROM test1cs t1, (SELECT x FROM test3cs GROUP BY x) t2
+WHERE t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1, 2;
+
+-- Same with set-op
+EXPLAIN (VERBOSE, COSTS OFF)
+SELECT * FROM test1cs t1, (SELECT x FROM test3cs UNION SELECT x FROM test3cs) t2
+WHERE t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1, 2;
+
+SELECT * FROM test1cs t1, (SELECT x FROM test3cs UNION SELECT x FROM test3cs) t2
+WHERE t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1, 2;
+
+-- Ensure that left-join is not removed
+EXPLAIN (COSTS OFF)
+SELECT t1.* FROM test3cs t1
+       LEFT JOIN (SELECT DISTINCT x FROM test3cs) t2 ON t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1;
+
+SELECT t1.* FROM test3cs t1
+       LEFT JOIN (SELECT DISTINCT x FROM test3cs) t2 ON t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1;
+
+-- Ensure that semijoin is not reduced to innerjoin
+EXPLAIN (COSTS OFF)
+SELECT * FROM test3cs t1
+  WHERE EXISTS (SELECT 1 FROM (SELECT DISTINCT x FROM test3cs) t2
+                WHERE t1.x = t2.x COLLATE case_insensitive)
+ORDER BY 1;
+
+SELECT * FROM test3cs t1
+  WHERE EXISTS (SELECT 1 FROM (SELECT DISTINCT x FROM test3cs) t2
+                WHERE t1.x = t2.x COLLATE case_insensitive)
+ORDER BY 1;
+
 CREATE TABLE test1ci (x text COLLATE case_insensitive);
 CREATE TABLE test2ci (x text COLLATE case_insensitive);
 CREATE TABLE test3ci (x text COLLATE case_insensitive);
index 0abdb2d37e2d2dd28838e9d1bc8da639b349a6a1..3ade7c08b6d11ae67f9f8307203c7d335876b281 100644 (file)
@@ -688,6 +688,7 @@ DiscardMode
 DiscardStmt
 DispatchOption
 DistanceValue
+DistinctColInfo
 DistinctExpr
 DoState
 DoStmt