Consider collation when proving uniqueness from unique indexes

author Richard Guo <rguo@postgresql.org>

Tue, 5 May 2026 01:22:53 +0000 (10:22 +0900)

committer Richard Guo <rguo@postgresql.org>

Tue, 5 May 2026 01:22:53 +0000 (10:22 +0900)
author Richard Guo <rguo@postgresql.org>
Tue, 5 May 2026 01:22:53 +0000 (10:22 +0900)
committer Richard Guo <rguo@postgresql.org>
Tue, 5 May 2026 01:22:53 +0000 (10:22 +0900)
diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c

index f76a5373c4bea2d20bb7abd9f0da1fa543ba0a40..3f5d4fa3182fef6b90493b25226e41372e73c97a 100644 (file)
--- a/src/backend/optimizer/path/indxpath.c
+++ b/src/backend/optimizer/path/indxpath.c
@@ -4226,16 +4226,19 @@ relation_has_unique_index_for(PlannerInfo *root, RelOptInfo *rel,
                                  * The condition's equality operator must be a member of the
                                  * index opfamily, else it is not asserting the right kind of
                                  * equality behavior for this index.  We check this first
-                                * since it's probably cheaper than match_index_to_operand().
+                                * since it's probably the cheapest test.
                                  */
                                 if (!list_member_oid(rinfo->mergeopfamilies, ind->opfamily[c]))
                                         continue;
  
                                 /*
-                                * XXX at some point we may need to check collations here too.
-                                * For the moment we assume all collations reduce to the same
-                                * notion of equality.
+                                * The index's collation must agree with the clause's input
+                                * collation on equality, else the index's uniqueness does not
+                                * imply uniqueness under the clause's equality semantics.
                                  */
+                               if (!collations_agree_on_equality(ind->indexcollations[c],
+                                                                                                 exprInputCollation((Node *) rinfo->clause)))
+                                       continue;
  
                                 /* OK, see if the condition operand matches the index key */
                                 if (rinfo->outer_is_left)
diff --git a/src/backend/utils/cache/lsyscache.c b/src/backend/utils/cache/lsyscache.c

index 1574f0c68c1211eca1917403cd22f1f63c56ef06..3de10d4df7e0a48f6522f70c316d98ba3304be5e 100644 (file)
--- a/src/backend/utils/cache/lsyscache.c
+++ b/src/backend/utils/cache/lsyscache.c
@@ -860,6 +860,44 @@ comparison_ops_are_compatible(Oid opno1, Oid opno2)
         return result;
  }
  
+/*
+ * collations_agree_on_equality
+ *             Return true if the two collations have equivalent notions of equality,
+ *             so that a uniqueness or equality proof established under one side
+ *             carries over to a comparison performed under the other side.
+ *
+ * Note: this is equality compatibility only.  Do NOT use this to reason
+ * about ordering.
+ *
+ * An InvalidOid on either side denotes the absence of a collation -- that
+ * side's operation is not collation-sensitive (e.g. a non-collatable column
+ * type).  Absence of a collation cannot conflict with the other side's
+ * collation, so we treat such pairs as agreeing on equality.  This generalizes
+ * the asymmetric treatment in IndexCollMatchesExprColl().
+ *
+ * Otherwise the collations have equivalent equality if they match, or if both
+ * are deterministic: by definition a deterministic collation treats two
+ * strings as equal iff they are byte-wise equal (see CREATE COLLATION), so any
+ * two deterministic collations share the same equality relation.  A mismatch
+ * involving a nondeterministic collation, however, may mean the two equality
+ * relations disagree, and the proof is unsound.
+ */
+bool
+collations_agree_on_equality(Oid coll1, Oid coll2)
+{
+       if (!OidIsValid(coll1) || !OidIsValid(coll2))
+               return true;
+
+       if (coll1 == coll2)
+               return true;
+
+       if (!get_collation_isdeterministic(coll1) ||
+               !get_collation_isdeterministic(coll2))
+               return false;
+
+       return true;
+}
+
  /*
   * op_is_safe_index_member
   *             Check if the operator is a member of a B-tree or Hash operator family.
diff --git a/src/include/utils/lsyscache.h b/src/include/utils/lsyscache.h

index 2e0258d877e3c1368b54df00c328ab59f6bdb2bd..8d5e92e07be25f54801d91006f7545f59b3fff6f 100644 (file)
--- a/src/include/utils/lsyscache.h
+++ b/src/include/utils/lsyscache.h
@@ -89,6 +89,7 @@ extern bool get_op_hash_functions(Oid opno,
  extern List *get_op_index_interpretation(Oid opno);
  extern bool equality_ops_are_compatible(Oid opno1, Oid opno2);
  extern bool comparison_ops_are_compatible(Oid opno1, Oid opno2);
+extern bool collations_agree_on_equality(Oid coll1, Oid coll2);
  extern bool op_is_safe_index_member(Oid opno);
  extern Oid     get_opfamily_proc(Oid opfamily, Oid lefttype, Oid righttype,
                                                           int16 procnum);
diff --git a/src/test/regress/expected/collate.icu.utf8.out b/src/test/regress/expected/collate.icu.utf8.out

index b20640514ce99ee5d92a69fa6cab007d8d39a3d4..25bfd53787210b68f72c0e35275cae33c285a8b1 100644 (file)
--- a/src/test/regress/expected/collate.icu.utf8.out
+++ b/src/test/regress/expected/collate.icu.utf8.out
@@ -1654,6 +1654,129 @@ SELECT string_to_array('ABCDEFGHI' COLLATE case_sensitive, NULL, 'b');
   {A,B,C,D,E,F,G,H,I}
  (1 row)
  
+--
+-- A unique index under one collation does not prove uniqueness under
+-- another, so the planner must not use such a proof for any optimization.
+--
+-- Ensure that we do not use inner-unique join execution
+EXPLAIN (VERBOSE, COSTS OFF)
+SELECT * FROM test1cs t1, test3cs t2
+WHERE t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1, 2;
+                              QUERY PLAN                              
+----------------------------------------------------------------------
+ Sort
+   Output: t1.x, t2.x
+   Sort Key: t1.x COLLATE case_sensitive, t2.x COLLATE case_sensitive
+   ->  Hash Join
+         Output: t1.x, t2.x
+         Hash Cond: ((t2.x)::text = (t1.x)::text)
+         ->  Seq Scan on collate_tests.test3cs t2
+               Output: t2.x
+         ->  Hash
+               Output: t1.x
+               ->  Seq Scan on collate_tests.test1cs t1
+                     Output: t1.x
+(12 rows)
+
+SELECT * FROM test1cs t1, test3cs t2
+WHERE t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1, 2;
+  x  |  x  
+-----+-----
+ abc | abc
+ abc | ABC
+ ABC | abc
+ ABC | ABC
+ def | def
+ ghi | ghi
+(6 rows)
+
+-- Ensure that left-join is not removed
+EXPLAIN (COSTS OFF)
+SELECT t1.* FROM test3cs t1
+       LEFT JOIN test3cs t2 ON t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1;
+                QUERY PLAN                
+------------------------------------------
+ Sort
+   Sort Key: t1.x COLLATE case_sensitive
+   ->  Hash Left Join
+         Hash Cond: (t1.x = (t2.x)::text)
+         ->  Seq Scan on test3cs t1
+         ->  Hash
+               ->  Seq Scan on test3cs t2
+(7 rows)
+
+SELECT t1.* FROM test3cs t1
+       LEFT JOIN test3cs t2 ON t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1;
+  x  
+-----
+ abc
+ abc
+ ABC
+ ABC
+ def
+ ghi
+(6 rows)
+
+-- Ensure that self-join is not removed
+EXPLAIN (COSTS OFF)
+SELECT * FROM test3cs t1, test3cs t2
+WHERE t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1, 2;
+                              QUERY PLAN                              
+----------------------------------------------------------------------
+ Sort
+   Sort Key: t1.x COLLATE case_sensitive, t2.x COLLATE case_sensitive
+   ->  Hash Join
+         Hash Cond: ((t1.x)::text = (t2.x)::text)
+         ->  Seq Scan on test3cs t1
+         ->  Hash
+               ->  Seq Scan on test3cs t2
+(7 rows)
+
+SELECT * FROM test3cs t1, test3cs t2
+WHERE t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1, 2;
+  x  |  x  
+-----+-----
+ abc | abc
+ abc | ABC
+ ABC | abc
+ ABC | ABC
+ def | def
+ ghi | ghi
+(6 rows)
+
+-- Ensure that semijoin is not reduced to innerjoin
+EXPLAIN (COSTS OFF)
+SELECT * FROM test3cs t1
+  WHERE EXISTS (SELECT 1 FROM test3cs t2 WHERE t1.x = t2.x COLLATE case_insensitive)
+ORDER BY 1;
+                    QUERY PLAN                    
+--------------------------------------------------
+ Sort
+   Sort Key: t1.x COLLATE case_sensitive
+   ->  Hash Semi Join
+         Hash Cond: ((t1.x)::text = (t2.x)::text)
+         ->  Seq Scan on test3cs t1
+         ->  Hash
+               ->  Seq Scan on test3cs t2
+(7 rows)
+
+SELECT * FROM test3cs t1
+  WHERE EXISTS (SELECT 1 FROM test3cs t2 WHERE t1.x = t2.x COLLATE case_insensitive)
+ORDER BY 1;
+  x  
+-----
+ abc
+ ABC
+ def
+ ghi
+(4 rows)
+
  CREATE TABLE test1ci (x text COLLATE case_insensitive);
  CREATE TABLE test2ci (x text COLLATE case_insensitive);
  CREATE TABLE test3ci (x text COLLATE case_insensitive);
diff --git a/src/test/regress/sql/collate.icu.utf8.sql b/src/test/regress/sql/collate.icu.utf8.sql

index d4640c0eb14330f4c280b13fc767161fda0913d7..81d32c03e1a322e312fcbaaf4f2cbae1de551ac2 100644 (file)
--- a/src/test/regress/sql/collate.icu.utf8.sql
+++ b/src/test/regress/sql/collate.icu.utf8.sql
@@ -612,6 +612,51 @@ CREATE UNIQUE INDEX ON test3cs (x);  -- ok
  SELECT string_to_array('ABC,DEF,GHI' COLLATE case_sensitive, ',', 'abc');
  SELECT string_to_array('ABCDEFGHI' COLLATE case_sensitive, NULL, 'b');
  
+--
+-- A unique index under one collation does not prove uniqueness under
+-- another, so the planner must not use such a proof for any optimization.
+--
+
+-- Ensure that we do not use inner-unique join execution
+EXPLAIN (VERBOSE, COSTS OFF)
+SELECT * FROM test1cs t1, test3cs t2
+WHERE t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1, 2;
+
+SELECT * FROM test1cs t1, test3cs t2
+WHERE t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1, 2;
+
+-- Ensure that left-join is not removed
+EXPLAIN (COSTS OFF)
+SELECT t1.* FROM test3cs t1
+       LEFT JOIN test3cs t2 ON t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1;
+
+SELECT t1.* FROM test3cs t1
+       LEFT JOIN test3cs t2 ON t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1;
+
+-- Ensure that self-join is not removed
+EXPLAIN (COSTS OFF)
+SELECT * FROM test3cs t1, test3cs t2
+WHERE t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1, 2;
+
+SELECT * FROM test3cs t1, test3cs t2
+WHERE t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1, 2;
+
+-- Ensure that semijoin is not reduced to innerjoin
+EXPLAIN (COSTS OFF)
+SELECT * FROM test3cs t1
+  WHERE EXISTS (SELECT 1 FROM test3cs t2 WHERE t1.x = t2.x COLLATE case_insensitive)
+ORDER BY 1;
+
+SELECT * FROM test3cs t1
+  WHERE EXISTS (SELECT 1 FROM test3cs t2 WHERE t1.x = t2.x COLLATE case_insensitive)
+ORDER BY 1;
+
  CREATE TABLE test1ci (x text COLLATE case_insensitive);
  CREATE TABLE test2ci (x text COLLATE case_insensitive);
  CREATE TABLE test3ci (x text COLLATE case_insensitive);
author	Richard Guo <rguo@postgresql.org>
	Tue, 5 May 2026 01:22:53 +0000 (10:22 +0900)
committer	Richard Guo <rguo@postgresql.org>
	Tue, 5 May 2026 01:22:53 +0000 (10:22 +0900)
src/backend/optimizer/path/indxpath.c		patch \| blob \| blame \| history
src/backend/utils/cache/lsyscache.c		patch \| blob \| blame \| history
src/include/utils/lsyscache.h		patch \| blob \| blame \| history
src/test/regress/expected/collate.icu.utf8.out		patch \| blob \| blame \| history
src/test/regress/sql/collate.icu.utf8.sql		patch \| blob \| blame \| history