Consider opfamily and collation when removing redundant GROUP BY columns

author Richard Guo <rguo@postgresql.org>

Fri, 8 May 2026 03:45:51 +0000 (12:45 +0900)

committer Richard Guo <rguo@postgresql.org>

Fri, 8 May 2026 03:45:51 +0000 (12:45 +0900)
author Richard Guo <rguo@postgresql.org>
Fri, 8 May 2026 03:45:51 +0000 (12:45 +0900)
committer Richard Guo <rguo@postgresql.org>
Fri, 8 May 2026 03:45:51 +0000 (12:45 +0900)
diff --git a/src/backend/optimizer/plan/initsplan.c b/src/backend/optimizer/plan/initsplan.c

index 96ee312ebdf523e9627237869915f68716663833..b38422c47a4166f05a10e40307d23a57ca08bab6 100644 (file)
--- a/src/backend/optimizer/plan/initsplan.c
+++ b/src/backend/optimizer/plan/initsplan.c
@@ -82,6 +82,18 @@ typedef struct JoinTreeItem
                                                                          * lateral references */
  } JoinTreeItem;
  
+/*
+ * Compatibility info for one GROUP BY item, precomputed for use by
+ * remove_useless_groupby_columns() when matching unique-index columns against
+ * GROUP BY items.
+ */
+typedef struct GroupByColInfo
+{
+       AttrNumber      attno;                  /* var->varattno */
+       List       *eq_opfamilies;      /* mergejoin opfamilies of sgc->eqop */
+       Oid                     coll;                   /* var->varcollid */
+} GroupByColInfo;
+
  
  static bool is_partial_agg_memory_risky(PlannerInfo *root);
  static void create_agg_clause_infos(PlannerInfo *root);
@@ -421,6 +433,7 @@ remove_useless_groupby_columns(PlannerInfo *root)
  {
         Query      *parse = root->parse;
         Bitmapset **groupbyattnos;
+       List      **groupbycols;
         Bitmapset **surplusvars;
         bool            tryremove = false;
         ListCell   *lc;
@@ -437,14 +450,20 @@ remove_useless_groupby_columns(PlannerInfo *root)
         /*
          * Scan the GROUP BY clause to find GROUP BY items that are simple Vars.
          * Fill groupbyattnos[k] with a bitmapset of the column attnos of RTE k
-        * that are GROUP BY items.
+        * that are GROUP BY items, and groupbycols[k] with a parallel list of
+        * GroupByColInfo records.  We need the latter so that, when checking a
+        * unique index against this rel's GROUP BY items, we can verify that the
+        * index's notion of equality agrees with at least one GROUP BY item per
+        * index column.
          */
         groupbyattnos = palloc0_array(Bitmapset *, list_length(parse->rtable) + 1);
+       groupbycols = palloc0_array(List *, list_length(parse->rtable) + 1);
         foreach(lc, root->processed_groupClause)
         {
                 SortGroupClause *sgc = lfirst_node(SortGroupClause, lc);
                 TargetEntry *tle = get_sortgroupclause_tle(sgc, parse->targetList);
                 Var                *var = (Var *) tle->expr;
+               GroupByColInfo *info;
  
                 /*
                  * Ignore non-Vars and Vars from other query levels.
@@ -470,6 +489,12 @@ remove_useless_groupby_columns(PlannerInfo *root)
                 tryremove |= !bms_is_empty(groupbyattnos[relid]);
                 groupbyattnos[relid] = bms_add_member(groupbyattnos[relid],
                                                                                           var->varattno - FirstLowInvalidHeapAttributeNumber);
+
+               info = palloc(sizeof(GroupByColInfo));
+               info->attno = var->varattno;
+               info->eq_opfamilies = get_mergejoin_opfamilies(sgc->eqop);
+               info->coll = var->varcollid;
+               groupbycols[relid] = lappend(groupbycols[relid], info);
         }
  
         /*
@@ -524,7 +549,7 @@ remove_useless_groupby_columns(PlannerInfo *root)
                 foreach_node(IndexOptInfo, index, rel->indexlist)
                 {
                         Bitmapset  *ind_attnos;
-                       bool            nulls_check_ok;
+                       bool            index_check_ok;
  
                         /*
                          * Skip any non-unique and deferrable indexes.  Predicate indexes
@@ -539,9 +564,14 @@ remove_useless_groupby_columns(PlannerInfo *root)
                                 continue;
  
                         ind_attnos = NULL;
-                       nulls_check_ok = true;
+                       index_check_ok = true;
                         for (int i = 0; i < index->nkeycolumns; i++)
                         {
+                               AttrNumber      indkey_attno = index->indexkeys[i];
+                               Oid                     indkey_opfamily = index->opfamily[i];
+                               Oid                     indkey_coll = index->indexcollations[i];
+                               ListCell   *lc2;
+
                                 /*
                                  * We must insist that the index columns are all defined NOT
                                  * NULL otherwise duplicate NULLs could exist.  However, we
@@ -551,20 +581,41 @@ remove_useless_groupby_columns(PlannerInfo *root)
                                  * despite the NULL.
                                  */
                                 if (!index->nullsnotdistinct &&
-                                       !bms_is_member(index->indexkeys[i],
-                                                                  rel->notnullattnums))
+                                       !bms_is_member(indkey_attno, rel->notnullattnums))
+                               {
+                                       index_check_ok = false;
+                                       break;
+                               }
+
+                               /*
+                                * The index proves uniqueness only under its own opfamily and
+                                * collation.  Require some GROUP BY item on this column to
+                                * use a compatible eqop and collation, the same check
+                                * relation_has_unique_index_for() applies to join clauses.
+                                */
+                               foreach(lc2, groupbycols[relid])
+                               {
+                                       GroupByColInfo *info = (GroupByColInfo *) lfirst(lc2);
+
+                                       if (info->attno != indkey_attno)
+                                               continue;
+                                       if (list_member_oid(info->eq_opfamilies, indkey_opfamily) &&
+                                               collations_agree_on_equality(indkey_coll, info->coll))
+                                               break;
+                               }
+                               if (lc2 == NULL)
                                 {
-                                       nulls_check_ok = false;
+                                       index_check_ok = false;
                                         break;
                                 }
  
                                 ind_attnos =
                                         bms_add_member(ind_attnos,
-                                                                  index->indexkeys[i] -
+                                                                  indkey_attno -
                                                                    FirstLowInvalidHeapAttributeNumber);
                         }
  
-                       if (!nulls_check_ok)
+                       if (!index_check_ok)
                                 continue;
  
                         /*
diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out

index ff80869fb3390ca2827f5cb5ad8541282a04fc86..fbda0e3bbc28771463b5c0414a4f3e0a471474cd 100644 (file)
--- a/src/test/regress/expected/aggregates.out
+++ b/src/test/regress/expected/aggregates.out
@@ -1645,6 +1645,37 @@ explain (costs off) select y,z from t2 group by y,z;
     ->  Seq Scan on t2
  (3 rows)
  
+-- A unique index proves uniqueness only under its own opfamily.  When the
+-- GROUP BY's eqop comes from a different opfamily with looser equality,
+-- rows the index regards as distinct can collapse into one GROUP BY group,
+-- so the index is not usable for removing redundant columns.
+create type t_rec as (x numeric);
+create temp table t_opf (a t_rec not null, b text);
+create unique index on t_opf (a record_image_ops);
+-- (1.0) and (1.00) are bytewise distinct but logically equal as records;
+-- the index admits both, but GROUP BY a (default record_ops) would merge
+-- them, so b must be retained as a grouping key.
+insert into t_opf values (row(1.0)::t_rec, 'X'), (row(1.00)::t_rec, 'Y');
+explain (costs off)
+select a, b from t_opf group by a, b order by b;
+          QUERY PLAN           
+-------------------------------
+ Group
+   Group Key: b, a
+   ->  Sort
+         Sort Key: b, a
+         ->  Seq Scan on t_opf
+(5 rows)
+
+select a, b from t_opf group by a, b order by b;
+   a    | b 
+--------+---
+ (1.0)  | X
+ (1.00) | Y
+(2 rows)
+
+drop table t_opf;
+drop type t_rec;
  drop table t1 cascade;
  NOTICE:  drop cascades to table t1c
  drop table t2;
diff --git a/src/test/regress/expected/collate.icu.utf8.out b/src/test/regress/expected/collate.icu.utf8.out

index 9c4b8ccff78f5b8d0715dd4148f8e152ef220c11..04e2f6df03738106b0e2e32f6feb6c223eab1a89 100644 (file)
--- a/src/test/regress/expected/collate.icu.utf8.out
+++ b/src/test/regress/expected/collate.icu.utf8.out
@@ -3321,6 +3321,38 @@ GROUP BY t1.id, t1.val;
  
  DROP TABLE eager_agg_t1;
  DROP TABLE eager_agg_t2;
+--
+-- A unique index can prove functional dependency for GROUP BY column
+-- removal only if its per-column collation agrees on equality with
+-- the GROUP BY column's collation.  An index built under a different
+-- (deterministic) collation would otherwise let remove_useless_groupby_columns
+-- drop other columns whose values still differ within a nondeterministic
+-- group.
+--
+CREATE TABLE groupby_collation_t (a text COLLATE case_insensitive NOT NULL, b text);
+INSERT INTO groupby_collation_t VALUES ('foo', 'X'), ('FOO', 'Y');
+CREATE UNIQUE INDEX ON groupby_collation_t (a COLLATE "C");
+-- Column b must NOT be dropped: under case_insensitive on a, 'foo' and
+-- 'FOO' would merge, but they have distinct b values.
+EXPLAIN (COSTS OFF)
+SELECT a, b FROM groupby_collation_t GROUP BY a, b ORDER BY a, b;
+                   QUERY PLAN                    
+-------------------------------------------------
+ Group
+   Group Key: a, b
+   ->  Sort
+         Sort Key: a COLLATE case_insensitive, b
+         ->  Seq Scan on groupby_collation_t
+(5 rows)
+
+SELECT a, b FROM groupby_collation_t GROUP BY a, b ORDER BY a, b;
+  a  | b 
+-----+---
+ foo | X
+ FOO | Y
+(2 rows)
+
+DROP TABLE groupby_collation_t;
  -- virtual generated columns
  CREATE TABLE t5 (
      a int,
diff --git a/src/test/regress/sql/aggregates.sql b/src/test/regress/sql/aggregates.sql

index 89bb83718e0c5877ae01ed3cdb60e04b84210959..580f364ba97c5cdd113312b6e93af6182a337b04 100644 (file)
--- a/src/test/regress/sql/aggregates.sql
+++ b/src/test/regress/sql/aggregates.sql
@@ -577,6 +577,23 @@ alter table t2 alter column z drop not null;
  create unique index t2_z_uidx on t2(z) nulls not distinct;
  explain (costs off) select y,z from t2 group by y,z;
  
+-- A unique index proves uniqueness only under its own opfamily.  When the
+-- GROUP BY's eqop comes from a different opfamily with looser equality,
+-- rows the index regards as distinct can collapse into one GROUP BY group,
+-- so the index is not usable for removing redundant columns.
+create type t_rec as (x numeric);
+create temp table t_opf (a t_rec not null, b text);
+create unique index on t_opf (a record_image_ops);
+-- (1.0) and (1.00) are bytewise distinct but logically equal as records;
+-- the index admits both, but GROUP BY a (default record_ops) would merge
+-- them, so b must be retained as a grouping key.
+insert into t_opf values (row(1.0)::t_rec, 'X'), (row(1.00)::t_rec, 'Y');
+explain (costs off)
+select a, b from t_opf group by a, b order by b;
+select a, b from t_opf group by a, b order by b;
+drop table t_opf;
+drop type t_rec;
+
  drop table t1 cascade;
  drop table t2;
  drop table t3;
diff --git a/src/test/regress/sql/collate.icu.utf8.sql b/src/test/regress/sql/collate.icu.utf8.sql

index a8e34017e07f076772b58288e8a44b9056131595..18c47e6e05a0f52677d5563d9371223cef5eb996 100644 (file)
--- a/src/test/regress/sql/collate.icu.utf8.sql
+++ b/src/test/regress/sql/collate.icu.utf8.sql
@@ -1219,6 +1219,26 @@ GROUP BY t1.id, t1.val;
  DROP TABLE eager_agg_t1;
  DROP TABLE eager_agg_t2;
  
+--
+-- A unique index can prove functional dependency for GROUP BY column
+-- removal only if its per-column collation agrees on equality with
+-- the GROUP BY column's collation.  An index built under a different
+-- (deterministic) collation would otherwise let remove_useless_groupby_columns
+-- drop other columns whose values still differ within a nondeterministic
+-- group.
+--
+CREATE TABLE groupby_collation_t (a text COLLATE case_insensitive NOT NULL, b text);
+INSERT INTO groupby_collation_t VALUES ('foo', 'X'), ('FOO', 'Y');
+CREATE UNIQUE INDEX ON groupby_collation_t (a COLLATE "C");
+
+-- Column b must NOT be dropped: under case_insensitive on a, 'foo' and
+-- 'FOO' would merge, but they have distinct b values.
+EXPLAIN (COSTS OFF)
+SELECT a, b FROM groupby_collation_t GROUP BY a, b ORDER BY a, b;
+SELECT a, b FROM groupby_collation_t GROUP BY a, b ORDER BY a, b;
+
+DROP TABLE groupby_collation_t;
+
  -- virtual generated columns
  CREATE TABLE t5 (
      a int,
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list

index 3ade7c08b6d11ae67f9f8307203c7d335876b281..06532bf715265a1ffeb6701ed6c38ba7f2ad00d0 100644 (file)
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -1166,6 +1166,7 @@ GraphPattern
  GraphPropertyRef
  GraphTableParseState
  Group
+GroupByColInfo
  GroupByOrdering
  GroupClause
  GroupPath
author	Richard Guo <rguo@postgresql.org>
	Fri, 8 May 2026 03:45:51 +0000 (12:45 +0900)
committer	Richard Guo <rguo@postgresql.org>
	Fri, 8 May 2026 03:45:51 +0000 (12:45 +0900)
src/backend/optimizer/plan/initsplan.c		patch \| blob \| blame \| history
src/test/regress/expected/aggregates.out		patch \| blob \| blame \| history
src/test/regress/expected/collate.icu.utf8.out		patch \| blob \| blame \| history
src/test/regress/sql/aggregates.sql		patch \| blob \| blame \| history
src/test/regress/sql/collate.icu.utf8.sql		patch \| blob \| blame \| history
src/tools/pgindent/typedefs.list		patch \| blob \| blame \| history