From 3aee6283709f6a7d826b3fb5773cc4496788a5df Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Thu, 28 Aug 2025 13:49:20 -0400
Subject: [PATCH] Fix "variable not found in subplan target lists" in semijoin
 de-duplication.

One mechanism we have for implementing semi-joins is to de-duplicate
the output of the RHS and then treat the join as a plain inner join.
Initial construction of the join's SpecialJoinInfo identifies the
RHS columns that need to be de-duplicated, but later we may find that
some of those don't need to be handled explicitly, either because
they're known to be constant or because they are redundant with some
previous column.

Up to now, while sort-based de-duplication handled such cases well,
hash-based de-duplication didn't: we'd still hash on all of the
originally-identified columns.  This is probably not a very big
deal performance-wise, but in the wake of commit a3179ab69 it can
cause planner errors.  That happens when join elimination causes
recalculation of variables' attr_needed bitmapsets, and we decide
that a variable mentioned in a semijoin clause doesn't need to be
propagated up to the join level anymore.

There are a number of ways we could slice the blame for this, but the
only fix that doesn't result in pessimizing plans for loosely-related
cases is to be more careful about not hashing columns we don't
actually need to de-duplicate.  We can install that consideration
into create_unique_paths in master, or the predecessor code in
create_unique_path in v18, without much refactoring.

(As follow-up work, it might be a good idea to look at more-invasive
refactoring, in hopes of preventing other bugs in this area.  But
with v18 release so close, there's not time for that now, nor would
we be likely to want to put such refactoring into v18 anyway.)

Reported-by: Sergey Soloviev <sergey.soloviev@tantorlabs.ru>
Diagnosed-by: Richard Guo <guofenglinux@gmail.com>
Author: Tom Lane <tgl@sss.pgh.pa.us>
Reviewed-by: Richard Guo <guofenglinux@gmail.com>
Discussion: https://postgr.es/m/1fd1a421-4609-4d46-a1af-ab74d5de504a@tantorlabs.ru
Backpatch-through: 18
---
 src/backend/optimizer/util/pathnode.c    | 155 +++++++++++++++++++++--
 src/test/regress/expected/aggregates.out |  20 ---
 src/test/regress/expected/join.out       |  80 ++++++++++++
 src/test/regress/sql/aggregates.sql      |   9 --
 src/test/regress/sql/join.sql            |  33 +++++
 5 files changed, 254 insertions(+), 43 deletions(-)

diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index e0192d4a491..6d6e67f84e9 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -19,6 +19,7 @@
 #include "foreign/fdwapi.h"
 #include "miscadmin.h"
 #include "nodes/extensible.h"
+#include "nodes/makefuncs.h"
 #include "optimizer/appendinfo.h"
 #include "optimizer/clauses.h"
 #include "optimizer/cost.h"
@@ -27,7 +28,9 @@
 #include "optimizer/paths.h"
 #include "optimizer/planmain.h"
 #include "optimizer/tlist.h"
+#include "parser/parse_clause.h"
 #include "parser/parsetree.h"
+#include "utils/lsyscache.h"
 #include "utils/memutils.h"
 #include "utils/selfuncs.h"
 
@@ -1727,6 +1730,8 @@ UniquePath *
 create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath,
 				   SpecialJoinInfo *sjinfo)
 {
+	List	   *uniq_exprs;
+	List	   *in_operators;
 	UniquePath *pathnode;
 	Path		sort_path;		/* dummy for result of cost_sort */
 	Path		agg_path;		/* dummy for result of cost_agg */
@@ -1760,6 +1765,134 @@ create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath,
 	 */
 	oldcontext = MemoryContextSwitchTo(GetMemoryChunkContext(rel));
 
+	/*
+	 * First, identify the columns/expressions to be made unique along with
+	 * the associated equality operators.  We made lists of these when the
+	 * SpecialJoinInfo was created, but that was before constructing
+	 * EquivalenceClasses.  Some of the values to be unique-ified may now be
+	 * known redundant by the EquivalenceClass machinery (e.g., because they
+	 * have been equated to constants).  There is no need to compare such
+	 * values during unique-ification, and indeed we had better not try
+	 * because the Vars involved may not have propagated as high as the
+	 * semijoin's level.  We use make_pathkeys_for_sortclauses to detect such
+	 * cases, which is a tad inefficient but it doesn't seem worth building
+	 * specialized infrastructure for this.
+	 *
+	 * For a child rel, we can construct these lists from those of its parent.
+	 */
+	if (IS_OTHER_REL(rel))
+	{
+		UniquePath *parent_path = (UniquePath *) rel->top_parent->cheapest_unique_path;
+
+		Assert(parent_path && IsA(parent_path, UniquePath));
+		uniq_exprs = (List *)
+			adjust_appendrel_attrs_multilevel(root,
+											  (Node *) parent_path->uniq_exprs,
+											  rel,
+											  rel->top_parent);
+		in_operators = copyObject(parent_path->in_operators);
+	}
+	else
+	{
+		List	   *newtlist;
+		List	   *sortList;
+		ListCell   *lc1;
+		ListCell   *lc2;
+
+		uniq_exprs = in_operators = newtlist = sortList = NIL;
+		forboth(lc1, sjinfo->semi_rhs_exprs, lc2, sjinfo->semi_operators)
+		{
+			Expr	   *uniqexpr = lfirst(lc1);
+			Oid			in_oper = lfirst_oid(lc2);
+			Oid			sortop;
+
+			/*
+			 * Try to build an ORDER BY list to sort the input compatibly.  We
+			 * do this for each sortable clause even when the clauses are not
+			 * all sortable, so that we can detect clauses that are redundant
+			 * according to the pathkey machinery.
+			 */
+			sortop = get_ordering_op_for_equality_op(in_oper, false);
+			if (OidIsValid(sortop))
+			{
+				Oid			eqop;
+				TargetEntry *tle;
+				SortGroupClause *sortcl;
+				List	   *sortPathkeys;
+
+				/*
+				 * The Unique node will need equality operators.  Normally
+				 * these are the same as the IN clause operators, but if those
+				 * are cross-type operators then the equality operators are
+				 * the ones for the IN clause operators' RHS datatype.
+				 */
+				eqop = get_equality_op_for_ordering_op(sortop, NULL);
+				if (!OidIsValid(eqop))	/* shouldn't happen */
+					elog(ERROR, "could not find equality operator for ordering operator %u",
+						 sortop);
+
+				tle = makeTargetEntry((Expr *) uniqexpr,
+									  list_length(newtlist) + 1,
+									  NULL,
+									  false);
+				newtlist = lappend(newtlist, tle);
+
+				sortcl = makeNode(SortGroupClause);
+				sortcl->tleSortGroupRef = assignSortGroupRef(tle, newtlist);
+				sortcl->eqop = eqop;
+				sortcl->sortop = sortop;
+				sortcl->reverse_sort = false;
+				sortcl->nulls_first = false;
+				sortcl->hashable = false;	/* no need to make this accurate */
+				sortList = lappend(sortList, sortcl);
+
+				/*
+				 * At each step, convert the SortGroupClause list to pathkey
+				 * form. If the just-added SortGroupClause is redundant, the
+				 * result will be shorter than the SortGroupClause list.
+				 */
+				sortPathkeys = make_pathkeys_for_sortclauses(root, sortList,
+															 newtlist);
+				if (list_length(sortPathkeys) != list_length(sortList))
+				{
+					/* Drop the redundant SortGroupClause */
+					sortList = list_delete_last(sortList);
+					Assert(list_length(sortPathkeys) == list_length(sortList));
+					/* Undo tlist addition too */
+					newtlist = list_delete_last(newtlist);
+					/* Don't need this column */
+					continue;
+				}
+			}
+			else if (sjinfo->semi_can_btree)	/* shouldn't happen */
+				elog(ERROR, "could not find ordering operator for equality operator %u",
+					 in_oper);
+
+			/*
+			 * We need to include this column in the output list.
+			 *
+			 * Under GEQO and when planning child joins, the sjinfo might be
+			 * short-lived, so we'd better make copies of data structures we
+			 * extract from it.
+			 */
+			uniq_exprs = lappend(uniq_exprs, copyObject(uniqexpr));
+			in_operators = lappend_oid(in_operators, in_oper);
+		}
+
+		/*
+		 * It can happen that all the RHS columns are equated to constants.
+		 * We'd have to do something special to unique-ify in that case, and
+		 * it's such an unlikely-in-the-real-world case that it's not worth
+		 * the effort.  So just punt if we found no columns to unique-ify.
+		 */
+		if (uniq_exprs == NIL)
+		{
+			MemoryContextSwitchTo(oldcontext);
+			return NULL;
+		}
+	}
+
+	/* OK, build the path node */
 	pathnode = makeNode(UniquePath);
 
 	pathnode->path.pathtype = T_Unique;
@@ -1778,14 +1911,8 @@ create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath,
 	pathnode->path.pathkeys = NIL;
 
 	pathnode->subpath = subpath;
-
-	/*
-	 * Under GEQO and when planning child joins, the sjinfo might be
-	 * short-lived, so we'd better make copies of data structures we extract
-	 * from it.
-	 */
-	pathnode->in_operators = copyObject(sjinfo->semi_operators);
-	pathnode->uniq_exprs = copyObject(sjinfo->semi_rhs_exprs);
+	pathnode->in_operators = in_operators;
+	pathnode->uniq_exprs = uniq_exprs;
 
 	/*
 	 * If the input is a relation and it has a unique index that proves the
@@ -1795,8 +1922,8 @@ create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath,
 	 */
 	if (rel->rtekind == RTE_RELATION && sjinfo->semi_can_btree &&
 		relation_has_unique_index_for(root, rel, NIL,
-									  sjinfo->semi_rhs_exprs,
-									  sjinfo->semi_operators))
+									  uniq_exprs,
+									  in_operators))
 	{
 		pathnode->umethod = UNIQUE_PATH_NOOP;
 		pathnode->path.rows = rel->rows;
@@ -1829,13 +1956,13 @@ create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath,
 		{
 			List	   *sub_tlist_colnos;
 
-			sub_tlist_colnos = translate_sub_tlist(sjinfo->semi_rhs_exprs,
+			sub_tlist_colnos = translate_sub_tlist(uniq_exprs,
 												   rel->relid);
 
 			if (sub_tlist_colnos &&
 				query_is_distinct_for(rte->subquery,
 									  sub_tlist_colnos,
-									  sjinfo->semi_operators))
+									  in_operators))
 			{
 				pathnode->umethod = UNIQUE_PATH_NOOP;
 				pathnode->path.rows = rel->rows;
@@ -1855,11 +1982,11 @@ create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath,
 
 	/* Estimate number of output rows */
 	pathnode->path.rows = estimate_num_groups(root,
-											  sjinfo->semi_rhs_exprs,
+											  uniq_exprs,
 											  rel->rows,
 											  NULL,
 											  NULL);
-	numCols = list_length(sjinfo->semi_rhs_exprs);
+	numCols = list_length(uniq_exprs);
 
 	if (sjinfo->semi_can_btree)
 	{
diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out
index 1f1ce2380af..4cfbe424603 100644
--- a/src/test/regress/expected/aggregates.out
+++ b/src/test/regress/expected/aggregates.out
@@ -3379,26 +3379,6 @@ select v||'a', case when v||'a' = 'aa' then 1 else 0 end, count(*)
  ba       |    0 |     1
 (2 rows)
 
--- Make sure that generation of HashAggregate for uniqification purposes
--- does not lead to array overflow due to unexpected duplicate hash keys
--- see CAFeeJoKKu0u+A_A9R9316djW-YW3-+Gtgvy3ju655qRHR3jtdA@mail.gmail.com
-set enable_memoize to off;
-explain (costs off)
-  select 1 from tenk1
-   where (hundred, thousand) in (select twothousand, twothousand from onek);
-                         QUERY PLAN                          
--------------------------------------------------------------
- Hash Join
-   Hash Cond: (tenk1.hundred = onek.twothousand)
-   ->  Seq Scan on tenk1
-         Filter: (hundred = thousand)
-   ->  Hash
-         ->  HashAggregate
-               Group Key: onek.twothousand, onek.twothousand
-               ->  Seq Scan on onek
-(8 rows)
-
-reset enable_memoize;
 --
 -- Hash Aggregation Spill tests
 --
diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out
index 390aabfb34b..a30ff88eef8 100644
--- a/src/test/regress/expected/join.out
+++ b/src/test/regress/expected/join.out
@@ -3222,6 +3222,24 @@ where b.unique2 is null;
          ->  Index Only Scan using tenk1_unique2 on tenk1 b
 (5 rows)
 
+-- check that we avoid de-duplicating columns redundantly
+set enable_memoize to off;
+explain (costs off)
+select 1 from tenk1
+where (hundred, thousand) in (select twothousand, twothousand from onek);
+                   QUERY PLAN                    
+-------------------------------------------------
+ Hash Join
+   Hash Cond: (tenk1.hundred = onek.twothousand)
+   ->  Seq Scan on tenk1
+         Filter: (hundred = thousand)
+   ->  Hash
+         ->  HashAggregate
+               Group Key: onek.twothousand
+               ->  Seq Scan on onek
+(8 rows)
+
+reset enable_memoize;
 --
 -- regression test for bogus RTE_GROUP entries
 --
@@ -6500,6 +6518,68 @@ where t1.a = s.c;
 ----------
 (0 rows)
 
+rollback;
+-- check handling of semijoins after join removal: we must suppress
+-- unique-ification of known-constant values
+begin;
+create temp table t (a int unique, b int);
+insert into t values (1, 2);
+explain (verbose, costs off)
+select t1.a from t t1
+  left join t t2 on t1.a = t2.a
+       join t t3 on true
+where exists (select 1 from t t4
+                join t t5 on t4.b = t5.b
+                join t t6 on t5.b = t6.b
+              where t1.a = t4.a and t3.a = t5.a and t4.a = 1);
+                                     QUERY PLAN                                     
+------------------------------------------------------------------------------------
+ Nested Loop
+   Output: t1.a
+   Inner Unique: true
+   ->  Nested Loop
+         Output: t1.a, t5.a
+         ->  Index Only Scan using t_a_key on pg_temp.t t1
+               Output: t1.a
+               Index Cond: (t1.a = 1)
+         ->  HashAggregate
+               Output: t5.a
+               Group Key: t5.a
+               ->  Hash Join
+                     Output: t5.a
+                     Hash Cond: (t6.b = t4.b)
+                     ->  Seq Scan on pg_temp.t t6
+                           Output: t6.a, t6.b
+                     ->  Hash
+                           Output: t4.b, t5.b, t5.a
+                           ->  Hash Join
+                                 Output: t4.b, t5.b, t5.a
+                                 Inner Unique: true
+                                 Hash Cond: (t5.b = t4.b)
+                                 ->  Seq Scan on pg_temp.t t5
+                                       Output: t5.a, t5.b
+                                 ->  Hash
+                                       Output: t4.b, t4.a
+                                       ->  Index Scan using t_a_key on pg_temp.t t4
+                                             Output: t4.b, t4.a
+                                             Index Cond: (t4.a = 1)
+   ->  Index Only Scan using t_a_key on pg_temp.t t3
+         Output: t3.a
+         Index Cond: (t3.a = t5.a)
+(32 rows)
+
+select t1.a from t t1
+  left join t t2 on t1.a = t2.a
+       join t t3 on true
+where exists (select 1 from t t4
+                join t t5 on t4.b = t5.b
+                join t t6 on t5.b = t6.b
+              where t1.a = t4.a and t3.a = t5.a and t4.a = 1);
+ a 
+---
+ 1
+(1 row)
+
 rollback;
 -- test cases where we can remove a join, but not a PHV computed at it
 begin;
diff --git a/src/test/regress/sql/aggregates.sql b/src/test/regress/sql/aggregates.sql
index 277b4b198cc..79eca85c985 100644
--- a/src/test/regress/sql/aggregates.sql
+++ b/src/test/regress/sql/aggregates.sql
@@ -1505,15 +1505,6 @@ select v||'a', case when v||'a' = 'aa' then 1 else 0 end, count(*)
   from unnest(array['a','b']) u(v)
  group by v||'a' order by 1;
 
--- Make sure that generation of HashAggregate for uniqification purposes
--- does not lead to array overflow due to unexpected duplicate hash keys
--- see CAFeeJoKKu0u+A_A9R9316djW-YW3-+Gtgvy3ju655qRHR3jtdA@mail.gmail.com
-set enable_memoize to off;
-explain (costs off)
-  select 1 from tenk1
-   where (hundred, thousand) in (select twothousand, twothousand from onek);
-reset enable_memoize;
-
 --
 -- Hash Aggregation Spill tests
 --
diff --git a/src/test/regress/sql/join.sql b/src/test/regress/sql/join.sql
index f6e7070db65..23e220afcd2 100644
--- a/src/test/regress/sql/join.sql
+++ b/src/test/regress/sql/join.sql
@@ -839,6 +839,13 @@ explain (costs off)
 select a.* from tenk1 a left join tenk1 b on a.unique1 = b.unique2
 where b.unique2 is null;
 
+-- check that we avoid de-duplicating columns redundantly
+set enable_memoize to off;
+explain (costs off)
+select 1 from tenk1
+where (hundred, thousand) in (select twothousand, twothousand from onek);
+reset enable_memoize;
+
 --
 -- regression test for bogus RTE_GROUP entries
 --
@@ -2420,6 +2427,32 @@ where t1.a = s.c;
 
 rollback;
 
+-- check handling of semijoins after join removal: we must suppress
+-- unique-ification of known-constant values
+begin;
+
+create temp table t (a int unique, b int);
+insert into t values (1, 2);
+
+explain (verbose, costs off)
+select t1.a from t t1
+  left join t t2 on t1.a = t2.a
+       join t t3 on true
+where exists (select 1 from t t4
+                join t t5 on t4.b = t5.b
+                join t t6 on t5.b = t6.b
+              where t1.a = t4.a and t3.a = t5.a and t4.a = 1);
+
+select t1.a from t t1
+  left join t t2 on t1.a = t2.a
+       join t t3 on true
+where exists (select 1 from t t4
+                join t t5 on t4.b = t5.b
+                join t t6 on t5.b = t6.b
+              where t1.a = t4.a and t3.a = t5.a and t4.a = 1);
+
+rollback;
+
 -- test cases where we can remove a join, but not a PHV computed at it
 begin;
 
-- 
2.47.3