Enable use of Memoize for ANTI joins

author Richard Guo <rguo@postgresql.org>

Thu, 3 Jul 2025 01:57:26 +0000 (10:57 +0900)

committer Richard Guo <rguo@postgresql.org>

Thu, 3 Jul 2025 01:57:26 +0000 (10:57 +0900)
author Richard Guo <rguo@postgresql.org>
Thu, 3 Jul 2025 01:57:26 +0000 (10:57 +0900)
committer Richard Guo <rguo@postgresql.org>
Thu, 3 Jul 2025 01:57:26 +0000 (10:57 +0900)
diff --git a/src/backend/optimizer/path/joinpath.c b/src/backend/optimizer/path/joinpath.c

index 7aa8f5d799cac8652736004a1b09956b30b05ef0..ebedc5574ca9c32284c7be0667ac1520340d185b 100644 (file)
--- a/src/backend/optimizer/path/joinpath.c
+++ b/src/backend/optimizer/path/joinpath.c
@@ -154,13 +154,17 @@ add_paths_to_joinrel(PlannerInfo *root,
         /*
          * See if the inner relation is provably unique for this outer rel.
          *
-        * We have some special cases: for JOIN_SEMI and JOIN_ANTI, it doesn't
-        * matter since the executor can make the equivalent optimization anyway;
-        * we need not expend planner cycles on proofs.  For JOIN_UNIQUE_INNER, we
-        * must be considering a semijoin whose inner side is not provably unique
-        * (else reduce_unique_semijoins would've simplified it), so there's no
-        * point in calling innerrel_is_unique.  However, if the LHS covers all of
-        * the semijoin's min_lefthand, then it's appropriate to set inner_unique
+        * We have some special cases: for JOIN_SEMI, it doesn't matter since the
+        * executor can make the equivalent optimization anyway.  It also doesn't
+        * help enable use of Memoize, since a semijoin with a provably unique
+        * inner side should have been reduced to an inner join in that case.
+        * Therefore, we need not expend planner cycles on proofs.  (For
+        * JOIN_ANTI, although it doesn't help the executor for the same reason,
+        * it can benefit Memoize paths.)  For JOIN_UNIQUE_INNER, we must be
+        * considering a semijoin whose inner side is not provably unique (else
+        * reduce_unique_semijoins would've simplified it), so there's no point in
+        * calling innerrel_is_unique.  However, if the LHS covers all of the
+        * semijoin's min_lefthand, then it's appropriate to set inner_unique
          * because the path produced by create_unique_path will be unique relative
          * to the LHS.  (If we have an LHS that's only part of the min_lefthand,
          * that is *not* true.)  For JOIN_UNIQUE_OUTER, pass JOIN_INNER to avoid
@@ -169,12 +173,6 @@ add_paths_to_joinrel(PlannerInfo *root,
         switch (jointype)
         {
                 case JOIN_SEMI:
-               case JOIN_ANTI:
-
-                       /*
-                        * XXX it may be worth proving this to allow a Memoize to be
-                        * considered for Nested Loop Semi/Anti Joins.
-                        */
                         extra.inner_unique = false; /* well, unproven */
                         break;
                 case JOIN_UNIQUE_INNER:
@@ -715,16 +713,21 @@ get_memoize_path(PlannerInfo *root, RelOptInfo *innerrel,
                 return NULL;
  
         /*
-        * Currently we don't do this for SEMI and ANTI joins unless they're
-        * marked as inner_unique.  This is because nested loop SEMI/ANTI joins
-        * don't scan the inner node to completion, which will mean memoize cannot
-        * mark the cache entry as complete.
-        *
-        * XXX Currently we don't attempt to mark SEMI/ANTI joins as inner_unique
-        * = true.  Should we?  See add_paths_to_joinrel()
+        * Currently we don't do this for SEMI and ANTI joins, because nested loop
+        * SEMI/ANTI joins don't scan the inner node to completion, which means
+        * memoize cannot mark the cache entry as complete.  Nor can we mark the
+        * cache entry as complete after fetching the first inner tuple, because
+        * if that tuple and the current outer tuple don't satisfy the join
+        * clauses, a second inner tuple that satisfies the parameters would find
+        * the cache entry already marked as complete.  The only exception is when
+        * the inner relation is provably unique, as in that case, there won't be
+        * a second matching tuple and we can safely mark the cache entry as
+        * complete after fetching the first inner tuple.  Note that in such
+        * cases, the SEMI join should have been reduced to an inner join by
+        * reduce_unique_semijoins.
          */
-       if (!extra->inner_unique && (jointype == JOIN_SEMI ||
-                                                                jointype == JOIN_ANTI))
+       if ((jointype == JOIN_SEMI || jointype == JOIN_ANTI) &&
+               !extra->inner_unique)
                 return NULL;
  
         /*
diff --git a/src/test/regress/expected/memoize.out b/src/test/regress/expected/memoize.out

index 38dfaf021c91d27b69fa9f33b15c77c660ea5bb5..150dc1b44cf62f922d0ed987a24524495342a4fa 100644 (file)
--- a/src/test/regress/expected/memoize.out
+++ b/src/test/regress/expected/memoize.out
@@ -25,6 +25,7 @@ begin
          ln := regexp_replace(ln, 'Heap Fetches: \d+', 'Heap Fetches: N');
          ln := regexp_replace(ln, 'loops=\d+', 'loops=N');
          ln := regexp_replace(ln, 'Index Searches: \d+', 'Index Searches: N');
+        ln := regexp_replace(ln, 'Memory: \d+kB', 'Memory: NkB');
          return next ln;
      end loop;
  end;
@@ -500,3 +501,62 @@ RESET max_parallel_workers_per_gather;
  RESET parallel_tuple_cost;
  RESET parallel_setup_cost;
  RESET min_parallel_table_scan_size;
+-- Ensure memoize works for ANTI joins
+CREATE TABLE tab_anti (a int, b boolean);
+INSERT INTO tab_anti SELECT i%3, false FROM generate_series(1,100)i;
+ANALYZE tab_anti;
+-- Ensure we get a Memoize plan for ANTI join
+SELECT explain_memoize('
+SELECT COUNT(*) FROM tab_anti t1 LEFT JOIN
+LATERAL (SELECT DISTINCT ON (a) a, b, t1.a AS x FROM tab_anti t2) t2
+ON t1.a+1 = t2.a
+WHERE t2.a IS NULL;', false);
+                                      explain_memoize                                       
+--------------------------------------------------------------------------------------------
+ Aggregate (actual rows=1.00 loops=N)
+   ->  Nested Loop Anti Join (actual rows=33.00 loops=N)
+         ->  Seq Scan on tab_anti t1 (actual rows=100.00 loops=N)
+         ->  Memoize (actual rows=0.67 loops=N)
+               Cache Key: (t1.a + 1), t1.a
+               Cache Mode: binary
+               Hits: 97  Misses: 3  Evictions: Zero  Overflows: 0  Memory Usage: NkB
+               ->  Subquery Scan on t2 (actual rows=0.67 loops=N)
+                     Filter: ((t1.a + 1) = t2.a)
+                     Rows Removed by Filter: 2
+                     ->  Unique (actual rows=2.67 loops=N)
+                           ->  Sort (actual rows=67.33 loops=N)
+                                 Sort Key: t2_1.a
+                                 Sort Method: quicksort  Memory: NkB
+                                 ->  Seq Scan on tab_anti t2_1 (actual rows=100.00 loops=N)
+(15 rows)
+
+-- And check we get the expected results.
+SELECT COUNT(*) FROM tab_anti t1 LEFT JOIN
+LATERAL (SELECT DISTINCT ON (a) a, b, t1.a AS x FROM tab_anti t2) t2
+ON t1.a+1 = t2.a
+WHERE t2.a IS NULL;
+ count 
+-------
+    33
+(1 row)
+
+-- Ensure we do not add memoize node for SEMI join
+EXPLAIN (COSTS OFF)
+SELECT * FROM tab_anti t1 WHERE t1.a IN
+ (SELECT a FROM tab_anti t2 WHERE t2.b IN
+  (SELECT t1.b FROM tab_anti t3 WHERE t2.a > 1 OFFSET 0));
+                   QUERY PLAN                    
+-------------------------------------------------
+ Nested Loop Semi Join
+   ->  Seq Scan on tab_anti t1
+   ->  Nested Loop Semi Join
+         Join Filter: (t1.a = t2.a)
+         ->  Seq Scan on tab_anti t2
+         ->  Subquery Scan on "ANY_subquery"
+               Filter: (t2.b = "ANY_subquery".b)
+               ->  Result
+                     One-Time Filter: (t2.a > 1)
+                     ->  Seq Scan on tab_anti t3
+(10 rows)
+
+DROP TABLE tab_anti;
diff --git a/src/test/regress/sql/memoize.sql b/src/test/regress/sql/memoize.sql

index c0d47fa875ad96ccec47ef8d518b2682c0cc2ded..8d1cdd6990c871b67e6c45dec87805d6bd458fea 100644 (file)
--- a/src/test/regress/sql/memoize.sql
+++ b/src/test/regress/sql/memoize.sql
@@ -26,6 +26,7 @@ begin
          ln := regexp_replace(ln, 'Heap Fetches: \d+', 'Heap Fetches: N');
          ln := regexp_replace(ln, 'loops=\d+', 'loops=N');
          ln := regexp_replace(ln, 'Index Searches: \d+', 'Index Searches: N');
+        ln := regexp_replace(ln, 'Memory: \d+kB', 'Memory: NkB');
          return next ln;
      end loop;
  end;
@@ -244,3 +245,29 @@ RESET max_parallel_workers_per_gather;
  RESET parallel_tuple_cost;
  RESET parallel_setup_cost;
  RESET min_parallel_table_scan_size;
+
+-- Ensure memoize works for ANTI joins
+CREATE TABLE tab_anti (a int, b boolean);
+INSERT INTO tab_anti SELECT i%3, false FROM generate_series(1,100)i;
+ANALYZE tab_anti;
+
+-- Ensure we get a Memoize plan for ANTI join
+SELECT explain_memoize('
+SELECT COUNT(*) FROM tab_anti t1 LEFT JOIN
+LATERAL (SELECT DISTINCT ON (a) a, b, t1.a AS x FROM tab_anti t2) t2
+ON t1.a+1 = t2.a
+WHERE t2.a IS NULL;', false);
+
+-- And check we get the expected results.
+SELECT COUNT(*) FROM tab_anti t1 LEFT JOIN
+LATERAL (SELECT DISTINCT ON (a) a, b, t1.a AS x FROM tab_anti t2) t2
+ON t1.a+1 = t2.a
+WHERE t2.a IS NULL;
+
+-- Ensure we do not add memoize node for SEMI join
+EXPLAIN (COSTS OFF)
+SELECT * FROM tab_anti t1 WHERE t1.a IN
+ (SELECT a FROM tab_anti t2 WHERE t2.b IN
+  (SELECT t1.b FROM tab_anti t3 WHERE t2.a > 1 OFFSET 0));
+
+DROP TABLE tab_anti;
author	Richard Guo <rguo@postgresql.org>
	Thu, 3 Jul 2025 01:57:26 +0000 (10:57 +0900)
committer	Richard Guo <rguo@postgresql.org>
	Thu, 3 Jul 2025 01:57:26 +0000 (10:57 +0900)
src/backend/optimizer/path/joinpath.c		patch \| blob \| blame \| history
src/test/regress/expected/memoize.out		patch \| blob \| blame \| history
src/test/regress/sql/memoize.sql		patch \| blob \| blame \| history