Discount the metapage when estimating number of index pages visited.

author Tom Lane <tgl@sss.pgh.pa.us>

Fri, 20 Mar 2026 18:50:53 +0000 (14:50 -0400)

committer Tom Lane <tgl@sss.pgh.pa.us>

Fri, 20 Mar 2026 18:50:53 +0000 (14:50 -0400)
author Tom Lane <tgl@sss.pgh.pa.us>
Fri, 20 Mar 2026 18:50:53 +0000 (14:50 -0400)
committer Tom Lane <tgl@sss.pgh.pa.us>
Fri, 20 Mar 2026 18:50:53 +0000 (14:50 -0400)
diff --git a/contrib/bloom/blcost.c b/contrib/bloom/blcost.c

index 15b09048be76556099c30e8d01b99e6501e00ffd..5a733dc10ca53042483717d66cfeae298f2d8b58 100644 (file)
--- a/contrib/bloom/blcost.c
+++ b/contrib/bloom/blcost.c
@@ -30,6 +30,9 @@ blcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
         /* We have to visit all index tuples anyway */
         costs.numIndexTuples = index->tuples;
  
+       /* As in btcostestimate, count only the metapage as non-leaf */
+       costs.numNonLeafPages = 1;
+
         /* Use generic estimate */
         genericcostestimate(root, path, loop_count, &costs);
  
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c

index 86b55c9bb8b7a3f7670a36b5979c986f157cc5cf..53f85ccde015c0bcee9495a6457298ce1a15a0fa 100644 (file)
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -7388,6 +7388,11 @@ index_other_operands_eval_cost(PlannerInfo *root, List *indexquals)
         return qual_arg_cost;
  }
  
+/*
+ * Compute generic index access cost estimates.
+ *
+ * See struct GenericCosts in selfuncs.h for more info.
+ */
  void
  genericcostestimate(PlannerInfo *root,
                                         IndexPath *path,
@@ -7483,16 +7488,18 @@ genericcostestimate(PlannerInfo *root,
          * Estimate the number of index pages that will be retrieved.
          *
          * We use the simplistic method of taking a pro-rata fraction of the total
-        * number of index pages.  In effect, this counts only leaf pages and not
-        * any overhead such as index metapage or upper tree levels.
+        * number of index leaf pages.  We disregard any overhead such as index
+        * metapages or upper tree levels.
          *
          * In practice access to upper index levels is often nearly free because
          * those tend to stay in cache under load; moreover, the cost involved is
          * highly dependent on index type.  We therefore ignore such costs here
          * and leave it to the caller to add a suitable charge if needed.
          */
-       if (index->pages > 1 && index->tuples > 1)
-               numIndexPages = ceil(numIndexTuples * index->pages / index->tuples);
+       if (index->pages > costs->numNonLeafPages && index->tuples > 1)
+               numIndexPages =
+                       ceil(numIndexTuples * (index->pages - costs->numNonLeafPages)
+                                / index->tuples);
         else
                 numIndexPages = 1.0;
  
@@ -8083,9 +8090,18 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
  
         /*
          * Now do generic index cost estimation.
+        *
+        * While we expended effort to make realistic estimates of numIndexTuples
+        * and num_sa_scans, we are content to count only the btree metapage as
+        * non-leaf.  btree fanout is typically high enough that upper pages are
+        * few relative to leaf pages, so accounting for them would move the
+        * estimates at most a percent or two.  Given the uncertainty in just how
+        * many upper pages exist in a particular index, we'll skip trying to
+        * handle that.
          */
         costs.numIndexTuples = numIndexTuples;
         costs.num_sa_scans = num_sa_scans;
+       costs.numNonLeafPages = 1;
  
         genericcostestimate(root, path, loop_count, &costs);
  
@@ -8150,6 +8166,9 @@ hashcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
  {
         GenericCosts costs = {0};
  
+       /* As in btcostestimate, count only the metapage as non-leaf */
+       costs.numNonLeafPages = 1;
+
         genericcostestimate(root, path, loop_count, &costs);
  
         /*
@@ -8194,6 +8213,8 @@ gistcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
         GenericCosts costs = {0};
         Cost            descentCost;
  
+       /* GiST has no metapage, so we treat all pages as leaf pages */
+
         genericcostestimate(root, path, loop_count, &costs);
  
         /*
@@ -8249,6 +8270,9 @@ spgcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
         GenericCosts costs = {0};
         Cost            descentCost;
  
+       /* As in btcostestimate, count only the metapage as non-leaf */
+       costs.numNonLeafPages = 1;
+
         genericcostestimate(root, path, loop_count, &costs);
  
         /*
diff --git a/src/include/utils/selfuncs.h b/src/include/utils/selfuncs.h

index 7195f9ea901f9ac0270f370d7d6570794c1a0cc7..8d9fff95a1915495987bd8021e41bf32d559f0a4 100644 (file)
--- a/src/include/utils/selfuncs.h
+++ b/src/include/utils/selfuncs.h
@@ -122,6 +122,12 @@ typedef struct VariableStatData
   * Similarly, they can set num_sa_scans to some value >= 1 for an index AM
   * that doesn't necessarily perform exactly one primitive index scan per
   * distinct combination of ScalarArrayOp array elements.
+ * Similarly, they can set numNonLeafPages to some value >= 1 if they know
+ * how many index pages are not leaf pages.  (It's always good to count
+ * totally non-data-bearing pages such as metapages here, since accounting
+ * for the metapage can move cost estimates for a small index significantly.
+ * But upper pages in large indexes may be few enough relative to leaf pages
+ * that it's not worth trying to count them.)
   */
  typedef struct
  {
@@ -136,6 +142,7 @@ typedef struct
         double          numIndexTuples; /* number of leaf tuples visited */
         double          spc_random_page_cost;   /* relevant random_page_cost value */
         double          num_sa_scans;   /* # indexscans from ScalarArrayOpExprs */
+       BlockNumber numNonLeafPages;    /* # of index pages that are not leaves */
  } GenericCosts;
  
  /* Hooks for plugins to get control when we ask for stats */
diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out

index 250b17d092f76b885dae3fbe10d21206cf6fdac8..84872c6f04ee8cf92255f372077267ff5ab7e7f4 100644 (file)
--- a/src/test/regress/expected/join.out
+++ b/src/test/regress/expected/join.out
@@ -9811,12 +9811,14 @@ drop index j1_id2_idx;
  set enable_nestloop to 0;
  set enable_hashjoin to 0;
  set enable_sort to 0;
+-- we need additional data to get the partial indexes to be preferred
+insert into j1 select 2, i from generate_series(1, 100) i;
+insert into j2 select 1, i from generate_series(2, 100) i;
+analyze j1;
+analyze j2;
  -- create indexes that will be preferred over the PKs to perform the join
  create index j1_id1_idx on j1 (id1) where id1 % 1000 = 1;
  create index j2_id1_idx on j2 (id1) where id1 % 1000 = 1;
--- need an additional row in j2, if we want j2_id1_idx to be preferred
-insert into j2 values(1,2);
-analyze j2;
  explain (costs off) select * from j1
  inner join j2 on j1.id1 = j2.id1 and j1.id2 = j2.id2
  where j1.id1 % 1000 = 1 and j2.id1 % 1000 = 1;
diff --git a/src/test/regress/expected/memoize.out b/src/test/regress/expected/memoize.out

index 00c30b91459db39a4ef1e9f28c4e0a3ad93e2e41..218972dfab88d0eb43cde1b8ff3b465800f92259 100644 (file)
--- a/src/test/regress/expected/memoize.out
+++ b/src/test/regress/expected/memoize.out
@@ -262,6 +262,7 @@ CREATE INDEX flt_f_idx ON flt (f);
  INSERT INTO flt VALUES('-0.0'::float),('+0.0'::float);
  ANALYZE flt;
  SET enable_seqscan TO off;
+SET enable_material TO off;
  -- Ensure memoize operates in logical mode
  SELECT explain_memoize('
  SELECT * FROM flt f1 INNER JOIN flt f2 ON f1.f = f2.f;', false);
@@ -455,6 +456,7 @@ WHERE unique1 < 3
  (1 row)
  
  RESET enable_seqscan;
+RESET enable_material;
  RESET enable_mergejoin;
  RESET work_mem;
  RESET hash_mem_multiplier;
diff --git a/src/test/regress/expected/select.out b/src/test/regress/expected/select.out

index 2cdb78d1e9aab710bb951080827d3e36af2d960c..34f040beecca52e92720ef9e8945de5b7de70059 100644 (file)
--- a/src/test/regress/expected/select.out
+++ b/src/test/regress/expected/select.out
@@ -861,7 +861,6 @@ select unique2 from onek2 where unique2 = 11 and stringu1 < 'B';
        11
  (1 row)
  
-RESET enable_indexscan;
  -- check multi-index cases too
  explain (costs off)
  select unique1, unique2 from onek2
@@ -908,6 +907,20 @@ select unique1, unique2 from onek2
         0 |     998
  (2 rows)
  
+RESET enable_indexscan;
+-- onek2_u2_prtl should be preferred over this index, but we have to
+-- discount the metapage to arrive at that answer
+begin;
+create index onek2_index_full on onek2 (stringu1, unique2);
+explain (costs off)
+select unique2 from onek2
+  where stringu1 < 'B'::name;
+                  QUERY PLAN                  
+----------------------------------------------
+ Index Only Scan using onek2_u2_prtl on onek2
+(1 row)
+
+rollback;
  --
  -- Test some corner cases that have been known to confuse the planner
  --
diff --git a/src/test/regress/sql/join.sql b/src/test/regress/sql/join.sql

index a81c5fd011fb8050ee8ac360b1528ecbed675425..30b479dda7c040dc39589012ffb6bb2f8f1b93d1 100644 (file)
--- a/src/test/regress/sql/join.sql
+++ b/src/test/regress/sql/join.sql
@@ -3723,14 +3723,16 @@ set enable_nestloop to 0;
  set enable_hashjoin to 0;
  set enable_sort to 0;
  
+-- we need additional data to get the partial indexes to be preferred
+insert into j1 select 2, i from generate_series(1, 100) i;
+insert into j2 select 1, i from generate_series(2, 100) i;
+analyze j1;
+analyze j2;
+
  -- create indexes that will be preferred over the PKs to perform the join
  create index j1_id1_idx on j1 (id1) where id1 % 1000 = 1;
  create index j2_id1_idx on j2 (id1) where id1 % 1000 = 1;
  
--- need an additional row in j2, if we want j2_id1_idx to be preferred
-insert into j2 values(1,2);
-analyze j2;
-
  explain (costs off) select * from j1
  inner join j2 on j1.id1 = j2.id1 and j1.id2 = j2.id2
  where j1.id1 % 1000 = 1 and j2.id1 % 1000 = 1;
diff --git a/src/test/regress/sql/memoize.sql b/src/test/regress/sql/memoize.sql

index 8d1cdd6990c871b67e6c45dec87805d6bd458fea..e39bbb653919f1222ae3f0014adc49c164b23449 100644 (file)
--- a/src/test/regress/sql/memoize.sql
+++ b/src/test/regress/sql/memoize.sql
@@ -139,6 +139,7 @@ INSERT INTO flt VALUES('-0.0'::float),('+0.0'::float);
  ANALYZE flt;
  
  SET enable_seqscan TO off;
+SET enable_material TO off;
  
  -- Ensure memoize operates in logical mode
  SELECT explain_memoize('
@@ -218,6 +219,7 @@ WHERE unique1 < 3
         WHERE t0.ten = t1.twenty AND t0.two <> t2.four OFFSET 0);
  
  RESET enable_seqscan;
+RESET enable_material;
  RESET enable_mergejoin;
  RESET work_mem;
  RESET hash_mem_multiplier;
diff --git a/src/test/regress/sql/select.sql b/src/test/regress/sql/select.sql

index 1d1bf2b9310aa8374186d7b62a4189270675b586..2dfe88d2054b0406c453df672189ef14029f0012 100644 (file)
--- a/src/test/regress/sql/select.sql
+++ b/src/test/regress/sql/select.sql
@@ -221,7 +221,6 @@ SET enable_indexscan TO off;
  explain (costs off)
  select unique2 from onek2 where unique2 = 11 and stringu1 < 'B';
  select unique2 from onek2 where unique2 = 11 and stringu1 < 'B';
-RESET enable_indexscan;
  -- check multi-index cases too
  explain (costs off)
  select unique1, unique2 from onek2
@@ -233,6 +232,16 @@ select unique1, unique2 from onek2
    where (unique2 = 11 and stringu1 < 'B') or unique1 = 0;
  select unique1, unique2 from onek2
    where (unique2 = 11 and stringu1 < 'B') or unique1 = 0;
+RESET enable_indexscan;
+
+-- onek2_u2_prtl should be preferred over this index, but we have to
+-- discount the metapage to arrive at that answer
+begin;
+create index onek2_index_full on onek2 (stringu1, unique2);
+explain (costs off)
+select unique2 from onek2
+  where stringu1 < 'B'::name;
+rollback;
  
  --
  -- Test some corner cases that have been known to confuse the planner
author	Tom Lane <tgl@sss.pgh.pa.us>
	Fri, 20 Mar 2026 18:50:53 +0000 (14:50 -0400)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Fri, 20 Mar 2026 18:50:53 +0000 (14:50 -0400)
contrib/bloom/blcost.c		patch \| blob \| blame \| history
src/backend/utils/adt/selfuncs.c		patch \| blob \| blame \| history
src/include/utils/selfuncs.h		patch \| blob \| blame \| history
src/test/regress/expected/join.out		patch \| blob \| blame \| history
src/test/regress/expected/memoize.out		patch \| blob \| blame \| history
src/test/regress/expected/select.out		patch \| blob \| blame \| history
src/test/regress/sql/join.sql		patch \| blob \| blame \| history
src/test/regress/sql/memoize.sql		patch \| blob \| blame \| history
src/test/regress/sql/select.sql		patch \| blob \| blame \| history