Fix hash partition pruning with asymmetric partition sets.

author Tom Lane <tgl@sss.pgh.pa.us>

Thu, 28 Jan 2021 18:41:55 +0000 (13:41 -0500)

committer Tom Lane <tgl@sss.pgh.pa.us>

Thu, 28 Jan 2021 18:41:55 +0000 (13:41 -0500)
author Tom Lane <tgl@sss.pgh.pa.us>
Thu, 28 Jan 2021 18:41:55 +0000 (13:41 -0500)
committer Tom Lane <tgl@sss.pgh.pa.us>
Thu, 28 Jan 2021 18:41:55 +0000 (13:41 -0500)
diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c

index e326888862cd249939f0e4bc813aba2a9affa6d8..93e2cdd0207c1ed10c1898d73029d7cf167d0af8 100644 (file)
--- a/src/backend/executor/execPartition.c
+++ b/src/backend/executor/execPartition.c
@@ -1323,16 +1323,14 @@ get_partition_for_tuple(PartitionDispatch pd, Datum *values, bool *isnull)
         {
                 case PARTITION_STRATEGY_HASH:
                         {
-                               int                     greatest_modulus;
                                 uint64          rowHash;
  
-                               greatest_modulus = get_hash_partition_greatest_modulus(boundinfo);
                                 rowHash = compute_partition_hash_value(key->partnatts,
                                                                                                            key->partsupfunc,
                                                                                                            key->partcollation,
                                                                                                            values, isnull);
  
-                               part_index = boundinfo->indexes[rowHash % greatest_modulus];
+                               part_index = boundinfo->indexes[rowHash % boundinfo->nindexes];
                         }
                         break;
  
diff --git a/src/backend/partitioning/partbounds.c b/src/backend/partitioning/partbounds.c

index cebd1c1dca571822a2c7a2251faccaa95cc8a156..895943a2e1571aaa6a5554582ee8b871232ad711 100644 (file)
--- a/src/backend/partitioning/partbounds.c
+++ b/src/backend/partitioning/partbounds.c
@@ -92,7 +92,6 @@ static int    partition_range_bsearch(int partnatts, FmgrInfo *partsupfunc,
                                                                         Oid *partcollation,
                                                                         PartitionBoundInfo boundinfo,
                                                                         PartitionRangeBound *probe, bool *is_equal);
-static int     get_partition_bound_num_indexes(PartitionBoundInfo b);
  static Expr *make_partition_op_expr(PartitionKey key, int keynum,
                                                                         uint16 strategy, Expr *arg1, Expr *arg2);
  static Oid     get_partition_operator(PartitionKey key, int col,
@@ -266,6 +265,7 @@ create_hash_bounds(PartitionBoundSpec **boundspecs, int nparts,
  
         boundinfo->ndatums = ndatums;
         boundinfo->datums = (Datum **) palloc0(ndatums * sizeof(Datum *));
+       boundinfo->nindexes = greatest_modulus;
         boundinfo->indexes = (int *) palloc(greatest_modulus * sizeof(int));
         for (i = 0; i < greatest_modulus; i++)
                 boundinfo->indexes[i] = -1;
@@ -398,6 +398,7 @@ create_list_bounds(PartitionBoundSpec **boundspecs, int nparts,
  
         boundinfo->ndatums = ndatums;
         boundinfo->datums = (Datum **) palloc0(ndatums * sizeof(Datum *));
+       boundinfo->nindexes = ndatums;
         boundinfo->indexes = (int *) palloc(ndatums * sizeof(int));
  
         /*
@@ -593,8 +594,9 @@ create_range_bounds(PartitionBoundSpec **boundspecs, int nparts,
  
         /*
          * For range partitioning, an additional value of -1 is stored as the last
-        * element.
+        * element of the indexes[] array.
          */
+       boundinfo->nindexes = ndatums + 1;
         boundinfo->indexes = (int *) palloc((ndatums + 1) * sizeof(int));
  
         for (i = 0; i < ndatums; i++)
@@ -675,45 +677,41 @@ partition_bounds_equal(int partnatts, int16 *parttyplen, bool *parttypbyval,
         if (b1->ndatums != b2->ndatums)
                 return false;
  
+       if (b1->nindexes != b2->nindexes)
+               return false;
+
         if (b1->null_index != b2->null_index)
                 return false;
  
         if (b1->default_index != b2->default_index)
                 return false;
  
-       if (b1->strategy == PARTITION_STRATEGY_HASH)
+       /* For all partition strategies, the indexes[] arrays have to match */
+       for (i = 0; i < b1->nindexes; i++)
         {
-               int                     greatest_modulus = get_hash_partition_greatest_modulus(b1);
-
-               /*
-                * If two hash partitioned tables have different greatest moduli,
-                * their partition schemes don't match.
-                */
-               if (greatest_modulus != get_hash_partition_greatest_modulus(b2))
+               if (b1->indexes[i] != b2->indexes[i])
                         return false;
+       }
  
+       /* Finally, compare the datums[] arrays */
+       if (b1->strategy == PARTITION_STRATEGY_HASH)
+       {
                 /*
                  * We arrange the partitions in the ascending order of their moduli
                  * and remainders.  Also every modulus is factor of next larger
                  * modulus.  Therefore we can safely store index of a given partition
                  * in indexes array at remainder of that partition.  Also entries at
                  * (remainder + N * modulus) positions in indexes array are all same
-                * for (modulus, remainder) specification for any partition.  Thus
-                * datums array from both the given bounds are same, if and only if
-                * their indexes array will be same.  So, it suffices to compare
-                * indexes array.
-                */
-               for (i = 0; i < greatest_modulus; i++)
-                       if (b1->indexes[i] != b2->indexes[i])
-                               return false;
-
-#ifdef USE_ASSERT_CHECKING
-
-               /*
-                * Nonetheless make sure that the bounds are indeed same when the
+                * for (modulus, remainder) specification for any partition.  Thus the
+                * datums arrays from the given bounds are the same, if and only if
+                * their indexes arrays are the same.  So, it suffices to compare the
+                * indexes arrays.
+                *
+                * Nonetheless make sure that the bounds are indeed the same when the
                  * indexes match.  Hash partition bound stores modulus and remainder
                  * at b1->datums[i][0] and b1->datums[i][1] position respectively.
                  */
+#ifdef USE_ASSERT_CHECKING
                 for (i = 0; i < b1->ndatums; i++)
                         Assert((b1->datums[i][0] == b2->datums[i][0] &&
                                         b1->datums[i][1] == b2->datums[i][1]));
@@ -759,15 +757,7 @@ partition_bounds_equal(int partnatts, int16 *parttyplen, bool *parttypbyval,
                                                                   parttypbyval[j], parttyplen[j]))
                                         return false;
                         }
-
-                       if (b1->indexes[i] != b2->indexes[i])
-                               return false;
                 }
-
-               /* There are ndatums+1 indexes in case of range partitions */
-               if (b1->strategy == PARTITION_STRATEGY_RANGE &&
-                       b1->indexes[i] != b2->indexes[i])
-                       return false;
         }
         return true;
  }
@@ -783,17 +773,16 @@ partition_bounds_copy(PartitionBoundInfo src,
         PartitionBoundInfo dest;
         int                     i;
         int                     ndatums;
+       int                     nindexes;
         int                     partnatts;
-       int                     num_indexes;
  
         dest = (PartitionBoundInfo) palloc(sizeof(PartitionBoundInfoData));
  
         dest->strategy = src->strategy;
         ndatums = dest->ndatums = src->ndatums;
+       nindexes = dest->nindexes = src->nindexes;
         partnatts = key->partnatts;
  
-       num_indexes = get_partition_bound_num_indexes(src);
-
         /* List partitioned tables have only a single partition key. */
         Assert(key->strategy != PARTITION_STRATEGY_LIST || partnatts == 1);
  
@@ -851,8 +840,8 @@ partition_bounds_copy(PartitionBoundInfo src,
                 }
         }
  
-       dest->indexes = (int *) palloc(sizeof(int) * num_indexes);
-       memcpy(dest->indexes, src->indexes, sizeof(int) * num_indexes);
+       dest->indexes = (int *) palloc(sizeof(int) * nindexes);
+       memcpy(dest->indexes, src->indexes, sizeof(int) * nindexes);
  
         dest->null_index = src->null_index;
         dest->default_index = src->default_index;
@@ -1016,7 +1005,7 @@ check_new_partition_bound(char *relname, Relation parent,
                                                                 (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
                                                                  errmsg("every hash partition modulus must be a factor of the next larger modulus")));
  
-                                       greatest_modulus = get_hash_partition_greatest_modulus(boundinfo);
+                                       greatest_modulus = boundinfo->nindexes;
                                         remainder = spec->remainder;
  
                                         /*
@@ -1380,18 +1369,15 @@ check_default_partition_contents(Relation parent, Relation default_rel,
  /*
   * get_hash_partition_greatest_modulus
   *
- * Returns the greatest modulus of the hash partition bound. The greatest
- * modulus will be at the end of the datums array because hash partitions are
- * arranged in the ascending order of their moduli and remainders.
+ * Returns the greatest modulus of the hash partition bound.
+ * This is no longer used in the core code, but we keep it around
+ * in case external modules are using it.
   */
  int
  get_hash_partition_greatest_modulus(PartitionBoundInfo bound)
  {
         Assert(bound && bound->strategy == PARTITION_STRATEGY_HASH);
-       Assert(bound->datums && bound->ndatums > 0);
-       Assert(DatumGetInt32(bound->datums[bound->ndatums - 1][0]) > 0);
-
-       return DatumGetInt32(bound->datums[bound->ndatums - 1][0]);
+       return bound->nindexes;
  }
  
  /*
@@ -1788,46 +1774,6 @@ qsort_partition_rbound_cmp(const void *a, const void *b, void *arg)
                                                                 b1->lower, b2);
  }
  
-/*
- * get_partition_bound_num_indexes
- *
- * Returns the number of the entries in the partition bound indexes array.
- */
-static int
-get_partition_bound_num_indexes(PartitionBoundInfo bound)
-{
-       int                     num_indexes;
-
-       Assert(bound);
-
-       switch (bound->strategy)
-       {
-               case PARTITION_STRATEGY_HASH:
-
-                       /*
-                        * The number of the entries in the indexes array is same as the
-                        * greatest modulus.
-                        */
-                       num_indexes = get_hash_partition_greatest_modulus(bound);
-                       break;
-
-               case PARTITION_STRATEGY_LIST:
-                       num_indexes = bound->ndatums;
-                       break;
-
-               case PARTITION_STRATEGY_RANGE:
-                       /* Range partitioned table has an extra index. */
-                       num_indexes = bound->ndatums + 1;
-                       break;
-
-               default:
-                       elog(ERROR, "unexpected partition strategy: %d",
-                                (int) bound->strategy);
-       }
-
-       return num_indexes;
-}
-
  /*
   * get_partition_operator
   *
diff --git a/src/backend/partitioning/partprune.c b/src/backend/partitioning/partprune.c

index 040fc9b5be707a14575f8e19b44fc7daffa580d0..0f891b3d7453caf5393074cfc4023a20dcf56ea3 100644 (file)
--- a/src/backend/partitioning/partprune.c
+++ b/src/backend/partitioning/partprune.c
@@ -789,7 +789,10 @@ get_matching_partitions(PartitionPruneContext *context, List *pruning_steps)
         scan_default = final_result->scan_default;
         while ((i = bms_next_member(final_result->bound_offsets, i)) >= 0)
         {
-               int                     partindex = context->boundinfo->indexes[i];
+               int                     partindex;
+
+               Assert(i < context->boundinfo->nindexes);
+               partindex = context->boundinfo->indexes[i];
  
                 if (partindex < 0)
                 {
@@ -2518,20 +2521,19 @@ get_matching_hash_bounds(PartitionPruneContext *context,
                 for (i = 0; i < partnatts; i++)
                         isnull[i] = bms_is_member(i, nullkeys);
  
-               greatest_modulus = get_hash_partition_greatest_modulus(boundinfo);
                 rowHash = compute_partition_hash_value(partnatts, partsupfunc, partcollation,
                                                                                            values, isnull);
  
+               greatest_modulus = boundinfo->nindexes;
                 if (partindices[rowHash % greatest_modulus] >= 0)
                         result->bound_offsets =
                                 bms_make_singleton(rowHash % greatest_modulus);
         }
         else
         {
-               /* Getting here means at least one hash partition exists. */
-               Assert(boundinfo->ndatums > 0);
+               /* Report all valid offsets into the boundinfo->indexes array. */
                 result->bound_offsets = bms_add_range(NULL, 0,
-                                                                                         boundinfo->ndatums - 1);
+                                                                                         boundinfo->nindexes - 1);
         }
  
         /*
@@ -3392,30 +3394,20 @@ perform_pruning_combine_step(PartitionPruneContext *context,
                                                          PartitionPruneStepCombine *cstep,
                                                          PruneStepResult **step_results)
  {
-       ListCell   *lc1;
-       PruneStepResult *result = NULL;
+       PruneStepResult *result = (PruneStepResult *) palloc0(sizeof(PruneStepResult));
         bool            firststep;
+       ListCell   *lc1;
  
         /*
          * A combine step without any source steps is an indication to not perform
          * any partition pruning.  Return all datum indexes in that case.
          */
-       result = (PruneStepResult *) palloc0(sizeof(PruneStepResult));
-       if (list_length(cstep->source_stepids) == 0)
+       if (cstep->source_stepids == NIL)
         {
                 PartitionBoundInfo boundinfo = context->boundinfo;
-               int                     rangemax;
-
-               /*
-                * Add all valid offsets into the boundinfo->indexes array.  For range
-                * partitioning, boundinfo->indexes contains (boundinfo->ndatums + 1)
-                * valid entries; otherwise there are boundinfo->ndatums.
-                */
-               rangemax = context->strategy == PARTITION_STRATEGY_RANGE ?
-                       boundinfo->ndatums : boundinfo->ndatums - 1;
  
                 result->bound_offsets =
-                       bms_add_range(result->bound_offsets, 0, rangemax);
+                       bms_add_range(NULL, 0, boundinfo->nindexes - 1);
                 result->scan_default = partition_bound_has_default(boundinfo);
                 result->scan_null = partition_bound_accepts_nulls(boundinfo);
                 return result;
diff --git a/src/include/partitioning/partbounds.h b/src/include/partitioning/partbounds.h

index 0d0fd42b181584f2066182b24c4ed1f0249b17a7..fd06eb9009df4f4805384bfdfac8f418411cc9f8 100644 (file)
--- a/src/include/partitioning/partbounds.h
+++ b/src/include/partitioning/partbounds.h
@@ -30,7 +30,7 @@
   * In the case of range partitioning, ndatums will typically be far less than
   * 2 * nparts, because a partition's upper bound and the next partition's lower
   * bound are the same in most common cases, and we only store one of them (the
- * upper bound).  In case of hash partitioning, ndatums will be same as the
+ * upper bound).  In case of hash partitioning, ndatums will be the same as the
   * number of partitions.
   *
   * For range and list partitioned tables, datums is an array of datum-tuples
@@ -46,20 +46,26 @@
   * the partition key's operator classes and collations.
   *
   * In the case of list partitioning, the indexes array stores one entry for
- * every datum, which is the index of the partition that accepts a given datum.
- * In case of range partitioning, it stores one entry per distinct range
- * datum, which is the index of the partition for which a given datum
- * is an upper bound.  In the case of hash partitioning, the number of the
- * entries in the indexes array is same as the greatest modulus amongst all
- * partitions.  For a given partition key datum-tuple, the index of the
- * partition which would accept that datum-tuple would be given by the entry
- * pointed by remainder produced when hash value of the datum-tuple is divided
- * by the greatest modulus.
+ * each datum-array entry, which is the index of the partition that accepts
+ * rows matching that datum.  So nindexes == ndatums.
+ *
+ * In the case of range partitioning, the indexes array stores one entry per
+ * distinct range datum, which is the index of the partition for which that
+ * datum is an upper bound (or -1 for a "gap" that has no partition).  It is
+ * convenient to have an extra -1 entry representing values above the last
+ * range datum, so nindexes == ndatums + 1.
+ *
+ * In the case of hash partitioning, the number of entries in the indexes
+ * array is the same as the greatest modulus amongst all partitions (which
+ * is a multiple of all partition moduli), so nindexes == greatest modulus.
+ * The indexes array is indexed according to the hash key's remainder modulo
+ * the greatest modulus, and it contains either the partition index accepting
+ * that remainder, or -1 if there is no partition for that remainder.
   */
  typedef struct PartitionBoundInfoData
  {
         char            strategy;               /* hash, list or range? */
-       int                     ndatums;                /* Length of the datums following array */
+       int                     ndatums;                /* Length of the datums[] array */
         Datum     **datums;
         PartitionRangeDatumKind **kind; /* The kind of each range bound datum;
                                                                          * NULL for hash and list partitioned
@@ -69,6 +75,7 @@ typedef struct PartitionBoundInfoData
                                                                  * if there isn't one */
         int                     default_index;  /* Index of the default partition; -1 if there
                                                                  * isn't one */
+       int                     nindexes;               /* Length of the indexes[] array */
  } PartitionBoundInfoData;
  
  #define partition_bound_accepts_nulls(bi) ((bi)->null_index != -1)
diff --git a/src/test/regress/expected/partition_prune.out b/src/test/regress/expected/partition_prune.out

index e9349d8cbb53fba7e88621f547098ed5f5202e3e..c1871fd8d80fcfa5c244824d7469b6deb68b6eae 100644 (file)
--- a/src/test/regress/expected/partition_prune.out
+++ b/src/test/regress/expected/partition_prune.out
@@ -1497,26 +1497,27 @@ drop table lp, coll_pruning, rlp, mc3p, mc2p, boolpart, boolrangep, rp, coll_pru
  -- result on different machines.  See the definitions of
  -- part_part_test_int4_ops and part_test_text_ops in insert.sql.
  --
-create table hp (a int, b text) partition by hash (a part_test_int4_ops, b part_test_text_ops);
+create table hp (a int, b text, c int)
+  partition by hash (a part_test_int4_ops, b part_test_text_ops);
  create table hp0 partition of hp for values with (modulus 4, remainder 0);
  create table hp3 partition of hp for values with (modulus 4, remainder 3);
  create table hp1 partition of hp for values with (modulus 4, remainder 1);
  create table hp2 partition of hp for values with (modulus 4, remainder 2);
-insert into hp values (null, null);
-insert into hp values (1, null);
-insert into hp values (1, 'xxx');
-insert into hp values (null, 'xxx');
-insert into hp values (2, 'xxx');
-insert into hp values (1, 'abcde');
-select tableoid::regclass, * from hp order by 1;
- tableoid | a |   b   
-----------+---+-------
- hp0      |   | 
- hp0      | 1 | xxx
- hp3      | 2 | xxx
- hp1      | 1 | 
- hp2      |   | xxx
- hp2      | 1 | abcde
+insert into hp values (null, null, 0);
+insert into hp values (1, null, 1);
+insert into hp values (1, 'xxx', 2);
+insert into hp values (null, 'xxx', 3);
+insert into hp values (2, 'xxx', 4);
+insert into hp values (1, 'abcde', 5);
+select tableoid::regclass, * from hp order by c;
+ tableoid | a |   b   | c 
+----------+---+-------+---
+ hp0      |   |       | 0
+ hp1      | 1 |       | 1
+ hp0      | 1 | xxx   | 2
+ hp2      |   | xxx   | 3
+ hp3      | 2 | xxx   | 4
+ hp2      | 1 | abcde | 5
  (6 rows)
  
  -- partial keys won't prune, nor would non-equality conditions
@@ -1674,6 +1675,33 @@ explain (costs off) select * from hp where (a = 1 and b = 'abcde') or (a = 2 and
           Filter: (((a = 1) AND (b = 'abcde'::text)) OR ((a = 2) AND (b = 'xxx'::text)) OR ((a IS NULL) AND (b IS NULL)))
  (7 rows)
  
+-- test pruning when not all the partitions exist
+drop table hp1;
+drop table hp3;
+explain (costs off) select * from hp where a = 1 and b = 'abcde';
+                 QUERY PLAN                  
+---------------------------------------------
+ Seq Scan on hp2
+   Filter: ((a = 1) AND (b = 'abcde'::text))
+(2 rows)
+
+explain (costs off) select * from hp where a = 1 and b = 'abcde' and
+  (c = 2 or c = 3);
+                              QUERY PLAN                              
+----------------------------------------------------------------------
+ Seq Scan on hp2
+   Filter: ((a = 1) AND (b = 'abcde'::text) AND ((c = 2) OR (c = 3)))
+(2 rows)
+
+drop table hp2;
+explain (costs off) select * from hp where a = 1 and b = 'abcde' and
+  (c = 2 or c = 3);
+        QUERY PLAN        
+--------------------------
+ Result
+   One-Time Filter: false
+(2 rows)
+
  drop table hp;
  --
  -- Test runtime partition pruning
diff --git a/src/test/regress/sql/partition_prune.sql b/src/test/regress/sql/partition_prune.sql

index 9177b49fc98be5c6dd1f7368554b274e2cd9eaeb..14db0f52d3c13eacc8824f946745e4fff37126b9 100644 (file)
--- a/src/test/regress/sql/partition_prune.sql
+++ b/src/test/regress/sql/partition_prune.sql
@@ -291,19 +291,20 @@ drop table lp, coll_pruning, rlp, mc3p, mc2p, boolpart, boolrangep, rp, coll_pru
  -- part_part_test_int4_ops and part_test_text_ops in insert.sql.
  --
  
-create table hp (a int, b text) partition by hash (a part_test_int4_ops, b part_test_text_ops);
+create table hp (a int, b text, c int)
+  partition by hash (a part_test_int4_ops, b part_test_text_ops);
  create table hp0 partition of hp for values with (modulus 4, remainder 0);
  create table hp3 partition of hp for values with (modulus 4, remainder 3);
  create table hp1 partition of hp for values with (modulus 4, remainder 1);
  create table hp2 partition of hp for values with (modulus 4, remainder 2);
  
-insert into hp values (null, null);
-insert into hp values (1, null);
-insert into hp values (1, 'xxx');
-insert into hp values (null, 'xxx');
-insert into hp values (2, 'xxx');
-insert into hp values (1, 'abcde');
-select tableoid::regclass, * from hp order by 1;
+insert into hp values (null, null, 0);
+insert into hp values (1, null, 1);
+insert into hp values (1, 'xxx', 2);
+insert into hp values (null, 'xxx', 3);
+insert into hp values (2, 'xxx', 4);
+insert into hp values (1, 'abcde', 5);
+select tableoid::regclass, * from hp order by c;
  
  -- partial keys won't prune, nor would non-equality conditions
  explain (costs off) select * from hp where a = 1;
@@ -324,6 +325,16 @@ explain (costs off) select * from hp where a = 2 and b = 'xxx';
  explain (costs off) select * from hp where a = 1 and b = 'abcde';
  explain (costs off) select * from hp where (a = 1 and b = 'abcde') or (a = 2 and b = 'xxx') or (a is null and b is null);
  
+-- test pruning when not all the partitions exist
+drop table hp1;
+drop table hp3;
+explain (costs off) select * from hp where a = 1 and b = 'abcde';
+explain (costs off) select * from hp where a = 1 and b = 'abcde' and
+  (c = 2 or c = 3);
+drop table hp2;
+explain (costs off) select * from hp where a = 1 and b = 'abcde' and
+  (c = 2 or c = 3);
+
  drop table hp;
  
  --
author	Tom Lane <tgl@sss.pgh.pa.us>
	Thu, 28 Jan 2021 18:41:55 +0000 (13:41 -0500)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Thu, 28 Jan 2021 18:41:55 +0000 (13:41 -0500)
src/backend/executor/execPartition.c		patch \| blob \| blame \| history
src/backend/partitioning/partbounds.c		patch \| blob \| blame \| history
src/backend/partitioning/partprune.c		patch \| blob \| blame \| history
src/include/partitioning/partbounds.h		patch \| blob \| blame \| history
src/test/regress/expected/partition_prune.out		patch \| blob \| blame \| history
src/test/regress/sql/partition_prune.sql		patch \| blob \| blame \| history