]> git.ipfire.org Git - thirdparty/postgresql.git/commitdiff
Fix duplicate arbiter detection during REINDEX CONCURRENTLY on partitions
authorÁlvaro Herrera <alvherre@kurilemu.de>
Wed, 28 Jan 2026 13:38:53 +0000 (14:38 +0100)
committerÁlvaro Herrera <alvherre@kurilemu.de>
Wed, 28 Jan 2026 13:38:53 +0000 (14:38 +0100)
Commit 90eae926a fixed ON CONFLICT handling during REINDEX CONCURRENTLY
on partitioned tables by treating unparented indexes as potential
arbiters.  However, there's a remaining race condition: when pg_inherits
records are swapped between consecutive calls to get_partition_ancestors(),
two different child indexes can appear to have the same parent, causing
duplicate entries in the arbiter list and triggering "invalid arbiter
index list" errors.

Note that this is not a new problem introduced by 90eae926a.  The same
error could occur before that commit in a slightly different scenario:
an index is selected during planning, then index_concurrently_swap()
commits, and a subsequent call to get_partition_ancestors() uses a new
catalog snapshot that sees zero ancestors for that index.

Fix by tracking which parent indexes have already been processed.  If a
subsequent call to get_partition_ancestors() returns a parent we've
already seen, treat that index as unparented instead, allowing it to be
matched via IsIndexCompatibleAsArbiter() like other concurrent reindex
scenarios.

Author: Mihail Nikalayeu <mihailnikalayeu@gmail.com>
Reported-by: Alexander Lakhin <exclusion@gmail.com>
Reviewed-by: Álvaro Herrera <alvherre@kurilemu.de>
Discussion: https://postgr.es/m/e5a8c1df-04e5-4343-85ef-5df2a7e3d90c@gmail.com

src/backend/executor/execPartition.c
src/test/modules/test_misc/t/010_index_concurrently_upsert.pl

index 55bdf5c483532db2f024746ac9d4ca5ba7bd5703..d13e786cf134388a6a965977a0974b747d819253 100644 (file)
@@ -28,6 +28,7 @@
 #include "partitioning/partprune.h"
 #include "rewrite/rewriteManip.h"
 #include "utils/acl.h"
+#include "utils/injection_point.h"
 #include "utils/lsyscache.h"
 #include "utils/partcache.h"
 #include "utils/rls.h"
@@ -761,7 +762,8 @@ ExecInitPartitionInfo(ModifyTableState *mtstate, EState *estate,
                if (rootResultRelInfo->ri_onConflictArbiterIndexes != NIL)
                {
                        List       *unparented_idxs = NIL,
-                                          *arbiters_listidxs = NIL;
+                                          *arbiters_listidxs = NIL,
+                                          *ancestors_seen = NIL;
 
                        for (int listidx = 0; listidx < leaf_part_rri->ri_NumIndices; listidx++)
                        {
@@ -775,17 +777,32 @@ ExecInitPartitionInfo(ModifyTableState *mtstate, EState *estate,
                                 * in case REINDEX CONCURRENTLY is working on one of the
                                 * arbiters.
                                 *
-                                * XXX get_partition_ancestors is slow: it scans pg_inherits
-                                * each time.  Consider a syscache or some other way to cache?
+                                * However, if two indexes appear to have the same parent,
+                                * treat the second of these as if it had no parent.  This
+                                * sounds counterintuitive, but it can happen if a transaction
+                                * running REINDEX CONCURRENTLY commits right between those
+                                * two indexes are checked by another process in this loop.
+                                * This will have the effect of also treating that second
+                                * index as arbiter.
+                                *
+                                * XXX get_partition_ancestors scans pg_inherits, which is not
+                                * only slow, but also means the catalog snapshot can get
+                                * invalidated each time through the loop (cf.
+                                * GetNonHistoricCatalogSnapshot).  Consider a syscache or
+                                * some other way to cache?
                                 */
                                indexoid = RelationGetRelid(leaf_part_rri->ri_IndexRelationDescs[listidx]);
                                ancestors = get_partition_ancestors(indexoid);
-                               if (ancestors != NIL)
+                               INJECTION_POINT("exec-init-partition-after-get-partition-ancestors", NULL);
+
+                               if (ancestors != NIL &&
+                                       !list_member_oid(ancestors_seen, linitial_oid(ancestors)))
                                {
                                        foreach_oid(parent_idx, rootResultRelInfo->ri_onConflictArbiterIndexes)
                                        {
                                                if (list_member_oid(ancestors, parent_idx))
                                                {
+                                                       ancestors_seen = lappend_oid(ancestors_seen, linitial_oid(ancestors));
                                                        arbiterIndexes = lappend_oid(arbiterIndexes, indexoid);
                                                        arbiters_listidxs = lappend_int(arbiters_listidxs, listidx);
                                                        break;
@@ -794,6 +811,7 @@ ExecInitPartitionInfo(ModifyTableState *mtstate, EState *estate,
                                }
                                else
                                        unparented_idxs = lappend_int(unparented_idxs, listidx);
+
                                list_free(ancestors);
                        }
 
@@ -812,16 +830,16 @@ ExecInitPartitionInfo(ModifyTableState *mtstate, EState *estate,
                                foreach_int(unparented_i, unparented_idxs)
                                {
                                        Relation        unparented_rel;
-                                       IndexInfo  *unparenred_ii;
+                                       IndexInfo  *unparented_ii;
 
                                        unparented_rel = leaf_part_rri->ri_IndexRelationDescs[unparented_i];
-                                       unparenred_ii = leaf_part_rri->ri_IndexRelationInfo[unparented_i];
+                                       unparented_ii = leaf_part_rri->ri_IndexRelationInfo[unparented_i];
 
                                        Assert(!list_member_oid(arbiterIndexes,
                                                                                        unparented_rel->rd_index->indexrelid));
 
                                        /* Ignore indexes not ready */
-                                       if (!unparenred_ii->ii_ReadyForInserts)
+                                       if (!unparented_ii->ii_ReadyForInserts)
                                                continue;
 
                                        foreach_int(arbiter_i, arbiters_listidxs)
@@ -839,7 +857,7 @@ ExecInitPartitionInfo(ModifyTableState *mtstate, EState *estate,
                                                if (IsIndexCompatibleAsArbiter(arbiter_rel,
                                                                                                           arbiter_ii,
                                                                                                           unparented_rel,
-                                                                                                          unparenred_ii))
+                                                                                                          unparented_ii))
                                                {
                                                        arbiterIndexes = lappend_oid(arbiterIndexes,
                                                                                                                 unparented_rel->rd_index->indexrelid);
@@ -851,6 +869,7 @@ ExecInitPartitionInfo(ModifyTableState *mtstate, EState *estate,
                        }
                        list_free(unparented_idxs);
                        list_free(arbiters_listidxs);
+                       list_free(ancestors_seen);
                }
 
                /*
index 26682ebc55a6f391c7c53447c2597f959991daa5..3bdb632887e55df392c54dc97d6a74c1fda28c84 100644 (file)
@@ -602,6 +602,51 @@ clean_safe_quit_ok($s1, $s2, $s3);
 
 $node->safe_psql('postgres', 'TRUNCATE TABLE test.tblparted');
 
+############################################################################
+note('Test: REINDEX on partitioned table, cache inval between two get_partition_ancestors');
+
+$s1 = $node->background_psql('postgres', on_error_stop => 0);
+$s2 = $node->background_psql('postgres', on_error_stop => 0);
+$s3 = $node->background_psql('postgres', on_error_stop => 0);
+
+$s1->query_safe(
+       q[
+SELECT injection_points_set_local();
+SELECT injection_points_attach('exec-init-partition-after-get-partition-ancestors', 'wait');
+]);
+
+$s2->query_safe(
+       q[
+SELECT injection_points_set_local();
+SELECT injection_points_attach('reindex-relation-concurrently-before-swap', 'wait');
+]);
+
+$s2->query_until(
+       qr/starting_reindex/, q[
+\echo starting_reindex
+REINDEX INDEX CONCURRENTLY test.tbl_partition_pkey;
+]);
+
+ok_injection_point($node, 'reindex-relation-concurrently-before-swap');
+
+$s1->query_until(
+       qr/starting_upsert_s1/, q[
+\echo starting_upsert_s1
+INSERT INTO test.tblparted VALUES (13, now()) ON CONFLICT (i) DO UPDATE SET updated_at = now();
+]);
+
+ok_injection_point($node,
+       'exec-init-partition-after-get-partition-ancestors');
+
+wakeup_injection_point($node, 'reindex-relation-concurrently-before-swap');
+
+wakeup_injection_point($node,
+       'exec-init-partition-after-get-partition-ancestors');
+
+clean_safe_quit_ok($s1, $s2, $s3);
+
+$node->safe_psql('postgres', 'TRUNCATE TABLE test.tblparted');
+
 ############################################################################
 note('Test: CREATE INDEX CONCURRENTLY + UPSERT');
 # Uses invalidate-catalog-snapshot-end to test catalog invalidation