From: Álvaro Herrera Date: Wed, 28 Jan 2026 13:38:53 +0000 (+0100) Subject: Fix duplicate arbiter detection during REINDEX CONCURRENTLY on partitions X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=e6d6e32f4240bb967460aabdd2db51181cff242f;p=thirdparty%2Fpostgresql.git Fix duplicate arbiter detection during REINDEX CONCURRENTLY on partitions Commit 90eae926a fixed ON CONFLICT handling during REINDEX CONCURRENTLY on partitioned tables by treating unparented indexes as potential arbiters. However, there's a remaining race condition: when pg_inherits records are swapped between consecutive calls to get_partition_ancestors(), two different child indexes can appear to have the same parent, causing duplicate entries in the arbiter list and triggering "invalid arbiter index list" errors. Note that this is not a new problem introduced by 90eae926a. The same error could occur before that commit in a slightly different scenario: an index is selected during planning, then index_concurrently_swap() commits, and a subsequent call to get_partition_ancestors() uses a new catalog snapshot that sees zero ancestors for that index. Fix by tracking which parent indexes have already been processed. If a subsequent call to get_partition_ancestors() returns a parent we've already seen, treat that index as unparented instead, allowing it to be matched via IsIndexCompatibleAsArbiter() like other concurrent reindex scenarios. Author: Mihail Nikalayeu Reported-by: Alexander Lakhin Reviewed-by: Álvaro Herrera Discussion: https://postgr.es/m/e5a8c1df-04e5-4343-85ef-5df2a7e3d90c@gmail.com --- diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c index 55bdf5c4835..d13e786cf13 100644 --- a/src/backend/executor/execPartition.c +++ b/src/backend/executor/execPartition.c @@ -28,6 +28,7 @@ #include "partitioning/partprune.h" #include "rewrite/rewriteManip.h" #include "utils/acl.h" +#include "utils/injection_point.h" #include "utils/lsyscache.h" #include "utils/partcache.h" #include "utils/rls.h" @@ -761,7 +762,8 @@ ExecInitPartitionInfo(ModifyTableState *mtstate, EState *estate, if (rootResultRelInfo->ri_onConflictArbiterIndexes != NIL) { List *unparented_idxs = NIL, - *arbiters_listidxs = NIL; + *arbiters_listidxs = NIL, + *ancestors_seen = NIL; for (int listidx = 0; listidx < leaf_part_rri->ri_NumIndices; listidx++) { @@ -775,17 +777,32 @@ ExecInitPartitionInfo(ModifyTableState *mtstate, EState *estate, * in case REINDEX CONCURRENTLY is working on one of the * arbiters. * - * XXX get_partition_ancestors is slow: it scans pg_inherits - * each time. Consider a syscache or some other way to cache? + * However, if two indexes appear to have the same parent, + * treat the second of these as if it had no parent. This + * sounds counterintuitive, but it can happen if a transaction + * running REINDEX CONCURRENTLY commits right between those + * two indexes are checked by another process in this loop. + * This will have the effect of also treating that second + * index as arbiter. + * + * XXX get_partition_ancestors scans pg_inherits, which is not + * only slow, but also means the catalog snapshot can get + * invalidated each time through the loop (cf. + * GetNonHistoricCatalogSnapshot). Consider a syscache or + * some other way to cache? */ indexoid = RelationGetRelid(leaf_part_rri->ri_IndexRelationDescs[listidx]); ancestors = get_partition_ancestors(indexoid); - if (ancestors != NIL) + INJECTION_POINT("exec-init-partition-after-get-partition-ancestors", NULL); + + if (ancestors != NIL && + !list_member_oid(ancestors_seen, linitial_oid(ancestors))) { foreach_oid(parent_idx, rootResultRelInfo->ri_onConflictArbiterIndexes) { if (list_member_oid(ancestors, parent_idx)) { + ancestors_seen = lappend_oid(ancestors_seen, linitial_oid(ancestors)); arbiterIndexes = lappend_oid(arbiterIndexes, indexoid); arbiters_listidxs = lappend_int(arbiters_listidxs, listidx); break; @@ -794,6 +811,7 @@ ExecInitPartitionInfo(ModifyTableState *mtstate, EState *estate, } else unparented_idxs = lappend_int(unparented_idxs, listidx); + list_free(ancestors); } @@ -812,16 +830,16 @@ ExecInitPartitionInfo(ModifyTableState *mtstate, EState *estate, foreach_int(unparented_i, unparented_idxs) { Relation unparented_rel; - IndexInfo *unparenred_ii; + IndexInfo *unparented_ii; unparented_rel = leaf_part_rri->ri_IndexRelationDescs[unparented_i]; - unparenred_ii = leaf_part_rri->ri_IndexRelationInfo[unparented_i]; + unparented_ii = leaf_part_rri->ri_IndexRelationInfo[unparented_i]; Assert(!list_member_oid(arbiterIndexes, unparented_rel->rd_index->indexrelid)); /* Ignore indexes not ready */ - if (!unparenred_ii->ii_ReadyForInserts) + if (!unparented_ii->ii_ReadyForInserts) continue; foreach_int(arbiter_i, arbiters_listidxs) @@ -839,7 +857,7 @@ ExecInitPartitionInfo(ModifyTableState *mtstate, EState *estate, if (IsIndexCompatibleAsArbiter(arbiter_rel, arbiter_ii, unparented_rel, - unparenred_ii)) + unparented_ii)) { arbiterIndexes = lappend_oid(arbiterIndexes, unparented_rel->rd_index->indexrelid); @@ -851,6 +869,7 @@ ExecInitPartitionInfo(ModifyTableState *mtstate, EState *estate, } list_free(unparented_idxs); list_free(arbiters_listidxs); + list_free(ancestors_seen); } /* diff --git a/src/test/modules/test_misc/t/010_index_concurrently_upsert.pl b/src/test/modules/test_misc/t/010_index_concurrently_upsert.pl index 26682ebc55a..3bdb632887e 100644 --- a/src/test/modules/test_misc/t/010_index_concurrently_upsert.pl +++ b/src/test/modules/test_misc/t/010_index_concurrently_upsert.pl @@ -602,6 +602,51 @@ clean_safe_quit_ok($s1, $s2, $s3); $node->safe_psql('postgres', 'TRUNCATE TABLE test.tblparted'); +############################################################################ +note('Test: REINDEX on partitioned table, cache inval between two get_partition_ancestors'); + +$s1 = $node->background_psql('postgres', on_error_stop => 0); +$s2 = $node->background_psql('postgres', on_error_stop => 0); +$s3 = $node->background_psql('postgres', on_error_stop => 0); + +$s1->query_safe( + q[ +SELECT injection_points_set_local(); +SELECT injection_points_attach('exec-init-partition-after-get-partition-ancestors', 'wait'); +]); + +$s2->query_safe( + q[ +SELECT injection_points_set_local(); +SELECT injection_points_attach('reindex-relation-concurrently-before-swap', 'wait'); +]); + +$s2->query_until( + qr/starting_reindex/, q[ +\echo starting_reindex +REINDEX INDEX CONCURRENTLY test.tbl_partition_pkey; +]); + +ok_injection_point($node, 'reindex-relation-concurrently-before-swap'); + +$s1->query_until( + qr/starting_upsert_s1/, q[ +\echo starting_upsert_s1 +INSERT INTO test.tblparted VALUES (13, now()) ON CONFLICT (i) DO UPDATE SET updated_at = now(); +]); + +ok_injection_point($node, + 'exec-init-partition-after-get-partition-ancestors'); + +wakeup_injection_point($node, 'reindex-relation-concurrently-before-swap'); + +wakeup_injection_point($node, + 'exec-init-partition-after-get-partition-ancestors'); + +clean_safe_quit_ok($s1, $s2, $s3); + +$node->safe_psql('postgres', 'TRUNCATE TABLE test.tblparted'); + ############################################################################ note('Test: CREATE INDEX CONCURRENTLY + UPSERT'); # Uses invalidate-catalog-snapshot-end to test catalog invalidation