A deadlock can occur when the DDL command and the apply worker acquire
catalog locks in different orders while dropping replication origins.
The issue is rare in PG16 and higher branches because, in most cases, the
tablesync worker performs the origin drop in those branches, and its
locking sequence does not conflict with DDL operations.
This patch ensures consistent lock acquisition to prevent such deadlocks.
As per buildfarm.
Reported-by: Alexander Lakhin <exclusion@gmail.com>
Author: Ajin Cherian <itsajin@gmail.com>
Reviewed-by: Hayato Kuroda <kuroda.hayato@fujitsu.com>
Reviewed-by: vignesh C <vignesh21@gmail.com>
Reviewed-by: Amit Kapila <amit.kapila16@gmail.com>
Backpatch-through: 14, where it was introduced
Discussion: https://postgr.es/m/
bab95e12-6cc5-4ebb-80a8-
3e41956aa297@gmail.com
* Update the state of a subscription table.
*/
void
-UpdateSubscriptionRelState(Oid subid, Oid relid, char state,
- XLogRecPtr sublsn)
+UpdateSubscriptionRelStateEx(Oid subid, Oid relid, char state,
+ XLogRecPtr sublsn, bool already_locked)
{
Relation rel;
HeapTuple tup;
Datum values[Natts_pg_subscription_rel];
bool replaces[Natts_pg_subscription_rel];
- LockSharedObject(SubscriptionRelationId, subid, 0, AccessShareLock);
+ if (already_locked)
+ {
+#ifdef USE_ASSERT_CHECKING
+ LOCKTAG tag;
- rel = table_open(SubscriptionRelRelationId, RowExclusiveLock);
+ Assert(CheckRelationOidLockedByMe(SubscriptionRelRelationId,
+ RowExclusiveLock, true));
+ SET_LOCKTAG_OBJECT(tag, InvalidOid, SubscriptionRelationId, subid, 0);
+ Assert(LockHeldByMe(&tag, AccessShareLock));
+#endif
+
+ rel = table_open(SubscriptionRelRelationId, NoLock);
+ }
+ else
+ {
+ LockSharedObject(SubscriptionRelationId, subid, 0, AccessShareLock);
+ rel = table_open(SubscriptionRelRelationId, RowExclusiveLock);
+ }
/* Try finding existing mapping. */
tup = SearchSysCacheCopy2(SUBSCRIPTIONRELMAP,
table_close(rel, NoLock);
}
+/*
+ * Update the state of a subscription table.
+ */
+void
+UpdateSubscriptionRelState(Oid subid, Oid relid, char state,
+ XLogRecPtr sublsn)
+{
+ UpdateSubscriptionRelStateEx(subid, relid, state, sublsn, false);
+}
+
/*
* Get state of subscription table.
*
static HTAB *last_start_times = NULL;
ListCell *lc;
bool started_tx = false;
+ Relation rel = NULL;
Assert(!IsTransactionState());
* refresh for the subscription where we remove the table
* state and its origin and by this time the origin might be
* already removed. So passing missing_ok = true.
+ *
+ * Lock the subscription and origin in the same order as we
+ * are doing during DDL commands to avoid deadlocks. See
+ * AlterSubscription_refresh.
*/
+ LockSharedObject(SubscriptionRelationId, MyLogicalRepWorker->subid,
+ 0, AccessShareLock);
+ if (!rel)
+ rel = table_open(SubscriptionRelRelationId, RowExclusiveLock);
+
ReplicationOriginNameForTablesync(MyLogicalRepWorker->subid,
rstate->relid,
originname,
/*
* Update the state to READY only after the origin cleanup.
*/
- UpdateSubscriptionRelState(MyLogicalRepWorker->subid,
- rstate->relid, rstate->state,
- rstate->lsn);
+ UpdateSubscriptionRelStateEx(MyLogicalRepWorker->subid,
+ rstate->relid, rstate->state,
+ rstate->lsn, true);
}
}
else
* This is required to avoid any undetected deadlocks
* due to any existing lock as deadlock detector won't
* be able to detect the waits on the latch.
+ *
+ * Also close any tables prior to the commit.
*/
+ if (rel)
+ {
+ table_close(rel, NoLock);
+ rel = NULL;
+ }
CommitTransactionCommand();
pgstat_report_stat(false);
}
}
}
+ /* Close table if opened */
+ if (rel)
+ table_close(rel, NoLock);
+
if (started_tx)
{
CommitTransactionCommand();
XLogRecPtr sublsn);
extern void UpdateSubscriptionRelState(Oid subid, Oid relid, char state,
XLogRecPtr sublsn);
+extern void UpdateSubscriptionRelStateEx(Oid subid, Oid relid, char state,
+ XLogRecPtr sublsn, bool already_locked);
extern char GetSubscriptionRelState(Oid subid, Oid relid, XLogRecPtr *sublsn);
extern void RemoveSubscriptionRel(Oid subid, Oid relid);