]> git.ipfire.org Git - thirdparty/postgresql.git/commitdiff
Fix a deadlock during ALTER SUBSCRIPTION ... DROP PUBLICATION.
authorAmit Kapila <akapila@postgresql.org>
Fri, 1 Aug 2025 07:16:30 +0000 (07:16 +0000)
committerAmit Kapila <akapila@postgresql.org>
Fri, 1 Aug 2025 07:16:30 +0000 (07:16 +0000)
A deadlock can occur when the DDL command and the apply worker acquire
catalog locks in different orders while dropping replication origins.

The issue is rare in PG16 and higher branches because, in most cases, the
tablesync worker performs the origin drop in those branches, and its
locking sequence does not conflict with DDL operations.

This patch ensures consistent lock acquisition to prevent such deadlocks.

As per buildfarm.

Reported-by: Alexander Lakhin <exclusion@gmail.com>
Author: Ajin Cherian <itsajin@gmail.com>
Reviewed-by: Hayato Kuroda <kuroda.hayato@fujitsu.com>
Reviewed-by: vignesh C <vignesh21@gmail.com>
Reviewed-by: Amit Kapila <amit.kapila16@gmail.com>
Backpatch-through: 14, where it was introduced
Discussion: https://postgr.es/m/bab95e12-6cc5-4ebb-80a8-3e41956aa297@gmail.com

src/backend/catalog/pg_subscription.c
src/backend/replication/logical/tablesync.c
src/include/catalog/pg_subscription_rel.h

index 29fc4218cd4bb736e945d170cb719606a4a199e9..941817c82711346726b225ff29f1239ada6ef914 100644 (file)
@@ -281,8 +281,8 @@ AddSubscriptionRelState(Oid subid, Oid relid, char state,
  * Update the state of a subscription table.
  */
 void
-UpdateSubscriptionRelState(Oid subid, Oid relid, char state,
-                                                  XLogRecPtr sublsn)
+UpdateSubscriptionRelStateEx(Oid subid, Oid relid, char state,
+                                                        XLogRecPtr sublsn, bool already_locked)
 {
        Relation        rel;
        HeapTuple       tup;
@@ -290,9 +290,24 @@ UpdateSubscriptionRelState(Oid subid, Oid relid, char state,
        Datum           values[Natts_pg_subscription_rel];
        bool            replaces[Natts_pg_subscription_rel];
 
-       LockSharedObject(SubscriptionRelationId, subid, 0, AccessShareLock);
+       if (already_locked)
+       {
+#ifdef USE_ASSERT_CHECKING
+               LOCKTAG         tag;
 
-       rel = table_open(SubscriptionRelRelationId, RowExclusiveLock);
+               Assert(CheckRelationOidLockedByMe(SubscriptionRelRelationId,
+                                                                                 RowExclusiveLock, true));
+               SET_LOCKTAG_OBJECT(tag, InvalidOid, SubscriptionRelationId, subid, 0);
+               Assert(LockHeldByMe(&tag, AccessShareLock));
+#endif
+
+               rel = table_open(SubscriptionRelRelationId, NoLock);
+       }
+       else
+       {
+               LockSharedObject(SubscriptionRelationId, subid, 0, AccessShareLock);
+               rel = table_open(SubscriptionRelRelationId, RowExclusiveLock);
+       }
 
        /* Try finding existing mapping. */
        tup = SearchSysCacheCopy2(SUBSCRIPTIONRELMAP,
@@ -326,6 +341,16 @@ UpdateSubscriptionRelState(Oid subid, Oid relid, char state,
        table_close(rel, NoLock);
 }
 
+/*
+ * Update the state of a subscription table.
+ */
+void
+UpdateSubscriptionRelState(Oid subid, Oid relid, char state,
+                                                  XLogRecPtr sublsn)
+{
+       UpdateSubscriptionRelStateEx(subid, relid, state, sublsn, false);
+}
+
 /*
  * Get state of subscription table.
  *
index 26b71dee6728a99f62ffe65d424233296e344592..3e9de50ff12121b68876f8f8dedf2d90963296ef 100644 (file)
@@ -366,6 +366,7 @@ process_syncing_tables_for_apply(XLogRecPtr current_lsn)
        static HTAB *last_start_times = NULL;
        ListCell   *lc;
        bool            started_tx = false;
+       Relation        rel = NULL;
 
        Assert(!IsTransactionState());
 
@@ -463,7 +464,16 @@ process_syncing_tables_for_apply(XLogRecPtr current_lsn)
                                 * refresh for the subscription where we remove the table
                                 * state and its origin and by this time the origin might be
                                 * already removed. So passing missing_ok = true.
+                                *
+                                * Lock the subscription and origin in the same order as we
+                                * are doing during DDL commands to avoid deadlocks. See
+                                * AlterSubscription_refresh.
                                 */
+                               LockSharedObject(SubscriptionRelationId, MyLogicalRepWorker->subid,
+                                                                0, AccessShareLock);
+                               if (!rel)
+                                       rel = table_open(SubscriptionRelRelationId, RowExclusiveLock);
+
                                ReplicationOriginNameForTablesync(MyLogicalRepWorker->subid,
                                                                                                  rstate->relid,
                                                                                                  originname,
@@ -473,9 +483,9 @@ process_syncing_tables_for_apply(XLogRecPtr current_lsn)
                                /*
                                 * Update the state to READY only after the origin cleanup.
                                 */
-                               UpdateSubscriptionRelState(MyLogicalRepWorker->subid,
-                                                                                  rstate->relid, rstate->state,
-                                                                                  rstate->lsn);
+                               UpdateSubscriptionRelStateEx(MyLogicalRepWorker->subid,
+                                                                                        rstate->relid, rstate->state,
+                                                                                        rstate->lsn, true);
                        }
                }
                else
@@ -526,7 +536,14 @@ process_syncing_tables_for_apply(XLogRecPtr current_lsn)
                                                 * This is required to avoid any undetected deadlocks
                                                 * due to any existing lock as deadlock detector won't
                                                 * be able to detect the waits on the latch.
+                                                *
+                                                * Also close any tables prior to the commit.
                                                 */
+                                               if (rel)
+                                               {
+                                                       table_close(rel, NoLock);
+                                                       rel = NULL;
+                                               }
                                                CommitTransactionCommand();
                                                pgstat_report_stat(false);
                                        }
@@ -586,6 +603,10 @@ process_syncing_tables_for_apply(XLogRecPtr current_lsn)
                }
        }
 
+       /* Close table if opened */
+       if (rel)
+               table_close(rel, NoLock);
+
        if (started_tx)
        {
                CommitTransactionCommand();
index ed94f57baa1376ae38f2b606320fcb30c100d258..e0a1a4b8b4a8f52f6eb9e79c125922c7b49c184b 100644 (file)
@@ -85,6 +85,8 @@ extern void AddSubscriptionRelState(Oid subid, Oid relid, char state,
                                                                        XLogRecPtr sublsn);
 extern void UpdateSubscriptionRelState(Oid subid, Oid relid, char state,
                                                                           XLogRecPtr sublsn);
+extern void UpdateSubscriptionRelStateEx(Oid subid, Oid relid, char state,
+                                                                                XLogRecPtr sublsn, bool already_locked);
 extern char GetSubscriptionRelState(Oid subid, Oid relid, XLogRecPtr *sublsn);
 extern void RemoveSubscriptionRel(Oid subid, Oid relid);