]> git.ipfire.org Git - thirdparty/postgresql.git/commitdiff
Fix a deadlock during ALTER SUBSCRIPTION ... DROP PUBLICATION.
authorAmit Kapila <akapila@postgresql.org>
Fri, 1 Aug 2025 07:46:22 +0000 (07:46 +0000)
committerAmit Kapila <akapila@postgresql.org>
Fri, 1 Aug 2025 07:46:22 +0000 (07:46 +0000)
A deadlock can occur when the DDL command and the apply worker acquire
catalog locks in different orders while dropping replication origins.

The issue is rare in PG16 and higher branches because, in most cases, the
tablesync worker performs the origin drop in those branches, and its
locking sequence does not conflict with DDL operations.

This patch ensures consistent lock acquisition to prevent such deadlocks.

As per buildfarm.

Reported-by: Alexander Lakhin <exclusion@gmail.com>
Author: Ajin Cherian <itsajin@gmail.com>
Reviewed-by: Hayato Kuroda <kuroda.hayato@fujitsu.com>
Reviewed-by: vignesh C <vignesh21@gmail.com>
Reviewed-by: Amit Kapila <amit.kapila16@gmail.com>
Backpatch-through: 14, where it was introduced
Discussion: https://postgr.es/m/bab95e12-6cc5-4ebb-80a8-3e41956aa297@gmail.com

src/backend/catalog/pg_subscription.c
src/backend/replication/logical/tablesync.c
src/include/catalog/pg_subscription_rel.h

index 1395032413e3db6ebabbd652a1b6d601b4f5ae95..c0abc8bc8f3b84f601105677cfbef985f42e6c07 100644 (file)
@@ -319,7 +319,7 @@ AddSubscriptionRelState(Oid subid, Oid relid, char state,
  */
 void
 UpdateSubscriptionRelState(Oid subid, Oid relid, char state,
-                                                  XLogRecPtr sublsn)
+                                                  XLogRecPtr sublsn, bool already_locked)
 {
        Relation        rel;
        HeapTuple       tup;
@@ -327,9 +327,24 @@ UpdateSubscriptionRelState(Oid subid, Oid relid, char state,
        Datum           values[Natts_pg_subscription_rel];
        bool            replaces[Natts_pg_subscription_rel];
 
-       LockSharedObject(SubscriptionRelationId, subid, 0, AccessShareLock);
+       if (already_locked)
+       {
+#ifdef USE_ASSERT_CHECKING
+               LOCKTAG         tag;
 
-       rel = table_open(SubscriptionRelRelationId, RowExclusiveLock);
+               Assert(CheckRelationOidLockedByMe(SubscriptionRelRelationId,
+                                                                                 RowExclusiveLock, true));
+               SET_LOCKTAG_OBJECT(tag, InvalidOid, SubscriptionRelationId, subid, 0);
+               Assert(LockHeldByMe(&tag, AccessShareLock, true));
+#endif
+
+               rel = table_open(SubscriptionRelRelationId, NoLock);
+       }
+       else
+       {
+               LockSharedObject(SubscriptionRelationId, subid, 0, AccessShareLock);
+               rel = table_open(SubscriptionRelRelationId, RowExclusiveLock);
+       }
 
        /* Try finding existing mapping. */
        tup = SearchSysCacheCopy2(SUBSCRIPTIONRELMAP,
index c90f23ee5b0b233c29bed1fcc0554e507baa3293..9ea7b2cfc128eac591f4a6593c004a07b535ffa2 100644 (file)
@@ -316,7 +316,8 @@ process_syncing_tables_for_sync(XLogRecPtr current_lsn)
                UpdateSubscriptionRelState(MyLogicalRepWorker->subid,
                                                                   MyLogicalRepWorker->relid,
                                                                   MyLogicalRepWorker->relstate,
-                                                                  MyLogicalRepWorker->relstate_lsn);
+                                                                  MyLogicalRepWorker->relstate_lsn,
+                                                                  false);
 
                /*
                 * End streaming so that LogRepWorkerWalRcvConn can be used to drop
@@ -425,6 +426,7 @@ process_syncing_tables_for_apply(XLogRecPtr current_lsn)
        ListCell   *lc;
        bool            started_tx = false;
        bool            should_exit = false;
+       Relation        rel = NULL;
 
        Assert(!IsTransactionState());
 
@@ -492,7 +494,17 @@ process_syncing_tables_for_apply(XLogRecPtr current_lsn)
                                 * worker to remove the origin tracking as if there is any
                                 * error while dropping we won't restart it to drop the
                                 * origin. So passing missing_ok = true.
+                                *
+                                * Lock the subscription and origin in the same order as we
+                                * are doing during DDL commands to avoid deadlocks. See
+                                * AlterSubscription_refresh.
                                 */
+                               LockSharedObject(SubscriptionRelationId, MyLogicalRepWorker->subid,
+                                                                0, AccessShareLock);
+
+                               if (!rel)
+                                       rel = table_open(SubscriptionRelRelationId, RowExclusiveLock);
+
                                ReplicationOriginNameForLogicalRep(MyLogicalRepWorker->subid,
                                                                                                   rstate->relid,
                                                                                                   originname,
@@ -504,7 +516,7 @@ process_syncing_tables_for_apply(XLogRecPtr current_lsn)
                                 */
                                UpdateSubscriptionRelState(MyLogicalRepWorker->subid,
                                                                                   rstate->relid, rstate->state,
-                                                                                  rstate->lsn);
+                                                                                  rstate->lsn, true);
                        }
                }
                else
@@ -555,7 +567,14 @@ process_syncing_tables_for_apply(XLogRecPtr current_lsn)
                                                 * This is required to avoid any undetected deadlocks
                                                 * due to any existing lock as deadlock detector won't
                                                 * be able to detect the waits on the latch.
+                                                *
+                                                * Also close any tables prior to the commit.
                                                 */
+                                               if (rel)
+                                               {
+                                                       table_close(rel, NoLock);
+                                                       rel = NULL;
+                                               }
                                                CommitTransactionCommand();
                                                pgstat_report_stat(false);
                                        }
@@ -622,6 +641,11 @@ process_syncing_tables_for_apply(XLogRecPtr current_lsn)
                }
        }
 
+       /* Close table if opened */
+       if (rel)
+               table_close(rel, NoLock);
+
+
        if (started_tx)
        {
                /*
@@ -1413,7 +1437,8 @@ LogicalRepSyncTableStart(XLogRecPtr *origin_startpos)
        UpdateSubscriptionRelState(MyLogicalRepWorker->subid,
                                                           MyLogicalRepWorker->relid,
                                                           MyLogicalRepWorker->relstate,
-                                                          MyLogicalRepWorker->relstate_lsn);
+                                                          MyLogicalRepWorker->relstate_lsn,
+                                                          false);
        CommitTransactionCommand();
        pgstat_report_stat(true);
 
@@ -1546,7 +1571,8 @@ LogicalRepSyncTableStart(XLogRecPtr *origin_startpos)
        UpdateSubscriptionRelState(MyLogicalRepWorker->subid,
                                                           MyLogicalRepWorker->relid,
                                                           SUBREL_STATE_FINISHEDCOPY,
-                                                          MyLogicalRepWorker->relstate_lsn);
+                                                          MyLogicalRepWorker->relstate_lsn,
+                                                          false);
 
        CommitTransactionCommand();
 
index c91797c869c24611dc76eaca79f724698fc8a527..f458447a0e5fba3cb2927075ac7fcfabe43ae34b 100644 (file)
@@ -85,7 +85,7 @@ typedef struct SubscriptionRelState
 extern void AddSubscriptionRelState(Oid subid, Oid relid, char state,
                                                                        XLogRecPtr sublsn, bool retain_lock);
 extern void UpdateSubscriptionRelState(Oid subid, Oid relid, char state,
-                                                                          XLogRecPtr sublsn);
+                                                                          XLogRecPtr sublsn, bool already_locked);
 extern char GetSubscriptionRelState(Oid subid, Oid relid, XLogRecPtr *sublsn);
 extern void RemoveSubscriptionRel(Oid subid, Oid relid);