]> git.ipfire.org Git - thirdparty/postgresql.git/commitdiff
Prevent concurrent SimpleLruTruncate() for any given SLRU.
authorNoah Misch <noah@leadboat.com>
Sat, 15 Aug 2020 17:15:53 +0000 (10:15 -0700)
committerNoah Misch <noah@leadboat.com>
Sat, 15 Aug 2020 17:15:57 +0000 (10:15 -0700)
The SimpleLruTruncate() header comment states the new coding rule.  To
achieve this, add locktype "frozenid" and two LWLocks.  This closes a
rare opportunity for data loss, which manifested as "apparent
wraparound" or "could not access status of transaction" errors.  Data
loss is more likely in pg_multixact, due to released branches' thin
margin between multiStopLimit and multiWrapLimit.  If a user's physical
replication primary logged ":  apparent wraparound" messages, the user
should rebuild standbys of that primary regardless of symptoms.  At less
risk is a cluster having emitted "not accepting commands" errors or
"must be vacuumed" warnings at some point.  One can test a cluster for
this data loss by running VACUUM FREEZE in every database.  Back-patch
to 9.5 (all supported versions).

Discussion: https://postgr.es/m/20190218073103.GA1434723@rfd.leadboat.com

doc/src/sgml/catalogs.sgml
doc/src/sgml/monitoring.sgml
src/backend/access/transam/slru.c
src/backend/access/transam/subtrans.c
src/backend/commands/async.c
src/backend/commands/vacuum.c
src/backend/storage/lmgr/lmgr.c
src/backend/storage/lmgr/lwlocknames.txt
src/backend/utils/adt/lockfuncs.c
src/include/storage/lmgr.h
src/include/storage/lock.h

index fdad2cfc613f7e3dc5e42d0a27cffa37f167a3ba..849ba347e2335611e10a78ddc10e10c9205e8db2 100644 (file)
@@ -9005,7 +9005,8 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</replaceable>:<replaceable>&l
    and general database objects (identified by class OID and object OID,
    in the same way as in <structname>pg_description</structname> or
    <structname>pg_depend</structname>).  Also, the right to extend a
-   relation is represented as a separate lockable object.
+   relation is represented as a separate lockable object, as is the right to
+   update <structname>pg_database</structname>.<structfield>datfrozenxid</structfield>.
    Also, <quote>advisory</quote> locks can be taken on numbers that have
    user-defined meanings.
   </para>
@@ -9031,6 +9032,7 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</replaceable>:<replaceable>&l
        Type of the lockable object:
        <literal>relation</literal>,
        <literal>extend</literal>,
+       <literal>frozenid</literal>,
        <literal>page</literal>,
        <literal>tuple</literal>,
        <literal>transactionid</literal>,
index 67282e82327a54e124bd1267bf259208be9505a6..622c73d37a5b8b7a382efab9f1475d59a06d4903 100644 (file)
@@ -849,7 +849,7 @@ postgres   27093  0.0  0.0  30096  2752 ?        Ss   11:34   0:00 postgres: ser
 
       <tbody>
        <row>
-        <entry morerows="64"><literal>LWLock</literal></entry>
+        <entry morerows="66"><literal>LWLock</literal></entry>
         <entry><literal>ShmemIndexLock</literal></entry>
         <entry>Waiting to find or allocate space in shared memory.</entry>
        </row>
@@ -1047,6 +1047,16 @@ postgres   27093  0.0  0.0  30096  2752 ?        Ss   11:34   0:00 postgres: ser
          <entry>Waiting to execute <function>txid_status</function> or update
          the oldest transaction id available to it.</entry>
         </row>
+        <row>
+         <entry><literal>WrapLimitsVacuumLock</literal></entry>
+         <entry>Waiting to update limits on transaction id and multixact
+         consumption.</entry>
+        </row>
+        <row>
+         <entry><literal>NotifyQueueTailLock</literal></entry>
+         <entry>Waiting to update limit on notification message
+         storage.</entry>
+        </row>
         <row>
          <entry><literal>clog</literal></entry>
          <entry>Waiting for I/O on a clog (transaction status) buffer.</entry>
@@ -1132,7 +1142,7 @@ postgres   27093  0.0  0.0  30096  2752 ?        Ss   11:34   0:00 postgres: ser
          counters during Parallel Hash plan execution.</entry>
         </row>
         <row>
-         <entry morerows="9"><literal>Lock</literal></entry>
+         <entry morerows="10"><literal>Lock</literal></entry>
          <entry><literal>relation</literal></entry>
          <entry>Waiting to acquire a lock on a relation.</entry>
         </row>
@@ -1140,6 +1150,12 @@ postgres   27093  0.0  0.0  30096  2752 ?        Ss   11:34   0:00 postgres: ser
          <entry><literal>extend</literal></entry>
          <entry>Waiting to extend a relation.</entry>
         </row>
+        <row>
+         <entry><literal>frozenid</literal></entry>
+         <entry>Waiting to
+         update <structname>pg_database</structname>.<structfield>datfrozenxid</structfield>
+         and <structname>pg_database</structname>.<structfield>datminmxid</structfield>.</entry>
+        </row>
         <row>
          <entry><literal>page</literal></entry>
          <entry>Waiting to acquire a lock on page of a relation.</entry>
index fad5d363e32ff9d636371ea74f71b1808d9b8ab3..67387979bdbbcc85f66f303715c1c9f076057dec 100644 (file)
@@ -1163,6 +1163,14 @@ SimpleLruFlush(SlruCtl ctl, bool allow_redirtied)
 
 /*
  * Remove all segments before the one holding the passed page number
+ *
+ * All SLRUs prevent concurrent calls to this function, either with an LWLock
+ * or by calling it only as part of a checkpoint.  Mutual exclusion must begin
+ * before computing cutoffPage.  Mutual exclusion must end after any limit
+ * update that would permit other backends to write fresh data into the
+ * segment immediately preceding the one containing cutoffPage.  Otherwise,
+ * when the SLRU is quite full, SimpleLruTruncate() might delete that segment
+ * after it has accrued freshly-written data.
  */
 void
 SimpleLruTruncate(SlruCtl ctl, int cutoffPage)
index 4faa21f5aef88d3280a7348815470816e4302097..ef63b6d98a4b841377f0fce08bd5997d52d572e5 100644 (file)
@@ -347,8 +347,8 @@ ExtendSUBTRANS(TransactionId newestXact)
 /*
  * Remove all SUBTRANS segments before the one holding the passed transaction ID
  *
- * This is normally called during checkpoint, with oldestXact being the
- * oldest TransactionXmin of any running transaction.
+ * oldestXact is the oldest TransactionXmin of any running transaction.  This
+ * is called only during checkpoint.
  */
 void
 TruncateSUBTRANS(TransactionId oldestXact)
index 4cd1f4b95c37d3d82c7b0f9e3fbea049e766fb8e..8ef0aad808e8f95ac9de74d6aef9d001ccd1b09d 100644 (file)
@@ -224,19 +224,22 @@ typedef struct QueueBackendStatus
 /*
  * Shared memory state for LISTEN/NOTIFY (excluding its SLRU stuff)
  *
- * The AsyncQueueControl structure is protected by the AsyncQueueLock.
+ * The AsyncQueueControl structure is protected by the AsyncQueueLock and
+ * NotifyQueueTailLock.
  *
- * When holding the lock in SHARED mode, backends may only inspect their own
- * entries as well as the head and tail pointers. Consequently we can allow a
- * backend to update its own record while holding only SHARED lock (since no
- * other backend will inspect it).
+ * When holding AsyncQueueLock in SHARED mode, backends may only inspect their
+ * own entries as well as the head and tail pointers. Consequently we can
+ * allow a backend to update its own record while holding only SHARED lock
+ * (since no other backend will inspect it).
  *
- * When holding the lock in EXCLUSIVE mode, backends can inspect the entries
- * of other backends and also change the head and tail pointers.
+ * When holding AsyncQueueLock in EXCLUSIVE mode, backends can inspect the
+ * entries of other backends and also change the head pointer. When holding
+ * both AsyncQueueLock and NotifyQueueTailLock in EXCLUSIVE mode, backends can
+ * change the tail pointer.
  *
  * AsyncCtlLock is used as the control lock for the pg_notify SLRU buffers.
- * In order to avoid deadlocks, whenever we need both locks, we always first
- * get AsyncQueueLock and then AsyncCtlLock.
+ * In order to avoid deadlocks, whenever we need multiple locks, we first get
+ * NotifyQueueTailLock, then AsyncQueueLock, and lastly AsyncCtlLock.
  *
  * Each backend uses the backend[] array entry with index equal to its
  * BackendId (which can range from 1 to MaxBackends).  We rely on this to make
@@ -2013,6 +2016,10 @@ asyncQueueAdvanceTail(void)
        int                     newtailpage;
        int                     boundary;
 
+       /* Restrict task to one backend per cluster; see SimpleLruTruncate(). */
+       LWLockAcquire(NotifyQueueTailLock, LW_EXCLUSIVE);
+
+       /* Compute the new tail. */
        LWLockAcquire(AsyncQueueLock, LW_EXCLUSIVE);
        min = QUEUE_HEAD;
        for (i = 1; i <= MaxBackends; i++)
@@ -2021,7 +2028,6 @@ asyncQueueAdvanceTail(void)
                        min = QUEUE_POS_MIN(min, QUEUE_BACKEND_POS(i));
        }
        oldtailpage = QUEUE_POS_PAGE(QUEUE_TAIL);
-       QUEUE_TAIL = min;
        LWLockRelease(AsyncQueueLock);
 
        /*
@@ -2041,6 +2047,17 @@ asyncQueueAdvanceTail(void)
                 */
                SimpleLruTruncate(AsyncCtl, newtailpage);
        }
+
+       /*
+        * Advertise the new tail.  This changes asyncQueueIsFull()'s verdict for
+        * the segment immediately prior to the new tail, allowing fresh data into
+        * that segment.
+        */
+       LWLockAcquire(AsyncQueueLock, LW_EXCLUSIVE);
+       QUEUE_TAIL = min;
+       LWLockRelease(AsyncQueueLock);
+
+       LWLockRelease(NotifyQueueTailLock);
 }
 
 /*
index 70fa5e114bdeb80a45bacbcc5133fdff830abe06..b45661672e377b9cde5c6efc98919e14964915c1 100644 (file)
@@ -1017,6 +1017,14 @@ vac_update_datfrozenxid(void)
        bool            bogus = false;
        bool            dirty = false;
 
+       /*
+        * Restrict this task to one backend per database.  This avoids race
+        * conditions that would move datfrozenxid or datminmxid backward.  It
+        * avoids calling vac_truncate_clog() with a datfrozenxid preceding a
+        * datfrozenxid passed to an earlier vac_truncate_clog() call.
+        */
+       LockDatabaseFrozenIds(ExclusiveLock);
+
        /*
         * Initialize the "min" calculation with GetOldestXmin, which is a
         * reasonable approximation to the minimum relfrozenxid for not-yet-
@@ -1181,6 +1189,9 @@ vac_truncate_clog(TransactionId frozenXID,
        bool            bogus = false;
        bool            frozenAlreadyWrapped = false;
 
+       /* Restrict task to one backend per cluster; see SimpleLruTruncate(). */
+       LWLockAcquire(WrapLimitsVacuumLock, LW_EXCLUSIVE);
+
        /* init oldest datoids to sync with my frozenXID/minMulti values */
        oldestxid_datoid = MyDatabaseId;
        minmulti_datoid = MyDatabaseId;
@@ -1290,6 +1301,8 @@ vac_truncate_clog(TransactionId frozenXID,
         */
        SetTransactionIdLimit(frozenXID, oldestxid_datoid);
        SetMultiXactIdLimit(minMulti, minmulti_datoid, false);
+
+       LWLockRelease(WrapLimitsVacuumLock);
 }
 
 
index dc0a4396388ec7bd6d9bdf3c5a46df577667c9ce..044b0189e7e1987c292c6f80108cffa7ab57f885 100644 (file)
@@ -412,6 +412,21 @@ UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
        LockRelease(&tag, lockmode, false);
 }
 
+/*
+ *             LockDatabaseFrozenIds
+ *
+ * This allows one backend per database to execute vac_update_datfrozenxid().
+ */
+void
+LockDatabaseFrozenIds(LOCKMODE lockmode)
+{
+       LOCKTAG         tag;
+
+       SET_LOCKTAG_DATABASE_FROZEN_IDS(tag, MyDatabaseId);
+
+       (void) LockAcquire(&tag, lockmode, false, false);
+}
+
 /*
  *             LockPage
  *
@@ -1015,6 +1030,11 @@ DescribeLockTag(StringInfo buf, const LOCKTAG *tag)
                                                         tag->locktag_field2,
                                                         tag->locktag_field1);
                        break;
+               case LOCKTAG_DATABASE_FROZEN_IDS:
+                       appendStringInfo(buf,
+                                                        _("pg_database.datfrozenxid of database %u"),
+                                                        tag->locktag_field1);
+                       break;
                case LOCKTAG_PAGE:
                        appendStringInfo(buf,
                                                         _("page %u of relation %u of database %u"),
index e6025ecedb3ba34e7579a4115510ed294d4f8f6d..04a1786d372f1366715e9754ba6a6b68307a7985 100644 (file)
@@ -50,3 +50,5 @@ OldSnapshotTimeMapLock                                42
 BackendRandomLock                                      43
 LogicalRepWorkerLock                           44
 CLogTruncationLock                                     45
+WrapLimitsVacuumLock                           46
+NotifyQueueTailLock                                    47
index 66c09a1f31607c6c9dbad73e546b10a88339466f..01c66a2538b35d882db8f3675cc07e3b12cbd4ff 100644 (file)
@@ -26,6 +26,7 @@
 const char *const LockTagTypeNames[] = {
        "relation",
        "extend",
+       "frozenid",
        "page",
        "tuple",
        "transactionid",
@@ -245,6 +246,17 @@ pg_lock_status(PG_FUNCTION_ARGS)
                                nulls[8] = true;
                                nulls[9] = true;
                                break;
+                       case LOCKTAG_DATABASE_FROZEN_IDS:
+                               values[1] = ObjectIdGetDatum(instance->locktag.locktag_field1);
+                               nulls[2] = true;
+                               nulls[3] = true;
+                               nulls[4] = true;
+                               nulls[5] = true;
+                               nulls[6] = true;
+                               nulls[7] = true;
+                               nulls[8] = true;
+                               nulls[9] = true;
+                               break;
                        case LOCKTAG_PAGE:
                                values[1] = ObjectIdGetDatum(instance->locktag.locktag_field1);
                                values[2] = ObjectIdGetDatum(instance->locktag.locktag_field2);
index a217de971667d31f6fbea725eb962403295e2727..13667584ea08fcf4a9b7bf318cd01284f492addb 100644 (file)
@@ -57,6 +57,9 @@ extern bool ConditionalLockRelationForExtension(Relation relation,
                                                                        LOCKMODE lockmode);
 extern int     RelationExtensionLockWaiterCount(Relation relation);
 
+/* Lock to recompute pg_database.datfrozenxid in the current database */
+extern void LockDatabaseFrozenIds(LOCKMODE lockmode);
+
 /* Lock a page (currently only used within indexes) */
 extern void LockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode);
 extern bool ConditionalLockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode);
index ff4df7fec513c11ff048aa7c977ea040f56284b5..6b56fa587434341880edeb02f7594fc54c699333 100644 (file)
@@ -141,6 +141,8 @@ typedef enum LockTagType
        /* ID info for a relation is DB OID + REL OID; DB OID = 0 if shared */
        LOCKTAG_RELATION_EXTEND,        /* the right to extend a relation */
        /* same ID info as RELATION */
+       LOCKTAG_DATABASE_FROZEN_IDS,    /* pg_database.datfrozenxid */
+       /* ID info for frozen IDs is DB OID */
        LOCKTAG_PAGE,                           /* one page of a relation */
        /* ID info for a page is RELATION info + BlockNumber */
        LOCKTAG_TUPLE,                          /* one physical tuple */
@@ -206,6 +208,14 @@ typedef struct LOCKTAG
         (locktag).locktag_type = LOCKTAG_RELATION_EXTEND, \
         (locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD)
 
+#define SET_LOCKTAG_DATABASE_FROZEN_IDS(locktag,dboid) \
+       ((locktag).locktag_field1 = (dboid), \
+        (locktag).locktag_field2 = 0, \
+        (locktag).locktag_field3 = 0, \
+        (locktag).locktag_field4 = 0, \
+        (locktag).locktag_type = LOCKTAG_DATABASE_FROZEN_IDS, \
+        (locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD)
+
 #define SET_LOCKTAG_PAGE(locktag,dboid,reloid,blocknum) \
        ((locktag).locktag_field1 = (dboid), \
         (locktag).locktag_field2 = (reloid), \