From: Noah Misch Date: Sat, 15 Aug 2020 17:15:53 +0000 (-0700) Subject: Prevent concurrent SimpleLruTruncate() for any given SLRU. X-Git-Tag: REL9_5_24~85 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=3fbbf64ceecbd0fe675850070ea1e5d77c401c4f;p=thirdparty%2Fpostgresql.git Prevent concurrent SimpleLruTruncate() for any given SLRU. The SimpleLruTruncate() header comment states the new coding rule. To achieve this, add locktype "frozenid" and two LWLocks. This closes a rare opportunity for data loss, which manifested as "apparent wraparound" or "could not access status of transaction" errors. Data loss is more likely in pg_multixact, due to released branches' thin margin between multiStopLimit and multiWrapLimit. If a user's physical replication primary logged ": apparent wraparound" messages, the user should rebuild standbys of that primary regardless of symptoms. At less risk is a cluster having emitted "not accepting commands" errors or "must be vacuumed" warnings at some point. One can test a cluster for this data loss by running VACUUM FREEZE in every database. Back-patch to 9.5 (all supported versions). Discussion: https://postgr.es/m/20190218073103.GA1434723@rfd.leadboat.com --- diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml index ac4347f1286..e7af4bae65d 100644 --- a/doc/src/sgml/catalogs.sgml +++ b/doc/src/sgml/catalogs.sgml @@ -7971,7 +7971,8 @@ and general database objects (identified by class OID and object OID, in the same way as in pg_description or pg_depend). Also, the right to extend a - relation is represented as a separate lockable object. + relation is represented as a separate lockable object, as is the right to + update pg_database.datfrozenxid. Also, advisory locks can be taken on numbers that have user-defined meanings. @@ -7997,6 +7998,7 @@ Type of the lockable object: relation, extend, + frozenid, page, tuple, transactionid, diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c index 217182f8cd6..680a28f198d 100644 --- a/src/backend/access/transam/slru.c +++ b/src/backend/access/transam/slru.c @@ -1137,6 +1137,14 @@ SimpleLruFlush(SlruCtl ctl, bool allow_redirtied) /* * Remove all segments before the one holding the passed page number + * + * All SLRUs prevent concurrent calls to this function, either with an LWLock + * or by calling it only as part of a checkpoint. Mutual exclusion must begin + * before computing cutoffPage. Mutual exclusion must end after any limit + * update that would permit other backends to write fresh data into the + * segment immediately preceding the one containing cutoffPage. Otherwise, + * when the SLRU is quite full, SimpleLruTruncate() might delete that segment + * after it has accrued freshly-written data. */ void SimpleLruTruncate(SlruCtl ctl, int cutoffPage) diff --git a/src/backend/access/transam/subtrans.c b/src/backend/access/transam/subtrans.c index dde72b7d2e1..6702350e124 100644 --- a/src/backend/access/transam/subtrans.c +++ b/src/backend/access/transam/subtrans.c @@ -334,8 +334,8 @@ ExtendSUBTRANS(TransactionId newestXact) /* * Remove all SUBTRANS segments before the one holding the passed transaction ID * - * This is normally called during checkpoint, with oldestXact being the - * oldest TransactionXmin of any running transaction. + * oldestXact is the oldest TransactionXmin of any running transaction. This + * is called only during checkpoint. */ void TruncateSUBTRANS(TransactionId oldestXact) diff --git a/src/backend/commands/async.c b/src/backend/commands/async.c index 7bf946b9095..dfa9b62a606 100644 --- a/src/backend/commands/async.c +++ b/src/backend/commands/async.c @@ -223,19 +223,22 @@ typedef struct QueueBackendStatus /* * Shared memory state for LISTEN/NOTIFY (excluding its SLRU stuff) * - * The AsyncQueueControl structure is protected by the AsyncQueueLock. + * The AsyncQueueControl structure is protected by the AsyncQueueLock and + * NotifyQueueTailLock. * - * When holding the lock in SHARED mode, backends may only inspect their own - * entries as well as the head and tail pointers. Consequently we can allow a - * backend to update its own record while holding only SHARED lock (since no - * other backend will inspect it). + * When holding AsyncQueueLock in SHARED mode, backends may only inspect their + * own entries as well as the head and tail pointers. Consequently we can + * allow a backend to update its own record while holding only SHARED lock + * (since no other backend will inspect it). * - * When holding the lock in EXCLUSIVE mode, backends can inspect the entries - * of other backends and also change the head and tail pointers. + * When holding AsyncQueueLock in EXCLUSIVE mode, backends can inspect the + * entries of other backends and also change the head pointer. When holding + * both AsyncQueueLock and NotifyQueueTailLock in EXCLUSIVE mode, backends can + * change the tail pointer. * * AsyncCtlLock is used as the control lock for the pg_notify SLRU buffers. - * In order to avoid deadlocks, whenever we need both locks, we always first - * get AsyncQueueLock and then AsyncCtlLock. + * In order to avoid deadlocks, whenever we need multiple locks, we first get + * NotifyQueueTailLock, then AsyncQueueLock, and lastly AsyncCtlLock. * * Each backend uses the backend[] array entry with index equal to its * BackendId (which can range from 1 to MaxBackends). We rely on this to make @@ -1982,6 +1985,10 @@ asyncQueueAdvanceTail(void) int newtailpage; int boundary; + /* Restrict task to one backend per cluster; see SimpleLruTruncate(). */ + LWLockAcquire(NotifyQueueTailLock, LW_EXCLUSIVE); + + /* Compute the new tail. */ LWLockAcquire(AsyncQueueLock, LW_EXCLUSIVE); min = QUEUE_HEAD; for (i = 1; i <= MaxBackends; i++) @@ -1990,7 +1997,6 @@ asyncQueueAdvanceTail(void) min = QUEUE_POS_MIN(min, QUEUE_BACKEND_POS(i)); } oldtailpage = QUEUE_POS_PAGE(QUEUE_TAIL); - QUEUE_TAIL = min; LWLockRelease(AsyncQueueLock); /* @@ -2010,6 +2016,17 @@ asyncQueueAdvanceTail(void) */ SimpleLruTruncate(AsyncCtl, newtailpage); } + + /* + * Advertise the new tail. This changes asyncQueueIsFull()'s verdict for + * the segment immediately prior to the new tail, allowing fresh data into + * that segment. + */ + LWLockAcquire(AsyncQueueLock, LW_EXCLUSIVE); + QUEUE_TAIL = min; + LWLockRelease(AsyncQueueLock); + + LWLockRelease(NotifyQueueTailLock); } /* diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index ebc76eaa207..43199cd5ffa 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -883,6 +883,14 @@ vac_update_datfrozenxid(void) bool bogus = false; bool dirty = false; + /* + * Restrict this task to one backend per database. This avoids race + * conditions that would move datfrozenxid or datminmxid backward. It + * avoids calling vac_truncate_clog() with a datfrozenxid preceding a + * datfrozenxid passed to an earlier vac_truncate_clog() call. + */ + LockDatabaseFrozenIds(ExclusiveLock); + /* * Initialize the "min" calculation with GetOldestXmin, which is a * reasonable approximation to the minimum relfrozenxid for not-yet- @@ -1047,6 +1055,9 @@ vac_truncate_clog(TransactionId frozenXID, bool bogus = false; bool frozenAlreadyWrapped = false; + /* Restrict task to one backend per cluster; see SimpleLruTruncate(). */ + LWLockAcquire(WrapLimitsVacuumLock, LW_EXCLUSIVE); + /* init oldest datoids to sync with my frozenXID/minMulti values */ oldestxid_datoid = MyDatabaseId; minmulti_datoid = MyDatabaseId; @@ -1148,6 +1159,8 @@ vac_truncate_clog(TransactionId frozenXID, SetTransactionIdLimit(frozenXID, oldestxid_datoid); SetMultiXactIdLimit(minMulti, minmulti_datoid); AdvanceOldestCommitTsXid(frozenXID); + + LWLockRelease(WrapLimitsVacuumLock); } diff --git a/src/backend/storage/lmgr/lmgr.c b/src/backend/storage/lmgr/lmgr.c index 1e9093b941f..ba5f9175d8b 100644 --- a/src/backend/storage/lmgr/lmgr.c +++ b/src/backend/storage/lmgr/lmgr.c @@ -377,6 +377,21 @@ UnlockRelationForExtension(Relation relation, LOCKMODE lockmode) LockRelease(&tag, lockmode, false); } +/* + * LockDatabaseFrozenIds + * + * This allows one backend per database to execute vac_update_datfrozenxid(). + */ +void +LockDatabaseFrozenIds(LOCKMODE lockmode) +{ + LOCKTAG tag; + + SET_LOCKTAG_DATABASE_FROZEN_IDS(tag, MyDatabaseId); + + (void) LockAcquire(&tag, lockmode, false, false); +} + /* * LockPage * @@ -980,6 +995,11 @@ DescribeLockTag(StringInfo buf, const LOCKTAG *tag) tag->locktag_field2, tag->locktag_field1); break; + case LOCKTAG_DATABASE_FROZEN_IDS: + appendStringInfo(buf, + _("pg_database.datfrozenxid of database %u"), + tag->locktag_field1); + break; case LOCKTAG_PAGE: appendStringInfo(buf, _("page %u of relation %u of database %u"), diff --git a/src/backend/utils/adt/lockfuncs.c b/src/backend/utils/adt/lockfuncs.c index 1705ff0d118..430e8c8703d 100644 --- a/src/backend/utils/adt/lockfuncs.c +++ b/src/backend/utils/adt/lockfuncs.c @@ -25,6 +25,7 @@ static const char *const LockTagTypeNames[] = { "relation", "extend", + "frozenid", "page", "tuple", "transactionid", @@ -244,6 +245,17 @@ pg_lock_status(PG_FUNCTION_ARGS) nulls[8] = true; nulls[9] = true; break; + case LOCKTAG_DATABASE_FROZEN_IDS: + values[1] = ObjectIdGetDatum(instance->locktag.locktag_field1); + nulls[2] = true; + nulls[3] = true; + nulls[4] = true; + nulls[5] = true; + nulls[6] = true; + nulls[7] = true; + nulls[8] = true; + nulls[9] = true; + break; case LOCKTAG_PAGE: values[1] = ObjectIdGetDatum(instance->locktag.locktag_field1); values[2] = ObjectIdGetDatum(instance->locktag.locktag_field2); diff --git a/src/include/storage/lmgr.h b/src/include/storage/lmgr.h index e7ccce22942..72d112ee44e 100644 --- a/src/include/storage/lmgr.h +++ b/src/include/storage/lmgr.h @@ -54,6 +54,9 @@ extern void UnlockRelationIdForSession(LockRelId *relid, LOCKMODE lockmode); extern void LockRelationForExtension(Relation relation, LOCKMODE lockmode); extern void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode); +/* Lock to recompute pg_database.datfrozenxid in the current database */ +extern void LockDatabaseFrozenIds(LOCKMODE lockmode); + /* Lock a page (currently only used within indexes) */ extern void LockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode); extern bool ConditionalLockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode); diff --git a/src/include/storage/lock.h b/src/include/storage/lock.h index c19a934b09a..9019ae8dcd6 100644 --- a/src/include/storage/lock.h +++ b/src/include/storage/lock.h @@ -168,6 +168,8 @@ typedef enum LockTagType /* ID info for a relation is DB OID + REL OID; DB OID = 0 if shared */ LOCKTAG_RELATION_EXTEND, /* the right to extend a relation */ /* same ID info as RELATION */ + LOCKTAG_DATABASE_FROZEN_IDS, /* pg_database.datfrozenxid */ + /* ID info for frozen IDs is DB OID */ LOCKTAG_PAGE, /* one page of a relation */ /* ID info for a page is RELATION info + BlockNumber */ LOCKTAG_TUPLE, /* one physical tuple */ @@ -231,6 +233,14 @@ typedef struct LOCKTAG (locktag).locktag_type = LOCKTAG_RELATION_EXTEND, \ (locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD) +#define SET_LOCKTAG_DATABASE_FROZEN_IDS(locktag,dboid) \ + ((locktag).locktag_field1 = (dboid), \ + (locktag).locktag_field2 = 0, \ + (locktag).locktag_field3 = 0, \ + (locktag).locktag_field4 = 0, \ + (locktag).locktag_type = LOCKTAG_DATABASE_FROZEN_IDS, \ + (locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD) + #define SET_LOCKTAG_PAGE(locktag,dboid,reloid,blocknum) \ ((locktag).locktag_field1 = (dboid), \ (locktag).locktag_field2 = (reloid), \ diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h index e360fc0393c..228ddf898d3 100644 --- a/src/include/storage/lwlock.h +++ b/src/include/storage/lwlock.h @@ -136,7 +136,9 @@ extern PGDLLIMPORT LWLockPadded *MainLWLockArray; #define CommitTsLock (&MainLWLockArray[39].lock) #define ReplicationOriginLock (&MainLWLockArray[40].lock) #define MultiXactTruncationLock (&MainLWLockArray[41].lock) -#define NUM_INDIVIDUAL_LWLOCKS 42 +#define WrapLimitsVacuumLock (&MainLWLockArray[46].lock) +#define NotifyQueueTailLock (&MainLWLockArray[47].lock) +#define NUM_INDIVIDUAL_LWLOCKS 48 /* * It's a bit odd to declare NUM_BUFFER_PARTITIONS and NUM_LOCK_PARTITIONS