From 819dc118c0f6cd1fc08f0e807702b4bc0b0d8733 Mon Sep 17 00:00:00 2001 From: Andres Freund Date: Wed, 8 Oct 2025 12:57:52 -0400 Subject: [PATCH] Improve ReadRecentBuffer() scalability While testing a new potential use for ReadRecentBuffer(), Andres reported that it scales badly when called concurrently for the same buffer by many backends. Instead of a naive (but wrong) coding with PinBuffer(), it used the spinlock, so that it could be careful to pin only if the buffer was valid and holding the expected block, to avoid breaking invariants in eg GetVictimBuffer(). Unfortunately that made it less scalable than PinBuffer(), which uses compare-exchange instead. We can fix that by giving PinBuffer() a new skip_if_not_valid mode that doesn't pin invalid buffers. It might occasionally skip when it shouldn't due to the unlocked read of the header flags, but that's unlikely and perfectly acceptable for an opportunistic optimisation routine, and it can only succeed when it really should due to the compare-exchange loop. Note that this fixes ReadRecentBuffer()'s failure to bump the usage count. While this could be seen as a bug, there currently aren't cases affected by this in core, so it doesn't seem worth backpatching that portion. Author: Thomas Munro Reported-by: Andres Freund Reviewed-by: Andres Freund Reviewed-by: Matthias van de Meent Discussion: https://postgr.es/m/20230627020546.t6z4tntmj7wmjrfh%40awork3.anarazel.de Discussion: https://postgr.es/m/fvfmkr5kk4nyex56ejgxj3uzi63isfxovp2biecb4bspbjrze7@az2pljabhnff --- src/backend/storage/buffer/bufmgr.c | 64 +++++++++++++---------------- 1 file changed, 28 insertions(+), 36 deletions(-) diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index fe470de63f2..10232a333d8 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -512,7 +512,8 @@ static BlockNumber ExtendBufferedRelShared(BufferManagerRelation bmr, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by); -static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy); +static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy, + bool skip_if_not_valid); static void PinBuffer_Locked(BufferDesc *buf); static void UnpinBuffer(BufferDesc *buf); static void UnpinBufferNoOwner(BufferDesc *buf); @@ -685,7 +686,6 @@ ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockN BufferDesc *bufHdr; BufferTag tag; uint32 buf_state; - bool have_private_ref; Assert(BufferIsValid(recent_buffer)); @@ -713,38 +713,24 @@ ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockN else { bufHdr = GetBufferDescriptor(recent_buffer - 1); - have_private_ref = GetPrivateRefCount(recent_buffer) > 0; /* - * Do we already have this buffer pinned with a private reference? If - * so, it must be valid and it is safe to check the tag without - * locking. If not, we have to lock the header first and then check. + * Is it still valid and holding the right tag? We do an unlocked tag + * comparison first, to make it unlikely that we'll increment the + * usage counter of the wrong buffer, if someone calls us with a very + * out of date recent_buffer. Then we'll check it again if we get the + * pin. */ - if (have_private_ref) - buf_state = pg_atomic_read_u32(&bufHdr->state); - else - buf_state = LockBufHdr(bufHdr); - - if ((buf_state & BM_VALID) && BufferTagsEqual(&tag, &bufHdr->tag)) + if (BufferTagsEqual(&tag, &bufHdr->tag) && + PinBuffer(bufHdr, NULL, true)) { - /* - * It's now safe to pin the buffer. We can't pin first and ask - * questions later, because it might confuse code paths like - * InvalidateBuffer() if we pinned a random non-matching buffer. - */ - if (have_private_ref) - PinBuffer(bufHdr, NULL); /* bump pin count */ - else - PinBuffer_Locked(bufHdr); /* pin for first time */ - - pgBufferUsage.shared_blks_hit++; - - return true; + if (BufferTagsEqual(&tag, &bufHdr->tag)) + { + pgBufferUsage.shared_blks_hit++; + return true; + } + UnpinBuffer(bufHdr); } - - /* If we locked the header above, now unlock. */ - if (!have_private_ref) - UnlockBufHdr(bufHdr, buf_state); } return false; @@ -2036,7 +2022,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, */ buf = GetBufferDescriptor(existing_buf_id); - valid = PinBuffer(buf, strategy); + valid = PinBuffer(buf, strategy, false); /* Can release the mapping lock as soon as we've pinned it */ LWLockRelease(newPartitionLock); @@ -2098,7 +2084,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, existing_buf_hdr = GetBufferDescriptor(existing_buf_id); - valid = PinBuffer(existing_buf_hdr, strategy); + valid = PinBuffer(existing_buf_hdr, strategy, false); /* Can release the mapping lock as soon as we've pinned it */ LWLockRelease(newPartitionLock); @@ -2736,7 +2722,7 @@ ExtendBufferedRelShared(BufferManagerRelation bmr, * Pin the existing buffer before releasing the partition lock, * preventing it from being evicted. */ - valid = PinBuffer(existing_hdr, strategy); + valid = PinBuffer(existing_hdr, strategy, false); LWLockRelease(partition_lock); UnpinBuffer(victim_buf_hdr); @@ -3035,10 +3021,13 @@ ReleaseAndReadBuffer(Buffer buffer, * must have been done already. * * Returns true if buffer is BM_VALID, else false. This provision allows - * some callers to avoid an extra spinlock cycle. + * some callers to avoid an extra spinlock cycle. If skip_if_not_valid is + * true, then a false return value also indicates that the buffer was + * (recently) invalid and has not been pinned. */ static bool -PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy) +PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy, + bool skip_if_not_valid) { Buffer b = BufferDescriptorGetBuffer(buf); bool result; @@ -3054,11 +3043,12 @@ PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy) uint32 buf_state; uint32 old_buf_state; - ref = NewPrivateRefCountEntry(b); - old_buf_state = pg_atomic_read_u32(&buf->state); for (;;) { + if (unlikely(skip_if_not_valid && !(old_buf_state & BM_VALID))) + return false; + if (old_buf_state & BM_LOCKED) old_buf_state = WaitBufHdrUnlocked(buf); @@ -3088,6 +3078,8 @@ PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy) { result = (buf_state & BM_VALID) != 0; + ref = NewPrivateRefCountEntry(b); + /* * Assume that we acquired a buffer pin for the purposes of * Valgrind buffer client checks (even in !result case) to -- 2.47.3