]> git.ipfire.org Git - thirdparty/postgresql.git/commitdiff
bufmgr: Don't copy pages while writing out
authorAndres Freund <andres@anarazel.de>
Fri, 27 Mar 2026 19:27:04 +0000 (15:27 -0400)
committerAndres Freund <andres@anarazel.de>
Fri, 27 Mar 2026 19:56:29 +0000 (15:56 -0400)
After the series of preceding commits introducing and using
BufferBeginSetHintBits()/BufferSetHintBits16(), hint bits are not set anymore
while IO is going on. Therefore we do not need to copy pages while they are
being written out anymore.

For the same reason XLogSaveBufferForHint() now does not need to operate on a
copy of the page anymore, but can instead use the normal XLogRegisterBuffer()
mechanism. For that the assertions and comments to XLogRegisterBuffer() had to
be updated to allow share-exclusive locked buffers to be registered.

Reviewed-by: Melanie Plageman <melanieplageman@gmail.com>
Discussion: https://postgr.es/m/5ubipyssiju5twkb7zgqwdr7q2vhpkpmuelxfpanetlk6ofnop@hvxb4g2amb2d

src/backend/access/hash/hashpage.c
src/backend/access/transam/xloginsert.c
src/backend/storage/buffer/bufmgr.c
src/backend/storage/buffer/localbuf.c
src/backend/storage/page/bufpage.c
src/backend/storage/smgr/bulk_write.c
src/include/storage/bufpage.h
src/test/modules/test_aio/test_aio.c

index 263bc73f11eb0e292a722b0b50b0b208393b8baa..8099b0d021f05dcd3ef82c1ba4276e77efc6c5f2 100644 (file)
@@ -1031,7 +1031,7 @@ _hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks)
                                        zerobuf.data,
                                        true);
 
-       PageSetChecksumInplace(page, lastblock);
+       PageSetChecksum(page, lastblock);
        smgrextend(RelationGetSmgr(rel), MAIN_FORKNUM, lastblock, zerobuf.data,
                           false);
 
index e4a819efeeb88a5bec648ce407c2d46e1cddae9c..f2e10b82b7d3e742036f749a049d1f8ed82a9d48 100644 (file)
@@ -252,9 +252,9 @@ XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
        Assert(begininsert_called);
 
        /*
-        * Ordinarily, buffer should be exclusive-locked and marked dirty before
-        * we get here, otherwise we could end up violating one of the rules in
-        * access/transam/README.
+        * Ordinarily, the buffer should be exclusive-locked (or share-exclusive
+        * in case of hint bits) and marked dirty before we get here, otherwise we
+        * could end up violating one of the rules in access/transam/README.
         *
         * Some callers intentionally register a clean page and never update that
         * page's LSN; in that case they can pass the flag REGBUF_NO_CHANGE to
@@ -262,8 +262,11 @@ XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
         */
 #ifdef USE_ASSERT_CHECKING
        if (!(flags & REGBUF_NO_CHANGE))
-               Assert(BufferIsLockedByMeInMode(buffer, BUFFER_LOCK_EXCLUSIVE) &&
-                          BufferIsDirty(buffer));
+       {
+               Assert(BufferIsDirty(buffer));
+               Assert(BufferIsLockedByMeInMode(buffer, BUFFER_LOCK_EXCLUSIVE) ||
+                          BufferIsLockedByMeInMode(buffer, BUFFER_LOCK_SHARE_EXCLUSIVE));
+       }
 #endif
 
        if (block_id >= max_registered_block_id)
@@ -1123,12 +1126,6 @@ XLogCheckBufferNeedsBackup(Buffer buffer)
  * buffer. The buffer already needs to have been marked dirty by
  * MarkBufferDirtyHint().
  *
- * We can't use the plain backup block mechanism since that relies on the
- * Buffer being exclusively locked. Since some modifications (setting LSN, hint
- * bits) are allowed in a sharelocked buffer that can lead to wal checksum
- * failures. So instead we copy the page and insert the copied data as normal
- * record data.
- *
  * We only need to do something if page has not yet been full page written in
  * this checkpoint round. The LSN of the inserted wal record is returned if we
  * had to write, InvalidXLogRecPtr otherwise.
@@ -1164,37 +1161,13 @@ XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
        if (lsn <= RedoRecPtr)
        {
                int                     flags = 0;
-               PGAlignedBlock copied_buffer;
-               char       *origdata = (char *) BufferGetBlock(buffer);
-               RelFileLocator rlocator;
-               ForkNumber      forkno;
-               BlockNumber blkno;
-
-               /*
-                * Copy buffer so we don't have to worry about concurrent hint bit or
-                * lsn updates. We assume pd_lower/upper cannot be changed without an
-                * exclusive lock, so the contents bkp are not racy.
-                */
-               if (buffer_std)
-               {
-                       /* Assume we can omit data between pd_lower and pd_upper */
-                       Page            page = BufferGetPage(buffer);
-                       uint16          lower = ((PageHeader) page)->pd_lower;
-                       uint16          upper = ((PageHeader) page)->pd_upper;
-
-                       memcpy(copied_buffer.data, origdata, lower);
-                       memcpy(copied_buffer.data + upper, origdata + upper, BLCKSZ - upper);
-               }
-               else
-                       memcpy(copied_buffer.data, origdata, BLCKSZ);
 
                XLogBeginInsert();
 
                if (buffer_std)
                        flags |= REGBUF_STANDARD;
 
-               BufferGetTag(buffer, &rlocator, &forkno, &blkno);
-               XLogRegisterBlock(0, &rlocator, forkno, blkno, copied_buffer.data, flags);
+               XLogRegisterBuffer(0, buffer, flags);
 
                recptr = XLogInsert(RM_XLOG_ID, XLOG_FPI_FOR_HINT);
        }
index e212f6110f2033f9f63a9d8b58706fbdf512c2c8..f4f07c9a1c08446b26e51ee75d8a9283dfca1f9e 100644 (file)
@@ -4440,7 +4440,6 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
        ErrorContextCallback errcallback;
        instr_time      io_start;
        Block           bufBlock;
-       char       *bufToWrite;
 
        Assert(BufferLockHeldByMeInMode(buf, BUFFER_LOCK_EXCLUSIVE) ||
                   BufferLockHeldByMeInMode(buf, BUFFER_LOCK_SHARE_EXCLUSIVE));
@@ -4503,22 +4502,15 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
         */
        bufBlock = BufHdrGetBlock(buf);
 
-       /*
-        * Update page checksum if desired.  Since we have only shared lock on the
-        * buffer, other processes might be updating hint bits in it, so we must
-        * copy the page to private storage if we do checksumming.
-        */
-       bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum);
+       /* Update page checksum if desired. */
+       PageSetChecksum((Page) bufBlock, buf->tag.blockNum);
 
        io_start = pgstat_prepare_io_time(track_io_timing);
 
-       /*
-        * bufToWrite is either the shared buffer or a copy, as appropriate.
-        */
        smgrwrite(reln,
                          BufTagGetForkNum(&buf->tag),
                          buf->tag.blockNum,
-                         bufToWrite,
+                         bufBlock,
                          false);
 
        /*
index 404c6bccbdd7e3742120876cde644ef576ebc88b..b69398c637567d3f5ac69205aa5c04079a8ae96d 100644 (file)
@@ -199,7 +199,7 @@ FlushLocalBuffer(BufferDesc *bufHdr, SMgrRelation reln)
                reln = smgropen(BufTagGetRelFileLocator(&bufHdr->tag),
                                                MyProcNumber);
 
-       PageSetChecksumInplace(localpage, bufHdr->tag.blockNum);
+       PageSetChecksum(localpage, bufHdr->tag.blockNum);
 
        io_start = pgstat_prepare_io_time(track_io_timing);
 
index de85911e3ac5f1144d84e416b336e19c3c03e31a..56f1f7ae9fc312cb412e68f64d064a97a404fde0 100644 (file)
@@ -1492,53 +1492,20 @@ PageIndexTupleOverwrite(Page page, OffsetNumber offnum,
 
 
 /*
- * Set checksum for a page in shared buffers.
+ * Set checksum on a page.
  *
- * If checksums are disabled, or if the page is not initialized, just return
- * the input.  Otherwise, we must make a copy of the page before calculating
- * the checksum, to prevent concurrent modifications (e.g. setting hint bits)
- * from making the final checksum invalid.  It doesn't matter if we include or
- * exclude hints during the copy, as long as we write a valid page and
- * associated checksum.
+ * If the page is in shared buffers, it needs to be locked in at least
+ * share-exclusive mode.
  *
- * Returns a pointer to the block-sized data that needs to be written. Uses
- * statically-allocated memory, so the caller must immediately write the
- * returned page and not refer to it again.
- */
-char *
-PageSetChecksumCopy(Page page, BlockNumber blkno)
-{
-       static char *pageCopy = NULL;
-
-       /* If we don't need a checksum, just return the passed-in data */
-       if (PageIsNew(page) || !DataChecksumsEnabled())
-               return page;
-
-       /*
-        * We allocate the copy space once and use it over on each subsequent
-        * call.  The point of palloc'ing here, rather than having a static char
-        * array, is first to ensure adequate alignment for the checksumming code
-        * and second to avoid wasting space in processes that never call this.
-        */
-       if (pageCopy == NULL)
-               pageCopy = MemoryContextAllocAligned(TopMemoryContext,
-                                                                                        BLCKSZ,
-                                                                                        PG_IO_ALIGN_SIZE,
-                                                                                        0);
-
-       memcpy(pageCopy, page, BLCKSZ);
-       ((PageHeader) pageCopy)->pd_checksum = pg_checksum_page(pageCopy, blkno);
-       return pageCopy;
-}
-
-/*
- * Set checksum for a page in private memory.
+ * If checksums are disabled, or if the page is not initialized, just
+ * return. Otherwise compute and set the checksum.
  *
- * This must only be used when we know that no other process can be modifying
- * the page buffer.
+ * In the past this needed to be done on a copy of the page, due to the
+ * possibility of e.g., hint bits being set concurrently. However, this is not
+ * necessary anymore as hint bits won't be set while IO is going on.
  */
 void
-PageSetChecksumInplace(Page page, BlockNumber blkno)
+PageSetChecksum(Page page, BlockNumber blkno)
 {
        /* If we don't need a checksum, just return */
        if (PageIsNew(page) || !DataChecksumsEnabled())
index 36b28824ec88af5968fd75207f501f88a25bd16b..f3c24082a69b499d2638ef03f1372ddeb0c8e89f 100644 (file)
@@ -279,7 +279,7 @@ smgr_bulk_flush(BulkWriteState *bulkstate)
                BlockNumber blkno = pending_writes[i].blkno;
                Page            page = pending_writes[i].buf->data;
 
-               PageSetChecksumInplace(page, blkno);
+               PageSetChecksum(page, blkno);
 
                if (blkno >= bulkstate->relsize)
                {
index 3de58ba43123e056120b629d46025fa7031c66c2..e5267b93fe635868e5235a3ecf5d94ece5d6e1a7 100644 (file)
@@ -537,7 +537,6 @@ extern void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems);
 extern void PageIndexTupleDeleteNoCompact(Page page, OffsetNumber offnum);
 extern bool PageIndexTupleOverwrite(Page page, OffsetNumber offnum,
                                                                        const void *newtup, Size newsize);
-extern char *PageSetChecksumCopy(Page page, BlockNumber blkno);
-extern void PageSetChecksumInplace(Page page, BlockNumber blkno);
+extern void PageSetChecksum(Page page, BlockNumber blkno);
 
 #endif                                                 /* BUFPAGE_H */
index b1aa8af9ec008710ab7843c9491fd4ab2b284088..2ae4a559fab0a797994fc26cfa2efe83f7f3209e 100644 (file)
@@ -288,7 +288,7 @@ modify_rel_block(PG_FUNCTION_ARGS)
        }
        else
        {
-               PageSetChecksumInplace(page, blkno);
+               PageSetChecksum(page, blkno);
        }
 
        smgrwrite(RelationGetSmgr(rel),