WAL-log inplace update before revealing it to other sessions.

author Noah Misch <noah@leadboat.com>

Wed, 17 Dec 2025 00:13:54 +0000 (16:13 -0800)

committer Noah Misch <noah@leadboat.com>

Wed, 17 Dec 2025 00:13:55 +0000 (16:13 -0800)
author Noah Misch <noah@leadboat.com>
Wed, 17 Dec 2025 00:13:54 +0000 (16:13 -0800)
committer Noah Misch <noah@leadboat.com>
Wed, 17 Dec 2025 00:13:55 +0000 (16:13 -0800)
diff --git a/src/backend/access/heap/README.tuplock b/src/backend/access/heap/README.tuplock

index ad835ff4820afef2c453c712c4b2381655ee6d04..16f7d78b7d232c60b7ca31d52142d4f6527e3c2a 100644 (file)
--- a/src/backend/access/heap/README.tuplock
+++ b/src/backend/access/heap/README.tuplock
@@ -198,9 +198,7 @@ Inplace updates create an exception to the rule that tuple data won't change
  under a reader holding a pin.  A reader of a heap_fetch() result tuple may
  witness a torn read.  Current inplace-updated fields are aligned and are no
  wider than four bytes, and current readers don't need consistency across
-fields.  Hence, they get by with just fetching each field once.  XXX such a
-caller may also read a value that has not reached WAL; see
-systable_inplace_update_finish().
+fields.  Hence, they get by with just fetching each field once.
  
  During logical decoding, caches reflect an inplace update no later than the
  next XLOG_XACT_INVALIDATIONS.  That record witnesses the end of a command.
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c

index b9326eed3ffde7e6a7d6c1073adbe8dd4d542d87..d977df4cec8b36548ddf0cc6d14d4feecf893760 100644 (file)
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -6288,6 +6288,8 @@ heap_inplace_update_and_unlock(Relation relation,
         HeapTupleHeader htup = oldtup->t_data;
         uint32          oldlen;
         uint32          newlen;
+       char       *dst;
+       char       *src;
  
         Assert(ItemPointerEquals(&oldtup->t_self, &tuple->t_self));
         oldlen = oldtup->t_len - htup->t_hoff;
@@ -6295,6 +6297,9 @@ heap_inplace_update_and_unlock(Relation relation,
         if (oldlen != newlen || htup->t_hoff != tuple->t_data->t_hoff)
                 elog(ERROR, "wrong tuple length");
  
+       dst = (char *) htup + htup->t_hoff;
+       src = (char *) tuple->t_data + tuple->t_data->t_hoff;
+
         /*
          * Unlink relcache init files as needed.  If unlinking, acquire
          * RelCacheInitLock until after associated invalidations.  By doing this
@@ -6305,15 +6310,15 @@ heap_inplace_update_and_unlock(Relation relation,
          */
         PreInplace_Inval();
  
-       /* NO EREPORT(ERROR) from here till changes are logged */
-       START_CRIT_SECTION();
-
-       memcpy((char *) htup + htup->t_hoff,
-                  (char *) tuple->t_data + tuple->t_data->t_hoff,
-                  newlen);
-
         /*----------
-        * XXX A crash here can allow datfrozenxid() to get ahead of relfrozenxid:
+        * NO EREPORT(ERROR) from here till changes are complete
+        *
+        * Our buffer lock won't stop a reader having already pinned and checked
+        * visibility for this tuple.  Hence, we write WAL first, then mutate the
+        * buffer.  Like in MarkBufferDirtyHint() or RecordTransactionCommit(),
+        * checkpoint delay makes that acceptable.  With the usual order of
+        * changes, a crash after memcpy() and before XLogInsert() could allow
+        * datfrozenxid to overtake relfrozenxid:
          *
          * ["D" is a VACUUM (ONLY_DATABASE_STATS)]
          * ["R" is a VACUUM tbl]
@@ -6323,14 +6328,36 @@ heap_inplace_update_and_unlock(Relation relation,
          * D: raise pg_database.datfrozenxid, XLogInsert(), finish
          * [crash]
          * [recovery restores datfrozenxid w/o relfrozenxid]
-        */
-
-       MarkBufferDirty(buffer);
+        *
+        * Mimic MarkBufferDirtyHint() subroutine XLogSaveBufferForHint().
+        * Specifically, use DELAY_CHKPT_START, and copy the buffer to the stack.
+        * The stack copy facilitates a FPI of the post-mutation block before we
+        * accept other sessions seeing it.  DELAY_CHKPT_START allows us to
+        * XLogInsert() before MarkBufferDirty().  Since XLogSaveBufferForHint()
+        * can operate under BUFFER_LOCK_SHARED, it can't avoid DELAY_CHKPT_START.
+        * This function, however, likely could avoid it with the following order
+        * of operations: MarkBufferDirty(), XLogInsert(), memcpy().  Opt to use
+        * DELAY_CHKPT_START here, too, as a way to have fewer distinct code
+        * patterns to analyze.  Inplace update isn't so frequent that it should
+        * pursue the small optimization of skipping DELAY_CHKPT_START.
+        */
+       Assert((MyProc->delayChkptFlags & DELAY_CHKPT_START) == 0);
+       START_CRIT_SECTION();
+       MyProc->delayChkptFlags |= DELAY_CHKPT_START;
  
         /* XLOG stuff */
         if (RelationNeedsWAL(relation))
         {
                 xl_heap_inplace xlrec;
+               PGAlignedBlock copied_buffer;
+               char       *origdata = (char *) BufferGetBlock(buffer);
+               Page            page = BufferGetPage(buffer);
+               uint16          lower = ((PageHeader) page)->pd_lower;
+               uint16          upper = ((PageHeader) page)->pd_upper;
+               uintptr_t       dst_offset_in_block;
+               RelFileLocator rlocator;
+               ForkNumber      forkno;
+               BlockNumber blkno;
                 XLogRecPtr      recptr;
  
                 xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self);
@@ -6338,16 +6365,28 @@ heap_inplace_update_and_unlock(Relation relation,
                 XLogBeginInsert();
                 XLogRegisterData((char *) &xlrec, SizeOfHeapInplace);
  
-               XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
-               XLogRegisterBufData(0, (char *) htup + htup->t_hoff, newlen);
+               /* register block matching what buffer will look like after changes */
+               memcpy(copied_buffer.data, origdata, lower);
+               memcpy(copied_buffer.data + upper, origdata + upper, BLCKSZ - upper);
+               dst_offset_in_block = dst - origdata;
+               memcpy(copied_buffer.data + dst_offset_in_block, src, newlen);
+               BufferGetTag(buffer, &rlocator, &forkno, &blkno);
+               Assert(forkno == MAIN_FORKNUM);
+               XLogRegisterBlock(0, &rlocator, forkno, blkno, copied_buffer.data,
+                                                 REGBUF_STANDARD);
+               XLogRegisterBufData(0, src, newlen);
  
                 /* inplace updates aren't decoded atm, don't log the origin */
  
                 recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_INPLACE);
  
-               PageSetLSN(BufferGetPage(buffer), recptr);
+               PageSetLSN(page, recptr);
         }
  
+       memcpy(dst, src, newlen);
+
+       MarkBufferDirty(buffer);
+
         LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
  
         /*
@@ -6356,6 +6395,7 @@ heap_inplace_update_and_unlock(Relation relation,
          */
         AtInplace_Inval();
  
+       MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
         END_CRIT_SECTION();
         UnlockTuple(relation, &tuple->t_self, InplaceUpdateTupleLock);
  
diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h

index ea3ca9c48a676e9c667dbf2d13606c2c5126c29d..e16ab4a2e583524eec350db3a585ed9900bb9a5f 100644 (file)
--- a/src/include/storage/proc.h
+++ b/src/include/storage/proc.h
@@ -109,10 +109,10 @@ struct XidCache
   * is inserted prior to the new redo point, the corresponding data changes will
   * also be flushed to disk before the checkpoint can complete. (In the
   * extremely common case where the data being modified is in shared buffers
- * and we acquire an exclusive content lock on the relevant buffers before
- * writing WAL, this mechanism is not needed, because phase 2 will block
- * until we release the content lock and then flush the modified data to
- * disk.)
+ * and we acquire an exclusive content lock and MarkBufferDirty() on the
+ * relevant buffers before writing WAL, this mechanism is not needed, because
+ * phase 2 will block until we release the content lock and then flush the
+ * modified data to disk.  See transam/README and SyncOneBuffer().)
   *
   * Setting DELAY_CHKPT_COMPLETE prevents the system from moving from phase 2
   * to phase 3. This is useful if we are performing a WAL-logged operation that
author	Noah Misch <noah@leadboat.com>
	Wed, 17 Dec 2025 00:13:54 +0000 (16:13 -0800)
committer	Noah Misch <noah@leadboat.com>
	Wed, 17 Dec 2025 00:13:55 +0000 (16:13 -0800)
src/backend/access/heap/README.tuplock		patch \| blob \| blame \| history
src/backend/access/heap/heapam.c		patch \| blob \| blame \| history
src/include/storage/proc.h		patch \| blob \| blame \| history