* Write a backup block if needed when we are setting a hint. Note that
* this may be called for a variety of page types, not just heaps.
*
- * Callable while holding just share lock on the buffer content.
+ * Callable while holding just a share-exclusive lock on the buffer
+ * content. That suffices to prevent concurrent modifications of the
+ * buffer. The buffer already needs to have been marked dirty by
+ * MarkBufferDirtyHint().
*
* We can't use the plain backup block mechanism since that relies on the
* Buffer being exclusively locked. Since some modifications (setting LSN, hint
XLogRecPtr lsn;
XLogRecPtr RedoRecPtr;
- /*
- * Ensure no checkpoint can change our view of RedoRecPtr.
- */
- Assert((MyProc->delayChkptFlags & DELAY_CHKPT_START) != 0);
+ /* this also verifies that we hold an appropriate lock */
+ Assert(BufferIsDirty(buffer));
/*
- * Update RedoRecPtr so that we can make the right decision
+ * Update RedoRecPtr so that we can make the right decision. It's possible
+ * that a new checkpoint will start just after GetRedoRecPtr(), but that
+ * is ok, as the buffer is already dirty, ensuring that any BufferSync()
+ * started after the buffer was marked dirty cannot complete without
+ * flushing this buffer. If a checkpoint started between marking the
+ * buffer dirty and this check, we will emit an unnecessary WAL record (as
+ * the buffer will be written out as part of the checkpoint), but the
+ * window for that is not big.
*/
RedoRecPtr = GetRedoRecPtr();
if (unlikely(!(lockstate & BM_DIRTY)))
{
XLogRecPtr lsn = InvalidXLogRecPtr;
- bool delayChkptFlags = false;
+ bool wal_log = false;
uint64 buf_state;
/*
RelFileLocatorSkippingWAL(BufTagGetRelFileLocator(&bufHdr->tag)))
return;
- /*
- * If the block is already dirty because we either made a change
- * or set a hint already, then we don't need to write a full page
- * image. Note that aggressive cleaning of blocks dirtied by hint
- * bit setting would increase the call rate. Bulk setting of hint
- * bits would reduce the call rate...
- *
- * We must issue the WAL record before we mark the buffer dirty.
- * Otherwise we might write the page before we write the WAL. That
- * causes a race condition, since a checkpoint might occur between
- * writing the WAL record and marking the buffer dirty. We solve
- * that with a kluge, but one that is already in use during
- * transaction commit to prevent race conditions. Basically, we
- * simply prevent the checkpoint WAL record from being written
- * until we have marked the buffer dirty. We don't start the
- * checkpoint flush until we have marked dirty, so our checkpoint
- * must flush the change to disk successfully or the checkpoint
- * never gets written, so crash recovery will fix.
- *
- * It's possible we may enter here without an xid, so it is
- * essential that CreateCheckPoint waits for virtual transactions
- * rather than full transactionids.
- */
- Assert((MyProc->delayChkptFlags & DELAY_CHKPT_START) == 0);
- MyProc->delayChkptFlags |= DELAY_CHKPT_START;
- delayChkptFlags = true;
- lsn = XLogSaveBufferForHint(buffer, buffer_std);
+ wal_log = true;
}
+ /*
+ * We must mark the page dirty before we emit the WAL record, as per
+ * the usual rules, to ensure that BufferSync()/SyncOneBuffer() try to
+ * flush the buffer, even if we haven't inserted the WAL record yet.
+ * As we hold at least a share-exclusive lock, checkpoints will wait
+ * for this backend to be done with the buffer before continuing. If
+ * we did it the other way round, a checkpoint could start between
+ * writing the WAL record and marking the buffer dirty.
+ */
buf_state = LockBufHdr(bufHdr);
/*
*/
Assert(!(buf_state & BM_DIRTY));
Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
+ UnlockBufHdrExt(bufHdr, buf_state,
+ BM_DIRTY,
+ 0, 0);
+
+ /*
+ * If the block is already dirty because we either made a change or
+ * set a hint already, then we don't need to write a full page image.
+ * Note that aggressive cleaning of blocks dirtied by hint bit setting
+ * would increase the call rate. Bulk setting of hint bits would
+ * reduce the call rate...
+ */
+ if (wal_log)
+ lsn = XLogSaveBufferForHint(buffer, buffer_std);
if (XLogRecPtrIsValid(lsn))
{
* checksum here. That will happen when the page is written
* sometime later in this checkpoint cycle.
*/
+ buf_state = LockBufHdr(bufHdr);
PageSetLSN(page, lsn);
+ UnlockBufHdr(bufHdr);
}
- UnlockBufHdrExt(bufHdr, buf_state,
- BM_DIRTY,
- 0, 0);
-
- if (delayChkptFlags)
- MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
-
pgBufferUsage.shared_blks_dirtied++;
if (VacuumCostActive)
VacuumCostBalance += VacuumCostPageDirty;