]> git.ipfire.org Git - thirdparty/postgresql.git/commitdiff
Fix the torn-page hazard for PITR base backups by forcing full page writes
authorTom Lane <tgl@sss.pgh.pa.us>
Mon, 17 Apr 2006 18:55:05 +0000 (18:55 +0000)
committerTom Lane <tgl@sss.pgh.pa.us>
Mon, 17 Apr 2006 18:55:05 +0000 (18:55 +0000)
to occur between pg_start_backup() and pg_stop_backup(), even if the GUC
setting full_page_writes is OFF.  Per discussion, doing this in combination
with the already-existing checkpoint during pg_start_backup() should ensure
safety against partial page updates being included in the backup.  We do
not have to force full page writes to occur during normal PITR operation,
as I had first feared.

src/backend/access/transam/xlog.c

index 0daee527b597123cdbe90e4580f2376cc34c81c3..f21407c50751e5e0c7e66bb403a1b715914ba4ea 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.235 2006/04/14 20:27:24 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.236 2006/04/17 18:55:05 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -344,6 +344,7 @@ typedef struct XLogCtlInsert
        XLogPageHeader currpage;        /* points to header of block in cache */
        char       *currpos;            /* current insertion point in cache */
        XLogRecPtr      RedoRecPtr;             /* current redo point for insertions */
+       bool            forcePageWrites;        /* forcing full-page writes for PITR? */
 } XLogCtlInsert;
 
 /*
@@ -466,7 +467,7 @@ static void exitArchiveRecovery(TimeLineID endTLI,
                                        uint32 endLogId, uint32 endLogSeg);
 static bool recoveryStopsHere(XLogRecord *record, bool *includeThis);
 
-static bool XLogCheckBuffer(XLogRecData *rdata,
+static bool XLogCheckBuffer(XLogRecData *rdata, bool doPageWrites,
                                XLogRecPtr *lsn, BkpBlock *bkpb);
 static bool AdvanceXLInsertBuffer(void);
 static void XLogWrite(XLogwrtRqst WriteRqst, bool flexible);
@@ -544,6 +545,7 @@ XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
        unsigned        i;
        XLogwrtRqst LogwrtRqst;
        bool            updrqst;
+       bool            doPageWrites;
        bool            no_tran = (rmid == RM_XLOG_ID) ? true : false;
 
        if (info & XLR_INFO_MASK)
@@ -591,6 +593,14 @@ begin:;
                dtbuf_bkp[i] = false;
        }
 
+       /*
+        * Decide if we need to do full-page writes in this XLOG record: true if
+        * full_page_writes is on or we have a PITR request for it.  Since we
+        * don't yet have the insert lock, forcePageWrites could change under us,
+        * but we'll recheck it once we have the lock.
+        */
+       doPageWrites = fullPageWrites || Insert->forcePageWrites;
+
        INIT_CRC32(rdata_crc);
        len = 0;
        for (rdt = rdata;;)
@@ -622,7 +632,8 @@ begin:;
                                {
                                        /* OK, put it in this slot */
                                        dtbuf[i] = rdt->buffer;
-                                       if (XLogCheckBuffer(rdt, &(dtbuf_lsn[i]), &(dtbuf_xlg[i])))
+                                       if (XLogCheckBuffer(rdt, doPageWrites,
+                                                                               &(dtbuf_lsn[i]), &(dtbuf_xlg[i])))
                                        {
                                                dtbuf_bkp[i] = true;
                                                rdt->data = NULL;
@@ -735,30 +746,51 @@ begin:;
         * Check to see if my RedoRecPtr is out of date.  If so, may have to go
         * back and recompute everything.  This can only happen just after a
         * checkpoint, so it's better to be slow in this case and fast otherwise.
+        *
+        * If we aren't doing full-page writes then RedoRecPtr doesn't actually
+        * affect the contents of the XLOG record, so we'll update our local
+        * copy but not force a recomputation.
         */
        if (!XLByteEQ(RedoRecPtr, Insert->RedoRecPtr))
        {
                Assert(XLByteLT(RedoRecPtr, Insert->RedoRecPtr));
                RedoRecPtr = Insert->RedoRecPtr;
 
-               for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
+               if (doPageWrites)
                {
-                       if (dtbuf[i] == InvalidBuffer)
-                               continue;
-                       if (dtbuf_bkp[i] == false &&
-                               XLByteLE(dtbuf_lsn[i], RedoRecPtr))
+                       for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
                        {
-                               /*
-                                * Oops, this buffer now needs to be backed up, but we didn't
-                                * think so above.      Start over.
-                                */
-                               LWLockRelease(WALInsertLock);
-                               END_CRIT_SECTION();
-                               goto begin;
+                               if (dtbuf[i] == InvalidBuffer)
+                                       continue;
+                               if (dtbuf_bkp[i] == false &&
+                                       XLByteLE(dtbuf_lsn[i], RedoRecPtr))
+                               {
+                                       /*
+                                        * Oops, this buffer now needs to be backed up, but we
+                                        * didn't think so above.  Start over.
+                                        */
+                                       LWLockRelease(WALInsertLock);
+                                       END_CRIT_SECTION();
+                                       goto begin;
+                               }
                        }
                }
        }
 
+       /*
+        * Also check to see if forcePageWrites was just turned on; if we
+        * weren't already doing full-page writes then go back and recompute.
+        * (If it was just turned off, we could recompute the record without
+        * full pages, but we choose not to bother.)
+        */
+       if (Insert->forcePageWrites && !doPageWrites)
+       {
+               /* Oops, must redo it with full-page data */
+               LWLockRelease(WALInsertLock);
+               END_CRIT_SECTION();
+               goto begin;
+       }
+
        /*
         * Make additional rdata chain entries for the backup blocks, so that we
         * don't need to special-case them in the write loop.  Note that we have
@@ -966,7 +998,7 @@ begin:;
  * save the buffer's LSN at *lsn.
  */
 static bool
-XLogCheckBuffer(XLogRecData *rdata,
+XLogCheckBuffer(XLogRecData *rdata, bool doPageWrites,
                                XLogRecPtr *lsn, BkpBlock *bkpb)
 {
        PageHeader      page;
@@ -980,7 +1012,7 @@ XLogCheckBuffer(XLogRecData *rdata,
         */
        *lsn = page->pd_lsn;
 
-       if (fullPageWrites &&
+       if (doPageWrites &&
                XLByteLE(page->pd_lsn, RedoRecPtr))
        {
                /*
@@ -5651,76 +5683,120 @@ pg_start_backup(PG_FUNCTION_ARGS)
                                                                                                 PointerGetDatum(backupid)));
 
        /*
-        * Force a CHECKPOINT.  This is not strictly necessary, but it seems like
-        * a good idea to minimize the amount of past WAL needed to use the
-        * backup.      Also, this guarantees that two successive backup runs will
-        * have different checkpoint positions and hence different history file
-        * names, even if nothing happened in between.
+        * Mark backup active in shared memory.  We must do full-page WAL writes
+        * during an on-line backup even if not doing so at other times, because
+        * it's quite possible for the backup dump to obtain a "torn" (partially
+        * written) copy of a database page if it reads the page concurrently
+        * with our write to the same page.  This can be fixed as long as the
+        * first write to the page in the WAL sequence is a full-page write.
+        * Hence, we turn on forcePageWrites and then force a CHECKPOINT, to
+        * ensure there are no dirty pages in shared memory that might get
+        * dumped while the backup is in progress without having a corresponding
+        * WAL record.  (Once the backup is complete, we need not force full-page
+        * writes anymore, since we expect that any pages not modified during
+        * the backup interval must have been correctly captured by the backup.)
+        *
+        * We must hold WALInsertLock to change the value of forcePageWrites,
+        * to ensure adequate interlocking against XLogInsert().
         */
-       RequestCheckpoint(true, false);
+       LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
+       if (XLogCtl->Insert.forcePageWrites)
+       {
+               LWLockRelease(WALInsertLock);
+               ereport(ERROR,
+                               (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+                                errmsg("a backup is already in progress"),
+                                errhint("Run pg_stop_backup() and try again.")));
+       }
+       XLogCtl->Insert.forcePageWrites = true;
+       LWLockRelease(WALInsertLock);
 
-       /*
-        * Now we need to fetch the checkpoint record location, and also its REDO
-        * pointer.  The oldest point in WAL that would be needed to restore
-        * starting from the checkpoint is precisely the REDO pointer.
-        */
-       LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
-       checkpointloc = ControlFile->checkPoint;
-       startpoint = ControlFile->checkPointCopy.redo;
-       LWLockRelease(ControlFileLock);
+       /* Use a TRY block to ensure we release forcePageWrites if fail below */
+       PG_TRY();
+       {
+               /*
+                * Force a CHECKPOINT.  Aside from being necessary to prevent torn
+                * page problems, this guarantees that two successive backup runs will
+                * have different checkpoint positions and hence different history
+                * file names, even if nothing happened in between.
+                */
+               RequestCheckpoint(true, false);
 
-       XLByteToSeg(startpoint, _logId, _logSeg);
-       XLogFileName(xlogfilename, ThisTimeLineID, _logId, _logSeg);
+               /*
+                * Now we need to fetch the checkpoint record location, and also its
+                * REDO pointer.  The oldest point in WAL that would be needed to
+                * restore starting from the checkpoint is precisely the REDO pointer.
+                */
+               LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+               checkpointloc = ControlFile->checkPoint;
+               startpoint = ControlFile->checkPointCopy.redo;
+               LWLockRelease(ControlFileLock);
 
-       /*
-        * We deliberately use strftime/localtime not the src/timezone functions,
-        * so that backup labels will consistently be recorded in the same
-        * timezone regardless of TimeZone setting.  This matches elog.c's
-        * practice.
-        */
-       stamp_time = time(NULL);
-       strftime(strfbuf, sizeof(strfbuf),
-                        "%Y-%m-%d %H:%M:%S %Z",
-                        localtime(&stamp_time));
+               XLByteToSeg(startpoint, _logId, _logSeg);
+               XLogFileName(xlogfilename, ThisTimeLineID, _logId, _logSeg);
 
-       /*
-        * Check for existing backup label --- implies a backup is already running
-        */
-       if (stat(BACKUP_LABEL_FILE, &stat_buf) != 0)
-       {
-               if (errno != ENOENT)
+               /*
+                * We deliberately use strftime/localtime not the src/timezone
+                * functions, so that backup labels will consistently be recorded in
+                * the same timezone regardless of TimeZone setting.  This matches
+                * elog.c's practice.
+                */
+               stamp_time = time(NULL);
+               strftime(strfbuf, sizeof(strfbuf),
+                                "%Y-%m-%d %H:%M:%S %Z",
+                                localtime(&stamp_time));
+
+               /*
+                * Check for existing backup label --- implies a backup is already
+                * running.  (XXX given that we checked forcePageWrites above, maybe
+                * it would be OK to just unlink any such label file?)
+                */
+               if (stat(BACKUP_LABEL_FILE, &stat_buf) != 0)
+               {
+                       if (errno != ENOENT)
+                               ereport(ERROR,
+                                               (errcode_for_file_access(),
+                                                errmsg("could not stat file \"%s\": %m",
+                                                               BACKUP_LABEL_FILE)));
+               }
+               else
+                       ereport(ERROR,
+                                       (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+                                        errmsg("a backup is already in progress"),
+                                        errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
+                                                        BACKUP_LABEL_FILE)));
+
+               /*
+                * Okay, write the file
+                */
+               fp = AllocateFile(BACKUP_LABEL_FILE, "w");
+               if (!fp)
                        ereport(ERROR,
                                        (errcode_for_file_access(),
-                                        errmsg("could not stat file \"%s\": %m",
+                                        errmsg("could not create file \"%s\": %m",
+                                                       BACKUP_LABEL_FILE)));
+               fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n",
+                               startpoint.xlogid, startpoint.xrecoff, xlogfilename);
+               fprintf(fp, "CHECKPOINT LOCATION: %X/%X\n",
+                               checkpointloc.xlogid, checkpointloc.xrecoff);
+               fprintf(fp, "START TIME: %s\n", strfbuf);
+               fprintf(fp, "LABEL: %s\n", backupidstr);
+               if (fflush(fp) || ferror(fp) || FreeFile(fp))
+                       ereport(ERROR,
+                                       (errcode_for_file_access(),
+                                        errmsg("could not write file \"%s\": %m",
                                                        BACKUP_LABEL_FILE)));
        }
-       else
-               ereport(ERROR,
-                               (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
-                                errmsg("a backup is already in progress"),
-                                errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
-                                                BACKUP_LABEL_FILE)));
+       PG_CATCH();
+       {
+               /* Turn off forcePageWrites on failure */
+               LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
+               XLogCtl->Insert.forcePageWrites = false;
+               LWLockRelease(WALInsertLock);
 
-       /*
-        * Okay, write the file
-        */
-       fp = AllocateFile(BACKUP_LABEL_FILE, "w");
-       if (!fp)
-               ereport(ERROR,
-                               (errcode_for_file_access(),
-                                errmsg("could not create file \"%s\": %m",
-                                               BACKUP_LABEL_FILE)));
-       fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n",
-                       startpoint.xlogid, startpoint.xrecoff, xlogfilename);
-       fprintf(fp, "CHECKPOINT LOCATION: %X/%X\n",
-                       checkpointloc.xlogid, checkpointloc.xrecoff);
-       fprintf(fp, "START TIME: %s\n", strfbuf);
-       fprintf(fp, "LABEL: %s\n", backupidstr);
-       if (fflush(fp) || ferror(fp) || FreeFile(fp))
-               ereport(ERROR,
-                               (errcode_for_file_access(),
-                                errmsg("could not write file \"%s\": %m",
-                                               BACKUP_LABEL_FILE)));
+               PG_RE_THROW();
+       }
+       PG_END_TRY();
 
        /*
         * We're done.  As a convenience, return the starting WAL offset.
@@ -5766,10 +5842,12 @@ pg_stop_backup(PG_FUNCTION_ARGS)
 
        /*
         * Get the current end-of-WAL position; it will be unsafe to use this dump
-        * to restore to a point in advance of this time.
+        * to restore to a point in advance of this time.  We can also clear
+        * forcePageWrites here.
         */
        LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
        INSERT_RECPTR(stoppoint, Insert, Insert->curridx);
+       XLogCtl->Insert.forcePageWrites = false;
        LWLockRelease(WALInsertLock);
 
        XLByteToSeg(stoppoint, _logId, _logSeg);