]> git.ipfire.org Git - thirdparty/postgresql.git/commitdiff
Fast promote mode skips checkpoint at end of recovery.
authorSimon Riggs <simon@2ndQuadrant.com>
Tue, 29 Jan 2013 00:06:15 +0000 (00:06 +0000)
committerSimon Riggs <simon@2ndQuadrant.com>
Tue, 29 Jan 2013 00:06:15 +0000 (00:06 +0000)
pg_ctl promote -m fast will skip the checkpoint at end of recovery so that we
can achieve very fast failover when the apply delay is low. Write new WAL record
XLOG_END_OF_RECOVERY to allow us to switch timeline correctly for downstream log
readers. If we skip synchronous end of recovery checkpoint we request a normal
spread checkpoint so that the window of re-recovery is low.

Simon Riggs and Kyotaro Horiguchi, with input from Fujii Masao.
Review by Heikki Linnakangas

src/backend/access/rmgrdesc/xlogdesc.c
src/backend/access/transam/xlog.c
src/bin/pg_ctl/pg_ctl.c
src/include/access/xlog_internal.h
src/include/catalog/pg_control.h

index 506b208c9cfa117b91983d481eb83b0393a79107..69012985161dcadd15cd6bd4d730cfc6e2fc9dd1 100644 (file)
@@ -18,6 +18,7 @@
 #include "access/xlog_internal.h"
 #include "catalog/pg_control.h"
 #include "utils/guc.h"
+#include "utils/timestamp.h"
 
 /*
  * GUC support
@@ -119,6 +120,15 @@ xlog_desc(StringInfo buf, uint8 xl_info, char *rec)
                memcpy(&fpw, rec, sizeof(bool));
                appendStringInfo(buf, "full_page_writes: %s", fpw ? "true" : "false");
        }
+       else if (info == XLOG_END_OF_RECOVERY)
+       {
+               xl_end_of_recovery xlrec;
+
+               memcpy(&xlrec, rec, sizeof(xl_end_of_recovery));
+               appendStringInfo(buf, "end_of_recovery: tli %u; time %s",
+                                                xlrec.ThisTimeLineID,
+                                                timestamptz_to_str(xlrec.end_time));
+       }
        else
                appendStringInfo(buf, "UNKNOWN");
 }
index cf2f6e70cff9e5fa0bf608183c22746d360e45ad..bcd379dca73253b8c786de1b1fc0f0fcf44a2d18 100644 (file)
@@ -66,6 +66,7 @@
 #define RECOVERY_COMMAND_FILE  "recovery.conf"
 #define RECOVERY_COMMAND_DONE  "recovery.done"
 #define PROMOTE_SIGNAL_FILE "promote"
+#define FAST_PROMOTE_SIGNAL_FILE "fast_promote"
 
 
 /* User-settable parameters */
@@ -210,6 +211,9 @@ bool StandbyMode = false;
 static char *PrimaryConnInfo = NULL;
 static char *TriggerFile = NULL;
 
+/* whether request for fast promotion has been made yet */
+static bool fast_promote = false;
+
 /* if recoveryStopsHere returns true, it saves actual stop xid/time/name here */
 static TransactionId recoveryStopXid;
 static TimestampTz recoveryStopTime;
@@ -611,6 +615,7 @@ static void CheckRequiredParameterValues(void);
 static void XLogReportParameters(void);
 static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI);
 static void LocalSetXLogInsertAllowed(void);
+static void CreateEndOfRecoveryRecord(void);
 static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
 static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo);
 
@@ -642,7 +647,7 @@ static XLogRecord *ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
                   int emode, bool fetching_ckpt);
 static void CheckRecoveryConsistency(void);
 static XLogRecord *ReadCheckpointRecord(XLogReaderState *xlogreader,
-                                        XLogRecPtr RecPtr, int whichChkpt);
+                                        XLogRecPtr RecPtr, int whichChkpti, bool report);
 static bool rescanLatestTimeLine(void);
 static void WriteControlFile(void);
 static void ReadControlFile(void);
@@ -4848,7 +4853,7 @@ StartupXLOG(void)
                 * When a backup_label file is present, we want to roll forward from
                 * the checkpoint it identifies, rather than using pg_control.
                 */
-               record = ReadCheckpointRecord(xlogreader, checkPointLoc, 0);
+               record = ReadCheckpointRecord(xlogreader, checkPointLoc, 0, true);
                if (record != NULL)
                {
                        memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
@@ -4890,7 +4895,7 @@ StartupXLOG(void)
                 */
                checkPointLoc = ControlFile->checkPoint;
                RedoStartLSN = ControlFile->checkPointCopy.redo;
-               record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1);
+               record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, true);
                if (record != NULL)
                {
                        ereport(DEBUG1,
@@ -4909,7 +4914,7 @@ StartupXLOG(void)
                else
                {
                        checkPointLoc = ControlFile->prevCheckPoint;
-                       record = ReadCheckpointRecord(xlogreader, checkPointLoc, 2);
+                       record = ReadCheckpointRecord(xlogreader, checkPointLoc, 2, true);
                        if (record != NULL)
                        {
                                ereport(LOG,
@@ -5393,22 +5398,33 @@ StartupXLOG(void)
                                }
 
                                /*
-                                * Before replaying this record, check if it is a shutdown
-                                * checkpoint record that causes the current timeline to
-                                * change. The checkpoint record is already considered to be
-                                * part of the new timeline, so we update ThisTimeLineID
-                                * before replaying it. That's important so that replayEndTLI,
-                                * which is recorded as the minimum recovery point's TLI if
+                                * Before replaying this record, check if this record
+                                * causes the current timeline to change. The record is
+                                * already considered to be part of the new timeline,
+                                * so we update ThisTimeLineID before replaying it.
+                                * That's important so that replayEndTLI, which is
+                                * recorded as the minimum recovery point's TLI if
                                 * recovery stops after this record, is set correctly.
                                 */
-                               if (record->xl_rmid == RM_XLOG_ID &&
-                                       (record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN)
+                               if (record->xl_rmid == RM_XLOG_ID)
                                {
-                                       CheckPoint      checkPoint;
-                                       TimeLineID      newTLI;
+                                       TimeLineID      newTLI = ThisTimeLineID;
+                                       uint8           info = record->xl_info & ~XLR_INFO_MASK;
+
+                                       if (info == XLOG_CHECKPOINT_SHUTDOWN)
+                                       {
+                                               CheckPoint      checkPoint;
+
+                                               memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
+                                               newTLI = checkPoint.ThisTimeLineID;
+                                       }
+                                       else if (info == XLOG_END_OF_RECOVERY)
+                                       {
+                                               xl_end_of_recovery      xlrec;
 
-                                       memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
-                                       newTLI = checkPoint.ThisTimeLineID;
+                                               memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
+                                               newTLI = xlrec.ThisTimeLineID;
+                                       }
 
                                        if (newTLI != ThisTimeLineID)
                                        {
@@ -5729,9 +5745,36 @@ StartupXLOG(void)
                 * allows some extra error checking in xlog_redo.
                 */
                if (bgwriterLaunched)
-                       RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
-                                                         CHECKPOINT_IMMEDIATE |
-                                                         CHECKPOINT_WAIT);
+               {
+                       bool    checkpoint_wait = true;
+
+                       /*
+                        * If we've been explicitly promoted with fast option,
+                        * end of recovery without a checkpoint if possible.
+                        */
+                       if (fast_promote)
+                       {
+                               checkPointLoc = ControlFile->prevCheckPoint;
+                               record = ReadCheckpointRecord(xlogreader, checkPointLoc, 2, false);
+                               if (record != NULL)
+                               {
+                                       checkpoint_wait = false;
+                                       CreateEndOfRecoveryRecord();
+                               }
+                       }
+
+                       /*
+                        * In most cases we will wait for a full checkpoint to complete.
+                        *
+                        * If not, issue a normal, non-immediate checkpoint but don't wait.
+                        */
+                       if (checkpoint_wait)
+                               RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
+                                                                       CHECKPOINT_IMMEDIATE |
+                                                                       CHECKPOINT_WAIT);
+                       else
+                               RequestCheckpoint(0);   /* No flags */
+               }
                else
                        CreateCheckPoint(CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IMMEDIATE);
 
@@ -6060,12 +6103,15 @@ LocalSetXLogInsertAllowed(void)
  */
 static XLogRecord *
 ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
-                                        int whichChkpt)
+                                        int whichChkpt, bool report)
 {
        XLogRecord *record;
 
        if (!XRecOffIsValid(RecPtr))
        {
+               if (!report)
+                       return NULL;
+
                switch (whichChkpt)
                {
                        case 1:
@@ -6088,6 +6134,9 @@ ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
 
        if (record == NULL)
        {
+               if (!report)
+                       return NULL;
+
                switch (whichChkpt)
                {
                        case 1:
@@ -6882,6 +6931,44 @@ CreateCheckPoint(int flags)
        LWLockRelease(CheckpointLock);
 }
 
+/*
+ * Mark the end of recovery in WAL though without running a full checkpoint.
+ * We can expect that a restartpoint is likely to be in progress as we
+ * do this, though we are unwilling to wait for it to complete. So be
+ * careful to avoid taking the CheckpointLock anywhere here.
+ *
+ * CreateRestartPoint() allows for the case where recovery may end before
+ * the restartpoint completes so there is no concern of concurrent behaviour.
+ */
+void
+CreateEndOfRecoveryRecord(void)
+{
+       xl_end_of_recovery      xlrec;
+       XLogRecData                     rdata;
+
+       /* sanity check */
+       if (!RecoveryInProgress())
+               elog(ERROR, "can only be used to end recovery");
+
+       xlrec.end_time = time(NULL);
+       xlrec.ThisTimeLineID = ThisTimeLineID;
+
+       LocalSetXLogInsertAllowed();
+
+       START_CRIT_SECTION();
+
+       rdata.data = (char *) &xlrec;
+       rdata.len = sizeof(xl_end_of_recovery);
+       rdata.buffer = InvalidBuffer;
+       rdata.next = NULL;
+
+       (void) XLogInsert(RM_XLOG_ID, XLOG_END_OF_RECOVERY, &rdata);
+
+       END_CRIT_SECTION();
+
+       LocalXLogInsertAllowed = -1;            /* return to "check" state */
+}
+
 /*
  * Flush all data in shared memory to disk, and fsync
  *
@@ -7613,6 +7700,27 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record)
 
                RecoveryRestartPoint(&checkPoint);
        }
+       else if (info == XLOG_END_OF_RECOVERY)
+       {
+               xl_end_of_recovery xlrec;
+
+               memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
+
+               /*
+                * For Hot Standby, we could treat this like a Shutdown Checkpoint,
+                * but this case is rarer and harder to test, so the benefit doesn't
+                * outweigh the potential extra cost of maintenance.
+                */
+
+               /*
+                * We should've already switched to the new TLI before replaying this
+                * record.
+                */
+               if (xlrec.ThisTimeLineID != ThisTimeLineID)
+                       ereport(PANIC,
+                                       (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
+                                                       xlrec.ThisTimeLineID, ThisTimeLineID)));
+       }
        else if (info == XLOG_NOOP)
        {
                /* nothing to do here */
@@ -9405,8 +9513,39 @@ CheckForStandbyTrigger(void)
 
        if (IsPromoteTriggered())
        {
-               ereport(LOG,
+               /*
+                * In 9.1 and 9.2 the postmaster unlinked the promote file
+                * inside the signal handler. We now leave the file in place
+                * and let the Startup process do the unlink. This allows
+                * Startup to know whether we're doing fast or normal
+                * promotion. Fast promotion takes precedence.
+                */
+               if (stat(FAST_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
+               {
+                       unlink(FAST_PROMOTE_SIGNAL_FILE);
+                       unlink(PROMOTE_SIGNAL_FILE);
+                       fast_promote = true;
+               }
+               else if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
+               {
+                       unlink(PROMOTE_SIGNAL_FILE);
+                       fast_promote = false;
+               }
+
+               /*
+                * We only look for fast promote via the pg_ctl promote option.
+                * It would be possible to extend trigger file support for the
+                * fast promotion option but that wouldn't be backwards compatible
+                * anyway and we're looking to focus further work on the promote
+                * option as the right way to signal end of recovery.
+                */
+               if (fast_promote)
+                       ereport(LOG,
+                               (errmsg("received fast promote request")));
+               else
+                       ereport(LOG,
                                (errmsg("received promote request")));
+
                ResetPromoteTriggered();
                triggered = true;
                return true;
@@ -9435,15 +9574,10 @@ CheckPromoteSignal(void)
 {
        struct stat stat_buf;
 
-       if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
-       {
-               /*
-                * Since we are in a signal handler, it's not safe to elog. We
-                * silently ignore any error from unlink.
-                */
-               unlink(PROMOTE_SIGNAL_FILE);
+       if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0 ||
+               stat(FAST_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
                return true;
-       }
+
        return false;
 }
 
index e412d71dcff8c556653569a973469a6fc48cec9e..e086b1244cc1f3552bc9c123c7aed17982246b3b 100644 (file)
@@ -1136,6 +1136,15 @@ do_promote(void)
                exit(1);
        }
 
+       /*
+        * Use two different kinds of promotion file so we can understand
+        * the difference between smart and fast promotion.
+        */
+       if (shutdown_mode >= FAST_MODE)
+               snprintf(promote_file, MAXPGPATH, "%s/fast_promote", pg_data);
+       else
+               snprintf(promote_file, MAXPGPATH, "%s/promote", pg_data);
+
        if ((prmfile = fopen(promote_file, "w")) == NULL)
        {
                write_stderr(_("%s: could not create promote signal file \"%s\": %s\n"),
@@ -1799,7 +1808,7 @@ do_help(void)
                         "                 [-o \"OPTIONS\"]\n"), progname);
        printf(_("  %s reload  [-D DATADIR] [-s]\n"), progname);
        printf(_("  %s status  [-D DATADIR]\n"), progname);
-       printf(_("  %s promote [-D DATADIR] [-s]\n"), progname);
+       printf(_("  %s promote [-D DATADIR] [-s] [-m PROMOTION-MODE]\n"), progname);
        printf(_("  %s kill    SIGNALNAME PID\n"), progname);
 #if defined(WIN32) || defined(__CYGWIN__)
        printf(_("  %s register   [-N SERVICENAME] [-U USERNAME] [-P PASSWORD] [-D DATADIR]\n"
@@ -1828,7 +1837,7 @@ do_help(void)
        printf(_("  -o OPTIONS             command line options to pass to postgres\n"
         "                         (PostgreSQL server executable) or initdb\n"));
        printf(_("  -p PATH-TO-POSTGRES    normally not necessary\n"));
-       printf(_("\nOptions for stop or restart:\n"));
+       printf(_("\nOptions for stop, restart or promote:\n"));
        printf(_("  -m, --mode=MODE        MODE can be \"smart\", \"fast\", or \"immediate\"\n"));
 
        printf(_("\nShutdown modes are:\n"));
@@ -1836,6 +1845,10 @@ do_help(void)
        printf(_("  fast        quit directly, with proper shutdown\n"));
        printf(_("  immediate   quit without complete shutdown; will lead to recovery on restart\n"));
 
+       printf(_("\nPromotion modes are:\n"));
+       printf(_("  smart       promote after performing a checkpoint\n"));
+       printf(_("  fast        promote quickly without waiting for checkpoint completion\n"));
+
        printf(_("\nAllowed signal names for kill:\n"));
        printf("  ABRT HUP INT QUIT TERM USR1 USR2\n");
 
@@ -2271,7 +2284,6 @@ main(int argc, char **argv)
                snprintf(pid_file, MAXPGPATH, "%s/postmaster.pid", pg_data);
                snprintf(backup_file, MAXPGPATH, "%s/backup_label", pg_data);
                snprintf(recovery_file, MAXPGPATH, "%s/recovery.conf", pg_data);
-               snprintf(promote_file, MAXPGPATH, "%s/promote", pg_data);
        }
 
        switch (ctl_command)
index 43e1e60f9bf82aa971938b4dd9bc3946885fe974..ce9957e618f7768352ad2e66416be3d33041a49e 100644 (file)
@@ -217,6 +217,12 @@ typedef struct xl_restore_point
        char            rp_name[MAXFNAMELEN];
 } xl_restore_point;
 
+/* End of recovery mark, when we don't do an END_OF_RECOVERY checkpoint */
+typedef struct xl_end_of_recovery
+{
+       TimestampTz end_time;
+       TimeLineID      ThisTimeLineID;
+} xl_end_of_recovery;
 
 /*
  * XLogRecord is defined in xlog.h, but we avoid #including that to keep
index e4a9abe7bc55d21b83a08143c0b5caf3e828b7f5..ec8cea7c86e749c3bde19063664a64bdd56f77ed 100644 (file)
@@ -64,6 +64,7 @@ typedef struct CheckPoint
 #define XLOG_PARAMETER_CHANGE                  0x60
 #define XLOG_RESTORE_POINT                             0x70
 #define XLOG_FPW_CHANGE                                0x80
+#define XLOG_END_OF_RECOVERY                   0x90
 
 
 /*