]> git.ipfire.org Git - thirdparty/postgresql.git/commitdiff
Allow Hot Standby to begin from a shutdown checkpoint.
authorHeikki Linnakangas <heikki.linnakangas@iki.fi>
Tue, 13 Apr 2010 14:17:46 +0000 (14:17 +0000)
committerHeikki Linnakangas <heikki.linnakangas@iki.fi>
Tue, 13 Apr 2010 14:17:46 +0000 (14:17 +0000)
Patch by Simon Riggs & me

src/backend/access/transam/twophase.c
src/backend/access/transam/xlog.c
src/include/access/twophase.h

index b1bf2c4f26052ade81df863a947700e459783a74..faafc7e5c18c7d717affac2f1ee1e59d9e488063 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *             $PostgreSQL: pgsql/src/backend/access/transam/twophase.c,v 1.59 2010/02/26 02:00:34 momjian Exp $
+ *             $PostgreSQL: pgsql/src/backend/access/transam/twophase.c,v 1.60 2010/04/13 14:17:46 heikki Exp $
  *
  * NOTES
  *             Each global transaction is associated with a global transaction
@@ -1718,6 +1718,89 @@ PrescanPreparedTransactions(TransactionId **xids_p, int *nxids_p)
        return result;
 }
 
+/*
+ * StandbyRecoverPreparedTransactions
+ *
+ * Scan the pg_twophase directory and setup all the required information to
+ * allow standby queries to treat prepared transactions as still active.
+ * This is never called at the end of recovery - we use
+ * RecoverPreparedTransactions() at that point.
+ *
+ * Currently we simply call SubTransSetParent() for any subxids of prepared
+ * transactions. If overwriteOK is true, it's OK if some XIDs have already
+ * been marked in pg_subtrans.
+ */
+void
+StandbyRecoverPreparedTransactions(bool overwriteOK)
+{
+       DIR                *cldir;
+       struct dirent *clde;
+
+       cldir = AllocateDir(TWOPHASE_DIR);
+       while ((clde = ReadDir(cldir, TWOPHASE_DIR)) != NULL)
+       {
+               if (strlen(clde->d_name) == 8 &&
+                       strspn(clde->d_name, "0123456789ABCDEF") == 8)
+               {
+                       TransactionId xid;
+                       char       *buf;
+                       TwoPhaseFileHeader *hdr;
+                       TransactionId *subxids;
+                       int                     i;
+
+                       xid = (TransactionId) strtoul(clde->d_name, NULL, 16);
+
+                       /* Already processed? */
+                       if (TransactionIdDidCommit(xid) || TransactionIdDidAbort(xid))
+                       {
+                               ereport(WARNING,
+                                               (errmsg("removing stale two-phase state file \"%s\"",
+                                                               clde->d_name)));
+                               RemoveTwoPhaseFile(xid, true);
+                               continue;
+                       }
+
+                       /* Read and validate file */
+                       buf = ReadTwoPhaseFile(xid, true);
+                       if (buf == NULL)
+                       {
+                               ereport(WARNING,
+                                         (errmsg("removing corrupt two-phase state file \"%s\"",
+                                                         clde->d_name)));
+                               RemoveTwoPhaseFile(xid, true);
+                               continue;
+                       }
+
+                       /* Deconstruct header */
+                       hdr = (TwoPhaseFileHeader *) buf;
+                       if (!TransactionIdEquals(hdr->xid, xid))
+                       {
+                               ereport(WARNING,
+                                         (errmsg("removing corrupt two-phase state file \"%s\"",
+                                                         clde->d_name)));
+                               RemoveTwoPhaseFile(xid, true);
+                               pfree(buf);
+                               continue;
+                       }
+
+                       /*
+                        * Examine subtransaction XIDs ... they should all follow main
+                        * XID.
+                        */
+                       subxids = (TransactionId *)
+                               (buf + MAXALIGN(sizeof(TwoPhaseFileHeader)));
+                       for (i = 0; i < hdr->nsubxacts; i++)
+                       {
+                               TransactionId subxid = subxids[i];
+
+                               Assert(TransactionIdFollows(subxid, xid));
+                               SubTransSetParent(xid, subxid, overwriteOK);
+                       }
+               }
+       }
+       FreeDir(cldir);
+}
+
 /*
  * RecoverPreparedTransactions
  *
index 379c6f11750b26ea4feff7f777d333668b302ca9..5fd4b870bef3d9254afe4cdecf847b94229ce4c2 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.393 2010/04/12 10:40:42 heikki Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.394 2010/04/13 14:17:46 heikki Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -496,6 +496,7 @@ static TimeLineID lastPageTLI = 0;
 static XLogRecPtr minRecoveryPoint;            /* local copy of
                                                                                 * ControlFile->minRecoveryPoint */
 static bool updateMinRecoveryPoint = true;
+static bool reachedMinRecoveryPoint = false;
 
 static bool InRedo = false;
 
@@ -551,6 +552,7 @@ static void ValidateXLOGDirectoryStructure(void);
 static void CleanupBackupHistory(void);
 static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
 static XLogRecord *ReadRecord(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt);
+static void CheckRecoveryConsistency(void);
 static bool ValidXLOGHeader(XLogPageHeader hdr, int emode);
 static XLogRecord *ReadCheckpointRecord(XLogRecPtr RecPtr, int whichChkpt);
 static List *readTimeLineHistory(TimeLineID targetTLI);
@@ -5591,7 +5593,6 @@ StartupXLOG(void)
        uint32          freespace;
        TransactionId oldestActiveXID;
        bool            bgwriterLaunched = false;
-       bool            backendsAllowed = false;
 
        /*
         * Read control file and check XLOG status looks valid.
@@ -5838,6 +5839,8 @@ StartupXLOG(void)
        if (InRecovery)
        {
                int                     rmid;
+               /* use volatile pointer to prevent code rearrangement */
+               volatile XLogCtlData *xlogctl = XLogCtl;
 
                /*
                 * Update pg_control to show that we are recovering and to show the
@@ -5930,6 +5933,33 @@ StartupXLOG(void)
                        StartupMultiXact();
 
                        ProcArrayInitRecoveryInfo(oldestActiveXID);
+
+                       /*
+                        * If we're beginning at a shutdown checkpoint, we know that
+                        * nothing was running on the master at this point. So fake-up
+                        * an empty running-xacts record and use that here and now.
+                        * Recover additional standby state for prepared transactions.
+                        */
+                       if (wasShutdown)
+                       {
+                               RunningTransactionsData running;
+
+                               /*
+                                * Construct a RunningTransactions snapshot representing a shut
+                                * down server, with only prepared transactions still alive.
+                                * We're never overflowed at this point because all subxids
+                                * are listed with their parent prepared transactions.
+                                */
+                               running.xcnt = nxids;
+                               running.subxid_overflow = false;
+                               running.nextXid = checkPoint.nextXid;
+                               running.oldestRunningXid = oldestActiveXID;
+                               running.xids = xids;
+
+                               ProcArrayApplyRecoveryInfo(&running);
+
+                               StandbyRecoverPreparedTransactions(false);
+                       }
                }
 
                /* Initialize resource managers */
@@ -5939,6 +5969,46 @@ StartupXLOG(void)
                                RmgrTable[rmid].rm_startup();
                }
 
+               /*
+                * Initialize shared replayEndRecPtr and recoveryLastRecPtr.
+                *
+                * This is slightly confusing if we're starting from an online
+                * checkpoint; we've just read and replayed the chekpoint record,
+                * but we're going to start replay from its redo pointer, which
+                * precedes the location of the checkpoint record itself. So even
+                * though the last record we've replayed is indeed ReadRecPtr, we
+                * haven't replayed all the preceding records yet. That's OK for
+                * the current use of these variables.
+                */
+               SpinLockAcquire(&xlogctl->info_lck);
+               xlogctl->replayEndRecPtr = ReadRecPtr;
+               xlogctl->recoveryLastRecPtr = ReadRecPtr;
+               SpinLockRelease(&xlogctl->info_lck);
+
+               /*
+                * Let postmaster know we've started redo now, so that it can
+                * launch bgwriter to perform restartpoints.  We don't bother
+                * during crash recovery as restartpoints can only be performed
+                * during archive recovery.  And we'd like to keep crash recovery
+                * simple, to avoid introducing bugs that could you from
+                * recovering after crash.
+                *
+                * After this point, we can no longer assume that we're the only
+                * process in addition to postmaster!  Also, fsync requests are
+                * subsequently to be handled by the bgwriter, not locally.
+                */
+               if (InArchiveRecovery && IsUnderPostmaster)
+               {
+                       SetForwardFsyncRequests();
+                       SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
+                       bgwriterLaunched = true;
+               }
+
+               /*
+                * Allow read-only connections immediately if we're consistent already.
+                */
+               CheckRecoveryConsistency();
+
                /*
                 * Find the first record that logically follows the checkpoint --- it
                 * might physically precede it, though.
@@ -5958,43 +6028,14 @@ StartupXLOG(void)
                {
                        bool            recoveryContinue = true;
                        bool            recoveryApply = true;
-                       bool            reachedMinRecoveryPoint = false;
                        ErrorContextCallback errcontext;
 
-                       /* use volatile pointer to prevent code rearrangement */
-                       volatile XLogCtlData *xlogctl = XLogCtl;
-
-                       /* initialize shared replayEndRecPtr and recoveryLastRecPtr */
-                       SpinLockAcquire(&xlogctl->info_lck);
-                       xlogctl->replayEndRecPtr = ReadRecPtr;
-                       xlogctl->recoveryLastRecPtr = ReadRecPtr;
-                       SpinLockRelease(&xlogctl->info_lck);
-
                        InRedo = true;
 
                        ereport(LOG,
                                        (errmsg("redo starts at %X/%X",
                                                        ReadRecPtr.xlogid, ReadRecPtr.xrecoff)));
 
-                       /*
-                        * Let postmaster know we've started redo now, so that it can
-                        * launch bgwriter to perform restartpoints.  We don't bother
-                        * during crash recovery as restartpoints can only be performed
-                        * during archive recovery.  And we'd like to keep crash recovery
-                        * simple, to avoid introducing bugs that could you from
-                        * recovering after crash.
-                        *
-                        * After this point, we can no longer assume that we're the only
-                        * process in addition to postmaster!  Also, fsync requests are
-                        * subsequently to be handled by the bgwriter, not locally.
-                        */
-                       if (InArchiveRecovery && IsUnderPostmaster)
-                       {
-                               SetForwardFsyncRequests();
-                               SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
-                               bgwriterLaunched = true;
-                       }
-
                        /*
                         * main redo apply loop
                         */
@@ -6024,32 +6065,8 @@ StartupXLOG(void)
                                /* Handle interrupt signals of startup process */
                                HandleStartupProcInterrupts();
 
-                               /*
-                                * Have we passed our safe starting point?
-                                */
-                               if (!reachedMinRecoveryPoint &&
-                                       XLByteLE(minRecoveryPoint, EndRecPtr) &&
-                                       XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
-                               {
-                                       reachedMinRecoveryPoint = true;
-                                       ereport(LOG,
-                                               (errmsg("consistent recovery state reached at %X/%X",
-                                                               EndRecPtr.xlogid, EndRecPtr.xrecoff)));
-                               }
-
-                               /*
-                                * Have we got a valid starting snapshot that will allow
-                                * queries to be run? If so, we can tell postmaster that the
-                                * database is consistent now, enabling connections.
-                                */
-                               if (standbyState == STANDBY_SNAPSHOT_READY &&
-                                       !backendsAllowed &&
-                                       reachedMinRecoveryPoint &&
-                                       IsUnderPostmaster)
-                               {
-                                       backendsAllowed = true;
-                                       SendPostmasterSignal(PMSIGNAL_RECOVERY_CONSISTENT);
-                               }
+                               /* Allow read-only connections if we're consistent now */
+                               CheckRecoveryConsistency();
 
                                /*
                                 * Have we reached our recovery target?
@@ -6398,6 +6415,44 @@ StartupXLOG(void)
        }
 }
 
+/*
+ * Checks if recovery has reached a consistent state. When consistency is
+ * reached and we have a valid starting standby snapshot, tell postmaster
+ * that it can start accepting read-only connections.
+ */
+static void
+CheckRecoveryConsistency(void)
+{
+       static bool             backendsAllowed = false;
+
+       /*
+        * Have we passed our safe starting point?
+        */
+       if (!reachedMinRecoveryPoint &&
+               XLByteLE(minRecoveryPoint, EndRecPtr) &&
+               XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
+       {
+               reachedMinRecoveryPoint = true;
+               ereport(LOG,
+                               (errmsg("consistent recovery state reached at %X/%X",
+                                               EndRecPtr.xlogid, EndRecPtr.xrecoff)));
+       }
+
+       /*
+        * Have we got a valid starting snapshot that will allow
+        * queries to be run? If so, we can tell postmaster that the
+        * database is consistent now, enabling connections.
+        */
+       if (standbyState == STANDBY_SNAPSHOT_READY &&
+               !backendsAllowed &&
+               reachedMinRecoveryPoint &&
+               IsUnderPostmaster)
+       {
+               backendsAllowed = true;
+               SendPostmasterSignal(PMSIGNAL_RECOVERY_CONSISTENT);
+       }
+}
+
 /*
  * Is the system still in recovery?
  *
@@ -7657,13 +7712,36 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record)
                if (standbyState != STANDBY_DISABLED)
                        CheckRequiredParameterValues(checkPoint);
 
+               /*
+                * If we see a shutdown checkpoint, we know that nothing was
+                * running on the master at this point. So fake-up an empty
+                * running-xacts record and use that here and now. Recover
+                * additional standby state for prepared transactions.
+                */
                if (standbyState >= STANDBY_INITIALIZED)
                {
+                       TransactionId *xids;
+                       int                     nxids;
+                       TransactionId oldestActiveXID;
+                       RunningTransactionsData running;
+
+                       oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
+
                        /*
-                        * Remove stale transactions, if any.
+                        * Construct a RunningTransactions snapshot representing a shut
+                        * down server, with only prepared transactions still alive.
+                        * We're never overflowed at this point because all subxids
+                        * are listed with their parent prepared transactions.
                         */
-                       ExpireOldKnownAssignedTransactionIds(checkPoint.nextXid);
-                       StandbyReleaseOldLocks(checkPoint.nextXid);
+                       running.xcnt = nxids;
+                       running.subxid_overflow = false;
+                       running.nextXid = checkPoint.nextXid;
+                       running.oldestRunningXid = oldestActiveXID;
+                       running.xids = xids;
+
+                       ProcArrayApplyRecoveryInfo(&running);
+
+                       StandbyRecoverPreparedTransactions(true);
                }
 
                /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
index 61b92244fb972b3cfcdd913b8ce92ebd35f28dcf..ea3c9966c734f6639077c17f0715e48502eba5e6 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/access/twophase.h,v 1.14 2010/01/02 16:58:00 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/access/twophase.h,v 1.15 2010/04/13 14:17:46 heikki Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -44,6 +44,7 @@ extern bool StandbyTransactionIdIsPrepared(TransactionId xid);
 
 extern TransactionId PrescanPreparedTransactions(TransactionId **xids_p,
                                                        int *nxids_p);
+extern void StandbyRecoverPreparedTransactions(bool overwriteOK);
 extern void RecoverPreparedTransactions(void);
 
 extern void RecreateTwoPhaseFile(TransactionId xid, void *content, int len);