#ifndef SQLITE_OMIT_DISKIO
#include "sqliteInt.h"
+/*
+******************** NOTES ON THE DESIGN OF THE PAGER ************************
+**
+** Within this comment block, a page is deemed to have been synced
+** automatically as soon as it is written when PRAGMA synchronous=OFF.
+** Otherwise, the page is not synced until the xSync method of the VFS
+** is called successfully on the file containing the page.
+**
+** Definition: A page of the database file is said to be "overwriteable" if
+** one or more of the following are true about the page:
+**
+** (a) The original content of the page as it was at the beginning of
+** the transaction has been written into the rollback journal and
+** synced.
+**
+** (b) The page was a freelist leaf page at the start of the transaction.
+**
+** (c) The page number is greater than the largest page that existed in
+** the database file at the start of the transaction.
+**
+** (1) A page of the database file is never overwritten unless one of the
+** following are true:
+**
+** (a) The page and all other pages on the same sector are overwriteable.
+**
+** (b) The atomic page write optimization is enabled, and the entire
+** transaction other than the update of the transaction sequence
+** number consists of a single page change.
+**
+** (2) The content of a page written into the rollback journal exactly matches
+** both the content in the database when the rollback journal was written
+** and the content in the database at the beginning of the current
+** transaction.
+**
+** (3) Writes to the database file are an integer multiple of the page size
+** in length and are aligned to a page boundary.
+**
+** (4) Reads from the database file are either aligned on a page boundary and
+** an integer multiple of the page size in length or are taken from the
+** first 100 bytes of the database file.
+**
+** (5) All writes to the database file are synced prior to the rollback journal
+** being deleted, truncated, or zeroed.
+**
+** (6) If a master journal file is used, then all writes to the database file
+** are synced prior to the master journal being deleted.
+**
+** Definition: Two databases (or the same database at two points it time)
+** are said to be "logically equivalent" if they give the same answer to
+** all queries. Note in particular the the content of freelist leaf
+** pages can be changed arbitarily without effecting the logical equivalence
+** of the database.
+**
+** (7) At any time, if any subset, including the empty set and the total set,
+** of the unsynced changes to a rollback journal are removed and the
+** journal is rolled back, the resulting database file will be logical
+** equivalent to the database file at the beginning of the transaction.
+**
+** (8) When a transaction is rolled back, the xTruncate method of the VFS
+** is called to restore the database file to the same size it was at
+** the beginning of the transaction. (In some VFSes, the xTruncate
+** method is a no-op, but that does not change the fact the SQLite will
+** invoke it.)
+**
+** (9) Whenever the database file is modified, at least one bit in the range
+** of bytes from 24 through 39 inclusive will be changed prior to releasing
+** the EXCLUSIVE lock.
+**
+** (10) The pattern of bits in bytes 24 through 39 shall not repeat in less
+** than one billion transactions.
+**
+** (11) A database file is well-formed at the beginning and at the conclusion
+** of every transaction.
+**
+** (12) An EXCLUSIVE lock is held on the database file when writing to
+** the database file.
+**
+** (13) A SHARED lock is held on the database file while reading any
+** content out of the database file.
+*/
+
/*
** Macros for troubleshooting. Normally turned off
*/
**
** journalStarted
**
-** This flag is set whenever the the main journal is synced.
+** This flag is set whenever the the main journal is opened and
+** initialized
**
** The point of this flag is that it must be set after the
** first journal header in a journal file has been synced to disk.
**
** doNotSync
**
-** This variable is set and cleared by sqlite3PagerWrite().
+** When enabled, cache spills are prohibited and the journal file cannot
+** be synced. This variable is set and cleared by sqlite3PagerWrite()
+** in order to prevent a journal sync from happening in between the
+** journalling of two pages on the same sector.
**
** needSync
**
sqlite3_file *sjfd; /* File descriptor for sub-journal */
i64 journalOff; /* Current write offset in the journal file */
i64 journalHdr; /* Byte offset to previous journal header */
+ i64 journalSizeLimit; /* Size limit for persistent journal files */
PagerSavepoint *aSavepoint; /* Array of active savepoints */
int nSavepoint; /* Number of elements in aSavepoint[] */
char dbFileVers[16]; /* Changes whenever database file changes */
void *pCodec; /* First argument to xCodec... methods */
#endif
char *pTmpSpace; /* Pager.pageSize bytes of space for tmp use */
- i64 journalSizeLimit; /* Size limit for persistent journal files */
PCache *pPCache; /* Pointer to page cache object */
sqlite3_backup *pBackup; /* Pointer to list of ongoing backup processes */
};
for(nWrite=0; rc==SQLITE_OK&&nWrite<JOURNAL_HDR_SZ(pPager); nWrite+=nHeader){
IOTRACE(("JHDR %p %lld %d\n", pPager, pPager->journalHdr, nHeader))
rc = sqlite3OsWrite(pPager->jfd, zHeader, nHeader, pPager->journalOff);
+ assert( pPager->journalHdr <= pPager->journalOff );
pPager->journalOff += nHeader;
}
}
pPager->setMaster = 1;
assert( isOpen(pPager->jfd) );
+ assert( pPager->journalHdr <= pPager->journalOff );
/* Calculate the length in bytes and the checksum of zMaster */
for(nMaster=0; zMaster[nMaster]; nMaster++){
*/
static int pager_playback_one_page(
Pager *pPager, /* The pager being played back */
- int isMainJrnl, /* 1 -> main journal. 0 -> sub-journal. */
- int isUnsync, /* True if reading from unsynced main journal */
i64 *pOffset, /* Offset of record to playback */
- int isSavepnt, /* True for a savepoint rollback */
- Bitvec *pDone /* Bitvec of pages already played back */
+ Bitvec *pDone, /* Bitvec of pages already played back */
+ int isMainJrnl, /* 1 -> main journal. 0 -> sub-journal. */
+ int isSavepnt /* True for a savepoint rollback */
){
int rc;
PgHdr *pPg; /* An existing page in the cache */
u32 cksum; /* Checksum used for sanity checking */
char *aData; /* Temporary storage for the page */
sqlite3_file *jfd; /* The file descriptor for the journal file */
+ int isSynced; /* True if journal page is synced */
assert( (isMainJrnl&~1)==0 ); /* isMainJrnl is 0 or 1 */
assert( (isSavepnt&~1)==0 ); /* isSavepnt is 0 or 1 */
PAGERID(pPager), pgno, pager_datahash(pPager->pageSize, (u8*)aData),
(isMainJrnl?"main-journal":"sub-journal")
));
+ if( isMainJrnl ){
+ isSynced = pPager->noSync || (*pOffset <= pPager->journalHdr);
+ }else{
+ isSynced = (pPg==0 || 0==(pPg->flags & PGHDR_NEED_SYNC));
+ }
if( (pPager->state>=PAGER_EXCLUSIVE)
- && (!isSavepnt || pPg==0 || 0==(pPg->flags&PGHDR_NEED_SYNC))
&& isOpen(pPager->fd)
- && !isUnsync
+ && isSynced
){
i64 ofst = (pgno-1)*(i64)pPager->pageSize;
testcase( !isSavepnt && pPg!=0 && (pPg->flags&PGHDR_NEED_SYNC)!=0 );
/* If the contents of this page were just restored from the main
** journal file, then its content must be as they were when the
** transaction was first opened. In this case we can mark the page
- ** as clean, since there will be no need to write it out to the.
+ ** as clean, since there will be no need to write it out to the
+ ** database.
**
** There is one exception to this rule. If the page is being rolled
** back as part of a savepoint (or statement) rollback from an
** occurs.
*/
while( 1 ){
- int isUnsync = 0;
-
/* Read the next journal header from the journal file. If there are
** not enough bytes left in the journal file for a complete header, or
** it is corrupted, then a process must of failed while writing it.
if( nRec==0 && !isHot &&
pPager->journalHdr+JOURNAL_HDR_SZ(pPager)==pPager->journalOff ){
nRec = (int)((szJ - pPager->journalOff) / JOURNAL_PG_SZ(pPager));
- isUnsync = 1;
}
/* If this is the first header read from the journal, truncate the
pager_reset(pPager);
needPagerReset = 0;
}
- rc = pager_playback_one_page(pPager,1,isUnsync,&pPager->journalOff,0,0);
+ rc = pager_playback_one_page(pPager,&pPager->journalOff,0,1,0);
if( rc!=SQLITE_OK ){
if( rc==SQLITE_DONE ){
rc = SQLITE_OK;
iHdrOff = pSavepoint->iHdrOffset ? pSavepoint->iHdrOffset : szJ;
pPager->journalOff = pSavepoint->iOffset;
while( rc==SQLITE_OK && pPager->journalOff<iHdrOff ){
- rc = pager_playback_one_page(pPager, 1, 0, &pPager->journalOff, 1, pDone);
+ rc = pager_playback_one_page(pPager, &pPager->journalOff, pDone, 1, 1);
}
assert( rc!=SQLITE_DONE );
}else{
nJRec = (u32)((szJ - pPager->journalOff)/JOURNAL_PG_SZ(pPager));
}
for(ii=0; rc==SQLITE_OK && ii<nJRec && pPager->journalOff<szJ; ii++){
- rc = pager_playback_one_page(pPager, 1, 0, &pPager->journalOff, 1, pDone);
+ rc = pager_playback_one_page(pPager, &pPager->journalOff, pDone, 1, 1);
}
assert( rc!=SQLITE_DONE );
}
i64 offset = pSavepoint->iSubRec*(4+pPager->pageSize);
for(ii=pSavepoint->iSubRec; rc==SQLITE_OK && ii<pPager->nSubRec; ii++){
assert( offset==ii*(4+pPager->pageSize) );
- rc = pager_playback_one_page(pPager, 0, 0, &offset, 1, pDone);
+ rc = pager_playback_one_page(pPager, &offset, pDone, 0, 1);
}
assert( rc!=SQLITE_DONE );
}
** mode, then the journal file may at this point actually be larger
** than Pager.journalOff bytes. If the next thing in the journal
** file happens to be a journal-header (written as part of the
- ** previous connections transaction), and a crash or power-failure
+ ** previous connection's transaction), and a crash or power-failure
** occurs after nRec is updated but before this connection writes
** anything else to the journal file (or commits/rolls back its
** transaction), then SQLite may become confused when doing the
*/
pPager->needSync = 0;
pPager->journalStarted = 1;
+ pPager->journalHdr = pPager->journalOff;
sqlite3PcacheClearSyncFlags(pPager->pPCache);
}
goto failed;
}
- /* TODO: Why are these cleared here? Is it necessary? */
+ /* Reset the journal status fields to indicates that we have no
+ ** rollback journal at this time. */
pPager->journalStarted = 0;
pPager->journalOff = 0;
pPager->setMaster = 0;
pPager->journalHdr = 0;
+
+ /* Make sure the journal file has been synced to disk. */
/* Playback and delete the journal. Drop the database write
** lock and reacquire the read lock. Purge the cache before
** playing back the hot-journal so that we don't end up with
- ** an inconsistent cache.
+ ** an inconsistent cache. Sync the hot journal before playing
+ ** it back since the process that crashed and left the hot journal
+ ** probably did not sync it and we are required to always sync
+ ** the journal before playing it back.
*/
if( isOpen(pPager->jfd) ){
- rc = pager_playback(pPager, 1);
+ if( !pPager->noSync ){
+ rc = sqlite3OsSync(pPager->jfd, SQLITE_SYNC_NORMAL);
+ }
+ if( rc==SQLITE_OK ){
+ rc = sqlite3OsFileSize(pPager->jfd, &pPager->journalHdr);
+ }
+ if( rc==SQLITE_OK ){
+ rc = pager_playback(pPager, 1);
+ }
if( rc!=SQLITE_OK ){
rc = pager_error(pPager, rc);
goto failed;
** a) When reading a free-list leaf page from the database, and
**
** b) When a savepoint is being rolled back and we need to load
-** a new page into the cache to populate with the data read
+** a new page into the cache to be filled with the data read
** from the savepoint journal.
**
** If noContent is true, then the data returned is zeroed instead of
** contains the database locks. The following assert verifies
** that we do not. */
assert( pPg->pgno!=PAGER_MJ_PGNO(pPager) );
+
+ assert( pPager->journalHdr <= pPager->journalOff );
CODEC2(pPager, pData, pPg->pgno, 7, return SQLITE_NOMEM, pData2);
cksum = pager_cksum(pPager, (u8*)pData2);
rc = write32bits(pPager->jfd, pPager->journalOff, pPg->pgno);