** file simultaneously, or one process from reading the database while
** another is writing.
**
-** @(#) $Id: pager.c,v 1.67 2003/01/12 18:02:18 drh Exp $
+** @(#) $Id: pager.c,v 1.68 2003/01/16 13:42:43 drh Exp $
*/
#include "os.h" /* Must be first to enable large file support */
#include "sqliteInt.h"
#include <assert.h>
#include <string.h>
+/*
+** Macros for troubleshooting. Normally turned off
+*/
+#if 0
+static Pager *mainPager = 0;
+#define SET_PAGER(X) if( mainPager==0 ) mainPager = (X)
+#define CLR_PAGER(X) if( mainPager==(X) ) mainPager = 0
+#define TRACE1(X) if( pPager==mainPager ) fprintf(stderr,X)
+#define TRACE2(X,Y) if( pPager==mainPager ) fprintf(stderr,X,Y)
+#define TRACE3(X,Y,Z) if( pPager==mainPager ) fprintf(stderr,X,Y,Z)
+#else
+#define SET_PAGER(X)
+#define CLR_PAGER(X)
+#define TRACE1(X)
+#define TRACE2(X,Y)
+#define TRACE3(X,Y,Z)
+#endif
+
+
/*
** The page cache as a whole is always in one of the following
** states:
u8 inJournal; /* TRUE if has been written to journal */
u8 inCkpt; /* TRUE if written to the checkpoint journal */
u8 dirty; /* TRUE if we need to write back changes */
+ u8 needSync; /* Sync journal before writing this page */
u8 alwaysRollback; /* Disable dont_rollback() for this page */
/* SQLITE_PAGE_SIZE bytes of page data follow this header */
/* Pager.nExtra bytes of local data follow the page data */
int origDbSize; /* dbSize before the current change */
int ckptSize; /* Size of database (in pages) at ckpt_begin() */
off_t ckptJSize; /* Size of journal at ckpt_begin() */
+#ifndef NDEBUG
+ off_t syncJSize; /* Size of journal at last fsync() call */
+#endif
int ckptNRec; /* Number of records in the checkpoint journal */
int nExtra; /* Add this many bytes to each in-memory page */
void (*xDestructor)(void*); /* Call this routine when freeing pages */
int mxPage; /* Maximum number of pages to hold in cache */
int nHit, nMiss, nOvfl; /* Cache hits, missing, and LRU overflows */
u8 journalOpen; /* True if journal file descriptors is valid */
+ u8 journalStarted; /* True if initial magic of journal is synced */
u8 useJournal; /* Do not use a rollback journal on this file */
u8 ckptOpen; /* True if the checkpoint journal is open */
u8 ckptInUse; /* True we are in a checkpoint */
for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
pPg->inJournal = 0;
pPg->dirty = 0;
+ pPg->needSync = 0;
}
}else{
assert( pPager->dirtyFile==0 || pPager->useJournal==0 );
** at the same time, if there is one.
*/
pPg = pager_lookup(pPager, pgRec.pgno);
+ if( pPg==0 || pPg->needSync==0 ){
+ TRACE2("PLAYBACK %d\n", pgRec.pgno);
+ sqliteOsSeek(&pPager->fd, (pgRec.pgno-1)*(off_t)SQLITE_PAGE_SIZE);
+ rc = sqliteOsWrite(&pPager->fd, pgRec.aData, SQLITE_PAGE_SIZE);
+ }
if( pPg ){
memcpy(PGHDR_TO_DATA(pPg), pgRec.aData, SQLITE_PAGE_SIZE);
memset(PGHDR_TO_EXTRA(pPg), 0, pPager->nExtra);
- }
- rc = sqliteOsSeek(&pPager->fd, (pgRec.pgno-1)*(off_t)SQLITE_PAGE_SIZE);
- if( rc==SQLITE_OK ){
- rc = sqliteOsWrite(&pPager->fd, pgRec.aData, SQLITE_PAGE_SIZE);
+ pPg->dirty = 0;
+ pPg->needSync = 0;
}
return rc;
}
if( rc!=SQLITE_OK ) break;
}
+
end_playback:
+#if !defined(NDEBUG) && defined(SQLITE_TEST)
+ /* For pages that were never written into the journal, restore the
+ ** memory copy from the original database file.
+ **
+ ** This is code is used during testing only. It is necessary to
+ ** compensate for the sqliteOsTruncate() call inside
+ ** sqlitepager_rollback().
+ */
+ if( rc==SQLITE_OK ){
+ PgHdr *pPg;
+ for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
+ if( (int)pPg->pgno <= pPager->origDbSize ){
+ sqliteOsSeek(&pPager->fd, SQLITE_PAGE_SIZE*(off_t)(pPg->pgno-1));
+ rc = sqliteOsRead(&pPager->fd, PGHDR_TO_DATA(pPg), SQLITE_PAGE_SIZE);
+ if( rc ) break;
+ }else{
+ memset(PGHDR_TO_DATA(pPg), 0, SQLITE_PAGE_SIZE);
+ }
+ memset(PGHDR_TO_EXTRA(pPg), 0, pPager->nExtra);
+ pPg->needSync = 0;
+ pPg->dirty = 0;
+ }
+ }
+#endif
if( rc!=SQLITE_OK ){
pager_unwritelock(pPager);
pPager->errMask |= PAGER_ERR_CORRUPT;
sqliteFree(zFullPathname);
return SQLITE_NOMEM;
}
+ SET_PAGER(pPager);
pPager->zFilename = (char*)&pPager[1];
pPager->zJournal = &pPager->zFilename[nameLen+1];
strcpy(pPager->zFilename, zFullPathname);
** sqliteOsDelete(pPager->zFilename);
** }
*/
+ CLR_PAGER(pPager);
sqliteFree(pPager);
return SQLITE_OK;
}
*/
static int syncAllPages(Pager *pPager){
PgHdr *pPg;
- Pgno lastPgno = 0;
int rc = SQLITE_OK;
/* Sync the journal before modifying the main database
*/
if( pPager->needSync ){
if( !pPager->tempFile ){
+ assert( pPager->journalOpen );
+ assert( !pPager->noSync );
+ TRACE1("SYNC\n");
rc = sqliteOsSync(&pPager->jfd);
if( rc!=0 ) return rc;
+#ifndef NDEBUG
+ rc = sqliteOsFileSize(&pPager->jfd, &pPager->syncJSize);
+ if( rc!=0 ) return rc;
+#endif
+ pPager->journalStarted = 1;
}
pPager->needSync = 0;
}
- /* Write all dirty free pages to the disk in the order that they
- ** appear on the disk. We have experimented with sorting the pages
- ** by page numbers so that they are written in order, but that does
- ** not appear to improve performance.
+ /* Erase the needSync flag from every page.
*/
- for(pPg=pPager->pFirst; pPg; pPg=pPg->pNextFree){
- if( pPg->dirty ){
- if( lastPgno==0 || pPg->pgno!=lastPgno+1 ){
- sqliteOsSeek(&pPager->fd, (pPg->pgno-1)*(off_t)SQLITE_PAGE_SIZE);
- }
- rc = sqliteOsWrite(&pPager->fd, PGHDR_TO_DATA(pPg), SQLITE_PAGE_SIZE);
- if( rc!=SQLITE_OK ) break;
- pPg->dirty = 0;
- lastPgno = pPg->pgno;
- }
+ for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
+ pPg->needSync = 0;
}
+
return rc;
}
return SQLITE_BUSY;
}
pPager->journalOpen = 1;
+ pPager->journalStarted = 0;
/* Playback and delete the journal. Drop the database write
** lock and reacquire the read lock.
pPager->pAll = pPg;
pPager->nPage++;
}else{
- /* Recycle an older page. First locate the page to be recycled.
- ** Try to find one that is not dirty and is near the head of
- ** of the free list */
+ /* Find a page to recycle. Try to locate a page that does not
+ ** require us to do an fsync() on the journal.
+ */
pPg = pPager->pFirst;
- while( pPg && pPg->dirty ){
+ while( pPg && pPg->needSync ){
pPg = pPg->pNextFree;
}
- /* If we could not find a page that has not been used recently
- ** and which is not dirty, then sync the journal and write all
- ** dirty free pages into the database file, thus making them
- ** clean pages and available for recycling.
- **
- ** We have to sync the journal before writing a page to the main
- ** database. But syncing is a very slow operation. So after a
- ** sync, it is best to write everything we can back to the main
- ** database to minimize the risk of having to sync again in the
- ** near future. That is why we write all dirty pages after a
- ** sync.
+ /* If we could not find a page that does not require an fsync()
+ ** on the journal file then fsync the journal file. This is a
+ ** very slow operation, so we work hard to avoid it. But sometimes
+ ** it can't be helped.
*/
if( pPg==0 ){
int rc = syncAllPages(pPager);
pPg = pPager->pFirst;
}
assert( pPg->nRef==0 );
+
+ /* Write the page to the database file if it is dirty.
+ */
+ if( pPg->dirty ){
+ assert( pPg->needSync==0 );
+ TRACE2("SAVE %d\n", pPg->pgno);
+ sqliteOsSeek(&pPager->fd, (pPg->pgno-1)*(off_t)SQLITE_PAGE_SIZE);
+ rc = sqliteOsWrite(&pPager->fd, PGHDR_TO_DATA(pPg), SQLITE_PAGE_SIZE);
+ if( rc!=SQLITE_OK ){
+ sqlitepager_rollback(pPager);
+ *ppPage = 0;
+ return SQLITE_IOERR;
+ }
+ pPg->dirty = 0;
+ }
assert( pPg->dirty==0 );
- /* If the page we are recyclying is marked as alwaysRollback, then
+ /* If the page we are recycling is marked as alwaysRollback, then
** set the global alwaysRollback flag, thus disabling the
** sqlite_dont_rollback() optimization for the rest of this transaction.
** It is necessary to do this because the page marked alwaysRollback
pPg->pgno = pgno;
if( pPager->aInJournal && (int)pgno<=pPager->origDbSize ){
sqliteCheckMemory(pPager->aInJournal, pgno/8);
+ assert( pPager->journalOpen );
pPg->inJournal = (pPager->aInJournal[pgno/8] & (1<<(pgno&7)))!=0;
+ pPg->needSync = 0;
}else{
pPg->inJournal = 0;
+ pPg->needSync = 0;
}
if( pPager->aInCkpt && (int)pgno<=pPager->ckptSize
&& (pPager->aInCkpt[pgno/8] & (1<<(pgno&7)))!=0 ){
return SQLITE_CANTOPEN;
}
pPager->journalOpen = 1;
+ pPager->journalStarted = 0;
pPager->needSync = 0;
pPager->alwaysRollback = 0;
sqlitepager_pagecount(pPager);
rc = SQLITE_FULL;
}
}
+#ifndef NDEBUG
+ pPager->syncJSize = 0;
+#endif
return rc;
}
}
pPager->state = SQLITE_WRITELOCK;
pPager->dirtyFile = 0;
+ TRACE1("TRANSACTION\n");
if( pPager->useJournal && !pPager->tempFile ){
rc = pager_open_journal(pPager);
}
** main database file. Write the current page to the transaction
** journal if it is not there already.
*/
- if( !pPg->inJournal && pPager->useJournal
- && (int)pPg->pgno <= pPager->origDbSize ){
- rc = write32bits(&pPager->jfd, pPg->pgno);
- if( rc==SQLITE_OK ){
- rc = sqliteOsWrite(&pPager->jfd, pData, SQLITE_PAGE_SIZE);
- }
- if( rc!=SQLITE_OK ){
- sqlitepager_rollback(pPager);
- pPager->errMask |= PAGER_ERR_FULL;
- return rc;
+ if( !pPg->inJournal && pPager->useJournal ){
+ if( (int)pPg->pgno <= pPager->origDbSize ){
+ rc = write32bits(&pPager->jfd, pPg->pgno);
+ if( rc==SQLITE_OK ){
+ rc = sqliteOsWrite(&pPager->jfd, pData, SQLITE_PAGE_SIZE);
+ }
+ if( rc!=SQLITE_OK ){
+ sqlitepager_rollback(pPager);
+ pPager->errMask |= PAGER_ERR_FULL;
+ return rc;
+ }
+ assert( pPager->aInJournal!=0 );
+ pPager->aInJournal[pPg->pgno/8] |= 1<<(pPg->pgno&7);
+ pPg->needSync = !pPager->noSync;
+ pPg->inJournal = 1;
+ if( pPager->ckptInUse ){
+ pPager->aInCkpt[pPg->pgno/8] |= 1<<(pPg->pgno&7);
+ page_add_to_ckpt_list(pPg);
+ }
+ TRACE3("JOURNAL %d %d\n", pPg->pgno, pPg->needSync);
+ }else{
+ pPg->needSync = !pPager->journalStarted && !pPager->noSync;
+ TRACE3("APPEND %d %d\n", pPg->pgno, pPg->needSync);
}
- assert( pPager->aInJournal!=0 );
- pPager->aInJournal[pPg->pgno/8] |= 1<<(pPg->pgno&7);
- pPager->needSync = !pPager->noSync;
- pPg->inJournal = 1;
- if( pPager->ckptInUse ){
- pPager->aInCkpt[pPg->pgno/8] |= 1<<(pPg->pgno&7);
- page_add_to_ckpt_list(pPg);
+ if( pPg->needSync ){
+ pPager->needSync = 1;
}
}
** corruption during the next transaction.
*/
}else{
+ TRACE2("DONT_WRITE %d\n", pgno);
pPg->dirty = 0;
}
}
pPager->aInCkpt[pPg->pgno/8] |= 1<<(pPg->pgno&7);
page_add_to_ckpt_list(pPg);
}
+ TRACE2("DONT_ROLLBACK %d\n", pPg->pgno);
}
if( pPager->ckptInUse && !pPg->inCkpt && (int)pPg->pgno<=pPager->ckptSize ){
assert( pPg->inJournal || (int)pPg->pgno>pPager->origDbSize );
int sqlitepager_commit(Pager *pPager){
int rc;
PgHdr *pPg;
+ int dbChanged;
if( pPager->errMask==PAGER_ERR_FULL ){
rc = sqlitepager_rollback(pPager);
if( pPager->state!=SQLITE_WRITELOCK ){
return SQLITE_ERROR;
}
+ TRACE1("COMMIT\n");
if( pPager->dirtyFile==0 ){
/* Exit early (without doing the time-consuming sqliteOsSync() calls)
** if there have been no changes to the database file. */
return rc;
}
assert( pPager->journalOpen );
+ if( !pPager->journalStarted && !pPager->noSync ) pPager->needSync = 1;
+ assert( pPager->dirtyFile || !pPager->needSync );
if( pPager->needSync && sqliteOsSync(&pPager->jfd)!=SQLITE_OK ){
goto commit_abort;
}
+ dbChanged = 0;
for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
if( pPg->dirty==0 ) continue;
- rc = sqliteOsSeek(&pPager->fd, (pPg->pgno-1)*(off_t)SQLITE_PAGE_SIZE);
- if( rc!=SQLITE_OK ) goto commit_abort;
+ TRACE2("COMMIT-PAGE %d\n", pPg->pgno);
+ sqliteOsSeek(&pPager->fd, (pPg->pgno-1)*(off_t)SQLITE_PAGE_SIZE);
rc = sqliteOsWrite(&pPager->fd, PGHDR_TO_DATA(pPg), SQLITE_PAGE_SIZE);
if( rc!=SQLITE_OK ) goto commit_abort;
+ dbChanged = 1;
}
- if( !pPager->noSync && sqliteOsSync(&pPager->fd)!=SQLITE_OK ){
+ if( dbChanged && !pPager->noSync && sqliteOsSync(&pPager->fd)!=SQLITE_OK ){
goto commit_abort;
}
rc = pager_unwritelock(pPager);
*/
int sqlitepager_rollback(Pager *pPager){
int rc;
+ TRACE1("ROLLBACK\n");
if( !pPager->dirtyFile || !pPager->journalOpen ){
rc = pager_unwritelock(pPager);
pPager->dbSize = -1;
return rc;
}
+
+#if defined(SQLITE_TEST) && !defined(NDEBUG)
+ /* Truncate the journal to the size it was at the conclusion of the
+ ** last sqliteOsSync() call. This is really an error check. If the
+ ** rollback still works, it means that the rollback would have also
+ ** worked if it had occurred after an OS crash or unexpected power
+ ** loss.
+ */
+ if( pPager->syncJSize<sizeof(aJournalMagic)+sizeof(Pgno) ){
+ pPager->syncJSize = sizeof(aJournalMagic)+sizeof(Pgno);
+ }
+ TRACE2("TRUNCATE JOURNAL %lld\n", pPager->syncJSize);
+ rc = sqliteOsTruncate(&pPager->jfd, pPager->syncJSize);
+ if( rc ) return rc;
+#endif
+
if( pPager->errMask!=0 && pPager->errMask!=PAGER_ERR_FULL ){
if( pPager->state>=SQLITE_WRITELOCK ){
pager_playback(pPager);