From: danielk1977 Date: Tue, 20 Jan 2009 17:06:27 +0000 (+0000) Subject: Big change to make pager.c easier to follow. Unused variables removed, comments impro... X-Git-Tag: version-3.6.15~553 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=bea2a94850e3c1030e020c01251160eb5c083f9a;p=thirdparty%2Fsqlite.git Big change to make pager.c easier to follow. Unused variables removed, comments improved, etc. (CVS 6197) FossilOrigin-Name: 12f7346c13c180ed73d7a2e3b590be457360254a --- diff --git a/manifest b/manifest index 034d13a9a0..28e376bb88 100644 --- a/manifest +++ b/manifest @@ -1,5 +1,5 @@ -C When\snot\scompiling\sfor\san\sEBCDIC\ssystem,\suse\sbuilt-in\salternatives\sto\sthe\stolowe\nr(),\stoupper()\sand\sother\sctype.h\slibrary\sfunctions.\sTicket\s#3597.\s(CVS\s6196) -D 2009-01-20T16:53:40 +C Big\schange\sto\smake\spager.c\seasier\sto\sfollow.\sUnused\svariables\sremoved,\scomments\simproved,\setc.\s(CVS\s6197) +D 2009-01-20T17:06:27 F Makefile.arm-wince-mingw32ce-gcc fcd5e9cd67fe88836360bb4f9ef4cb7f8e2fb5a0 F Makefile.in 6619a1b72de7ada2bb7be97862913e27c6f5e339 F Makefile.linux-gcc d53183f4aa6a9192d249731c90dbdffbd2c68654 @@ -101,11 +101,11 @@ F src/alter.c 0ec29744c36c6e976596ce38c16289ebc5dc94db F src/analyze.c c86fd6a1425b22b3a46ce72ad403e4280026364f F src/attach.c 1c35f95da3c62d19de75b44cfefd12c81c1791b3 F src/auth.c c8b2ab5c8bad4bd90ed7c294694f48269162c627 -F src/bitvec.c 1da48f915129b476cae0cfd7a6ed0eb0d03a9bdb +F src/bitvec.c 44f7059ac1f874d364b34af31b9617e52223ba75 F src/btmutex.c 63c5cc4ad5715690767ffcb741e185d7bc35ec1a -F src/btree.c e183a4357e8f5081fd04c757e006ef6fdaaa79a1 +F src/btree.c 01b3af74d9e756901f21401ca1a3d0bb8b2b9368 F src/btree.h 4f141cf748d2ee7c6d7fc175f64f87a45cd44113 -F src/btreeInt.h 790482376263bbc5862ae7124354a26c70695369 +F src/btreeInt.h 44bcbfe387ba99a3a9f2527bd12fa1bb8bc574b3 F src/build.c c8bf5dcef4d5889bc57eecdb8b3dba178e5e06a8 F src/callback.c bee8949d619b1b7b1e4dfac8a19c5116ae1dd12a F src/complete.c cb14e06dbe79dee031031f0d9e686ff306afe07c @@ -119,7 +119,7 @@ F src/hash.c 5824e6ff7ba78cd34c8d6cd724367713583e5b55 F src/hash.h 28f38ebb1006a5beedcb013bcdfe31befe7437ae F src/hwtime.h 4a1d45f4cae1f402ea19686acf24acf4f0cb53cb F src/insert.c f6db1e6f43aae337e64a755208abb6ff124edc19 -F src/journal.c cffd2cd214e58c0e99c3ff632b3bee6c7cbb260e +F src/journal.c e00df0c0da8413ab6e1bb7d7cab5665d4a9000d0 F src/legacy.c 8b3b95d48d202614946d7ce7256e7ba898905c3b F src/loadext.c 3f96631089fc4f3871a67f02f2e4fc7ea4d51edc F src/main.c da5ff4dfbb59896af5f7b4afaef48d2fb7d08a7d @@ -142,11 +142,11 @@ F src/os_common.h 24525d8b7bce66c374dfc1810a6c9043f3359b60 F src/os_os2.c bed77dc26e3a95ce4a204936b9a1ca6fe612fcc5 F src/os_unix.c c74255cc1fcddc38fc3cc1dcf70c2325d3c2948e F src/os_win.c 496e3ceb499aedc63622a89ef76f7af2dd902709 -F src/pager.c bf917e869ef44b8e277e87f5b5ac980a1ee727ef -F src/pager.h 3345547d4b5b4db323f50d855d91a01837c7f2de +F src/pager.c 52fdfe6d6326a646152a26f1db94f4563fd6db39 +F src/pager.h 95b9802f6a014cc55523006f2b893a37b2dc16e3 F src/parse.y b214295a91e985c42adb6bfd3ad1c56c47828e8d -F src/pcache.c a3c729f4bb3464fab27617ab7411916e0cded2bf -F src/pcache.h 00adba50e5b90414a40f2c63e5272c152c523373 +F src/pcache.c 48fbfc2208a2734c578b3223fcd4fea7d84f442b +F src/pcache.h 9b927ccc5a538e31b4c3bc7eec4f976db42a1324 F src/pcache1.c c0aa84ff69ea759fa944dbee9167a2463ab7c322 F src/pragma.c 04c13c79fd559d769f5bcb3aa661b32d484b1e7b F src/prepare.c 9ec504ddd4a8e34e5fb502033312da6a78f5f76a @@ -158,13 +158,13 @@ F src/select.c ae72b604e47092521c4d9ae54e1b1cbeb872a747 F src/shell.c 0d801ef653fd73d17161afebaab898a58ec3524b F src/sqlite.h.in 6cd2489e40fe97ba58c60044a4ced377e08b6d09 F src/sqlite3ext.h 1db7d63ab5de4b3e6b83dd03d1a4e64fef6d2a17 -F src/sqliteInt.h 8ddded6a804d2055ae860e8f514c545a49840006 +F src/sqliteInt.h 086886697368982dd7469689cd6e96c94d091008 F src/sqliteLimit.h ffe93f5a0c4e7bd13e70cd7bf84cfb5c3465f45d F src/status.c 237b193efae0cf6ac3f0817a208de6c6c6ef6d76 F src/table.c 332ab0ea691e63862e2a8bdfe2c0617ee61062a3 F src/tclsqlite.c 7d77c3899d0244804d2773c9157e783788627762 F src/test1.c 58c0026d8764635efe8e7e7cea61e41faecef597 -F src/test2.c b0e977810ea3df8adaa2a2898ef4f6813875e840 +F src/test2.c 9689e7d3b7791da8c03f9acd1ea801802cb83c17 F src/test3.c 88a246b56b824275300e6c899634fbac1dc94b14 F src/test4.c f79ab52d27ff49b784b631a42e2ccd52cfd5c84c F src/test5.c 162a1cea2105a2c460a3f39fa6919617b562a288 @@ -202,7 +202,7 @@ F src/vdbe.c e14b330e7b9b27dd9eb1595f47aedea30acfd521 F src/vdbe.h 03516f28bf5aca00a53c4dccd6c313f96adb94f6 F src/vdbeInt.h 5530e45fc64c1572f123aca384096e1b84cf834b F src/vdbeapi.c 85c33cfbfa56249cbe627831610afafba754477d -F src/vdbeaux.c a5db2fe6da4a49848f123af681888ba4cee6e7b4 +F src/vdbeaux.c 30c1bbc1d2876c5bbe84d52dab9980ed032bca98 F src/vdbeblob.c b0dcebfafedcf9c0addc7901ad98f6f986c08935 F src/vdbemem.c 06311d5c46fd703fe95ff81bd0a1b72ba0341311 F src/vtab.c e39e011d7443a8d574b1b9cde207a35522e6df43 @@ -697,7 +697,7 @@ F tool/speedtest16.c c8a9c793df96db7e4933f0852abb7a03d48f2e81 F tool/speedtest2.tcl ee2149167303ba8e95af97873c575c3e0fab58ff F tool/speedtest8.c 2902c46588c40b55661e471d7a86e4dd71a18224 F tool/speedtest8inst1.c 293327bc76823f473684d589a8160bde1f52c14e -P 7035c35dbef72b7b4d00201a65734a6d93dd0d3e -R 290a11e6c2265b9e59ca104eb003d6fa +P 1041abd6784d283bebf646c54e93599522f7889d +R 4bde501bdb15fe167fc73edd2c133657 U danielk1977 -Z 3b48670f3dab91065d2f211cd03723d9 +Z e2751323f1f34d69687246315d80831a diff --git a/manifest.uuid b/manifest.uuid index f3a35aa040..99eacc3832 100644 --- a/manifest.uuid +++ b/manifest.uuid @@ -1 +1 @@ -1041abd6784d283bebf646c54e93599522f7889d \ No newline at end of file +12f7346c13c180ed73d7a2e3b590be457360254a \ No newline at end of file diff --git a/src/bitvec.c b/src/bitvec.c index f631359cdd..39513ffda0 100644 --- a/src/bitvec.c +++ b/src/bitvec.c @@ -34,7 +34,7 @@ ** start of a transaction, and is thus usually less than a few thousand, ** but can be as large as 2 billion for a really big database. ** -** @(#) $Id: bitvec.c,v 1.12 2009/01/16 16:23:38 danielk1977 Exp $ +** @(#) $Id: bitvec.c,v 1.13 2009/01/20 17:06:27 danielk1977 Exp $ */ #include "sqliteInt.h" @@ -275,6 +275,14 @@ void sqlite3BitvecDestroy(Bitvec *p){ sqlite3_free(p); } +/* +** Return the value of the iSize parameter specified when Bitvec *p +** was created. +*/ +u32 sqlite3BitvecSize(Bitvec *p){ + return p->iSize; +} + #ifndef SQLITE_OMIT_BUILTIN_TEST /* ** Let V[] be an array of unsigned characters sufficient to hold diff --git a/src/btree.c b/src/btree.c index 3732dc6b62..173e4f95c6 100644 --- a/src/btree.c +++ b/src/btree.c @@ -9,7 +9,7 @@ ** May you share freely, never taking more than you give. ** ************************************************************************* -** $Id: btree.c,v 1.560 2009/01/16 16:23:38 danielk1977 Exp $ +** $Id: btree.c,v 1.561 2009/01/20 17:06:27 danielk1977 Exp $ ** ** This file implements a external (disk-based) database using BTrees. ** See the header comment on "btreeInt.h" for additional information. @@ -282,6 +282,80 @@ static void invalidateAllOverflowCache(BtShared *pBt){ #define invalidateAllOverflowCache(x) #endif +/* +** Set bit pgno of the BtShared.pHasContent bitvec. This is called +** when a page that previously contained data becomes a free-list leaf +** page. +** +** The BtShared.pHasContent bitvec exists to work around an obscure +** bug caused by the interaction of two useful IO optimizations surrounding +** free-list leaf pages: +** +** 1) When all data is deleted from a page and the page becomes +** a free-list leaf page, the page is not written to the database +** (as free-list leaf pages contain no meaningful data). Sometimes +** such a page is not even journalled (as it will not be modified, +** why bother journalling it?). +** +** 2) When a free-list leaf page is reused, its content is not read +** from the database or written to the journal file (why should it +** be, if it is not at all meaningful?). +** +** By themselves, these optimizations work fine and provide a handy +** performance boost to bulk delete or insert operations. However, if +** a page is moved to the free-list and then reused within the same +** transaction, a problem comes up. If the page is not journalled when +** it is moved to the free-list and it is also not journalled when it +** is extracted from the free-list and reused, then the original data +** may be lost. In the event of a rollback, it may not be possible +** to restore the database to its original configuration. +** +** The solution is the BtShared.pHasContent bitvec. Whenever a page is +** moved to become a free-list leaf page, the corresponding bit is +** set in the bitvec. Whenever a leaf page is extracted from the free-list, +** optimization 2 above is ommitted if the corresponding bit is already +** set in BtShared.pHasContent. The contents of the bitvec are cleared +** at the end of every transaction. +*/ +static int btreeSetHasContent(BtShared *pBt, Pgno pgno){ + int rc = SQLITE_OK; + if( !pBt->pHasContent ){ + int nPage; + rc = sqlite3PagerPagecount(pBt->pPager, &nPage); + if( rc==SQLITE_OK ){ + pBt->pHasContent = sqlite3BitvecCreate((u32)nPage); + if( !pBt->pHasContent ){ + rc = SQLITE_NOMEM; + } + } + } + if( rc==SQLITE_OK && pgno<=sqlite3BitvecSize(pBt->pHasContent) ){ + rc = sqlite3BitvecSet(pBt->pHasContent, pgno); + } + return rc; +} + +/* +** Query the BtShared.pHasContent vector. +** +** This function is called when a free-list leaf page is removed from the +** free-list for reuse. It returns false if it is safe to retrieve the +** page from the pager layer with the 'no-content' flag set. True otherwise. +*/ +static int btreeGetHasContent(BtShared *pBt, Pgno pgno){ + Bitvec *p = pBt->pHasContent; + return (p && (pgno>sqlite3BitvecSize(p) || sqlite3BitvecTest(p, pgno))); +} + +/* +** Clear (destroy) the BtShared.pHasContent bitvec. This should be +** invoked at the conclusion of each write-transaction. +*/ +static void btreeClearHasContent(BtShared *pBt){ + sqlite3BitvecDestroy(pBt->pHasContent); + pBt->pHasContent = 0; +} + /* ** Save the current cursor position in the variables BtCursor.nKey ** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK. @@ -1100,6 +1174,21 @@ int sqlite3BtreeGetPage( return SQLITE_OK; } +/* +** Retrieve a page from the pager cache. If the requested page is not +** already in the pager cache return NULL. Initialize the MemPage.pBt and +** MemPage.aData elements if needed. +*/ +static MemPage *btreePageLookup(BtShared *pBt, Pgno pgno){ + DbPage *pDbPage; + assert( sqlite3_mutex_held(pBt->mutex) ); + pDbPage = sqlite3PagerLookup(pBt->pPager, pgno); + if( pDbPage ){ + return btreePageFromDbPage(pDbPage, pgno, pBt); + } + return 0; +} + /* ** Return the size of the database file in pages. If there is any kind of ** error, return ((unsigned int)-1). @@ -1124,7 +1213,6 @@ static int getAndInitPage( MemPage **ppPage /* Write the page pointer here */ ){ int rc; - DbPage *pDbPage; MemPage *pPage; assert( sqlite3_mutex_held(pBt->mutex) ); @@ -1137,10 +1225,9 @@ static int getAndInitPage( ** pagerPagecount() to make sure pgno is within limits, which results ** in a measureable performance improvements. */ - pDbPage = sqlite3PagerLookup(pBt->pPager, pgno); - if( pDbPage ){ + *ppPage = pPage = btreePageLookup(pBt, pgno); + if( pPage ){ /* Page is already in cache */ - *ppPage = pPage = btreePageFromDbPage(pDbPage, pgno, pBt); rc = SQLITE_OK; }else{ /* Page not in cache. Acquire it. */ @@ -2013,7 +2100,7 @@ int sqlite3BtreeBeginTrans(Btree *p, int wrflag){ if( pBt->readOnly ){ rc = SQLITE_READONLY; }else{ - rc = sqlite3PagerBegin(pBt->pPage1->pDbPage, wrflag>1); + rc = sqlite3PagerBegin(pBt->pPager, wrflag>1); if( rc==SQLITE_OK ){ rc = newDatabase(pBt); } @@ -2372,7 +2459,7 @@ int sqlite3BtreeIncrVacuum(Btree *p){ rc = SQLITE_DONE; }else{ invalidateAllOverflowCache(pBt); - rc = incrVacuumStep(pBt, 0, sqlite3PagerImageSize(pBt->pPager)); + rc = incrVacuumStep(pBt, 0, pagerPagecount(pBt)); } sqlite3BtreeLeave(p); return rc; @@ -2540,6 +2627,7 @@ int sqlite3BtreeCommitPhaseTwo(Btree *p){ /* Set the handles current transaction state to TRANS_NONE and unlock ** the pager if this call closed the only read or write transaction. */ + btreeClearHasContent(pBt); p->inTrans = TRANS_NONE; unlockBtreeIfUnused(pBt); @@ -2675,6 +2763,7 @@ int sqlite3BtreeRollback(Btree *p){ } } + btreeClearHasContent(pBt); p->inTrans = TRANS_NONE; pBt->inStmt = 0; unlockBtreeIfUnused(pBt); @@ -3082,34 +3171,29 @@ int sqlite3BtreeDataSize(BtCursor *pCur, u32 *pSize){ ** ** If an error occurs an SQLite error code is returned. Otherwise: ** -** Unless pPgnoNext is NULL, the page number of the next overflow -** page in the linked list is written to *pPgnoNext. If page ovfl -** is the last page in its linked list, *pPgnoNext is set to zero. +** The page number of the next overflow page in the linked list is +** written to *pPgnoNext. If page ovfl is the last page in its linked +** list, *pPgnoNext is set to zero. ** -** If ppPage is not NULL, *ppPage is set to the MemPage* handle -** for page ovfl. The underlying pager page may have been requested -** with the noContent flag set, so the page data accessable via -** this handle may not be trusted. +** If ppPage is not NULL, and a reference to the MemPage object corresponding +** to page number pOvfl was obtained, then *ppPage is set to point to that +** reference. It is the responsibility of the caller to call releasePage() +** on *ppPage to free the reference. In no reference was obtained (because +** the pointer-map was used to obtain the value for *pPgnoNext), then +** *ppPage is set to zero. */ static int getOverflowPage( BtShared *pBt, Pgno ovfl, /* Overflow page */ - MemPage **ppPage, /* OUT: MemPage handle */ + MemPage **ppPage, /* OUT: MemPage handle (may be NULL) */ Pgno *pPgnoNext /* OUT: Next overflow page number */ ){ Pgno next = 0; + MemPage *pPage = 0; int rc = SQLITE_OK; assert( sqlite3_mutex_held(pBt->mutex) ); - /* One of these must not be NULL. Otherwise, why call this function? */ - assert(ppPage || pPgnoNext); - - /* If pPgnoNext is NULL, then this function is being called to obtain - ** a MemPage* reference only. No page-data is required in this case. - */ - if( !pPgnoNext ){ - return sqlite3BtreeGetPage(pBt, ovfl, ppPage, 1); - } + assert(pPgnoNext); #ifndef SQLITE_OMIT_AUTOVACUUM /* Try to find the next page in the overflow list using the @@ -3129,34 +3213,29 @@ static int getOverflowPage( if( iGuess<=pagerPagecount(pBt) ){ rc = ptrmapGet(pBt, iGuess, &eType, &pgno); - if( rc!=SQLITE_OK ){ - return rc; - } - if( eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){ + if( rc==SQLITE_OK && eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){ next = iGuess; + rc = SQLITE_DONE; } } } #endif - if( next==0 || ppPage ){ - MemPage *pPage = 0; - - rc = sqlite3BtreeGetPage(pBt, ovfl, &pPage, next!=0); + if( rc==SQLITE_OK ){ + rc = sqlite3BtreeGetPage(pBt, ovfl, &pPage, 0); assert(rc==SQLITE_OK || pPage==0); if( next==0 && rc==SQLITE_OK ){ next = get4byte(pPage->aData); } - - if( ppPage ){ - *ppPage = pPage; - }else{ - releasePage(pPage); - } } - *pPgnoNext = next; - return rc; + *pPgnoNext = next; + if( ppPage ){ + *ppPage = pPage; + }else{ + releasePage(pPage); + } + return (rc==SQLITE_DONE ? SQLITE_OK : rc); } /* @@ -4265,6 +4344,7 @@ static int allocateBtreePage( iPage = get4byte(&aData[8+closest*4]); if( !searchList || iPage==nearby ){ + int noContent; Pgno nPage; *pPgno = iPage; nPage = pagerPagecount(pBt); @@ -4281,9 +4361,9 @@ static int allocateBtreePage( } put4byte(&aData[4], k-1); assert( sqlite3PagerIswriteable(pTrunk->pDbPage) ); - rc = sqlite3BtreeGetPage(pBt, *pPgno, ppPage, 1); + noContent = !btreeGetHasContent(pBt, *pPgno); + rc = sqlite3BtreeGetPage(pBt, *pPgno, ppPage, noContent); if( rc==SQLITE_OK ){ - sqlite3PagerDontRollback((*ppPage)->pDbPage); rc = sqlite3PagerWrite((*ppPage)->pDbPage); if( rc!=SQLITE_OK ){ releasePage(*ppPage); @@ -4301,6 +4381,10 @@ static int allocateBtreePage( int nPage = pagerPagecount(pBt); *pPgno = nPage + 1; + if( *pPgno==PENDING_BYTE_PAGE(pBt) ){ + (*pPgno)++; + } + #ifndef SQLITE_OMIT_AUTOVACUUM if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt, *pPgno) ){ /* If *pPgno refers to a pointer-map page, allocate two new pages @@ -4340,32 +4424,51 @@ end_allocate_page: } /* -** Add a page of the database file to the freelist. +** This function is used to add page iPage to the database file free-list. +** It is assumed that the page is not already a part of the free-list. +** +** The value passed as the second argument to this function is optional. +** If the caller happens to have a pointer to the MemPage object +** corresponding to page iPage handy, it may pass it as the second value. +** Otherwise, it may pass NULL. ** -** sqlite3PagerUnref() is NOT called for pPage. +** If a pointer to a MemPage object is passed as the second argument, +** its reference count is not altered by this function. */ -static int freePage(MemPage *pPage){ - BtShared *pBt = pPage->pBt; - MemPage *pPage1 = pBt->pPage1; - int rc, n, k; +static int freePage2(BtShared *pBt, MemPage *pMemPage, Pgno iPage){ + MemPage *pTrunk = 0; /* Free-list trunk page */ + Pgno iTrunk = 0; /* Page number of free-list trunk page */ + MemPage *pPage1 = pBt->pPage1; /* Local reference to page 1 */ + MemPage *pPage; /* Page being freed. May be NULL. */ + int rc; /* Return Code */ + int nFree; /* Initial number of pages on free-list */ - /* Prepare the page for freeing */ - assert( sqlite3_mutex_held(pPage->pBt->mutex) ); - assert( pPage->pgno>1 ); - pPage->isInit = 0; + assert( sqlite3_mutex_held(pBt->mutex) ); + assert( iPage>1 ); + assert( !pMemPage || pMemPage->pgno==iPage ); + + if( pMemPage ){ + pPage = pMemPage; + sqlite3PagerRef(pPage->pDbPage); + }else{ + pPage = btreePageLookup(pBt, iPage); + } /* Increment the free page count on pPage1 */ rc = sqlite3PagerWrite(pPage1->pDbPage); - if( rc ) return rc; - n = get4byte(&pPage1->aData[36]); - put4byte(&pPage1->aData[36], n+1); + if( rc ) goto freepage_out; + nFree = get4byte(&pPage1->aData[36]); + put4byte(&pPage1->aData[36], nFree+1); #ifdef SQLITE_SECURE_DELETE /* If the SQLITE_SECURE_DELETE compile-time option is enabled, then ** always fully overwrite deleted information with zeros. */ - rc = sqlite3PagerWrite(pPage->pDbPage); - if( rc ) return rc; + if( (!pPage && (rc = sqlite3BtreeGetPage(pBt, iPage, &pPage, 0))) + || (rc = sqlite3PagerWrite(pPage->pDbPage)) + ){ + goto freepage_out; + } memset(pPage->aData, 0, pPage->pBt->pageSize); #endif @@ -4373,27 +4476,34 @@ static int freePage(MemPage *pPage){ ** to indicate that the page is free. */ if( ISAUTOVACUUM ){ - rc = ptrmapPut(pBt, pPage->pgno, PTRMAP_FREEPAGE, 0); - if( rc ) return rc; + rc = ptrmapPut(pBt, iPage, PTRMAP_FREEPAGE, 0); + if( rc ) goto freepage_out; } - if( n==0 ){ - /* This is the first free page */ - rc = sqlite3PagerWrite(pPage->pDbPage); - if( rc ) return rc; - memset(pPage->aData, 0, 8); - put4byte(&pPage1->aData[32], pPage->pgno); - TRACE(("FREE-PAGE: %d first\n", pPage->pgno)); - }else{ - /* Other free pages already exist. Retrive the first trunk page - ** of the freelist and find out how many leaves it has. */ - MemPage *pTrunk; - rc = sqlite3BtreeGetPage(pBt, get4byte(&pPage1->aData[32]), &pTrunk, 0); - if( rc ) return rc; - k = get4byte(&pTrunk->aData[4]); - if( k>=pBt->usableSize/4 - 8 ){ - /* The trunk is full. Turn the page being freed into a new - ** trunk page with no leaves. + /* Now manipulate the actual database free-list structure. There are two + ** possibilities. If the free-list is currently empty, or if the first + ** trunk page in the free-list is full, then this page will become a + ** new free-list trunk page. Otherwise, it will become a leaf of the + ** first trunk page in the current free-list. This block tests if it + ** is possible to add the page as a new free-list leaf. + */ + if( nFree!=0 ){ + int nLeaf; /* Initial number of leaf cells on trunk page */ + + iTrunk = get4byte(&pPage1->aData[32]); + rc = sqlite3BtreeGetPage(pBt, iTrunk, &pTrunk, 0); + if( rc!=SQLITE_OK ){ + goto freepage_out; + } + + nLeaf = get4byte(&pTrunk->aData[4]); + if( nLeaf<0 ){ + rc = SQLITE_CORRUPT_BKPT; + goto freepage_out; + } + if( nLeafusableSize/4 - 8 ){ + /* In this case there is room on the trunk page to insert the page + ** being freed as a new leaf. ** ** Note that the trunk page is not really full until it contains ** usableSize/4 - 2 entries, not usableSize/4 - 8 entries as we have @@ -4406,32 +4516,49 @@ static int freePage(MemPage *pPage){ ** to 3.6.0 or later) we should consider fixing the conditional above ** to read "usableSize/4-2" instead of "usableSize/4-8". */ - rc = sqlite3PagerWrite(pPage->pDbPage); - if( rc==SQLITE_OK ){ - put4byte(pPage->aData, pTrunk->pgno); - put4byte(&pPage->aData[4], 0); - put4byte(&pPage1->aData[32], pPage->pgno); - TRACE(("FREE-PAGE: %d new trunk page replacing %d\n", - pPage->pgno, pTrunk->pgno)); - } - }else if( k<0 ){ - rc = SQLITE_CORRUPT; - }else{ - /* Add the newly freed page as a leaf on the current trunk */ rc = sqlite3PagerWrite(pTrunk->pDbPage); if( rc==SQLITE_OK ){ - put4byte(&pTrunk->aData[4], k+1); - put4byte(&pTrunk->aData[8+k*4], pPage->pgno); + put4byte(&pTrunk->aData[4], nLeaf+1); + put4byte(&pTrunk->aData[8+nLeaf*4], iPage); #ifndef SQLITE_SECURE_DELETE - rc = sqlite3PagerDontWrite(pPage->pDbPage); + if( pPage ){ + sqlite3PagerDontWrite(pPage->pDbPage); + } #endif + rc = btreeSetHasContent(pBt, iPage); } TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno)); + goto freepage_out; } - releasePage(pTrunk); } + + /* If control flows to this point, then it was not possible to add the + ** the page being freed as a leaf page of the first trunk in the free-list. + ** Possibly because the free-list is empty, or possibly because the + ** first trunk in the free-list is full. Either way, the page being freed + ** will become the new first trunk page in the free-list. + */ + if( (!pPage && (rc = sqlite3BtreeGetPage(pBt, iPage, &pPage, 0))) + || (rc = sqlite3PagerWrite(pPage->pDbPage)) + ){ + goto freepage_out; + } + put4byte(pPage->aData, iTrunk); + put4byte(&pPage->aData[4], 0); + put4byte(&pPage1->aData[32], iPage); + TRACE(("FREE-PAGE: %d new trunk page replacing %d\n", pPage->pgno, iTrunk)); + +freepage_out: + if( pPage ){ + pPage->isInit = 0; + } + releasePage(pPage); + releasePage(pTrunk); return rc; } +static int freePage(MemPage *pPage){ + return freePage2(pPage->pBt, pPage, pPage->pgno); +} /* ** Free any overflow pages associated with the given Cell. @@ -4454,16 +4581,21 @@ static int clearCell(MemPage *pPage, unsigned char *pCell){ nOvfl = (info.nPayload - info.nLocal + ovflPageSize - 1)/ovflPageSize; assert( ovflPgno==0 || nOvfl>0 ); while( nOvfl-- ){ - MemPage *pOvfl; + Pgno iNext; + MemPage *pOvfl = 0; if( ovflPgno==0 || ovflPgno>pagerPagecount(pBt) ){ return SQLITE_CORRUPT_BKPT; } - - rc = getOverflowPage(pBt, ovflPgno, &pOvfl, (nOvfl==0)?0:&ovflPgno); - if( rc ) return rc; - rc = freePage(pOvfl); - sqlite3PagerUnref(pOvfl->pDbPage); + if( nOvfl ){ + rc = getOverflowPage(pBt, ovflPgno, &pOvfl, &iNext); + if( rc ) return rc; + } + rc = freePage2(pBt, pOvfl, ovflPgno); + if( pOvfl ){ + sqlite3PagerUnref(pOvfl->pDbPage); + } if( rc ) return rc; + ovflPgno = iNext; } return SQLITE_OK; } @@ -6229,11 +6361,6 @@ static int btreeCreateTable(Btree *p, int *piTable, int flags){ } assert( eType!=PTRMAP_ROOTPAGE ); assert( eType!=PTRMAP_FREEPAGE ); - rc = sqlite3PagerWrite(pRoot->pDbPage); - if( rc!=SQLITE_OK ){ - releasePage(pRoot); - return rc; - } rc = relocatePage(pBt, pRoot, eType, iPtrPage, pgnoMove, 0); releasePage(pRoot); @@ -7094,17 +7221,6 @@ const char *sqlite3BtreeGetFilename(Btree *p){ return sqlite3PagerFilename(p->pBt->pPager); } -/* -** Return the pathname of the directory that contains the database file. -** -** The pager directory name is invariant as long as the pager is -** open so it is safe to access without the BtShared mutex. -*/ -const char *sqlite3BtreeGetDirname(Btree *p){ - assert( p->pBt->pPager!=0 ); - return sqlite3PagerDirname(p->pBt->pPager); -} - /* ** Return the pathname of the journal file for this database. The return ** value of this routine is the same regardless of whether the journal file @@ -7190,7 +7306,7 @@ static int btreeCopyFile(Btree *pTo, Btree *pFrom){ ** page is still on the rollback journal, though. And that is the ** whole point of this block: to put pages on the rollback journal. */ - rc = sqlite3PagerDontWrite(pDbPage); + sqlite3PagerDontWrite(pDbPage); } sqlite3PagerUnref(pDbPage); } diff --git a/src/btreeInt.h b/src/btreeInt.h index b605453c55..2a97f9ee41 100644 --- a/src/btreeInt.h +++ b/src/btreeInt.h @@ -9,7 +9,7 @@ ** May you share freely, never taking more than you give. ** ************************************************************************* -** $Id: btreeInt.h,v 1.40 2009/01/16 16:23:38 danielk1977 Exp $ +** $Id: btreeInt.h,v 1.41 2009/01/20 17:06:27 danielk1977 Exp $ ** ** This file implements a external (disk-based) database using BTrees. ** For a detailed discussion of BTrees, refer to @@ -204,10 +204,6 @@ ** * zero or more pages numbers of leaves */ #include "sqliteInt.h" -#include "pager.h" -#include "btree.h" -#include "os.h" -#include /* Round up a number to the next larger multiple of 8. This is used ** to force 8-byte alignment on 64-bit architectures. @@ -383,6 +379,7 @@ struct BtShared { void *pSchema; /* Pointer to space allocated by sqlite3BtreeSchema() */ void (*xFreeSchema)(void*); /* Destructor for BtShared.pSchema */ sqlite3_mutex *mutex; /* Non-recursive mutex required to access this struct */ + Bitvec *pHasContent; /* Set of pages moved to free-list this transaction */ #ifndef SQLITE_OMIT_SHARED_CACHE int nRef; /* Number of references to this structure */ BtShared *pNext; /* Next on a list of sharable BtShared structs */ @@ -490,18 +487,10 @@ struct BtCursor { #define CURSOR_REQUIRESEEK 2 #define CURSOR_FAULT 3 -/* The database page the PENDING_BYTE occupies. This page is never used. -** TODO: This macro is very similary to PAGER_MJ_PGNO() in pager.c. They -** should possibly be consolidated (presumably in pager.h). -** -** If disk I/O is omitted (meaning that the database is stored purely -** in memory) then there is no pending byte. +/* +** The database page the PENDING_BYTE occupies. This page is never used. */ -#ifdef SQLITE_OMIT_DISKIO -# define PENDING_BYTE_PAGE(pBt) 0x7fffffff -#else -# define PENDING_BYTE_PAGE(pBt) ((Pgno)((PENDING_BYTE/(pBt)->pageSize)+1)) -#endif +# define PENDING_BYTE_PAGE(pBt) PAGER_MJ_PGNO(pBt) /* ** A linked list of the following structures is stored at BtShared.pLock. diff --git a/src/journal.c b/src/journal.c index bdc36bcaec..9466e69db1 100644 --- a/src/journal.c +++ b/src/journal.c @@ -10,7 +10,7 @@ ** ************************************************************************* ** -** @(#) $Id: journal.c,v 1.8 2008/05/01 18:01:47 drh Exp $ +** @(#) $Id: journal.c,v 1.9 2009/01/20 17:06:27 danielk1977 Exp $ */ #ifdef SQLITE_ENABLE_ATOMIC_WRITE @@ -28,7 +28,7 @@ ** ** 1) The in-memory representation grows too large for the allocated ** buffer, or -** 2) The xSync() method is called. +** 2) The sqlite3JournalCreate() function is called. */ #include "sqliteInt.h" @@ -95,8 +95,9 @@ static int jrnlRead( JournalFile *p = (JournalFile *)pJfd; if( p->pReal ){ rc = sqlite3OsRead(p->pReal, zBuf, iAmt, iOfst); + }else if( (iAmt+iOfst)>p->iSize ){ + rc = SQLITE_IOERR_SHORT_READ; }else{ - assert( iAmt+iOfst<=p->iSize ); memcpy(zBuf, &p->zBuf[iOfst], iAmt); } return rc; diff --git a/src/pager.c b/src/pager.c index aafdb648ca..93570b53e2 100644 --- a/src/pager.c +++ b/src/pager.c @@ -18,7 +18,7 @@ ** file simultaneously, or one process from reading the database while ** another is writing. ** -** @(#) $Id: pager.c,v 1.555 2009/01/17 16:59:41 danielk1977 Exp $ +** @(#) $Id: pager.c,v 1.556 2009/01/20 17:06:27 danielk1977 Exp $ */ #ifndef SQLITE_OMIT_DISKIO #include "sqliteInt.h" @@ -149,73 +149,151 @@ struct PagerSavepoint { /* ** A open page cache is an instance of the following structure. ** -** Pager.errCode may be set to SQLITE_IOERR, SQLITE_CORRUPT, or -** or SQLITE_FULL. Once one of the first three errors occurs, it persists -** and is returned as the result of every major pager API call. The -** SQLITE_FULL return code is slightly different. It persists only until the -** next successful rollback is performed on the pager cache. Also, -** SQLITE_FULL does not affect the sqlite3PagerGet() and sqlite3PagerLookup() -** APIs, they may still be used successfully. -** -** Managing the size of the database file in pages is a little complicated. -** The variable Pager.dbSize contains the number of pages that the database -** image currently contains. As the database image grows or shrinks this -** variable is updated. The variable Pager.dbFileSize contains the number -** of pages in the database file. This may be different from Pager.dbSize -** if some pages have been appended to the database image but not yet written -** out from the cache to the actual file on disk. Or if the image has been -** truncated by an incremental-vacuum operation. The Pager.dbOrigSize variable -** contains the number of pages in the database image when the current -** transaction was opened. The contents of all three of these variables is -** only guaranteed to be correct if the boolean Pager.dbSizeValid is true. +** errCode +** +** Pager.errCode may be set to SQLITE_IOERR, SQLITE_CORRUPT, or +** or SQLITE_FULL. Once one of the first three errors occurs, it persists +** and is returned as the result of every major pager API call. The +** SQLITE_FULL return code is slightly different. It persists only until the +** next successful rollback is performed on the pager cache. Also, +** SQLITE_FULL does not affect the sqlite3PagerGet() and sqlite3PagerLookup() +** APIs, they may still be used successfully. +** +** dbSizeValid, dbSize, dbOrigSize, dbFileSize +** +** Managing the size of the database file in pages is a little complicated. +** The variable Pager.dbSize contains the number of pages that the database +** image currently contains. As the database image grows or shrinks this +** variable is updated. The variable Pager.dbFileSize contains the number +** of pages in the database file. This may be different from Pager.dbSize +** if some pages have been appended to the database image but not yet written +** out from the cache to the actual file on disk. Or if the image has been +** truncated by an incremental-vacuum operation. The Pager.dbOrigSize variable +** contains the number of pages in the database image when the current +** transaction was opened. The contents of all three of these variables is +** only guaranteed to be correct if the boolean Pager.dbSizeValid is true. +** +** TODO: Under what conditions is dbSizeValid set? Cleared? +** +** changeCountDone +** +** This boolean variable is used to make sure that the change-counter +** (the 4-byte header field at byte offset 24 of the database file) is +** not updated more often than necessary. +** +** It is set to true when the change-counter field is updated, which +** can only happen if an exclusive lock is held on the database file. +** It is cleared (set to false) whenever an exclusive lock is +** relinquished on the database file. Each time a transaction is committed, +** The changeCountDone flag is inspected. If it is true, the work of +** updating the change-counter is omitted for the current transaction. +** +** This mechanism means that when running in exclusive mode, a connection +** need only update the change-counter once, for the first transaction +** committed. +** +** dbModified +** +** The dbModified flag is set whenever a database page is dirtied. +** It is cleared at the end of each transaction. +** +** It is used when committing or otherwise ending a transaction. If +** the dbModified flag is clear then less work has to be done. +** +** journalStarted +** +** This flag is set whenever the the main journal is synced. +** +** The point of this flag is that it must be set after the +** first journal header in a journal file has been synced to disk. +** After this has happened, new pages appended to the database +** do not need the PGHDR_NEED_SYNC flag set, as they do not need +** to wait for a journal sync before they can be written out to +** the database file (see function pager_write()). +** +** setMaster +** +** This variable is used to ensure that the master journal file name +** (if any) is only written into the journal file once. +** +** When committing a transaction, the master journal file name (if any) +** may be written into the journal file while the pager is still in +** PAGER_RESERVED state (see CommitPhaseOne() for the action). It +** then attempts to upgrade to an exclusive lock. If this attempt +** fails, then SQLITE_BUSY may be returned to the user and the user +** may attempt to commit the transaction again later (calling +** CommitPhaseOne() again). This flag is used to ensure that the +** master journal name is only written to the journal file the first +** time CommitPhaseOne() is called. +** +** doNotSync +** +** This variable is set and cleared by sqlite3PagerWrite(). +** +** needSync +** +** TODO: It might be easier to set this variable in writeJournalHdr() +** and writeMasterJournal() only. Change its meaning to "unsynced data +** has been written to the journal". */ struct Pager { sqlite3_vfs *pVfs; /* OS functions to use for IO */ - u8 journalOpen; /* True if journal file descriptors is valid */ - u8 journalStarted; /* True if header of journal is synced */ + u8 exclusiveMode; /* Boolean. True if locking_mode==EXCLUSIVE */ + u8 journalMode; /* On of the PAGER_JOURNALMODE_* values */ u8 useJournal; /* Use a rollback journal on this file */ u8 noReadlock; /* Do not bother to obtain readlocks */ u8 noSync; /* Do not sync the journal if true */ u8 fullSync; /* Do extra syncs of the journal for robustness */ u8 sync_flags; /* One of SYNC_NORMAL or SYNC_FULL */ - u8 state; /* PAGER_UNLOCK, _SHARED, _RESERVED, etc. */ u8 tempFile; /* zFilename is a temporary file */ u8 readOnly; /* True for a read-only database */ - u8 needSync; /* True if an fsync() is needed on the journal */ - u8 dirtyCache; /* True if cached pages have changed */ u8 memDb; /* True to inhibit all file I/O */ - u8 setMaster; /* True if a m-j name has been written to jrnl */ - u8 doNotSync; /* Boolean. While true, do not spill the cache */ - u8 exclusiveMode; /* Boolean. True if locking_mode==EXCLUSIVE */ - u8 journalMode; /* On of the PAGER_JOURNALMODE_* values */ + + /* The following block contains those class members that are dynamically + ** modified during normal operations. The other variables in this structure + ** are either constant throughout the lifetime of the pager, or else + ** used to store configuration parameters that affect the way the pager + ** operates. + ** + ** The 'state' variable is described in more detail along with the + ** descriptions of the values it may take - PAGER_UNLOCK etc. Many of the + ** other variables in this block are described in the comment directly + ** above this class definition. + */ + u8 state; /* PAGER_UNLOCK, _SHARED, _RESERVED, etc. */ u8 dbModified; /* True if there are any changes to the Db */ + u8 needSync; /* True if an fsync() is needed on the journal */ + u8 journalStarted; /* True if header of journal is synced */ u8 changeCountDone; /* Set after incrementing the change-counter */ + u8 setMaster; /* True if a m-j name has been written to jrnl */ + u8 doNotSync; /* Boolean. While true, do not spill the cache */ u8 dbSizeValid; /* Set when dbSize is correct */ Pgno dbSize; /* Number of pages in the database */ Pgno dbOrigSize; /* dbSize before the current transaction */ Pgno dbFileSize; /* Number of pages in the database file */ - u32 vfsFlags; /* Flags for sqlite3_vfs.xOpen() */ int errCode; /* One of several kinds of errors */ - int nRec; /* Number of pages written to the journal */ + int nRec; /* Pages journalled since last j-header written */ u32 cksumInit; /* Quasi-random value added to every checksum */ - int stmtNRec; /* Number of records in stmt subjournal */ + u32 nSubRec; /* Number of records written to sub-journal */ + Bitvec *pInJournal; /* One bit for each page in the database file */ + sqlite3_file *fd; /* File descriptor for database */ + sqlite3_file *jfd; /* File descriptor for main journal */ + sqlite3_file *sjfd; /* File descriptor for sub-journal */ + i64 journalOff; /* Current write offset in the journal file */ + i64 journalHdr; /* Byte offset to previous journal header */ + PagerSavepoint *aSavepoint; /* Array of active savepoints */ + int nSavepoint; /* Number of elements in aSavepoint[] */ + char dbFileVers[16]; /* Changes whenever database file changes */ + u32 sectorSize; /* Assumed sector size during rollback */ + int nExtra; /* Add this many bytes to each in-memory page */ + u32 vfsFlags; /* Flags for sqlite3_vfs.xOpen() */ int pageSize; /* Number of bytes in a page */ - int nPage; /* Total number of in-memory pages */ - int mxPage; /* Maximum number of pages to hold in cache */ Pgno mxPgno; /* Maximum allowed size of the database */ - Bitvec *pInJournal; /* One bit for each page in the database file */ - Bitvec *pAlwaysRollback; /* One bit for each page marked always-rollback */ char *zFilename; /* Name of the database file */ char *zJournal; /* Name of the journal file */ - char *zDirectory; /* Directory hold database and journal files */ - sqlite3_file *fd, *jfd; /* File descriptors for database and journal */ - sqlite3_file *sjfd; /* File descriptor for the sub-journal*/ int (*xBusyHandler)(void*); /* Function to call when busy */ void *pBusyHandlerArg; /* Context argument for xBusyHandler */ - i64 journalOff; /* Current byte offset in the journal file */ - i64 journalHdr; /* Byte offset to previous journal header */ - u32 sectorSize; /* Assumed sector size during rollback */ #ifdef SQLITE_TEST int nHit, nMiss; /* Cache hits and missing */ int nRead, nWrite; /* Database pages read/written */ @@ -226,11 +304,8 @@ struct Pager { void *pCodecArg; /* First argument to xCodec() */ #endif char *pTmpSpace; /* Pager.pageSize bytes of space for tmp use */ - char dbFileVers[16]; /* Changes whenever database file changes */ i64 journalSizeLimit; /* Size limit for persistent journal files */ PCache *pPCache; /* Pointer to page cache object */ - PagerSavepoint *aSavepoint; /* Array of active savepoints */ - int nSavepoint; /* Number of elements in aSavepoint[] */ }; /* @@ -277,15 +352,14 @@ static const unsigned char aJournalMagic[] = { }; /* -** The size of the header and of each page in the journal is determined -** by the following macros. +** The size of the of each page record in the journal is given by +** the following macro. */ #define JOURNAL_PG_SZ(pPager) ((pPager->pageSize) + 8) /* -** The journal header size for this pager. In the future, this could be -** set to some value read from the disk controller. The important -** characteristic is that it is the same size as a disk sector. +** The journal header size for this pager. This is usually the same +** size as a single disk sector. See also setSectorSize(). */ #define JOURNAL_HDR_SZ(pPager) (pPager->sectorSize) @@ -302,19 +376,27 @@ static const unsigned char aJournalMagic[] = { #endif /* -** Page number PAGER_MJ_PGNO is never used in an SQLite database (it is -** reserved for working around a windows/posix incompatibility). It is -** used in the journal to signify that the remainder of the journal file -** is devoted to storing a master journal name - there are no more pages to -** roll back. See comments for function writeMasterJournal() for details. +** The maximum legal page number is (2^31 - 1). */ -/* #define PAGER_MJ_PGNO(x) (PENDING_BYTE/((x)->pageSize)) */ -#define PAGER_MJ_PGNO(x) ((Pgno)((PENDING_BYTE/((x)->pageSize))+1)) +#define PAGER_MAX_PGNO 2147483647 +#ifndef NDEBUG /* -** The maximum legal page number is (2^31 - 1). +** Usage: +** +** assert( assert_pager_state(pPager) ); */ -#define PAGER_MAX_PGNO 2147483647 +static int assert_pager_state(Pager *pPager){ + + /* A temp-file is always in PAGER_EXCLUSIVE or PAGER_SYNCED state. */ + assert( pPager->tempFile==0 || pPager->state>=PAGER_EXCLUSIVE ); + + /* The changeCountDone flag is always set for temp-files */ + assert( pPager->tempFile==0 || pPager->changeCountDone ); + + return 1; +} +#endif /* ** Return true if it is necessary to write page *pPg into the sub-journal. @@ -376,11 +458,25 @@ static int write32bits(sqlite3_file *fd, i64 offset, u32 val){ return sqlite3OsWrite(fd, ac, 4, offset); } +/* +** The argument to this macro is a file descriptor (type sqlite3_file*). +** Return 0 if it is not open, or non-zero (but not 1) if it is. +** +** This is so that expressions can be written as: +** +** if( isOpen(pPager->jfd) ){ ... +** +** instead of +** +** if( pPager->jfd->pMethods ){ ... +*/ +#define isOpen(pFd) ((pFd)->pMethods) + /* ** If file pFd is open, call sqlite3OsUnlock() on it. */ static int osUnlock(sqlite3_file *pFd, int eLock){ - if( !pFd->pMethods ){ + if( !isOpen(pFd) ){ return SQLITE_OK; } return sqlite3OsUnlock(pFd, eLock); @@ -395,77 +491,37 @@ static int osUnlock(sqlite3_file *pFd, int eLock){ ** (b) the value returned by OsSectorSize() is less than or equal ** to the page size. ** +** The optimization is also always enabled for temporary files. It is +** an error to call this function if pPager is opened on an in-memory +** database. +** ** If the optimization cannot be used, 0 is returned. If it can be used, ** then the value returned is the size of the journal file when it ** contains rollback data for exactly one page. */ #ifdef SQLITE_ENABLE_ATOMIC_WRITE static int jrnlBufferSize(Pager *pPager){ - int dc; /* Device characteristics */ - int nSector; /* Sector size */ - int szPage; /* Page size */ - sqlite3_file *fd = pPager->fd; + assert( !MEMDB ); + if( !pPager->tempFile ){ + int dc; /* Device characteristics */ + int nSector; /* Sector size */ + int szPage; /* Page size */ - if( fd->pMethods ){ - dc = sqlite3OsDeviceCharacteristics(fd); + assert( isOpen(pPager->fd) ); + dc = sqlite3OsDeviceCharacteristics(pPager->fd); nSector = pPager->sectorSize; szPage = pPager->pageSize; - } - - assert(SQLITE_IOCAP_ATOMIC512==(512>>8)); - assert(SQLITE_IOCAP_ATOMIC64K==(65536>>8)); - - if( !fd->pMethods || - (dc & (SQLITE_IOCAP_ATOMIC|(szPage>>8)) && nSector<=szPage) ){ - return JOURNAL_HDR_SZ(pPager) + JOURNAL_PG_SZ(pPager); - } - return 0; -} -#endif -/* -** This function should be called when an error occurs within the pager -** code. The first argument is a pointer to the pager structure, the -** second the error-code about to be returned by a pager API function. -** The value returned is a copy of the second argument to this function. -** -** If the second argument is SQLITE_IOERR, SQLITE_CORRUPT, or SQLITE_FULL -** the error becomes persistent. Until the persisten error is cleared, -** subsequent API calls on this Pager will immediately return the same -** error code. -** -** A persistent error indicates that the contents of the pager-cache -** cannot be trusted. This state can be cleared by completely discarding -** the contents of the pager-cache. If a transaction was active when -** the persistent error occured, then the rollback journal may need -** to be replayed. -*/ -static void pager_unlock(Pager *pPager); -static int pager_error(Pager *pPager, int rc){ - int rc2 = rc & 0xff; - assert( - pPager->errCode==SQLITE_FULL || - pPager->errCode==SQLITE_OK || - (pPager->errCode & 0xff)==SQLITE_IOERR - ); - if( - rc2==SQLITE_FULL || - rc2==SQLITE_IOERR || - rc2==SQLITE_CORRUPT - ){ - pPager->errCode = rc; - if( pPager->state==PAGER_UNLOCK - && sqlite3PcacheRefCount(pPager->pPCache)==0 - ){ - /* If the pager is already unlocked, call pager_unlock() now to - ** clear the error state and ensure that the pager-cache is - ** completely empty. - */ - pager_unlock(pPager); + assert(SQLITE_IOCAP_ATOMIC512==(512>>8)); + assert(SQLITE_IOCAP_ATOMIC64K==(65536>>8)); + if( 0==(dc&(SQLITE_IOCAP_ATOMIC|(szPage>>8)) || nSector>szPage) ){ + return 0; } } - return rc; + + return JOURNAL_HDR_SZ(pPager) + JOURNAL_PG_SZ(pPager); } +#endif /* ** If SQLITE_CHECK_PAGES is defined then we do some sanity checking @@ -511,8 +567,10 @@ static void checkPage(PgHdr *pPg){ /* ** When this is called the journal file for pager pPager must be open. -** The master journal file name is read from the end of the file and -** written into memory supplied by the caller. +** This function attempts to read a master journal file name from the +** end of the file and, if successful, copies it into memory supplied +** by the caller. See comments above writeMasterJournal() for the format +** used to store a master journal file name at the end of a journal file. ** ** zMaster must point to a buffer of at least nMaster bytes allocated by ** the caller. This should be sqlite3_vfs.mxPathname+1 (to ensure there is @@ -521,70 +579,68 @@ static void checkPage(PgHdr *pPg){ ** nul-terminator), then this is handled as if no master journal name ** were present in the journal. ** -** If no master journal file name is present zMaster[0] is set to 0 and -** SQLITE_OK returned. +** If a master journal file name is present at the end of the journal +** file, then it is copied into the buffer pointed to by zMaster. A +** nul-terminator byte is appended to the buffer following the master +** journal file name. +** +** If it is determined that no master journal file name is present +** zMaster[0] is set to 0 and SQLITE_OK returned. +** +** If an error occurs while reading from the journal file, an SQLite +** error code is returned. */ static int readMasterJournal(sqlite3_file *pJrnl, char *zMaster, u32 nMaster){ - int rc; - u32 len; - i64 szJ; - u32 cksum; - u32 u; /* Unsigned loop counter */ - unsigned char aMagic[8]; /* A buffer to hold the magic header */ - + int rc; /* Return code */ + u32 len; /* Length in bytes of master journal name */ + i64 szJ; /* Total size in bytes of journal file pJrnl */ + u32 cksum; /* MJ checksum value read from journal */ + u32 u; /* Unsigned loop counter */ + unsigned char aMagic[8]; /* A buffer to hold the magic header */ zMaster[0] = '\0'; - rc = sqlite3OsFileSize(pJrnl, &szJ); - if( rc!=SQLITE_OK || szJ<16 ) return rc; - - rc = read32bits(pJrnl, szJ-16, &len); - if( rc!=SQLITE_OK ) return rc; - - if( len>=nMaster ){ - return SQLITE_OK; - } - - rc = read32bits(pJrnl, szJ-12, &cksum); - if( rc!=SQLITE_OK ) return rc; - - rc = sqlite3OsRead(pJrnl, aMagic, 8, szJ-8); - if( rc!=SQLITE_OK || memcmp(aMagic, aJournalMagic, 8) ) return rc; - - rc = sqlite3OsRead(pJrnl, zMaster, len, szJ-16-len); - if( rc!=SQLITE_OK ){ + if( SQLITE_OK!=(rc = sqlite3OsFileSize(pJrnl, &szJ)) + || szJ<16 + || SQLITE_OK!=(rc = read32bits(pJrnl, szJ-16, &len)) + || len>=nMaster + || SQLITE_OK!=(rc = read32bits(pJrnl, szJ-12, &cksum)) + || SQLITE_OK!=(rc = sqlite3OsRead(pJrnl, aMagic, 8, szJ-8)) + || memcmp(aMagic, aJournalMagic, 8) + || SQLITE_OK!=(rc = sqlite3OsRead(pJrnl, zMaster, len, szJ-16-len)) + ){ return rc; } - zMaster[len] = '\0'; /* See if the checksum matches the master journal name */ for(u=0; ujournalOff, assuming a sector +** size of pPager->sectorSize bytes. ** ** i.e for a sector size of 512: ** -** Input Offset Output Offset -** --------------------------------------- -** 0 0 -** 512 512 -** 100 512 -** 2000 2048 +** Pager.journalOff Return value +** --------------------------------------- +** 0 0 +** 512 512 +** 100 512 +** 2000 2048 ** */ static i64 journalHdrOffset(Pager *pPager){ @@ -598,26 +654,39 @@ static i64 journalHdrOffset(Pager *pPager){ assert( (offset-c)journalOff = journalHdrOffset(pPager); -} /* -** Write zeros over the header of the journal file. This has the -** effect of invalidating the journal file and committing the -** transaction. +** The journal file must be open when this function is called. +** +** This function is a no-op if the journal file has not been written to +** within the current transaction (i.e. if Pager.journalOff==0). +** +** If doTruncate is non-zero or the Pager.journalSizeLimit variable is +** set to 0, then truncate the journal file to zero bytes in size. Otherwise, +** zero the 28-byte header at the start of the journal file. In either case, +** if the pager is not in no-sync mode, sync the journal file immediately +** after writing or truncating it. +** +** If Pager.journalSizeLimit is set to a positive, non-zero value, and +** following the truncation or zeroing described above the size of the +** journal file in bytes is larger than this value, then truncate the +** journal file to Pager.journalSizeLimit bytes. The journal file does +** not need to be synced following this operation. +** +** If an IO error occurs, abandon processing and return the IO error code. +** Otherwise, return SQLITE_OK. */ static int zeroJournalHdr(Pager *pPager, int doTruncate){ - int rc = SQLITE_OK; - static const char zeroHdr[28] = {0}; - + int rc = SQLITE_OK; /* Return code */ + assert( isOpen(pPager->jfd) ); if( pPager->journalOff ){ - i64 iLimit = pPager->journalSizeLimit; + const i64 iLimit = pPager->journalSizeLimit; /* Local cache of jsl */ IOTRACE(("JZEROHDR %p\n", pPager)) if( doTruncate || iLimit==0 ){ rc = sqlite3OsTruncate(pPager->jfd, 0); }else{ + static const char zeroHdr[28] = {0}; rc = sqlite3OsWrite(pPager->jfd, zeroHdr, sizeof(zeroHdr), 0); } if( rc==SQLITE_OK && !pPager->noSync ){ @@ -657,19 +726,21 @@ static int zeroJournalHdr(Pager *pPager, int doTruncate){ ** Followed by (JOURNAL_HDR_SZ - 28) bytes of unused space. */ static int writeJournalHdr(Pager *pPager){ - int rc = SQLITE_OK; - char *zHeader = pPager->pTmpSpace; - u32 nHeader = pPager->pageSize; - u32 nWrite; - int ii; + int rc = SQLITE_OK; /* Return code */ + char *zHeader = pPager->pTmpSpace; /* Temporary space used to build header */ + u32 nHeader = pPager->pageSize; /* Size of buffer pointed to by zHeader */ + u32 nWrite; /* Bytes of header sector written */ + int ii; /* Loop counter */ + + assert( isOpen(pPager->jfd) ); /* Journal file must be open. */ if( nHeader>JOURNAL_HDR_SZ(pPager) ){ nHeader = JOURNAL_HDR_SZ(pPager); } - /* If there are active savepoints and any of them were created since the - ** most recent journal header was written, update the PagerSavepoint.iHdrOff - ** fields now. + /* If there are active savepoints and any of them were created + ** since the most recent journal header was written, update the + ** PagerSavepoint.iHdrOffset fields now. */ for(ii=0; iinSavepoint; ii++){ if( pPager->aSavepoint[ii].iHdrOffset==0 ){ @@ -677,9 +748,7 @@ static int writeJournalHdr(Pager *pPager){ } } - seekJournalHdr(pPager); - pPager->journalHdr = pPager->journalOff; - + pPager->journalHdr = pPager->journalOff = journalHdrOffset(pPager); memcpy(zHeader, aJournalMagic, sizeof(aJournalMagic)); /* @@ -702,7 +771,7 @@ static int writeJournalHdr(Pager *pPager){ ** * When the SQLITE_IOCAP_SAFE_APPEND flag is set. This guarantees ** that garbage data is never appended to the journal file. */ - assert(pPager->fd->pMethods||pPager->noSync); + assert( isOpen(pPager->fd) || pPager->noSync ); if( (pPager->noSync) || (pPager->journalMode==PAGER_JOURNALMODE_MEMORY) || (sqlite3OsDeviceCharacteristics(pPager->fd)&SQLITE_IOCAP_SAFE_APPEND) ){ @@ -719,19 +788,34 @@ static int writeJournalHdr(Pager *pPager){ /* The assumed sector size for this process */ put32bits(&zHeader[sizeof(aJournalMagic)+12], pPager->sectorSize); + /* The page size */ + put32bits(&zHeader[sizeof(aJournalMagic)+16], pPager->pageSize); + /* Initializing the tail of the buffer is not necessary. Everything ** works find if the following memset() is omitted. But initializing ** the memory prevents valgrind from complaining, so we are willing to ** take the performance hit. */ - memset(&zHeader[sizeof(aJournalMagic)+16], 0, - nHeader-(sizeof(aJournalMagic)+16)); - - if( pPager->journalHdr==0 ){ - /* The page size */ - put32bits(&zHeader[sizeof(aJournalMagic)+16], pPager->pageSize); - } - + memset(&zHeader[sizeof(aJournalMagic)+20], 0, + nHeader-(sizeof(aJournalMagic)+20)); + + /* In theory, it is only necessary to write the 28 bytes that the + ** journal header consumes to the journal file here. Then increment the + ** Pager.journalOff variable by JOURNAL_HDR_SZ so that the next + ** record is written to the following sector (leaving a gap in the file + ** that will be implicitly filled in by the OS). + ** + ** However it has been discovered that on some systems this pattern can + ** be significantly slower than contiguously writing data to the file, + ** even if that means explicitly writing data to the block of + ** (JOURNAL_HDR_SZ - 28) bytes that will not be used. So that is what + ** is done. + ** + ** The loop is required here in case the sector-size is larger than the + ** database page size. Since the zHeader buffer is only Pager.pageSize + ** bytes in size, more than one call to sqlite3OsWrite() may be required + ** to populate the entire journal header sector. + */ for(nWrite=0; rc==SQLITE_OK&&nWritejournalHdr, nHeader)) rc = sqlite3OsWrite(pPager->jfd, zHeader, nHeader, pPager->journalOff); @@ -745,93 +829,114 @@ static int writeJournalHdr(Pager *pPager){ ** The journal file must be open when this is called. A journal header file ** (JOURNAL_HDR_SZ bytes) is read from the current location in the journal ** file. The current location in the journal file is given by -** pPager->journalOff. See comments above function writeJournalHdr() for +** pPager->journalOff. See comments above function writeJournalHdr() for ** a description of the journal header format. ** -** If the header is read successfully, *nRec is set to the number of -** page records following this header and *dbSize is set to the size of the +** If the header is read successfully, *pNRec is set to the number of +** page records following this header and *pDbSize is set to the size of the ** database before the transaction began, in pages. Also, pPager->cksumInit ** is set to the value read from the journal header. SQLITE_OK is returned ** in this case. ** ** If the journal header file appears to be corrupted, SQLITE_DONE is -** returned and *nRec and *dbSize are undefined. If JOURNAL_HDR_SZ bytes +** returned and *pNRec and *PDbSize are undefined. If JOURNAL_HDR_SZ bytes ** cannot be read from the journal file an error code is returned. */ static int readJournalHdr( - Pager *pPager, - i64 journalSize, - u32 *pNRec, - u32 *pDbSize + Pager *pPager, /* Pager object */ + i64 journalSize, /* Size of the open journal file in bytes */ + u32 *pNRec, /* OUT: Value read from the nRec field */ + u32 *pDbSize /* OUT: Value of original database size field */ ){ - int rc; - unsigned char aMagic[8]; /* A buffer to hold the magic header */ - i64 jrnlOff; - u32 iPageSize; - u32 iSectorSize; + int rc; /* Return code */ + unsigned char aMagic[8]; /* A buffer to hold the magic header */ + i64 iHdrOff; /* Offset of journal header being read */ + + assert( isOpen(pPager->jfd) ); /* Journal file must be open. */ - seekJournalHdr(pPager); + /* Advance Pager.journalOff to the start of the next sector. If the + ** journal file is too small for there to be a header stored at this + ** point, return SQLITE_DONE. + */ + pPager->journalOff = journalHdrOffset(pPager); if( pPager->journalOff+JOURNAL_HDR_SZ(pPager) > journalSize ){ return SQLITE_DONE; } - jrnlOff = pPager->journalOff; - - rc = sqlite3OsRead(pPager->jfd, aMagic, sizeof(aMagic), jrnlOff); - if( rc ) return rc; - jrnlOff += sizeof(aMagic); + iHdrOff = pPager->journalOff; + /* Read in the first 8 bytes of the journal header. If they do not match + ** the magic string found at the start of each journal header, return + ** SQLITE_DONE. If an IO error occurs, return an error code. Otherwise, + ** proceed. + */ + rc = sqlite3OsRead(pPager->jfd, aMagic, sizeof(aMagic), iHdrOff); + if( rc ){ + return rc; + } if( memcmp(aMagic, aJournalMagic, sizeof(aMagic))!=0 ){ return SQLITE_DONE; } - rc = read32bits(pPager->jfd, jrnlOff, pNRec); - if( rc ) return rc; - - rc = read32bits(pPager->jfd, jrnlOff+4, &pPager->cksumInit); - if( rc ) return rc; - - rc = read32bits(pPager->jfd, jrnlOff+8, pDbSize); - if( rc ) return rc; + /* Read the first three 32-bit fields of the journal header: The nRec + ** field, the checksum-initializer and the database size at the start + ** of the transaction. Return an error code if anything goes wrong. + */ + if( SQLITE_OK!=(rc = read32bits(pPager->jfd, iHdrOff+8, pNRec)) + || SQLITE_OK!=(rc = read32bits(pPager->jfd, iHdrOff+12, &pPager->cksumInit)) + || SQLITE_OK!=(rc = read32bits(pPager->jfd, iHdrOff+16, pDbSize)) + ){ + return rc; + } if( pPager->journalOff==0 ){ - rc = read32bits(pPager->jfd, jrnlOff+16, &iPageSize); - if( rc ) return rc; + u32 iPageSize; /* Page-size field of journal header */ + u32 iSectorSize; /* Sector-size field of journal header */ + u16 iPageSize16; /* Copy of iPageSize in 16-bit variable */ - if( iPageSize<512 - || iPageSize>SQLITE_MAX_PAGE_SIZE - || ((iPageSize-1)&iPageSize)!=0 + /* Read the page-size and sector-size journal header fields. */ + if( SQLITE_OK!=(rc = read32bits(pPager->jfd, iHdrOff+20, &iSectorSize)) + || SQLITE_OK!=(rc = read32bits(pPager->jfd, iHdrOff+24, &iPageSize)) ){ - /* If the page-size in the journal-header is invalid, then the process - ** that wrote the journal-header must have crashed before the header - ** was synced. In this case stop reading the journal file here. + return rc; + } + + /* Check that the values read from the page-size and sector-size fields + ** are within range. To be 'in range', both values need to be a power + ** of two greater than or equal to 512, and not greater than their + ** respective compile time maximum limits. + */ + if( iPageSize<512 || iSectorSize<512 + || iPageSize>SQLITE_MAX_PAGE_SIZE || iSectorSize>MAX_SECTOR_SIZE + || ((iPageSize-1)&iPageSize)!=0 || ((iSectorSize-1)&iSectorSize)!=0 + ){ + /* If the either the page-size or sector-size in the journal-header is + ** invalid, then the process that wrote the journal-header must have + ** crashed before the header was synced. In this case stop reading + ** the journal file here. */ - rc = SQLITE_DONE; - }else{ - u16 pagesize = (u16)iPageSize; - rc = sqlite3PagerSetPagesize(pPager, &pagesize); - assert( rc!=SQLITE_OK || pagesize==(u16)iPageSize ); + return SQLITE_DONE; } - if( rc ) return rc; - + + /* Update the page-size to match the value read from the journal. + ** Use a testcase() macro to make sure that malloc failure within + ** PagerSetPagesize() is tested. + */ + iPageSize16 = (u16)iPageSize; + rc = sqlite3PagerSetPagesize(pPager, &iPageSize16); + testcase( rc!=SQLITE_OK ); + assert( rc!=SQLITE_OK || iPageSize16==(u16)iPageSize ); + /* Update the assumed sector-size to match the value used by ** the process that created this journal. If this journal was ** created by a process other than this one, then this routine ** is being called from within pager_playback(). The local value ** of Pager.sectorSize is restored at the end of that routine. */ - rc = read32bits(pPager->jfd, jrnlOff+12, &iSectorSize); - if( rc ) return rc; - if( (iSectorSize&(iSectorSize-1)) - || iSectorSize<512 - || iSectorSize>MAX_SECTOR_SIZE - ){ - return SQLITE_DONE; - } pPager->sectorSize = iSectorSize; } pPager->journalOff += JOURNAL_HDR_SZ(pPager); - return SQLITE_OK; + return rc; } @@ -842,34 +947,37 @@ static int readJournalHdr( ** journal file descriptor is advanced to the next sector boundary before ** anything is written. The format is: ** -** + 4 bytes: PAGER_MJ_PGNO. -** + N bytes: length of master journal name. -** + 4 bytes: N -** + 4 bytes: Master journal name checksum. -** + 8 bytes: aJournalMagic[]. +** + 4 bytes: PAGER_MJ_PGNO. +** + N bytes: Master journal filename in utf-8. +** + 4 bytes: N (length of master journal name in bytes, no nul-terminator). +** + 4 bytes: Master journal name checksum. +** + 8 bytes: aJournalMagic[]. ** ** The master journal page checksum is the sum of the bytes in the master -** journal name. +** journal name, where each byte is interpreted as a signed 8-bit integer. ** ** If zMaster is a NULL pointer (occurs for a single database transaction), ** this call is a no-op. */ static int writeMasterJournal(Pager *pPager, const char *zMaster){ - int rc; - int len; - int i; - i64 jrnlOff; - i64 jrnlSize; - u32 cksum = 0; - char zBuf[sizeof(aJournalMagic)+2*4]; - - if( !zMaster || pPager->setMaster ) return SQLITE_OK; - if( pPager->journalMode==PAGER_JOURNALMODE_MEMORY ) return SQLITE_OK; + int rc; /* Return code */ + int nMaster; /* Length of string zMaster */ + i64 iHdrOff; /* Offset of header in journal file */ + i64 jrnlSize; /* Size of journal file on disk */ + u32 cksum = 0; /* Checksum of string zMaster */ + + if( !zMaster || pPager->setMaster + || pPager->journalMode==PAGER_JOURNALMODE_MEMORY + || pPager->journalMode==PAGER_JOURNALMODE_OFF + ){ + return SQLITE_OK; + } pPager->setMaster = 1; + assert( isOpen(pPager->jfd) ); - len = sqlite3Strlen30(zMaster); - for(i=0; ifullSync ){ - seekJournalHdr(pPager); + pPager->journalOff = journalHdrOffset(pPager); } - jrnlOff = pPager->journalOff; - pPager->journalOff += (len+20); - - rc = write32bits(pPager->jfd, jrnlOff, PAGER_MJ_PGNO(pPager)); - if( rc!=SQLITE_OK ) return rc; - jrnlOff += 4; + iHdrOff = pPager->journalOff; - rc = sqlite3OsWrite(pPager->jfd, zMaster, len, jrnlOff); - if( rc!=SQLITE_OK ) return rc; - jrnlOff += len; - - put32bits(zBuf, len); - put32bits(&zBuf[4], cksum); - memcpy(&zBuf[8], aJournalMagic, sizeof(aJournalMagic)); - rc = sqlite3OsWrite(pPager->jfd, zBuf, 8+sizeof(aJournalMagic), jrnlOff); - jrnlOff += 8+sizeof(aJournalMagic); + /* Write the master journal data to the end of the journal file. If + ** an error occurs, return the error code to the caller. + */ + if( (rc = write32bits(pPager->jfd, iHdrOff, PAGER_MJ_PGNO(pPager))) + || (rc = sqlite3OsWrite(pPager->jfd, zMaster, nMaster, iHdrOff+4)) + || (rc = write32bits(pPager->jfd, iHdrOff+4+nMaster, nMaster)) + || (rc = write32bits(pPager->jfd, iHdrOff+4+nMaster+4, cksum)) + || (rc = sqlite3OsWrite(pPager->jfd, aJournalMagic, 8, iHdrOff+4+nMaster+8)) + ){ + return rc; + } + pPager->journalOff += (nMaster+20); pPager->needSync = !pPager->noSync; /* If the pager is in peristent-journal mode, then the physical @@ -907,34 +1013,37 @@ static int writeMasterJournal(Pager *pPager, const char *zMaster){ ** Easiest thing to do in this scenario is to truncate the journal ** file to the required size. */ - if( (rc==SQLITE_OK) - && (rc = sqlite3OsFileSize(pPager->jfd, &jrnlSize))==SQLITE_OK - && jrnlSize>jrnlOff + if( SQLITE_OK==(rc = sqlite3OsFileSize(pPager->jfd, &jrnlSize)) + && jrnlSize>pPager->journalOff ){ - rc = sqlite3OsTruncate(pPager->jfd, jrnlOff); + rc = sqlite3OsTruncate(pPager->jfd, pPager->journalOff); } return rc; } /* -** Find a page in the hash table given its page number. Return -** a pointer to the page or NULL if not found. +** Find a page in the hash table given its page number. Return +** a pointer to the page or NULL if the requested page is not +** already in memory. */ static PgHdr *pager_lookup(Pager *pPager, Pgno pgno){ - PgHdr *p; - sqlite3PcacheFetch(pPager->pPCache, pgno, 0, &p); + PgHdr *p; /* Return value */ + + /* It is not possible for a call to PcacheFetch() with createFlag==0 to + ** fail, since no attempt to allocate dynamic memory will be made. + */ + (void)sqlite3PcacheFetch(pPager->pPCache, pgno, 0, &p); return p; } /* -** Clear the in-memory cache. This routine -** sets the state of the pager back to what it was when it was first -** opened. Any outstanding pages are invalidated and subsequent attempts -** to access those pages will likely result in a coredump. +** Unless the pager is in error-state, discard all in-memory pages. If +** the pager is in error-state, then this call is a no-op. */ static void pager_reset(Pager *pPager){ - if( pPager->errCode ) return; - sqlite3PcacheClear(pPager->pPCache); + if( SQLITE_OK==pPager->errCode ){ + sqlite3PcacheClear(pPager->pPCache); + } } /* @@ -942,8 +1051,8 @@ static void pager_reset(Pager *pPager){ ** Pager.aSavepoint and Pager.nSavepoint to zero. Close the sub-journal ** if it is open and the pager is not in exclusive mode. */ -static void releaseAllSavepoint(Pager *pPager){ - int ii; +static void releaseAllSavepoints(Pager *pPager){ + int ii; /* Iterator for looping through Pager.aSavepoint */ for(ii=0; iinSavepoint; ii++){ sqlite3BitvecDestroy(pPager->aSavepoint[ii].pInSavepoint); } @@ -953,12 +1062,13 @@ static void releaseAllSavepoint(Pager *pPager){ sqlite3_free(pPager->aSavepoint); pPager->aSavepoint = 0; pPager->nSavepoint = 0; - pPager->stmtNRec = 0; + pPager->nSubRec = 0; } /* -** Set the bit number pgno in the PagerSavepoint.pInSavepoint bitvecs of -** all open savepoints. +** Set the bit number pgno in the PagerSavepoint.pInSavepoint +** bitvecs of all open savepoints. Return SQLITE_OK if successful +** or SQLITE_NOMEM if a malloc failure occurs. */ static int addToSavepointBitvecs(Pager *pPager, Pgno pgno){ int ii; /* Loop counter */ @@ -968,6 +1078,7 @@ static int addToSavepointBitvecs(Pager *pPager, Pgno pgno){ PagerSavepoint *p = &pPager->aSavepoint[ii]; if( pgno<=p->nOrig ){ rc |= sqlite3BitvecSet(p->pInSavepoint, pgno); + testcase( rc==SQLITE_NOMEM ); assert( rc==SQLITE_OK || rc==SQLITE_NOMEM ); } } @@ -975,7 +1086,8 @@ static int addToSavepointBitvecs(Pager *pPager, Pgno pgno){ } /* -** Unlock the database file. +** Unlock the database file. This function is a no-op if the pager +** is in exclusive mode. ** ** If the pager is currently in error state, discard the contents of ** the cache and reset the Pager structure internal state. If there is @@ -985,24 +1097,28 @@ static int addToSavepointBitvecs(Pager *pPager, Pgno pgno){ */ static void pager_unlock(Pager *pPager){ if( !pPager->exclusiveMode ){ - int rc; + int rc; /* Return code */ /* Always close the journal file when dropping the database lock. ** Otherwise, another connection with journal_mode=delete might ** delete the file out from under us. */ - if( pPager->journalOpen ){ - sqlite3OsClose(pPager->jfd); - pPager->journalOpen = 0; - sqlite3BitvecDestroy(pPager->pInJournal); - pPager->pInJournal = 0; - sqlite3BitvecDestroy(pPager->pAlwaysRollback); - pPager->pAlwaysRollback = 0; - } + sqlite3OsClose(pPager->jfd); + sqlite3BitvecDestroy(pPager->pInJournal); + pPager->pInJournal = 0; + releaseAllSavepoints(pPager); - rc = osUnlock(pPager->fd, NO_LOCK); - if( rc ) pPager->errCode = rc; + /* If the file is unlocked, somebody else might change it. The + ** values stored in Pager.dbSize etc. might become invalid if + ** this happens. TODO: Really, this doesn't need to be cleared + ** until the change-counter check fails in pagerSharedLock(). + */ pPager->dbSizeValid = 0; + + rc = osUnlock(pPager->fd, NO_LOCK); + if( rc ){ + pPager->errCode = rc; + } IOTRACE(("UNLOCK %p\n", pPager)) /* If Pager.errCode is set, the contents of the pager cache cannot be @@ -1010,62 +1126,168 @@ static void pager_unlock(Pager *pPager){ ** cache can be discarded and the error code safely cleared. */ if( pPager->errCode ){ - if( rc==SQLITE_OK ) pPager->errCode = SQLITE_OK; + if( rc==SQLITE_OK ){ + pPager->errCode = SQLITE_OK; + } pager_reset(pPager); - releaseAllSavepoint(pPager); - pPager->journalOff = 0; - pPager->journalStarted = 0; - pPager->dbOrigSize = 0; } - pPager->state = PAGER_UNLOCK; pPager->changeCountDone = 0; + pPager->state = PAGER_UNLOCK; } } /* -** Execute a rollback if a transaction is active and unlock the -** database file. If the pager has already entered the error state, -** do not attempt the rollback. +** This function should be called when an IOERR, CORRUPT or FULL error +** may have occured. The first argument is a pointer to the pager +** structure, the second the error-code about to be returned by a pager +** API function. The value returned is a copy of the second argument +** to this function. +** +** If the second argument is SQLITE_IOERR, SQLITE_CORRUPT, or SQLITE_FULL +** the error becomes persistent. Until the persisten error is cleared, +** subsequent API calls on this Pager will immediately return the same +** error code. +** +** A persistent error indicates that the contents of the pager-cache +** cannot be trusted. This state can be cleared by completely discarding +** the contents of the pager-cache. If a transaction was active when +** the persistent error occured, then the rollback journal may need +** to be replayed to restore the contents of the database file (as if +** it were a hot-journal). */ -static void pagerUnlockAndRollback(Pager *p){ - if( p->errCode==SQLITE_OK && p->state>=PAGER_RESERVED ){ - sqlite3BeginBenignMalloc(); - sqlite3PagerRollback(p); - sqlite3EndBenignMalloc(); +static int pager_error(Pager *pPager, int rc){ + int rc2 = rc & 0xff; + assert( + pPager->errCode==SQLITE_FULL || + pPager->errCode==SQLITE_OK || + (pPager->errCode & 0xff)==SQLITE_IOERR + ); + if( + rc2==SQLITE_FULL || + rc2==SQLITE_IOERR || + rc2==SQLITE_CORRUPT + ){ + pPager->errCode = rc; + if( pPager->state==PAGER_UNLOCK + && sqlite3PcacheRefCount(pPager->pPCache)==0 + ){ + /* If the pager is already unlocked, call pager_unlock() now to + ** clear the error state and ensure that the pager-cache is + ** completely empty. + */ + pager_unlock(pPager); + } } - pager_unlock(p); + return rc; } /* -** This routine ends a transaction. A transaction is ended by either -** a COMMIT or a ROLLBACK. -** -** When this routine is called, the pager has the journal file open and -** a RESERVED or EXCLUSIVE lock on the database. This routine will release -** the database lock and acquires a SHARED lock in its place if that is -** the appropriate thing to do. Release locks usually is appropriate, -** unless we are in exclusive access mode or unless this is a -** COMMIT AND BEGIN or ROLLBACK AND BEGIN operation. +** Execute a rollback if a transaction is active and unlock the +** database file. ** -** The journal file is either deleted or truncated. +** If the pager has already entered the error state, do not attempt +** the rollback at this time. Instead, pager_unlock() is called. The +** call to pager_unlock() will discard all in-memory pages, unlock +** the database file and clear the error state. If this means that +** there is a hot-journal left in the file-system, the next connection +** to obtain a shared lock on the pager (which may be this one) will +** roll it back. ** -** TODO: Consider keeping the journal file open for temporary databases. -** This might give a performance improvement on windows where opening -** a file is an expensive operation. +** If the pager has not already entered the error state, but an IO or +** malloc error occurs during a rollback, then this will itself cause +** the pager to enter the error state. Which will be cleared by the +** call to pager_unlock(), as described above. */ -static int pager_end_transaction(Pager *pPager, int hasMaster){ - int rc = SQLITE_OK; - int rc2 = SQLITE_OK; +static void pagerUnlockAndRollback(Pager *pPager){ + if( pPager->errCode==SQLITE_OK && pPager->state>=PAGER_RESERVED ){ + sqlite3BeginBenignMalloc(); + sqlite3PagerRollback(pPager); + sqlite3EndBenignMalloc(); + } + pager_unlock(pPager); +} + +/* +** This routine ends a transaction. A transaction is usually ended by +** either a COMMIT or a ROLLBACK operation. This routine may be called +** after rollback of a hot-journal, or if an error occurs while opening +** the journal file or writing the very first journal-header of a +** database transaction. +** +** If the pager is in PAGER_SHARED or PAGER_UNLOCK state when this +** routine is called, it is a no-op (returns SQLITE_OK). +** +** Otherwise, any active savepoints are released. +** +** If the journal file is open, then it is "finalized". Once a journal +** file has been finalized it is not possible to use it to roll back a +** transaction. Nor will it be considered to be a hot-journal by this +** or any other database connection. Exactly how a journal is finalized +** depends on whether or not the pager is running in exclusive mode and +** the current journal-mode (Pager.journalMode value), as follows: +** +** journalMode==MEMORY +** Journal file descriptor is simply closed. This destroys an +** in-memory journal. +** +** journalMode==TRUNCATE +** Journal file is truncated to zero bytes in size. +** +** journalMode==PERSIST +** The first 28 bytes of the journal file are zeroed. This invalidates +** the first journal header in the file, and hence the entire journal +** file. An invalid journal file cannot be rolled back. +** +** journalMode==DELETE +** The journal file is closed and deleted using sqlite3OsDelete(). +** +** If the pager is running in exclusive mode, this method of finalizing +** the journal file is never used. Instead, if the journalMode is +** DELETE and the pager is in exclusive mode, the method described under +** journalMode==PERSIST is used instead. +** +** After the journal is finalized, if running in non-exclusive mode, the +** pager moves to PAGER_SHARED state (and downgrades the lock on the +** database file accordingly). +** +** If the pager is running in exclusive mode and is in PAGER_SYNCED state, +** it moves to PAGER_EXCLUSIVE. No locks are downgraded when running in +** exclusive mode. +** +** SQLITE_OK is returned if no error occurs. If an error occurs during +** any of the IO operations to finalize the journal file or unlock the +** database then the IO error code is returned to the user. If the +** operation to finalize the journal file fails, then the code still +** tries to unlock the database file if not in exclusive mode. If the +** unlock operation fails as well, then the first error code related +** to the first error encountered (the journal finalization one) is +** returned. +*/ +static int pager_end_transaction(Pager *pPager, int hasMaster){ + int rc = SQLITE_OK; /* Error code from journal finalization operation */ + int rc2 = SQLITE_OK; /* Error code from db file unlock operation */ + if( pPager->statejournalOpen ){ + releaseAllSavepoints(pPager); + + assert( isOpen(pPager->jfd) || pPager->pInJournal==0 ); + if( isOpen(pPager->jfd) ){ + + /* TODO: There's a problem here if a journal-file was opened in MEMORY + ** mode and then the journal-mode is changed to TRUNCATE or PERSIST + ** during the transaction. This code should be changed to assume + ** that the journal mode has not changed since the transaction was + ** started. And the sqlite3PagerJournalMode() function should be + ** changed to make sure that this is the case too. + */ + + /* Finalize the journal file. */ if( pPager->journalMode==PAGER_JOURNALMODE_MEMORY ){ int isMemoryJournal = sqlite3IsMemJournal(pPager->jfd); sqlite3OsClose(pPager->jfd); - pPager->journalOpen = 0; if( !isMemoryJournal ){ rc = sqlite3OsDelete(pPager->pVfs, pPager->zJournal, 0); } @@ -1083,23 +1305,19 @@ static int pager_end_transaction(Pager *pPager, int hasMaster){ }else{ assert( pPager->journalMode==PAGER_JOURNALMODE_DELETE || rc ); sqlite3OsClose(pPager->jfd); - pPager->journalOpen = 0; if( rc==SQLITE_OK && !pPager->tempFile ){ rc = sqlite3OsDelete(pPager->pVfs, pPager->zJournal, 0); } } - sqlite3BitvecDestroy(pPager->pInJournal); - pPager->pInJournal = 0; - sqlite3BitvecDestroy(pPager->pAlwaysRollback); - pPager->pAlwaysRollback = 0; + #ifdef SQLITE_CHECK_PAGES sqlite3PcacheIterateDirty(pPager->pPCache, pager_set_pagehash); #endif + sqlite3PcacheCleanAll(pPager->pPCache); - pPager->dirtyCache = 0; + sqlite3BitvecDestroy(pPager->pInJournal); + pPager->pInJournal = 0; pPager->nRec = 0; - }else{ - assert( pPager->pInJournal==0 ); } if( !pPager->exclusiveMode ){ @@ -1109,42 +1327,43 @@ static int pager_end_transaction(Pager *pPager, int hasMaster){ }else if( pPager->state==PAGER_SYNCED ){ pPager->state = PAGER_EXCLUSIVE; } - pPager->dbOrigSize = 0; pPager->setMaster = 0; pPager->needSync = 0; - /* lruListSetFirstSynced(pPager); */ + pPager->dbModified = 0; + + /* TODO: Is this optimal? Why is the db size invalidated here + ** when the database file is not unlocked? */ + pPager->dbOrigSize = 0; sqlite3PcacheTruncate(pPager->pPCache, pPager->dbSize); if( !MEMDB ){ pPager->dbSizeValid = 0; } - pPager->dbModified = 0; return (rc==SQLITE_OK?rc2:rc); } /* -** Compute and return a checksum for the page of data. +** Parameter aData must point to a buffer of pPager->pageSize bytes +** of data. Compute and return a checksum based ont the contents of the +** page of data and the current value of pPager->cksumInit. ** -** This is not a real checksum. It is really just the sum of the -** random initial value and the page number. We experimented with -** a checksum of the entire data, but that was found to be too slow. +** This is not a real checksum. It is really just the sum of the +** random initial value (pPager->cksumInit) and every 200th byte +** of the page data, starting with byte offset (pPager->pageSize%200). +** Each byte is interpreted as an 8-bit unsigned integer. ** -** Note that the page number is stored at the beginning of data and -** the checksum is stored at the end. This is important. If journal -** corruption occurs due to a power failure, the most likely scenario -** is that one end or the other of the record will be changed. It is -** much less likely that the two ends of the journal record will be +** Changing the formula used to compute this checksum results in an +** incompatible journal file format. +** +** If journal corruption occurs due to a power failure, the most likely +** scenario is that one end or the other of the record will be changed. +** It is much less likely that the two ends of the journal record will be ** correct and the middle be corrupt. Thus, this "checksum" scheme, ** though fast and simple, catches the mostly likely kind of corruption. -** -** FIX ME: Consider adding every 200th (or so) byte of the data to the -** checksum. That way if a single page spans 3 or more disk sectors and -** only the middle sector is corrupt, we will still have a reasonable -** chance of failing the checksum and thus detecting the problem. */ static u32 pager_cksum(Pager *pPager, const u8 *aData){ - u32 cksum = pPager->cksumInit; - int i = pPager->pageSize-200; + u32 cksum = pPager->cksumInit; /* Checksum value to return */ + int i = pPager->pageSize-200; /* Loop counter */ while( i>0 ){ cksum += aData[i]; i -= 200; @@ -1155,18 +1374,40 @@ static u32 pager_cksum(Pager *pPager, const u8 *aData){ /* ** Read a single page from either the journal file (if isMainJrnl==1) or ** from the sub-journal (if isMainJrnl==0) and playback that page. -** The page begins at offset *pOffset into the file. The *pOffset +** The page begins at offset *pOffset into the file. The *pOffset ** value is increased to the start of the next page in the journal. ** ** The isMainJrnl flag is true if this is the main rollback journal and ** false for the statement journal. The main rollback journal uses ** checksums - the statement journal does not. ** +** If the page number of the page record read from the (sub-)journal file +** is greater than the current value of Pager.dbSize, then playback is +** skipped and SQLITE_OK is returned. +** ** If pDone is not NULL, then it is a record of pages that have already ** been played back. If the page at *pOffset has already been played back ** (if the corresponding pDone bit is set) then skip the playback. ** Make sure the pDone bit corresponding to the *pOffset page is set ** prior to returning. +** +** If the page record is successfully read from the (sub-)journal file +** and played back, then SQLITE_OK is returned. If an IO error occurs +** while reading the record from the (sub-)journal file or while writing +** to the database file, then the IO error code is returned. If data +** is successfully read from the (sub-)journal file but appears to be +** corrupted, SQLITE_DONE is returned. Data is considered corrupted in +** two circumstances: +** +** * If the record page-number is illegal (0 or PAGER_MJ_PGNO), or +** * If the record is being rolled back from the main journal file +** and the checksum field does not match the record content. +** +** Neither of these two scenarios are possible during a savepoint rollback. +** +** If this is a savepoint rollback, then memory may have to be dynamically +** allocated by this function. If this is the case and an allocation fails, +** SQLITE_NOMEM is returned. */ static int pager_playback_one_page( Pager *pPager, /* The pager being played back */ @@ -1190,8 +1431,10 @@ static int pager_playback_one_page( aData = (u8*)pPager->pTmpSpace; assert( aData ); /* Temp storage must have already been allocated */ + /* Read the page number and page data from the journal or sub-journal + ** file. Return an error code to the caller if an IO error occurs. + */ jfd = isMainJrnl ? pPager->jfd : pPager->sjfd; - rc = read32bits(jfd, *pOffset, &pgno); if( rc!=SQLITE_OK ) return rc; rc = sqlite3OsRead(jfd, aData, pPager->pageSize, (*pOffset)+4); @@ -1204,6 +1447,7 @@ static int pager_playback_one_page( ** detect this invalid data (with high probability) and ignore it. */ if( pgno==0 || pgno==PAGER_MJ_PGNO(pPager) ){ + assert( !isSavepnt ); return SQLITE_DONE; } if( pgno>(Pgno)pPager->dbSize || sqlite3BitvecTest(pDone, pgno) ){ @@ -1216,6 +1460,7 @@ static int pager_playback_one_page( return SQLITE_DONE; } } + if( pDone && (rc = sqlite3BitvecSet(pDone, pgno)) ){ return rc; } @@ -1261,7 +1506,7 @@ static int pager_playback_one_page( )); if( (pPager->state>=PAGER_EXCLUSIVE) && (pPg==0 || 0==(pPg->flags&PGHDR_NEED_SYNC)) - && (pPager->fd->pMethods) + && isOpen(pPager->fd) ){ i64 ofst = (pgno-1)*(i64)pPager->pageSize; rc = sqlite3OsWrite(pPager->fd, aData, pPager->pageSize, ofst); @@ -1305,7 +1550,7 @@ static int pager_playback_one_page( if( pPager->xReiniter ){ pPager->xReiniter(pPg); } - if( isMainJrnl && (!isSavepnt || pPager->journalOff<=pPager->journalHdr) ){ + if( isMainJrnl && (!isSavepnt || *pOffset<=pPager->journalHdr) ){ /* If the contents of this page were just restored from the main ** journal file, then its content must be as they were when the ** transaction was first opened. In this case we can mark the page @@ -1391,34 +1636,60 @@ static int pagerNextJournalPageIsValid(Pager *pPager){ ** Argument zMaster may point to Pager.pTmpSpace. So that buffer is not ** available for use within this function. ** +** When a master journal file is created, it is populated with the names +** of all of its child journals, one after another, formatted as utf-8 +** encoded text. The end of each child journal file is marked with a +** nul-terminator byte (0x00). i.e. the entire contents of a master journal +** file for a transaction involving two databases might be: ** -** The master journal file contains the names of all child journals. -** To tell if a master journal can be deleted, check to each of the -** children. If all children are either missing or do not refer to -** a different master journal, then this master journal can be deleted. +** "/home/bill/a.db-journal\x00/home/bill/b.db-journal\x00" +** +** A master journal file may only be deleted once all of its child +** journals have been rolled back. +** +** This function reads the contents of the master-journal file into +** memory and loops through each of the child journal names. For +** each child journal, it checks if: +** +** * if the child journal exists, and if so +** * if the child journal contains a reference to master journal +** file zMaster +** +** If a child journal can be found that matches both of the criteria +** above, this function returns without doing anything. Otherwise, if +** no such child journal can be found, file zMaster is deleted from +** the file-system using sqlite3OsDelete(). +** +** If an IO error within this function, an error code is returned. This +** function allocates memory by calling sqlite3Malloc(). If an allocation +** fails, SQLITE_NOMEM is returned. Otherwise, if no IO or malloc errors +** occur, SQLITE_OK is returned. +** +** TODO: This function allocates a single block of memory to load +** the entire contents of the master journal file. This could be +** a couple of kilobytes or so - potentially larger than the page +** size. */ static int pager_delmaster(Pager *pPager, const char *zMaster){ sqlite3_vfs *pVfs = pPager->pVfs; - int rc; - int master_open = 0; - sqlite3_file *pMaster; - sqlite3_file *pJournal; + int rc; /* Return code */ + sqlite3_file *pMaster; /* Malloc'd master-journal file descriptor */ + sqlite3_file *pJournal; /* Malloc'd child-journal file descriptor */ char *zMasterJournal = 0; /* Contents of master journal file */ i64 nMasterJournal; /* Size of master journal file */ - /* Open the master journal file exclusively in case some other process - ** is running this routine also. Not that it makes too much difference. + /* Allocate space for both the pJournal and pMaster file descriptors. + ** If successful, open the master journal file for reading. */ - pMaster = (sqlite3_file *)sqlite3Malloc(pVfs->szOsFile * 2); + pMaster = (sqlite3_file *)sqlite3MallocZero(pVfs->szOsFile * 2); pJournal = (sqlite3_file *)(((u8 *)pMaster) + pVfs->szOsFile); if( !pMaster ){ rc = SQLITE_NOMEM; }else{ - int flags = (SQLITE_OPEN_READONLY|SQLITE_OPEN_MASTER_JOURNAL); + const int flags = (SQLITE_OPEN_READONLY|SQLITE_OPEN_MASTER_JOURNAL); rc = sqlite3OsOpen(pVfs, zMaster, pMaster, flags, 0); } if( rc!=SQLITE_OK ) goto delmaster_out; - master_open = 1; rc = sqlite3OsFileSize(pMaster, &nMasterJournal); if( rc!=SQLITE_OK ) goto delmaster_out; @@ -1426,7 +1697,7 @@ static int pager_delmaster(Pager *pPager, const char *zMaster){ if( nMasterJournal>0 ){ char *zJournal; char *zMasterPtr = 0; - int nMasterPtr = pPager->pVfs->mxPathname+1; + int nMasterPtr = pVfs->mxPathname+1; /* Load the entire master journal file into space obtained from ** sqlite3_malloc() and pointed to by zMasterJournal. @@ -1481,8 +1752,9 @@ delmaster_out: if( zMasterJournal ){ sqlite3_free(zMasterJournal); } - if( master_open ){ + if( pMaster ){ sqlite3OsClose(pMaster); + assert( !isOpen(pJournal) ); } sqlite3_free(pMaster); return rc; @@ -1490,23 +1762,30 @@ delmaster_out: /* -** If the main database file is open and an exclusive lock is held, -** truncate the main file of the given pager to the specified number -** of pages. -** -** It might might be the case that the file on disk is smaller than nPage. -** This can happen, for example, if we are in the middle of a transaction -** which has extended the file size and the new pages are still all held -** in cache, then an INSERT or UPDATE does a statement rollback. Some -** operating system implementations can get confused if you try to -** truncate a file to some size that is larger than it currently is, -** so detect this case and write a single zero byte to the end of the new -** file instead. +** This function is used to change the actual size of the database +** file in the file-system. This only happens when committing a transaction, +** or rolling back a transaction (including rolling back a hot-journal). +** +** If the main database file is not open, or an exclusive lock is not +** held, this function is a no-op. Otherwise, the size of the file is +** changed to nPage pages (nPage*pPager->pageSize bytes). If the file +** on disk is currently larger than nPage pages, then use the VFS +** xTruncate() method to truncate it. +** +** Or, it might might be the case that the file on disk is smaller than +** nPage pages. Some operating system implementations can get confused if +** you try to truncate a file to some size that is larger than it +** currently is, so detect this case and write a single zero byte to +** the end of the new file instead. +** +** If successful, return SQLITE_OK. If an IO error occurs while modifying +** the database file, return the error code to the caller. */ static int pager_truncate(Pager *pPager, Pgno nPage){ int rc = SQLITE_OK; - if( pPager->state>=PAGER_EXCLUSIVE && pPager->fd->pMethods ){ + if( pPager->state>=PAGER_EXCLUSIVE && isOpen(pPager->fd) ){ i64 currentSize, newSize; + /* TODO: Is it safe to use Pager.dbFileSize here? */ rc = sqlite3OsFileSize(pPager->fd, ¤tSize); newSize = pPager->pageSize*(i64)nPage; if( rc==SQLITE_OK && currentSize!=newSize ){ @@ -1524,16 +1803,25 @@ static int pager_truncate(Pager *pPager, Pgno nPage){ } /* -** Set the sectorSize for the given pager. +** Set the value of the Pager.sectorSize variable for the given +** pager based on the value returned by the xSectorSize method +** of the open database file. The sector size will be used used +** to determine the size and alignment of journal header and +** master journal pointers within created journal files. ** -** The sector size is at least as big as the sector size reported -** by sqlite3OsSectorSize(). The minimum sector size is 512. +** For temporary files the effective sector size is always 512 bytes. +** +** Otherwise, for non-temporary files, the effective sector size is +** the value returned by the xSectorSize() method rounded up to 512 if +** it is less than 512, or rounded down to MAX_SECTOR_SIZE if it +** is greater than MAX_SECTOR_SIZE. */ static void setSectorSize(Pager *pPager){ - assert(pPager->fd->pMethods||pPager->tempFile); + assert( isOpen(pPager->fd) || pPager->tempFile ); + if( !pPager->tempFile ){ /* Sector size doesn't matter for temporary files. Also, the file - ** may not have been opened yet, in whcih case the OsSectorSize() + ** may not have been opened yet, in which case the OsSectorSize() ** call will segfault. */ pPager->sectorSize = sqlite3OsSectorSize(pPager->fd); @@ -1542,6 +1830,7 @@ static void setSectorSize(Pager *pPager){ pPager->sectorSize = 512; } if( pPager->sectorSize>MAX_SECTOR_SIZE ){ + assert( MAX_SECTOR_SIZE>=512 ); pPager->sectorSize = MAX_SECTOR_SIZE; } } @@ -1615,7 +1904,7 @@ static int pager_playback(Pager *pPager, int isHot){ /* Figure out how many records are in the journal. Abort early if ** the journal is empty. */ - assert( pPager->journalOpen ); + assert( isOpen(pPager->jfd) ); rc = sqlite3OsFileSize(pPager->jfd, &szJ); if( rc!=SQLITE_OK || szJ==0 ){ goto end_playback; @@ -1625,6 +1914,12 @@ static int pager_playback(Pager *pPager, int isHot){ ** If a master journal file name is specified, but the file is not ** present on disk, then the journal is not hot and does not need to be ** played back. + ** + ** TODO: Technically the following is an error because it assumes that + ** buffer Pager.pTmpSpace is (mxPathname+1) bytes or larger. i.e. that + ** (pPager->pageSize >= pPager->pVfs->mxPathname+1). Using os_unix.c, + ** mxPathname is 512, which is the same as the minimum allowable value + ** for pageSize. */ zMaster = pPager->pTmpSpace; rc = readMasterJournal(pPager->jfd, zMaster, pPager->pVfs->mxPathname+1); @@ -1637,8 +1932,10 @@ static int pager_playback(Pager *pPager, int isHot){ } pPager->journalOff = 0; - /* This loop terminates either when the readJournalHdr() call returns - ** SQLITE_DONE or an IO error occurs. */ + /* This loop terminates either when a readJournalHdr() or + ** pager_playback_one_page() call returns SQLITE_DONE or an IO error + ** occurs. + */ while( 1 ){ /* Read the next journal header from the journal file. If there are @@ -1699,7 +1996,8 @@ static int pager_playback(Pager *pPager, int isHot){ pPager->dbSize = mxPg; } - /* Copy original pages out of the journal and back into the database file. + /* Copy original pages out of the journal and back into the + ** database file and/or page cache. */ for(u=0; ujournalOff, 0, 0); @@ -1742,20 +2040,23 @@ end_playback: ** problems for other processes at some point in the future. So, just ** in case this has happened, clear the changeCountDone flag now. */ - pPager->changeCountDone = 0; + pPager->changeCountDone = pPager->tempFile; if( rc==SQLITE_OK ){ zMaster = pPager->pTmpSpace; rc = readMasterJournal(pPager->jfd, zMaster, pPager->pVfs->mxPathname+1); + testcase( rc!=SQLITE_OK ); } if( rc==SQLITE_OK ){ rc = pager_end_transaction(pPager, zMaster[0]!='\0'); + testcase( rc!=SQLITE_OK ); } if( rc==SQLITE_OK && zMaster[0] && res ){ /* If there was a master journal and this routine will return success, ** see if it is possible to delete the master journal. */ rc = pager_delmaster(pPager, zMaster); + testcase( rc!=SQLITE_OK ); } /* The Pager.sectorSize variable may have been updated while rolling @@ -1767,19 +2068,49 @@ end_playback: } /* -** Playback savepoint pSavepoint. Or, if pSavepoint==NULL, then playback -** the entire master journal file. +** Playback savepoint pSavepoint. Or, if pSavepoint==NULL, then playback +** the entire master journal file. The case pSavepoint==NULL occurs when +** a ROLLBACK TO command is invoked on a SAVEPOINT that is a transaction +** savepoint. +** +** When pSavepoint is not NULL (meaning a non-transaction savepoint is +** being rolled back), then the rollback consists of up to three stages, +** performed in the order specified: +** +** * Pages are played back from the main journal starting at byte +** offset PagerSavepoint.iOffset and continuing to +** PagerSavepoint.iHdrOffset, or to the end of the main journal +** file if PagerSavepoint.iHdrOffset is zero. ** -** The case pSavepoint==NULL occurs when a ROLLBACK TO command is invoked -** on a SAVEPOINT that is a transaction savepoint. +** * If PagerSavepoint.iHdrOffset is not zero, then pages are played +** back starting from the journal header immediately following +** PagerSavepoint.iHdrOffset to the end of the main journal file. +** +** * Pages are then played back from the sub-journal file, starting +** with the PagerSavepoint.iSubRec and continuing to the end of +** the journal file. +** +** Throughout the rollback process, each time a page is rolled back, the +** corresponding bit is set in a bitvec structure (variable pDone in the +** implementation below). This is used to ensure that a page is only +** rolled back the first time it is encountered in either journal. +** +** If pSavepoint is NULL, then pages are only played back from the main +** journal file. There is no need for a bitvec in this case. +** +** In either case, before playback commences the Pager.dbSize variable +** is reset to the value that it held at the start of the savepoint +** (or transaction). No page with a page-number greater than this value +** is played back. If one is encountered it is simply skipped. */ static int pagerPlaybackSavepoint(Pager *pPager, PagerSavepoint *pSavepoint){ i64 szJ; /* Effective size of the main journal */ i64 iHdrOff; /* End of first segment of main-journal records */ - Pgno ii; /* Loop counter */ int rc = SQLITE_OK; /* Return code */ Bitvec *pDone = 0; /* Bitvec to ensure pages played back only once */ + assert( pPager->state>=PAGER_SHARED ); + /* Allocate a bitvec to use to store the set of pages rolled back */ if( pSavepoint ){ pDone = sqlite3BitvecCreate(pSavepoint->nOrig); @@ -1788,11 +2119,10 @@ static int pagerPlaybackSavepoint(Pager *pPager, PagerSavepoint *pSavepoint){ } } - /* Truncate the database back to the size it was before the - ** savepoint being reverted was opened. + /* Set the database size back to the value it was before the savepoint + ** being reverted was opened. */ pPager->dbSize = pSavepoint ? pSavepoint->nOrig : pPager->dbOrigSize; - assert( pPager->state>=PAGER_SHARED ); /* Use pPager->journalOff as the effective size of the main rollback ** journal. The actual file might be larger than this in @@ -1813,8 +2143,8 @@ static int pagerPlaybackSavepoint(Pager *pPager, PagerSavepoint *pSavepoint){ pPager->journalOff = pSavepoint->iOffset; while( rc==SQLITE_OK && pPager->journalOffjournalOff, 1, pDone); - assert( rc!=SQLITE_DONE ); } + assert( rc!=SQLITE_DONE ); }else{ pPager->journalOff = 0; } @@ -1825,6 +2155,7 @@ static int pagerPlaybackSavepoint(Pager *pPager, PagerSavepoint *pSavepoint){ ** continue adding pages rolled back to pDone. */ while( rc==SQLITE_OK && pPager->journalOffjournalOffjournalOff, 1, pDone); - assert( rc!=SQLITE_DONE ); } + assert( rc!=SQLITE_DONE ); } assert( rc!=SQLITE_OK || pPager->journalOff==szJ ); @@ -1857,12 +2188,13 @@ static int pagerPlaybackSavepoint(Pager *pPager, PagerSavepoint *pSavepoint){ ** will be skipped. Out-of-range pages are also skipped. */ if( pSavepoint ){ + u32 ii; /* Loop counter */ i64 offset = pSavepoint->iSubRec*(4+pPager->pageSize); - for(ii=pSavepoint->iSubRec; rc==SQLITE_OK&&ii<(u32)pPager->stmtNRec; ii++){ - assert( offset == ii*(4+pPager->pageSize) ); + for(ii=pSavepoint->iSubRec; rc==SQLITE_OK && iinSubRec; ii++){ + assert( offset==ii*(4+pPager->pageSize) ); rc = pager_playback_one_page(pPager, 0, &offset, 1, pDone); - assert( rc!=SQLITE_DONE ); } + assert( rc!=SQLITE_DONE ); } sqlite3BitvecDestroy(pDone); @@ -1924,18 +2256,26 @@ int sqlite3_opentemp_count = 0; #endif /* -** Open a temporary file. +** Open a temporary file. +** +** Write the file descriptor into *pFile. Return SQLITE_OK on success +** or some other error code if we fail. The OS will automatically +** delete the temporary file when it is closed. +** +** The flags passed to the VFS layer xOpen() call are those specified +** by parameter vfsFlags ORed with the following: ** -** Write the file descriptor into *fd. Return SQLITE_OK on success or some -** other error code if we fail. The OS will automatically delete the temporary -** file when it is closed. +** SQLITE_OPEN_READWRITE +** SQLITE_OPEN_CREATE +** SQLITE_OPEN_EXCLUSIVE +** SQLITE_OPEN_DELETEONCLOSE */ -static int sqlite3PagerOpentemp( +static int pagerOpentemp( Pager *pPager, /* The pager object */ sqlite3_file *pFile, /* Write the file descriptor here */ int vfsFlags /* Flags passed through to the VFS */ ){ - int rc; + int rc; /* Return code */ #ifdef SQLITE_TEST sqlite3_opentemp_count++; /* Used for testing and analysis only */ @@ -1944,272 +2284,80 @@ static int sqlite3PagerOpentemp( vfsFlags |= SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE | SQLITE_OPEN_EXCLUSIVE | SQLITE_OPEN_DELETEONCLOSE; rc = sqlite3OsOpen(pPager->pVfs, 0, pFile, vfsFlags, 0); - assert( rc!=SQLITE_OK || pFile->pMethods ); + assert( rc!=SQLITE_OK || isOpen(pFile) ); return rc; } -static int pagerStress(void *,PgHdr *); - /* -** Create a new page cache and put a pointer to the page cache in *ppPager. -** The file to be cached need not exist. The file is not locked until -** the first call to sqlite3PagerGet() and is only held open until the -** last page is released using sqlite3PagerUnref(). +** Set the busy handler function. ** -** If zFilename is NULL then a randomly-named temporary file is created -** and used as the file to be cached. The file will be deleted -** automatically when it is closed. +** The pager invokes the busy-handler if sqlite3OsLock() returns +** SQLITE_BUSY when trying to upgrade from no-lock to a SHARED lock, +** or when trying to upgrade from a RESERVED lock to an EXCLUSIVE +** lock. It does *not* invoke the busy handler when upgrading from +** SHARED to RESERVED, or when upgrading from SHARED to EXCLUSIVE +** (which occurs during hot-journal rollback). Summary: ** -** If zFilename is ":memory:" then all information is held in cache. -** It is never written to disk. This can be used to implement an -** in-memory database. -*/ -int sqlite3PagerOpen( - sqlite3_vfs *pVfs, /* The virtual file system to use */ - Pager **ppPager, /* Return the Pager structure here */ - const char *zFilename, /* Name of the database file to open */ - int nExtra, /* Extra bytes append to each in-memory page */ - int flags, /* flags controlling this file */ - int vfsFlags /* flags passed through to sqlite3_vfs.xOpen() */ -){ - u8 *pPtr; - Pager *pPager = 0; - int rc = SQLITE_OK; - int i; - int tempFile = 0; - int memDb = 0; - int readOnly = 0; - int useJournal = (flags & PAGER_OMIT_JOURNAL)==0; - int noReadlock = (flags & PAGER_NO_READLOCK)!=0; - int journalFileSize; - int pcacheSize = sqlite3PcacheSize(); - int szPageDflt = SQLITE_DEFAULT_PAGE_SIZE; - char *zPathname = 0; - int nPathname = 0; - - if( sqlite3JournalSize(pVfs)>sqlite3MemJournalSize() ){ - journalFileSize = sqlite3JournalSize(pVfs); - }else{ - journalFileSize = sqlite3MemJournalSize(); - } - - /* The default return is a NULL pointer */ - *ppPager = 0; - - /* Compute and store the full pathname in an allocated buffer pointed - ** to by zPathname, length nPathname. Or, if this is a temporary file, - ** leave both nPathname and zPathname set to 0. - */ - if( zFilename && zFilename[0] ){ - nPathname = pVfs->mxPathname+1; - zPathname = sqlite3Malloc(nPathname*2); - if( zPathname==0 ){ - return SQLITE_NOMEM; - } -#ifndef SQLITE_OMIT_MEMORYDB - if( strcmp(zFilename,":memory:")==0 ){ - memDb = 1; - zPathname[0] = 0; - }else -#endif - { - rc = sqlite3OsFullPathname(pVfs, zFilename, nPathname, zPathname); - } - if( rc!=SQLITE_OK ){ - sqlite3_free(zPathname); - return rc; - } - nPathname = sqlite3Strlen30(zPathname); - } - - /* Allocate memory for the pager structure */ - pPager = sqlite3MallocZero( - sizeof(*pPager) + /* Pager structure */ - pcacheSize + /* PCache object */ - journalFileSize + /* The journal file structure */ - pVfs->szOsFile + /* The main db file */ - journalFileSize * 2 + /* The two journal files */ - 3*nPathname + 40 /* zFilename, zDirectory, zJournal */ - ); - if( !pPager ){ - sqlite3_free(zPathname); - return SQLITE_NOMEM; - } - pPager->pPCache = (PCache *)&pPager[1]; - pPtr = ((u8 *)&pPager[1]) + pcacheSize; - pPager->vfsFlags = vfsFlags; - pPager->fd = (sqlite3_file*)&pPtr[pVfs->szOsFile*0]; - pPager->sjfd = (sqlite3_file*)&pPtr[pVfs->szOsFile]; - pPager->jfd = (sqlite3_file*)&pPtr[pVfs->szOsFile+journalFileSize]; - pPager->zFilename = (char*)&pPtr[pVfs->szOsFile+2*journalFileSize]; - pPager->zDirectory = &pPager->zFilename[nPathname+1]; - pPager->zJournal = &pPager->zDirectory[nPathname+1]; - pPager->pVfs = pVfs; - if( zPathname ){ - memcpy(pPager->zFilename, zPathname, nPathname+1); - sqlite3_free(zPathname); - } - - /* Open the pager file. - */ - if( zFilename && zFilename[0] && !memDb ){ - if( nPathname>(pVfs->mxPathname - (int)sizeof("-journal")) ){ - rc = SQLITE_CANTOPEN; - }else{ - int fout = 0; - rc = sqlite3OsOpen(pVfs, pPager->zFilename, pPager->fd, - pPager->vfsFlags, &fout); - readOnly = (fout&SQLITE_OPEN_READONLY); - - /* If the file was successfully opened for read/write access, - ** choose a default page size in case we have to create the - ** database file. The default page size is the maximum of: - ** - ** + SQLITE_DEFAULT_PAGE_SIZE, - ** + The value returned by sqlite3OsSectorSize() - ** + The largest page size that can be written atomically. - */ - if( rc==SQLITE_OK && !readOnly ){ - setSectorSize(pPager); - if( szPageDfltsectorSize ){ - szPageDflt = pPager->sectorSize; - } -#ifdef SQLITE_ENABLE_ATOMIC_WRITE - { - int iDc = sqlite3OsDeviceCharacteristics(pPager->fd); - int ii; - assert(SQLITE_IOCAP_ATOMIC512==(512>>8)); - assert(SQLITE_IOCAP_ATOMIC64K==(65536>>8)); - assert(SQLITE_MAX_DEFAULT_PAGE_SIZE<=65536); - for(ii=szPageDflt; ii<=SQLITE_MAX_DEFAULT_PAGE_SIZE; ii=ii*2){ - if( iDc&(SQLITE_IOCAP_ATOMIC|(ii>>8)) ) szPageDflt = ii; - } - } -#endif - if( szPageDflt>SQLITE_MAX_DEFAULT_PAGE_SIZE ){ - szPageDflt = SQLITE_MAX_DEFAULT_PAGE_SIZE; - } - } - } - }else{ - /* If a temporary file is requested, it is not opened immediately. - ** In this case we accept the default page size and delay actually - ** opening the file until the first call to OsWrite(). - ** - ** This branch is also run for an in-memory database. An in-memory - ** database is the same as a temp-file that is never written out to - ** disk and uses an in-memory rollback journal. - */ - tempFile = 1; - pPager->state = PAGER_EXCLUSIVE; - } - - if( pPager && rc==SQLITE_OK ){ - pPager->pTmpSpace = sqlite3PageMalloc(szPageDflt); - } - - /* If an error occured in either of the blocks above. - ** Free the Pager structure and close the file. - ** Since the pager is not allocated there is no need to set - ** any Pager.errMask variables. - */ - if( !pPager || !pPager->pTmpSpace ){ - sqlite3OsClose(pPager->fd); - sqlite3_free(pPager); - return ((rc==SQLITE_OK)?SQLITE_NOMEM:rc); - } - nExtra = FORCE_ALIGNMENT(nExtra); - sqlite3PcacheOpen(szPageDflt, nExtra, !memDb, - !memDb?pagerStress:0, (void *)pPager, pPager->pPCache); - - PAGERTRACE(("OPEN %d %s\n", FILEHANDLEID(pPager->fd), pPager->zFilename)); - IOTRACE(("OPEN %p %s\n", pPager, pPager->zFilename)) - - /* Fill in Pager.zDirectory[] */ - memcpy(pPager->zDirectory, pPager->zFilename, nPathname+1); - for(i=sqlite3Strlen30(pPager->zDirectory); - i>0 && pPager->zDirectory[i-1]!='/'; i--){} - if( i>0 ) pPager->zDirectory[i-1] = 0; - - /* Fill in Pager.zJournal[] */ - if( zPathname ){ - memcpy(pPager->zJournal, pPager->zFilename, nPathname); - memcpy(&pPager->zJournal[nPathname], "-journal", 9); - }else{ - pPager->zJournal = 0; - } - - /* pPager->journalOpen = 0; */ - pPager->useJournal = (u8)useJournal; - pPager->noReadlock = (noReadlock && readOnly) ?1:0; - /* pPager->stmtOpen = 0; */ - /* pPager->stmtInUse = 0; */ - /* pPager->nRef = 0; */ - pPager->dbSizeValid = (u8)memDb; - pPager->pageSize = szPageDflt; - /* pPager->stmtSize = 0; */ - /* pPager->stmtJSize = 0; */ - /* pPager->nPage = 0; */ - pPager->mxPage = 100; - pPager->mxPgno = SQLITE_MAX_PAGE_COUNT; - /* pPager->state = PAGER_UNLOCK; */ - assert( pPager->state == (tempFile ? PAGER_EXCLUSIVE : PAGER_UNLOCK) ); - /* pPager->errMask = 0; */ - pPager->tempFile = (u8)tempFile; - assert( tempFile==PAGER_LOCKINGMODE_NORMAL - || tempFile==PAGER_LOCKINGMODE_EXCLUSIVE ); - assert( PAGER_LOCKINGMODE_EXCLUSIVE==1 ); - pPager->exclusiveMode = (u8)tempFile; - pPager->memDb = (u8)memDb; - pPager->readOnly = (u8)readOnly; - /* pPager->needSync = 0; */ - pPager->noSync = (pPager->tempFile || !useJournal) ?1:0; - pPager->fullSync = pPager->noSync ?0:1; - pPager->sync_flags = SQLITE_SYNC_NORMAL; - /* pPager->pFirst = 0; */ - /* pPager->pFirstSynced = 0; */ - /* pPager->pLast = 0; */ - pPager->nExtra = nExtra; - pPager->journalSizeLimit = SQLITE_DEFAULT_JOURNAL_SIZE_LIMIT; - assert(pPager->fd->pMethods||tempFile); - setSectorSize(pPager); - if( memDb ){ - pPager->journalMode = PAGER_JOURNALMODE_MEMORY; - } - /* pPager->xBusyHandler = 0; */ - /* pPager->pBusyHandlerArg = 0; */ - /* memset(pPager->aHash, 0, sizeof(pPager->aHash)); */ - *ppPager = pPager; - return SQLITE_OK; -} - -/* -** Set the busy handler function. +** Transition | Invokes xBusyHandler +** -------------------------------------------------------- +** NO_LOCK -> SHARED_LOCK | Yes +** SHARED_LOCK -> RESERVED_LOCK | No +** SHARED_LOCK -> EXCLUSIVE_LOCK | No +** RESERVED_LOCK -> EXCLUSIVE_LOCK | Yes +** +** If the busy-handler callback returns non-zero, the lock is +** retried. If it returns zero, then the SQLITE_BUSY error is +** returned to the caller of the pager API function. */ void sqlite3PagerSetBusyhandler( - Pager *pPager, - int (*xBusyHandler)(void *), - void *pBusyHandlerArg + Pager *pPager, /* Pager object */ + int (*xBusyHandler)(void *), /* Pointer to busy-handler function */ + void *pBusyHandlerArg /* Argument to pass to xBusyHandler */ ){ pPager->xBusyHandler = xBusyHandler; pPager->pBusyHandlerArg = pBusyHandlerArg; } /* -** Set the reinitializer for this pager. If not NULL, the reinitializer -** is called when the content of a page in cache is restored to its original -** value as a result of a rollback. The callback gives higher-level code -** an opportunity to restore the EXTRA section to agree with the restored -** page data. +** Set the reinitializer for this pager. If not NULL, the reinitializer +** is called when the content of a page in cache is modified (restored) +** as part of a transaction or savepoint rollback. The callback gives +** higher-level code an opportunity to restore the EXTRA section to +** agree with the restored page data. */ void sqlite3PagerSetReiniter(Pager *pPager, void (*xReinit)(DbPage*)){ pPager->xReiniter = xReinit; } /* -** Set the page size to *pPageSize. If the suggest new page size is -** inappropriate, then an alternative page size is set to that -** value before returning. +** Change the page size used by the Pager object. The new page size +** is passed in *pPageSize. +** +** If the pager is in the error state when this function is called, it +** is a no-op. The value returned is the error state error code (i.e. +** one of SQLITE_IOERR, SQLITE_CORRUPT or SQLITE_FULL). +** +** Otherwise, if all of the following are true: +** +** * the new page size (value of *pPageSize) is valid (a power +** of two between 512 and SQLITE_MAX_PAGE_SIZE, inclusive), and +** +** * there are no outstanding page references, and +** +** * the database is either not an in-memory database or it is +** an in-memory database that currently consists of zero pages. +** +** then the pager object page size is set to *pPageSize. +** +** If the page size is changed, then this function uses sqlite3PagerMalloc() +** to obtain a new Pager.pTmpSpace buffer. If this allocation attempt +** fails, SQLITE_NOMEM is returned and the page size remains unchanged. +** In all other cases, SQLITE_OK is returned. +** +** If the page size is not changed, either because one of the enumerated +** conditions above is not true, the pager was in error state when this +** function was called, or because the memory allocation attempt failed, +** then *pPageSize is set to the old, retained page size before returning. */ int sqlite3PagerSetPagesize(Pager *pPager, u16 *pPageSize){ int rc = pPager->errCode; @@ -2226,7 +2374,6 @@ int sqlite3PagerSetPagesize(Pager *pPager, u16 *pPageSize){ }else{ pager_reset(pPager); pPager->pageSize = pageSize; - if( !pPager->memDb ) setSectorSize(pPager); sqlite3PageFree(pPager->pTmpSpace); pPager->pTmpSpace = pNew; sqlite3PcacheSetPageSize(pPager->pPCache, pageSize); @@ -2292,17 +2439,21 @@ void enable_simulated_io_errors(void){ ** Read the first N bytes from the beginning of the file into memory ** that pDest points to. ** -** No error checking is done. The rational for this is that this function -** may be called even if the file does not exist or contain a header. In -** these cases sqlite3OsRead() will return an error, to which the correct -** response is to zero the memory at pDest and continue. A real IO error -** will presumably recur and be picked up later (Todo: Think about this). +** If the pager was opened on a transient file (zFilename==""), or +** opened on a file less than N bytes in size, the output buffer is +** zeroed and SQLITE_OK returned. The rationale for this is that this +** function is used to read database headers, and a new transient or +** zero sized database has a header than consists entirely of zeroes. +** +** If any IO error apart from SQLITE_IOERR_SHORT_READ is encountered, +** the error code is returned to the caller and the contents of the +** output buffer undefined. */ int sqlite3PagerReadFileheader(Pager *pPager, int N, unsigned char *pDest){ int rc = SQLITE_OK; memset(pDest, 0, N); - assert(pPager->fd->pMethods||pPager->tempFile); - if( pPager->fd->pMethods ){ + assert( isOpen(pPager->fd) || pPager->tempFile ); + if( isOpen(pPager->fd) ){ IOTRACE(("DBHDR %p 0 %d\n", pPager, N)) rc = sqlite3OsRead(pPager->fd, pDest, N, 0); if( rc==SQLITE_IOERR_SHORT_READ ){ @@ -2313,69 +2464,84 @@ int sqlite3PagerReadFileheader(Pager *pPager, int N, unsigned char *pDest){ } /* -** Return the total number of pages in the disk file associated with -** pPager. +** Return the total number of pages in the database file associated +** with pPager. Normally, this is calculated as (/). +** However, if the file is between 1 and bytes in size, then +** this is considered a 1 page file. ** -** If the PENDING_BYTE lies on the page directly after the end of the -** file, then consider this page part of the file too. For example, if -** PENDING_BYTE is byte 4096 (the first byte of page 5) and the size of the -** file is 4096 bytes, 5 is returned instead of 4. +** If the pager is in error state when this function is called, then the +** error state error code is returned and *pnPage left unchanged. Or, +** if the file system has to be queried for the size of the file and +** the query attempt returns an IO error, the IO error code is returned +** and *pnPage is left unchanged. +** +** Otherwise, if everything is successful, then SQLITE_OK is returned +** and *pnPage is set to the number of pages in the database. */ int sqlite3PagerPagecount(Pager *pPager, int *pnPage){ - i64 n = 0; - int rc; - assert( pPager!=0 ); + Pgno nPage; /* Value to return via *pnPage */ + + /* If the pager is already in the error state, return the error code. */ if( pPager->errCode ){ - rc = pPager->errCode; - return rc; + return pPager->errCode; } + + /* Determine the number of pages in the file. Store this in nPage. */ if( pPager->dbSizeValid ){ - n = pPager->dbSize; - } else { - assert(pPager->fd->pMethods||pPager->tempFile); - if( (pPager->fd->pMethods) - && (rc = sqlite3OsFileSize(pPager->fd, &n))!=SQLITE_OK ){ + nPage = pPager->dbSize; + }else{ + int rc; /* Error returned by OsFileSize() */ + i64 n = 0; /* File size in bytes returned by OsFileSize() */ + + assert( isOpen(pPager->fd) || pPager->tempFile ); + if( isOpen(pPager->fd) && (rc = sqlite3OsFileSize(pPager->fd, &n)) ){ pager_error(pPager, rc); return rc; } if( n>0 && npageSize ){ - n = 1; + nPage = 1; }else{ - n /= pPager->pageSize; + nPage = n / pPager->pageSize; } if( pPager->state!=PAGER_UNLOCK ){ - pPager->dbSize = (Pgno)n; - pPager->dbFileSize = (Pgno)n; + pPager->dbSize = (Pgno)nPage; + pPager->dbFileSize = (Pgno)nPage; pPager->dbSizeValid = 1; } } - if( n==(PENDING_BYTE/pPager->pageSize) ){ - n++; - } - if( n>pPager->mxPgno ){ - pPager->mxPgno = (Pgno)n; + + /* If the current number of pages in the file is greater than the + ** configured maximum pager number, increase the allowed limit so + ** that the file can be read. + */ + if( nPage>pPager->mxPgno ){ + pPager->mxPgno = (Pgno)nPage; } + + /* Set the output variable and return SQLITE_OK */ if( pnPage ){ - *pnPage = (int)n; + *pnPage = nPage; } return SQLITE_OK; } -/* -** Forward declaration -*/ -static int syncJournal(Pager*); /* -** Try to obtain a lock on a file. Invoke the busy callback if the lock -** is currently not available. Repeat until the busy callback returns -** false or until the lock succeeds. +** Try to obtain a lock of type locktype on the database file. If +** a similar or greater lock is already held, this function is a no-op +** (returning SQLITE_OK immediately). +** +** Otherwise, attempt to obtain the lock using sqlite3OsLock(). Invoke +** the busy callback if the lock is currently not available. Repeat +** until the busy callback returns false or until the attempt to +** obtain the lock succeeds. ** ** Return SQLITE_OK on success and an error code if we cannot obtain -** the lock. +** the lock. If the lock is obtained successfully, set the Pager.state +** variable to locktype before returning. */ static int pager_wait_on_lock(Pager *pPager, int locktype){ - int rc; + int rc; /* Return code */ /* The OS lock values must be the same as the Pager lock values */ assert( PAGER_SHARED==SHARED_LOCK ); @@ -2385,6 +2551,16 @@ static int pager_wait_on_lock(Pager *pPager, int locktype){ /* If the file is currently unlocked then the size must be unknown */ assert( pPager->state>=PAGER_SHARED || pPager->dbSizeValid==0 ); + /* Check that this is either a no-op (because the requested lock is + ** already held, or one of the transistions that the busy-handler + ** may be invoked during, according to the comment above + ** sqlite3PagerSetBusyhandler(). + */ + assert( (pPager->state>=locktype) + || (pPager->state==PAGER_UNLOCK && locktype==PAGER_SHARED) + || (pPager->state==PAGER_RESERVED && locktype==PAGER_EXCLUSIVE) + ); + if( pPager->state>=locktype ){ rc = SQLITE_OK; }else{ @@ -2409,23 +2585,9 @@ static int pager_wait_on_lock(Pager *pPager, int locktype){ void sqlite3PagerTruncateImage(Pager *pPager, Pgno nPage){ assert( pPager->dbSizeValid ); assert( pPager->dbSize>=nPage ); + assert( pPager->state>=PAGER_RESERVED ); pPager->dbSize = nPage; } - -/* -** Return the current size of the database file image in pages. This -** function differs from sqlite3PagerPagecount() in two ways: -** -** a) It may only be called when at least one reference to a database -** page is held. This guarantees that the database size is already -** known and a call to sqlite3OsFileSize() is not required. -** -** b) The return value is not adjusted for the locking page. -*/ -Pgno sqlite3PagerImageSize(Pager *pPager){ - assert( pPager->dbSizeValid ); - return pPager->dbSize; -} #endif /* ifndef SQLITE_OMIT_AUTOVACUUM */ /* @@ -2443,13 +2605,14 @@ Pgno sqlite3PagerImageSize(Pager *pPager){ ** to the caller. */ int sqlite3PagerClose(Pager *pPager){ - disable_simulated_io_errors(); sqlite3BeginBenignMalloc(); pPager->errCode = 0; pPager->exclusiveMode = 0; pager_reset(pPager); - if( !MEMDB ){ + if( MEMDB ){ + pager_unlock(pPager); + }else{ /* Set Pager.journalHdr to -1 for the benefit of the pager_playback() ** call which may be made from within pagerUnlockAndRollback(). If it ** is not -1, then the unsynced portion of an open journal file may @@ -2459,87 +2622,89 @@ int sqlite3PagerClose(Pager *pPager){ pPager->journalHdr = -1; pagerUnlockAndRollback(pPager); } - enable_simulated_io_errors(); sqlite3EndBenignMalloc(); + enable_simulated_io_errors(); PAGERTRACE(("CLOSE %d\n", PAGERID(pPager))); IOTRACE(("CLOSE %p\n", pPager)) - if( pPager->journalOpen ){ - sqlite3OsClose(pPager->jfd); - } - sqlite3BitvecDestroy(pPager->pInJournal); - sqlite3BitvecDestroy(pPager->pAlwaysRollback); - releaseAllSavepoint(pPager); sqlite3OsClose(pPager->fd); - /* Temp files are automatically deleted by the OS - ** if( pPager->tempFile ){ - ** sqlite3OsDelete(pPager->zFilename); - ** } - */ - sqlite3PageFree(pPager->pTmpSpace); sqlite3PcacheClose(pPager->pPCache); + + assert( !pPager->aSavepoint && !pPager->pInJournal ); + assert( !isOpen(pPager->jfd) && !isOpen(pPager->sjfd) ); + sqlite3_free(pPager); return SQLITE_OK; } #if !defined(NDEBUG) || defined(SQLITE_TEST) /* -** Return the page number for the given page data. +** Return the page number for page pPg. */ -Pgno sqlite3PagerPagenumber(DbPage *p){ - return p->pgno; +Pgno sqlite3PagerPagenumber(DbPage *pPg){ + return pPg->pgno; } #endif /* -** Increment the reference count for a page. The input pointer is -** a reference to the page data. +** Increment the reference count for page pPg. */ -int sqlite3PagerRef(DbPage *pPg){ +void sqlite3PagerRef(DbPage *pPg){ sqlite3PcacheRef(pPg); - return SQLITE_OK; } /* -** Sync the journal. In other words, make sure all the pages that have +** Sync the journal. In other words, make sure all the pages that have ** been written to the journal have actually reached the surface of the -** disk. It is not safe to modify the original database file until after -** the journal has been synced. If the original database is modified before -** the journal is synced and a power failure occurs, the unsynced journal -** data would be lost and we would be unable to completely rollback the -** database changes. Database corruption would occur. -** -** This routine also updates the nRec field in the header of the journal. -** (See comments on the pager_playback() routine for additional information.) -** If the sync mode is FULL, two syncs will occur. First the whole journal -** is synced, then the nRec field is updated, then a second sync occurs. +** disk and can be restored in the event of a hot-journal rollback. +** +** If the Pager.needSync flag is not set, then this function is a +** no-op. Otherwise, the actions required depend on the journal-mode +** and the device characteristics of the the file-system, as follows: +** +** * If the journal file is an in-memory journal file, no action need +** be taken. +** +** * Otherwise, if the device does not support the SAFE_APPEND property, +** then the nRec field of the most recently written journal header +** is updated to contain the number of journal records that have +** been written following it. If the pager is operating in full-sync +** mode, then the journal file is synced before this field is updated. ** -** For temporary databases, we do not care if we are able to rollback -** after a power failure, so no sync occurs. +** * If the device does not support the SEQUENTIAL property, then +** journal file is synced. ** -** If the IOCAP_SEQUENTIAL flag is set for the persistent media on which -** the database is stored, then OsSync() is never called on the journal -** file. In this case all that is required is to update the nRec field in -** the journal header. +** Or, in pseudo-code: ** -** This routine clears the needSync field of every page current held in -** memory. +** if( NOT ){ +** if( NOT SAFE_APPEND ){ +** if( ) xSync(); +** +** } +** if( NOT SEQUENTIAL ) xSync(); +** } +** +** The Pager.needSync flag is never be set for temporary files, or any +** file operating in no-sync mode (Pager.noSync set to non-zero). +** +** If successful, this routine clears the PGHDR_NEED_SYNC flag of every +** page currently held in memory before returning SQLITE_OK. If an IO +** error is encountered, then the IO error code is returned to the caller. */ static int syncJournal(Pager *pPager){ - int rc = SQLITE_OK; - - /* Sync the journal before modifying the main database - ** (assuming there is a journal and it needs to be synced.) - */ if( pPager->needSync ){ assert( !pPager->tempFile ); if( pPager->journalMode!=PAGER_JOURNALMODE_MEMORY ){ - int iDc = sqlite3OsDeviceCharacteristics(pPager->fd); - assert( pPager->journalOpen ); + int rc; /* Return code */ + const int iDc = sqlite3OsDeviceCharacteristics(pPager->fd); + assert( isOpen(pPager->jfd) ); if( 0==(iDc&SQLITE_IOCAP_SAFE_APPEND) ){ - i64 jrnlOff = journalHdrOffset(pPager); - u8 zMagic[8]; + /* Variable iNRecOffset is set to the offset in the journal file + ** of the nRec field of the most recently written journal header. + ** This field will be updated following the xSync() operation + ** on the journal file. */ + i64 iNRecOffset = pPager->journalHdr + sizeof(aJournalMagic); /* This block deals with an obscure problem. If the last connection ** that wrote to this database was operating in persistent-journal @@ -2557,11 +2722,18 @@ static int syncJournal(Pager *pPager){ ** To work around this, if the journal file does appear to contain ** a valid header following Pager.journalOff, then write a 0x00 ** byte to the start of it to prevent it from being recognized. + ** + ** Variable iNextHdrOffset is set to the offset at which this + ** problematic header will occur, if it exists. aMagic is used + ** as a temporary buffer to inspect the first couple of bytes of + ** the potential journal header. */ - rc = sqlite3OsRead(pPager->jfd, zMagic, 8, jrnlOff); - if( rc==SQLITE_OK && 0==memcmp(zMagic, aJournalMagic, 8) ){ + i64 iNextHdrOffset = journalHdrOffset(pPager); + u8 aMagic[8]; + rc = sqlite3OsRead(pPager->jfd, aMagic, 8, iNextHdrOffset); + if( rc==SQLITE_OK && 0==memcmp(aMagic, aJournalMagic, 8) ){ static const u8 zerobyte = 0; - rc = sqlite3OsWrite(pPager->jfd, &zerobyte, 1, jrnlOff); + rc = sqlite3OsWrite(pPager->jfd, &zerobyte, 1, iNextHdrOffset); } if( rc!=SQLITE_OK && rc!=SQLITE_IOERR_SHORT_READ ){ return rc; @@ -2582,13 +2754,11 @@ static int syncJournal(Pager *pPager){ PAGERTRACE(("SYNC journal of %d\n", PAGERID(pPager))); IOTRACE(("JSYNC %p\n", pPager)) rc = sqlite3OsSync(pPager->jfd, pPager->sync_flags); - if( rc!=0 ) return rc; + if( rc!=SQLITE_OK ) return rc; } - - jrnlOff = pPager->journalHdr + sizeof(aJournalMagic); - IOTRACE(("JHDR %p %lld %d\n", pPager, jrnlOff, 4)); - rc = write32bits(pPager->jfd, jrnlOff, pPager->nRec); - if( rc ) return rc; + IOTRACE(("JHDR %p %lld %d\n", pPager, iNRecOffset, 4)); + rc = write32bits(pPager->jfd, iNRecOffset, pPager->nRec); + if( rc!=SQLITE_OK ) return rc; } if( 0==(iDc&SQLITE_IOCAP_SEQUENTIAL) ){ PAGERTRACE(("SYNC journal of %d\n", PAGERID(pPager))); @@ -2596,37 +2766,63 @@ static int syncJournal(Pager *pPager){ rc = sqlite3OsSync(pPager->jfd, pPager->sync_flags| (pPager->sync_flags==SQLITE_SYNC_FULL?SQLITE_SYNC_DATAONLY:0) ); - if( rc!=0 ) return rc; + if( rc!=SQLITE_OK ) return rc; } - pPager->journalStarted = 1; } - pPager->needSync = 0; - /* Erase the needSync flag from every page. + /* The journal file was just successfully synced. Set Pager.needSync + ** to zero and clear the PGHDR_NEED_SYNC flag on all pagess. */ + pPager->needSync = 0; + pPager->journalStarted = 1; sqlite3PcacheClearSyncFlags(pPager->pPCache); } - return rc; + return SQLITE_OK; } /* -** Given a list of pages (connected by the PgHdr.pDirty pointer) write -** every one of those pages out to the database file. No calls are made -** to the page-cache to mark the pages as clean. It is the responsibility -** of the caller to use PcacheCleanAll() or PcacheMakeClean() to mark -** the pages as clean. +** The argument is the first in a linked list of dirty pages connected +** by the PgHdr.pDirty pointer. This function writes each one of the +** in-memory pages in the list to the database file. The argument may +** be NULL, representing an empty list. In this case this function is +** a no-op. +** +** The pager must hold at least a RESERVED lock when this function +** is called. Before writing anything to the database file, this lock +** is upgraded to an EXCLUSIVE lock. If the lock cannot be obtained, +** SQLITE_BUSY is returned and no data is written to the database file. +** +** If the pager is a temp-file pager and the actual file-system file +** is not yet open, it is created and opened before any data is +** written out. +** +** Once the lock has been upgraded and, if necessary, the file opened, +** the pages are written out to the database file in list order. Writing +** a page is skipped if it meets either of the following criteria: +** +** * The page number is greater than Pager.dbSize, or +** * The PGHDR_DONT_WRITE flag is set on the page. +** +** If writing out a page causes the database file to grow, Pager.dbFileSize +** is updated accordingly. If page 1 is written out, then the value cached +** in Pager.dbFileVers[] is updated to match the new value stored in +** the database file. +** +** If everything is successful, SQLITE_OK is returned. If an IO error +** occurs, an IO error code is returned. Or, if the EXCLUSIVE lock cannot +** be obtained, SQLITE_BUSY is returned. */ static int pager_write_pagelist(PgHdr *pList){ - Pager *pPager; - int rc; + Pager *pPager; /* Pager object */ + int rc; /* Return code */ if( pList==0 ) return SQLITE_OK; pPager = pList->pPager; /* At this point there may be either a RESERVED or EXCLUSIVE lock on the ** database file. If there is already an EXCLUSIVE lock, the following - ** calls to sqlite3OsLock() are no-ops. + ** call is a no-op. ** ** Moving the lock from RESERVED to EXCLUSIVE actually involves going ** through an intermediate state PENDING. A PENDING lock prevents new @@ -2640,67 +2836,82 @@ static int pager_write_pagelist(PgHdr *pList){ ** EXCLUSIVE, it means the database file has been changed and any rollback ** will require a journal playback. */ + assert( pPager->state>=PAGER_RESERVED ); rc = pager_wait_on_lock(pPager, EXCLUSIVE_LOCK); - if( rc!=SQLITE_OK ){ - return rc; - } - while( pList ){ + /* If the file is a temp-file has not yet been opened, open it now. It + ** is not possible for rc to be other than SQLITE_OK if this branch + ** is taken, as pager_wait_on_lock() is a no-op for temp-files. + */ + if( !isOpen(pPager->fd) ){ + assert( pPager->tempFile && rc==SQLITE_OK ); + rc = pagerOpentemp(pPager, pPager->fd, pPager->vfsFlags); + } - /* If the file has not yet been opened, open it now. */ - if( !pPager->fd->pMethods ){ - assert(pPager->tempFile); - rc = sqlite3PagerOpentemp(pPager, pPager->fd, pPager->vfsFlags); - if( rc ) return rc; - } + while( rc==SQLITE_OK && pList ){ + Pgno pgno = pList->pgno; /* If there are dirty pages in the page cache with page numbers greater ** than Pager.dbSize, this means sqlite3PagerTruncateImage() was called to ** make the file smaller (presumably by auto-vacuum code). Do not write ** any such pages to the file. + ** + ** Also, do not write out any page that has the PGHDR_DONT_WRITE flag + ** set (set by sqlite3PagerDontWrite()). */ - if( pList->pgno<=pPager->dbSize && 0==(pList->flags&PGHDR_DONT_WRITE) ){ - i64 offset = (pList->pgno-1)*(i64)pPager->pageSize; - char *pData = CODEC2(pPager, pList->pData, pList->pgno, 6); + if( pgno<=pPager->dbSize && 0==(pList->flags&PGHDR_DONT_WRITE) ){ + i64 offset = (pgno-1)*(i64)pPager->pageSize; /* Offset to write */ + char *pData = CODEC2(pPager, pList->pData, pgno, 6); /* Data to write */ - PAGERTRACE(("STORE %d page %d hash(%08x)\n", - PAGERID(pPager), pList->pgno, pager_pagehash(pList))); - IOTRACE(("PGOUT %p %d\n", pPager, pList->pgno)); + /* Write out the page data. */ rc = sqlite3OsWrite(pPager->fd, pData, pPager->pageSize, offset); - PAGER_INCR(sqlite3_pager_writedb_count); - PAGER_INCR(pPager->nWrite); - if( pList->pgno==1 ){ + + /* If page 1 was just written, update Pager.dbFileVers to match + ** the value now stored in the database file. If writing this + ** page caused the database file to grow, update dbFileSize. + */ + if( pgno==1 ){ memcpy(&pPager->dbFileVers, &pData[24], sizeof(pPager->dbFileVers)); } - if( pList->pgno>pPager->dbFileSize ){ - pPager->dbFileSize = pList->pgno; + if( pgno>pPager->dbFileSize ){ + pPager->dbFileSize = pgno; } + + PAGERTRACE(("STORE %d page %d hash(%08x)\n", + PAGERID(pPager), pgno, pager_pagehash(pList))); + IOTRACE(("PGOUT %p %d\n", pPager, pgno)); + PAGER_INCR(sqlite3_pager_writedb_count); + PAGER_INCR(pPager->nWrite); + }else{ + PAGERTRACE(("NOSTORE %d page %d\n", PAGERID(pPager), pgno)); } -#ifndef NDEBUG - else{ - PAGERTRACE(("NOSTORE %d page %d\n", PAGERID(pPager), pList->pgno)); - } -#endif - if( rc ) return rc; #ifdef SQLITE_CHECK_PAGES pList->pageHash = pager_pagehash(pList); #endif pList = pList->pDirty; } - return SQLITE_OK; + return rc; } /* -** Add the page to the sub-journal. It is the callers responsibility to -** use subjRequiresPage() to check that it is really required before -** calling this function. +** Append a record of the current state of page pPg to the sub-journal. +** It is the callers responsibility to use subjRequiresPage() to check +** that it is really required before calling this function. +** +** If successful, set the bit corresponding to pPg->pgno in the bitvecs +** for all open savepoints before returning. +** +** This function returns SQLITE_OK if everything is successful, an IO +** error code if the attempt to write to the sub-journal fails, or +** SQLITE_NOMEM if a malloc fails while setting a bit in a savepoint +** bitvec. */ static int subjournalPage(PgHdr *pPg){ int rc; void *pData = pPg->pData; Pager *pPager = pPg->pPager; - i64 offset = pPager->stmtNRec*(4+pPager->pageSize); + i64 offset = pPager->nSubRec*(4+pPager->pageSize); char *pData2 = CODEC2(pPager, pData, pPg->pgno, 7); PAGERTRACE(("STMT-JOURNAL %d page %d\n", PAGERID(pPager), pPg->pgno)); @@ -2711,9 +2922,10 @@ static int subjournalPage(PgHdr *pPg){ rc = sqlite3OsWrite(pPager->sjfd, pData2, pPager->pageSize, offset+4); } if( rc==SQLITE_OK ){ - pPager->stmtNRec++; + pPager->nSubRec++; assert( pPager->nSavepoint>0 ); rc = addToSavepointBitvecs(pPager, pPg->pgno); + testcase( rc!=SQLITE_OK ); } return rc; } @@ -2721,62 +2933,391 @@ static int subjournalPage(PgHdr *pPg){ /* ** This function is called by the pcache layer when it has reached some -** soft memory limit. The argument is a pointer to a purgeable Pager -** object. This function attempts to make a single dirty page that has no -** outstanding references (if one exists) clean so that it can be recycled -** by the pcache layer. +** soft memory limit. The first argument is a pointer to a Pager object +** (cast as a void*). The pager is always 'purgeable' (not an in-memory +** database). The second argument is a reference to a page that is +** currently dirty but has no outstanding references. The page +** is always associated with the Pager object passed as the first +** argument. +** +** The job of this function is to make pPg clean by writing its contents +** out to the database file, if possible. This may involve syncing the +** journal file. +** +** If successful, sqlite3PcacheMakeClean() is called on the page and +** SQLITE_OK returned. If an IO error occurs while trying to make the +** page clean, the IO error code is returned. If the page cannot be +** made clean for some other reason, but no error occurs, then SQLITE_OK +** is returned by sqlite3PcacheMakeClean() is not called. */ static int pagerStress(void *p, PgHdr *pPg){ Pager *pPager = (Pager *)p; int rc = SQLITE_OK; - if( pPager->doNotSync ){ + assert( pPg->pPager==pPager ); + assert( pPg->flags&PGHDR_DIRTY ); + + /* The doNotSync flag is set by the sqlite3PagerWrite() function while it + ** is journalling a set of two or more database pages that are stored + ** on the same disk sector. Syncing the journal is not allowed while + ** this is happening as it is important that all members of such a + ** set of pages are synced to disk together. So, if the page this function + ** is trying to make clean will require a journal sync and the doNotSync + ** flag is set, return without doing anything. The pcache layer will + ** just have to go ahead and allocate a new page buffer instead of + ** reusing pPg. + ** + ** Similarly, if the pager has already entered the error state, do not + ** try to write the contents of pPg to disk. + */ + if( pPager->errCode || (pPager->doNotSync && pPg->flags&PGHDR_NEED_SYNC) ){ return SQLITE_OK; } - assert( pPg->flags&PGHDR_DIRTY ); - if( pPager->errCode==SQLITE_OK ){ - if( pPg->flags&PGHDR_NEED_SYNC ){ - rc = syncJournal(pPager); - if( rc==SQLITE_OK && pPager->fullSync && - !(pPager->journalMode==PAGER_JOURNALMODE_MEMORY) && - !(sqlite3OsDeviceCharacteristics(pPager->fd)&SQLITE_IOCAP_SAFE_APPEND) - ){ - pPager->nRec = 0; - rc = writeJournalHdr(pPager); - } + /* Sync the journal file if required. */ + if( pPg->flags&PGHDR_NEED_SYNC ){ + rc = syncJournal(pPager); + if( rc==SQLITE_OK && pPager->fullSync && + !(pPager->journalMode==PAGER_JOURNALMODE_MEMORY) && + !(sqlite3OsDeviceCharacteristics(pPager->fd)&SQLITE_IOCAP_SAFE_APPEND) + ){ + pPager->nRec = 0; + rc = writeJournalHdr(pPager); } - if( rc==SQLITE_OK ){ - pPg->pDirty = 0; - if( pPg->pgno>pPager->dbSize && subjRequiresPage(pPg) ){ - rc = subjournalPage(pPg); - } - if( rc==SQLITE_OK ){ - rc = pager_write_pagelist(pPg); - } + } + + /* If the page number of this page is larger than the current size of + ** the database image, it may need to be written to the sub-journal. + ** This is because the call to pager_write_pagelist() below will not + ** actually write data to the file in this case. + ** + ** Consider the following sequence of events: + ** + ** BEGIN; + ** + ** + ** SAVEPOINT sp; + ** + ** pagerStress(page X) + ** ROLLBACK TO sp; + ** + ** If (X>Y), then when pagerStress is called page X will not be written + ** out to the database file, but will be dropped from the cache. Then, + ** following the "ROLLBACK TO sp" statement, reading page X will read + ** data from the database file. This will be the copy of page X as it + ** was when the transaction started, not as it was when "SAVEPOINT sp" + ** was executed. + ** + ** The solution is to write the current data for page X into the + ** sub-journal file now (if it is not already there), so that it will + ** be restored to its current value when the "ROLLBACK TO sp" is + ** executed. + */ + if( rc==SQLITE_OK && pPg->pgno>pPager->dbSize && subjRequiresPage(pPg) ){ + rc = subjournalPage(pPg); + } + + /* Write the contents of the page out to the database file. */ + if( rc==SQLITE_OK ){ + pPg->pDirty = 0; + rc = pager_write_pagelist(pPg); + } + + /* Mark the page as clean. */ + if( rc==SQLITE_OK ){ + PAGERTRACE(("STRESS %d page %d\n", PAGERID(pPager), pPg->pgno)); + sqlite3PcacheMakeClean(pPg); + } + + return pager_error(pPager, rc); +} + + +/* +** Allocate and initialize a new Pager object and put a pointer to it +** in *ppPager. The pager should eventually be freed by passing it +** to sqlite3PagerClose(). +** +** The zFilename argument is the path to the database file to open. +** If zFilename is NULL then a randomly-named temporary file is created +** and used as the file to be cached. Temporary files are be deleted +** automatically when they are closed. If zFilename is ":memory:" then +** all information is held in cache. It is never written to disk. +** This can be used to implement an in-memory database. +** +** The nExtra parameter specifies the number of bytes of space allocated +** along with each page reference. This space is available to the user +** via the sqlite3PagerGetExtra() API. +** +** The flags argument is used to specify properties that affect the +** operation of the pager. It should be passed some bitwise combination +** of the PAGER_OMIT_JOURNAL and PAGER_NO_READLOCK flags. +** +** The vfsFlags parameter is a bitmask to pass to the flags parameter +** of the xOpen() method of the supplied VFS when opening files. +** +** If the pager object is allocated and the specified file opened +** successfully, SQLITE_OK is returned and *ppPager set to point to +** the new pager object. If an error occurs, *ppPager is set to NULL +** and error code returned. This function may return SQLITE_NOMEM +** (sqlite3Malloc() is used to allocate memory), SQLITE_CANTOPEN or +** various SQLITE_IO_XXX errors. +*/ +int sqlite3PagerOpen( + sqlite3_vfs *pVfs, /* The virtual file system to use */ + Pager **ppPager, /* OUT: Return the Pager structure here */ + const char *zFilename, /* Name of the database file to open */ + int nExtra, /* Extra bytes append to each in-memory page */ + int flags, /* flags controlling this file */ + int vfsFlags /* flags passed through to sqlite3_vfs.xOpen() */ +){ + u8 *pPtr; + Pager *pPager = 0; /* Pager object to allocate and return */ + int rc = SQLITE_OK; /* Return code */ + int tempFile = 0; /* True for temp files (incl. in-memory files) */ + int memDb = 0; /* True if this is an in-memory file */ + int readOnly = 0; /* True if this is a read-only file */ + int journalFileSize; /* Bytes to allocate for each journal fd */ + char *zPathname = 0; /* Full path to database file */ + int nPathname = 0; /* Number of bytes in zPathname */ + int useJournal = (flags & PAGER_OMIT_JOURNAL)==0; /* False to omit journal */ + int noReadlock = (flags & PAGER_NO_READLOCK)!=0; /* True to omit read-lock */ + int pcacheSize = sqlite3PcacheSize(); /* Bytes to allocate for PCache */ + u16 szPageDflt = SQLITE_DEFAULT_PAGE_SIZE; /* Default page size */ + + /* Figure out how much space is required for each journal file-handle + ** (there are two of them, the main journal and the sub-journal). This + ** is the maximum space required for an in-memory journal file handle + ** and a regular journal file-handle. Note that a "regular journal-handle" + ** may be a wrapper capable of caching the first portion of the journal + ** file in memory to implement the atomic-write optimization (see + ** source file journal.c). + */ + if( sqlite3JournalSize(pVfs)>sqlite3MemJournalSize() ){ + journalFileSize = sqlite3JournalSize(pVfs); + }else{ + journalFileSize = sqlite3MemJournalSize(); + } + + /* Set the output variable to NULL in case an error occurs. */ + *ppPager = 0; + + /* Compute and store the full pathname in an allocated buffer pointed + ** to by zPathname, length nPathname. Or, if this is a temporary file, + ** leave both nPathname and zPathname set to 0. + */ + if( zFilename && zFilename[0] ){ + nPathname = pVfs->mxPathname+1; + zPathname = sqlite3Malloc(nPathname*2); + if( zPathname==0 ){ + return SQLITE_NOMEM; + } +#ifndef SQLITE_OMIT_MEMORYDB + if( strcmp(zFilename,":memory:")==0 ){ + memDb = 1; + zPathname[0] = 0; + }else +#endif + { + rc = sqlite3OsFullPathname(pVfs, zFilename, nPathname, zPathname); + } + + nPathname = sqlite3Strlen30(zPathname); + if( rc==SQLITE_OK && nPathname+8>pVfs->mxPathname ){ + /* This branch is taken when the journal path required by + ** the database being opened will be more than pVfs->mxPathname + ** bytes in length. This means the database cannot be opened, + ** as it will not be possible to open the journal file or even + ** check for a hot-journal before reading. + */ + rc = SQLITE_CANTOPEN; } if( rc!=SQLITE_OK ){ - pager_error(pPager, rc); + sqlite3_free(zPathname); + return rc; + } + } + + /* Allocate memory for the Pager structure, PCache object, the + ** three file descriptors, the database file name and the journal + ** file name. The layout in memory is as follows: + ** + ** Pager object (sizeof(Pager) bytes) + ** PCache object (sqlite3PcacheSize() bytes) + ** Database file handle (pVfs->szOsFile bytes) + ** Sub-journal file handle (journalFileSize bytes) + ** Main journal file handle (journalFileSize bytes) + ** Database file name (nPathname+1 bytes) + ** Journal file name (nPathname+8+1 bytes) + */ + pPtr = (u8 *)sqlite3MallocZero( + sizeof(*pPager) + /* Pager structure */ + pcacheSize + /* PCache object */ + pVfs->szOsFile + /* The main db file */ + journalFileSize * 2 + /* The two journal files */ + nPathname + 1 + /* zFilename */ + nPathname + 8 + 1 /* zJournal */ + ); + if( !pPtr ){ + sqlite3_free(zPathname); + return SQLITE_NOMEM; + } + pPager = (Pager*)(pPtr); + pPager->pPCache = (PCache*)(pPtr += sizeof(*pPager)); + pPager->fd = (sqlite3_file*)(pPtr += pcacheSize); + pPager->sjfd = (sqlite3_file*)(pPtr += pVfs->szOsFile); + pPager->jfd = (sqlite3_file*)(pPtr += journalFileSize); + pPager->zFilename = (char*)(pPtr += journalFileSize); + + /* Fill in the Pager.zFilename and Pager.zJournal buffers, if required. */ + if( zPathname ){ + pPager->zJournal = (char*)(pPtr += nPathname + 1); + memcpy(pPager->zFilename, zPathname, nPathname); + memcpy(pPager->zJournal, zPathname, nPathname); + memcpy(&pPager->zJournal[nPathname], "-journal", 8); + sqlite3_free(zPathname); + } + pPager->pVfs = pVfs; + pPager->vfsFlags = vfsFlags; + + /* Open the pager file. + */ + if( zFilename && zFilename[0] && !memDb ){ + int fout = 0; /* VFS flags returned by xOpen() */ + rc = sqlite3OsOpen(pVfs, pPager->zFilename, pPager->fd, vfsFlags, &fout); + readOnly = (fout&SQLITE_OPEN_READONLY); + + /* If the file was successfully opened for read/write access, + ** choose a default page size in case we have to create the + ** database file. The default page size is the maximum of: + ** + ** + SQLITE_DEFAULT_PAGE_SIZE, + ** + The value returned by sqlite3OsSectorSize() + ** + The largest page size that can be written atomically. + */ + if( rc==SQLITE_OK && !readOnly ){ + setSectorSize(pPager); + if( szPageDfltsectorSize ){ + szPageDflt = pPager->sectorSize; + } +#ifdef SQLITE_ENABLE_ATOMIC_WRITE + { + int iDc = sqlite3OsDeviceCharacteristics(pPager->fd); + int ii; + assert(SQLITE_IOCAP_ATOMIC512==(512>>8)); + assert(SQLITE_IOCAP_ATOMIC64K==(65536>>8)); + assert(SQLITE_MAX_DEFAULT_PAGE_SIZE<=65536); + for(ii=szPageDflt; ii<=SQLITE_MAX_DEFAULT_PAGE_SIZE; ii=ii*2){ + if( iDc&(SQLITE_IOCAP_ATOMIC|(ii>>8)) ){ + szPageDflt = ii; + } + } + } +#endif + if( szPageDflt>SQLITE_MAX_DEFAULT_PAGE_SIZE ){ + szPageDflt = SQLITE_MAX_DEFAULT_PAGE_SIZE; + } } + }else{ + /* If a temporary file is requested, it is not opened immediately. + ** In this case we accept the default page size and delay actually + ** opening the file until the first call to OsWrite(). + ** + ** This branch is also run for an in-memory database. An in-memory + ** database is the same as a temp-file that is never written out to + ** disk and uses an in-memory rollback journal. + */ + tempFile = 1; + pPager->state = PAGER_EXCLUSIVE; } + /* The following call to PagerSetPagesize() serves to set the value of + ** Pager.pageSize and to allocate the Pager.pTmpSpace buffer. + */ if( rc==SQLITE_OK ){ - PAGERTRACE(("STRESS %d page %d\n", PAGERID(pPager), pPg->pgno)); - sqlite3PcacheMakeClean(pPg); + assert( pPager->memDb==0 ); + rc = sqlite3PagerSetPagesize(pPager, &szPageDflt); + testcase( rc!=SQLITE_OK ); } - return rc; + + /* If an error occured in either of the blocks above, free the + ** Pager structure and close the file. + */ + if( rc!=SQLITE_OK ){ + assert( !pPager->pTmpSpace ); + sqlite3OsClose(pPager->fd); + sqlite3_free(pPager); + return rc; + } + + /* Initialize the PCache object. */ + nExtra = FORCE_ALIGNMENT(nExtra); + sqlite3PcacheOpen(szPageDflt, nExtra, !memDb, + !memDb?pagerStress:0, (void *)pPager, pPager->pPCache); + + PAGERTRACE(("OPEN %d %s\n", FILEHANDLEID(pPager->fd), pPager->zFilename)); + IOTRACE(("OPEN %p %s\n", pPager, pPager->zFilename)) + + pPager->useJournal = (u8)useJournal; + pPager->noReadlock = (noReadlock && readOnly) ?1:0; + /* pPager->stmtOpen = 0; */ + /* pPager->stmtInUse = 0; */ + /* pPager->nRef = 0; */ + pPager->dbSizeValid = (u8)memDb; + /* pPager->stmtSize = 0; */ + /* pPager->stmtJSize = 0; */ + /* pPager->nPage = 0; */ + pPager->mxPgno = SQLITE_MAX_PAGE_COUNT; + /* pPager->state = PAGER_UNLOCK; */ + assert( pPager->state == (tempFile ? PAGER_EXCLUSIVE : PAGER_UNLOCK) ); + /* pPager->errMask = 0; */ + pPager->tempFile = (u8)tempFile; + assert( tempFile==PAGER_LOCKINGMODE_NORMAL + || tempFile==PAGER_LOCKINGMODE_EXCLUSIVE ); + assert( PAGER_LOCKINGMODE_EXCLUSIVE==1 ); + pPager->exclusiveMode = (u8)tempFile; + pPager->changeCountDone = pPager->tempFile; + pPager->memDb = (u8)memDb; + pPager->readOnly = (u8)readOnly; + /* pPager->needSync = 0; */ + pPager->noSync = (pPager->tempFile || !useJournal) ?1:0; + pPager->fullSync = pPager->noSync ?0:1; + pPager->sync_flags = SQLITE_SYNC_NORMAL; + /* pPager->pFirst = 0; */ + /* pPager->pFirstSynced = 0; */ + /* pPager->pLast = 0; */ + pPager->nExtra = nExtra; + pPager->journalSizeLimit = SQLITE_DEFAULT_JOURNAL_SIZE_LIMIT; + assert( isOpen(pPager->fd) || tempFile ); + setSectorSize(pPager); + if( memDb ){ + pPager->journalMode = PAGER_JOURNALMODE_MEMORY; + } + /* pPager->xBusyHandler = 0; */ + /* pPager->pBusyHandlerArg = 0; */ + /* memset(pPager->aHash, 0, sizeof(pPager->aHash)); */ + *ppPager = pPager; + return SQLITE_OK; } + /* -** Return 1 if there is a hot journal on the given pager. -** A hot journal is one that needs to be played back. +** This function is called after transitioning from PAGER_UNLOCK to +** PAGER_SHARED state. It tests if there is a hot journal present in +** the file-system for the given pager. A hot journal is one that +** needs to be played back. According to this function, a hot-journal +** file exists if the following three criteria are met: +** +** * The journal file exists in the file system, and +** * No process holds a RESERVED or greater lock on the database file, and +** * The database file itself is greater than 0 bytes in size. ** ** If the current size of the database file is 0 but a journal file ** exists, that is probably an old journal left over from a prior -** database with the same name. Just delete the journal. -** -** Return negative if unable to determine the status of the journal. +** database with the same name. In this case the journal file is +** just deleted using OsDelete, *pExists is set to 0 and SQLITE_OK +** is returned. ** ** This routine does not open the journal file to examine its ** content. Hence, the journal might contain the name of a master @@ -2786,28 +3327,36 @@ static int pagerStress(void *p, PgHdr *pPg){ ** journal file exists and is not empty this routine assumes it ** is hot. The pager_playback() routine will discover that the ** journal file is not really hot and will no-op. +** +** If a hot-journal file is found to exist, *pExists is set to 1 and +** SQLITE_OK returned. If no hot-journal file is present, *pExists is +** set to 0 and SQLITE_OK returned. If an IO error occurs while trying +** to determine whether or not a hot-journal file exists, the IO error +** code is returned and the value of *pExists is undefined. */ static int hasHotJournal(Pager *pPager, int *pExists){ - sqlite3_vfs *pVfs = pPager->pVfs; - int rc = SQLITE_OK; - int exists = 0; - int locked = 0; + sqlite3_vfs * const pVfs = pPager->pVfs; + int rc; /* Return code */ + int exists = 0; /* True if a journal file is present */ + int locked = 0; /* True if some process holds a RESERVED lock */ + assert( pPager!=0 ); assert( pPager->useJournal ); - assert( pPager->fd->pMethods ); + assert( isOpen(pPager->fd) ); + *pExists = 0; rc = sqlite3OsAccess(pVfs, pPager->zJournal, SQLITE_ACCESS_EXISTS, &exists); if( rc==SQLITE_OK && exists ){ rc = sqlite3OsCheckReservedLock(pPager->fd, &locked); - } - if( rc==SQLITE_OK && exists && !locked ){ - int nPage; - rc = sqlite3PagerPagecount(pPager, &nPage); - if( rc==SQLITE_OK ){ - if( nPage==0 ){ - sqlite3OsDelete(pVfs, pPager->zJournal, 0); - }else{ - *pExists = 1; + if( rc==SQLITE_OK && !locked ){ + int nPage; + rc = sqlite3PagerPagecount(pPager, &nPage); + if( rc==SQLITE_OK ){ + if( nPage==0 ){ + sqlite3OsDelete(pVfs, pPager->zJournal, 0); + }else{ + *pExists = 1; + } } } } @@ -2815,54 +3364,88 @@ static int hasHotJournal(Pager *pPager, int *pExists){ } /* -** Read the content of page pPg out of the database file. +** Read the content for page pPg out of the database file and into +** pPg->pData. A shared lock or greater must be held on the database +** file before this function is called. +** +** If page 1 is read, then the value of Pager.dbFileVers[] is set to +** the value read from the database file. +** +** If an IO error occurs, then the IO error is returned to the caller. +** Otherwise, SQLITE_OK is returned. */ -static int readDbPage(Pager *pPager, PgHdr *pPg, Pgno pgno){ - int rc; - i64 offset; - assert( MEMDB==0 ); - assert(pPager->fd->pMethods||pPager->tempFile); - if( !pPager->fd->pMethods ){ +static int readDbPage(PgHdr *pPg){ + Pager *pPager = pPg->pPager; /* Pager object associated with page pPg */ + Pgno pgno = pPg->pgno; /* Page number to read */ + int rc; /* Return code */ + i64 iOffset; /* Byte offset of file to read from */ + + assert( pPager->state>=PAGER_SHARED && !MEMDB ); + + if( !isOpen(pPager->fd) ){ + assert( pPager->tempFile ); return SQLITE_IOERR_SHORT_READ; } - offset = (pgno-1)*(i64)pPager->pageSize; - rc = sqlite3OsRead(pPager->fd, pPg->pData, pPager->pageSize, offset); + iOffset = (pgno-1)*(i64)pPager->pageSize; + rc = sqlite3OsRead(pPager->fd, pPg->pData, pPager->pageSize, iOffset); + if( pgno==1 ){ + u8 *dbFileVers = &((u8*)pPg->pData)[24]; + memcpy(&pPager->dbFileVers, dbFileVers, sizeof(pPager->dbFileVers)); + } + CODEC1(pPager, pPg->pData, pgno, 3); + PAGER_INCR(sqlite3_pager_readdb_count); PAGER_INCR(pPager->nRead); IOTRACE(("PGIN %p %d\n", pPager, pgno)); - if( pgno==1 ){ - memcpy(&pPager->dbFileVers, &((u8*)pPg->pData)[24], - sizeof(pPager->dbFileVers)); - } - CODEC1(pPager, pPg->pData, pPg->pgno, 3); PAGERTRACE(("FETCH %d page %d hash(%08x)\n", - PAGERID(pPager), pPg->pgno, pager_pagehash(pPg))); + PAGERID(pPager), pgno, pager_pagehash(pPg))); + return rc; } - /* -** This function is called to obtain the shared lock required before -** data may be read from the pager cache. If the shared lock has already -** been obtained, this function is a no-op. +** This function is called whenever the upper layer requests a database +** page is requested, before the cache is checked for a suitable page +** or any data is read from the database. It performs the following +** two functions: +** +** 1) If the pager is currently in PAGER_UNLOCK state (no lock held +** on the database file), then an attempt is made to obtain a +** SHARED lock on the database file. Immediately after obtaining +** the SHARED lock, the file-system is checked for a hot-journal, +** which is played back if present. Following any hot-journal +** rollback, the contents of the cache are validated by checking +** the 'change-counter' field of the database file header and +** discarded if they are found to be invalid. +** +** 2) If the pager is running in exclusive-mode, and there are currently +** no outstanding references to any pages, and is in the error state, +** then an attempt is made to clear the error state by discarding +** the contents of the page cache and rolling back any open journal +** file. +** +** If the operation described by (2) above is not attempted, and if the +** pager is in an error state other than SQLITE_FULL when this is called, +** the error state error code is returned. It is permitted to read the +** database when in SQLITE_FULL error state. ** -** Immediately after obtaining the shared lock (if required), this function -** checks for a hot-journal file. If one is found, an emergency rollback -** is performed immediately. +** Otherwise, if everything is successful, SQLITE_OK is returned. If an +** IO error occurs while locking the database, checking for a hot-journal +** file or rolling back a journal file, the IO error code is returned. */ static int pagerSharedLock(Pager *pPager){ - int rc = SQLITE_OK; - int isErrorReset = 0; + int rc = SQLITE_OK; /* Return code */ + int isErrorReset = 0; /* True if recovering from error state */ /* If this database is opened for exclusive access, has no outstanding - ** page references and is in an error-state, now is the chance to clear + ** page references and is in an error-state, this is a chance to clear ** the error. Discard the contents of the pager-cache and treat any ** open journal file as a hot-journal. */ if( !MEMDB && pPager->exclusiveMode && sqlite3PcacheRefCount(pPager->pPCache)==0 && pPager->errCode ){ - if( pPager->journalOpen ){ + if( isOpen(pPager->jfd) ){ isErrorReset = 1; } pPager->errCode = SQLITE_OK; @@ -2878,7 +3461,7 @@ static int pagerSharedLock(Pager *pPager){ } if( pPager->state==PAGER_UNLOCK || isErrorReset ){ - sqlite3_vfs *pVfs = pPager->pVfs; + sqlite3_vfs * const pVfs = pPager->pVfs; int isHotJournal = 0; assert( !MEMDB ); assert( sqlite3PcacheRefCount(pPager->pPCache)==0 ); @@ -2907,12 +3490,13 @@ static int pagerSharedLock(Pager *pPager){ ** important that a RESERVED lock is not obtained on the way to the ** EXCLUSIVE lock. If it were, another process might open the ** database file, detect the RESERVED lock, and conclude that the - ** database is safe to read while this process is still rolling it - ** back. + ** database is safe to read while this process is still rolling the + ** hot-journal back. ** - ** Because the intermediate RESERVED lock is not requested, the - ** second process will get to this point in the code and fail to - ** obtain its own EXCLUSIVE lock on the database file. + ** Because the intermediate RESERVED lock is not requested, any + ** other process attempting to access the database file will get to + ** this point in the code and fail to obtain its own EXCLUSIVE lock + ** on the database file. */ if( pPager->statefd, EXCLUSIVE_LOCK); @@ -2929,7 +3513,7 @@ static int pagerSharedLock(Pager *pPager){ ** OsTruncate() call used in exclusive-access mode also requires ** a read/write file handle. */ - if( !isErrorReset && pPager->journalOpen==0 ){ + if( !isOpen(pPager->jfd) ){ int res; rc = sqlite3OsAccess(pVfs,pPager->zJournal,SQLITE_ACCESS_EXISTS,&res); if( rc==SQLITE_OK ){ @@ -2938,7 +3522,7 @@ static int pagerSharedLock(Pager *pPager){ int f = SQLITE_OPEN_READWRITE|SQLITE_OPEN_MAIN_JOURNAL; assert( !pPager->tempFile ); rc = sqlite3OsOpen(pVfs, pPager->zJournal, pPager->jfd, f, &fout); - assert( rc!=SQLITE_OK || pPager->jfd->pMethods ); + assert( rc!=SQLITE_OK || isOpen(pPager->jfd) ); if( rc==SQLITE_OK && fout&SQLITE_OPEN_READONLY ){ rc = SQLITE_CANTOPEN; sqlite3OsClose(pPager->jfd); @@ -2953,7 +3537,8 @@ static int pagerSharedLock(Pager *pPager){ if( rc!=SQLITE_OK ){ goto failed; } - pPager->journalOpen = 1; + + /* TODO: Why are these cleared here? Is it necessary? */ pPager->journalStarted = 0; pPager->journalOff = 0; pPager->setMaster = 0; @@ -2970,8 +3555,8 @@ static int pagerSharedLock(Pager *pPager){ rc = pager_error(pPager, rc); goto failed; } - assert(pPager->state==PAGER_SHARED || - (pPager->exclusiveMode && pPager->state>PAGER_SHARED) + assert( (pPager->state==PAGER_SHARED) + || (pPager->exclusiveMode && pPager->state>PAGER_SHARED) ); } @@ -3027,32 +3612,11 @@ static int pagerSharedLock(Pager *pPager){ } /* -** Make sure we have the content for a page. If the page was -** previously acquired with noContent==1, then the content was -** just initialized to zeros instead of being read from disk. -** But now we need the real data off of disk. So make sure we -** have it. Read it in if we do not have it already. -*/ -static int pager_get_content(PgHdr *pPg){ - if( pPg->flags&PGHDR_NEED_READ ){ - int rc = readDbPage(pPg->pPager, pPg, pPg->pgno); - if( rc==SQLITE_OK ){ - pPg->flags &= ~PGHDR_NEED_READ; - }else{ - return rc; - } - } - return SQLITE_OK; -} - -/* -** If the reference count has reached zero, and the pager is not in the -** middle of a write transaction or opened in exclusive mode, unlock it. +** If the reference count has reached zero, rollback any active +** transaction and unlock the pager. */ static void pagerUnlockIfUnused(Pager *pPager){ - if( (sqlite3PcacheRefCount(pPager->pPCache)==0) - && (!pPager->exclusiveMode || pPager->journalOff>0) - ){ + if( sqlite3PcacheRefCount(pPager->pPCache)==0 ){ pagerUnlockAndRollback(pPager); } } @@ -3070,16 +3634,48 @@ static void pagerDropPage(DbPage *pPg){ } /* -** Acquire a page. +** Acquire a reference to page number pgno in pager pPager (a page +** reference has type DbPage*). If the requested reference is +** successfully obtained, it is copied to *ppPage and SQLITE_OK returned. +** +** This function calls pagerSharedLock() to obtain a SHARED lock on +** the database file if such a lock or greater is not already held. +** This may cause hot-journal rollback or a cache purge. See comments +** above function pagerSharedLock() for details. +** +** If the requested page is already in the cache, it is returned. +** Otherwise, a new page object is allocated and populated with data +** read from the database file. In some cases, the pcache module may +** choose not to allocate a new page object and may reuse an existing +** object with no outstanding references. +** +** The extra data appended to a page is always initialized to zeros the +** first time a page is loaded into memory. If the page requested is +** already in the cache when this function is called, then the extra +** data is left as it was when the page object was last used. +** +** If the database image is smaller than the requested page or if a +** non-zero value is passed as the noContent parameter and the +** requested page is not already stored in the cache, then no +** actual disk read occurs. In this case the memory image of the +** page is initialized to all zeros. +** +** If noContent is true, it means that we do not care about the contents +** of the page. This occurs in two seperate scenarios: +** +** a) When reading a free-list leaf page from the database, and ** -** A read lock on the disk file is obtained when the first page is acquired. -** This read lock is dropped when the last page is released. +** b) When a savepoint is being rolled back and we need to load +** a new page into the cache to populate with the data read +** from the savepoint journal. ** -** This routine works for any page number greater than 0. If the database -** file is smaller than the requested page, then no actual disk -** read occurs and the memory image of the page is initialized to -** all zeros. The extra data appended to a page is always initialized -** to zeros the first time a page is loaded into memory. +** If noContent is true, then the data returned is zeroed instead of +** being read from the database. Additionally, the bits corresponding +** to pgno in Pager.pInJournal (bitvec of pages already written to the +** journal file) and the PagerSavepoint.pInSavepoint bitvecs of any open +** savepoints are set. This means if the page is made writable at any +** point in the future, using a call to sqlite3PagerWrite(), its contents +** will not be journaled. This saves IO. ** ** The acquisition might fail for several reasons. In all cases, ** an appropriate error code is returned and *ppPage is set to NULL. @@ -3091,15 +3687,6 @@ static void pagerDropPage(DbPage *pPg){ ** has to go to disk, and could also playback an old journal if necessary. ** Since Lookup() never goes to disk, it never has to deal with locks ** or journal files. -** -** If noContent is false, the page contents are actually read from disk. -** If noContent is true, it means that we do not care about the contents -** of the page at this time, so do not do a disk read. Just fill in the -** page content with zeros. But mark the fact that we have not read the -** content by setting the PgHdr.needRead flag. Later on, if -** sqlite3PagerWrite() is called on this page or if this routine is -** called again with noContent==0, that means that the content is needed -** and the disk read should occur at that point. */ int sqlite3PagerAcquire( Pager *pPager, /* The pager open on the database file */ @@ -3110,6 +3697,7 @@ int sqlite3PagerAcquire( PgHdr *pPg = 0; int rc; + assert( assert_pager_state(pPager) ); assert( pPager->state==PAGER_UNLOCK || sqlite3PcacheRefCount(pPager->pPCache)>0 || pgno==1 @@ -3141,6 +3729,7 @@ int sqlite3PagerAcquire( if( rc!=SQLITE_OK ){ return rc; } + assert( pPg->pgno==pgno ); if( pPg->pPager==0 ){ /* The pager cache has created a new page. Its content needs to ** be initialized. @@ -3163,13 +3752,26 @@ int sqlite3PagerAcquire( } memset(pPg->pData, 0, pPager->pageSize); if( noContent ){ - pPg->flags |= PGHDR_NEED_READ; + /* Failure to set the bits in the InJournal bit-vectors is benign. + ** It merely means that we might do some extra work to journal a + ** page that does not need to be journaled. Nevertheless, be sure + ** to test the case where a malloc error occurs while trying to set + ** a bit in a bit vector. + */ + sqlite3BeginBenignMalloc(); + if( pgno<=pPager->dbOrigSize ){ + TESTONLY( rc = ) sqlite3BitvecSet(pPager->pInJournal, pgno); + testcase( rc==SQLITE_NOMEM ); + } + TESTONLY( rc = ) addToSavepointBitvecs(pPager, pgno); + testcase( rc==SQLITE_NOMEM ); + sqlite3EndBenignMalloc(); } IOTRACE(("ZERO %p %d\n", pPager, pgno)); }else{ - rc = readDbPage(pPager, pPg, pgno); + assert( pPg->pPager==pPager ); + rc = readDbPage(pPg); if( rc!=SQLITE_OK && rc!=SQLITE_IOERR_SHORT_READ ){ - /* sqlite3PagerUnref(pPg); */ pagerDropPage(pPg); return rc; } @@ -3179,15 +3781,7 @@ int sqlite3PagerAcquire( #endif }else{ /* The requested page is in the page cache. */ - assert(sqlite3PcacheRefCount(pPager->pPCache)>0 || pgno==1); PAGER_INCR(pPager->nHit); - if( !noContent ){ - rc = pager_get_content(pPg); - if( rc ){ - sqlite3PagerUnref(pPg); - return rc; - } - } } *ppPage = pPg; @@ -3197,7 +3791,9 @@ int sqlite3PagerAcquire( /* ** Acquire a page if it is already in the in-memory cache. Do ** not read the page from disk. Return a pointer to the page, -** or 0 if the page is not in cache. +** or 0 if the page is not in cache. Also, return 0 if the +** pager is in PAGER_UNLOCK state when this function is called, +** or if the pager is in an error state other than SQLITE_FULL. ** ** See also sqlite3PagerGet(). The difference between this routine ** and sqlite3PagerGet() is that _get() will go to the disk and read @@ -3220,20 +3816,19 @@ DbPage *sqlite3PagerLookup(Pager *pPager, Pgno pgno){ } /* -** Release a page. +** Release a page reference. ** ** If the number of references to the page drop to zero, then the ** page is added to the LRU list. When all references to all pages ** are released, a rollback occurs and the lock on the database is ** removed. */ -int sqlite3PagerUnref(DbPage *pPg){ +void sqlite3PagerUnref(DbPage *pPg){ if( pPg ){ Pager *pPager = pPg->pPager; sqlite3PcacheRelease(pPg); pagerUnlockIfUnused(pPager); } - return SQLITE_OK; } /* @@ -3241,54 +3836,79 @@ int sqlite3PagerUnref(DbPage *pPg){ ** sub-journal file is open too. If the main journal is not open, ** this function is a no-op. ** -** SQLITE_OK is returned if everything goes according to plan. An -** SQLITE_IOERR_XXX error code is returned if the call to +** SQLITE_OK is returned if everything goes according to plan. +** An SQLITE_IOERR_XXX error code is returned if a call to ** sqlite3OsOpen() fails. */ static int openSubJournal(Pager *pPager){ int rc = SQLITE_OK; - if( pPager->journalOpen && !pPager->sjfd->pMethods ){ + if( isOpen(pPager->jfd) && !isOpen(pPager->sjfd) ){ if( pPager->journalMode==PAGER_JOURNALMODE_MEMORY ){ sqlite3MemJournalOpen(pPager->sjfd); }else{ - rc = sqlite3PagerOpentemp(pPager, pPager->sjfd, SQLITE_OPEN_SUBJOURNAL); + rc = pagerOpentemp(pPager, pPager->sjfd, SQLITE_OPEN_SUBJOURNAL); } } return rc; } /* -** Create a journal file for pPager. There should already be a RESERVED -** or EXCLUSIVE lock on the database file when this routine is called. +** This function is called at the start of every write transaction. +** There must already be a RESERVED or EXCLUSIVE lock on the database +** file when this routine is called. +** +** Open the journal file for pager pPager and write a journal header +** to the start of it. If there are active savepoints, open the sub-journal +** as well. This function is only used when the journal file is being +** opened to write a rollback log for a transaction. It is not used +** when opening a hot journal file to roll it back. +** +** If the journal file is already open (as it may be in exclusive mode), +** then this function just writes a journal header to the start of the +** already open file. ** -** Return SQLITE_OK if everything. Return an error code and release the -** write lock if anything goes wrong. +** Whether or not the journal file is opened by this function, the +** Pager.pInJournal bitvec structure is allocated. +** +** Return SQLITE_OK if everything is successful. Otherwise, return +** SQLITE_NOMEM if the attempt to allocate Pager.pInJournal fails, or +** an IO error code if opening or writing the journal file fails. */ static int pager_open_journal(Pager *pPager){ - sqlite3_vfs *pVfs = pPager->pVfs; - int flags = (SQLITE_OPEN_READWRITE|SQLITE_OPEN_EXCLUSIVE|SQLITE_OPEN_CREATE); + int rc = SQLITE_OK; /* Return code */ + sqlite3_vfs * const pVfs = pPager->pVfs; /* Local cache of vfs pointer */ - int rc; assert( pPager->state>=PAGER_RESERVED ); assert( pPager->useJournal ); assert( pPager->pInJournal==0 ); + + /* If already in the error state, this function is a no-op. */ + if( pPager->errCode ){ + return pPager->errCode; + } + + /* TODO: Is it really possible to get here with dbSizeValid==0? If not, + ** the call to PagerPagecount() can be removed. + */ + testcase( pPager->dbSizeValid==0 ); sqlite3PagerPagecount(pPager, 0); + pPager->pInJournal = sqlite3BitvecCreate(pPager->dbSize); if( pPager->pInJournal==0 ){ - rc = SQLITE_NOMEM; - goto failed_to_open_journal; + return SQLITE_NOMEM; } - if( pPager->journalOpen==0 ){ - if( pPager->tempFile ){ - flags |= (SQLITE_OPEN_DELETEONCLOSE|SQLITE_OPEN_TEMP_JOURNAL); - }else{ - flags |= (SQLITE_OPEN_MAIN_JOURNAL); - } + /* Open the journal file if it is not already open. */ + if( !isOpen(pPager->jfd) ){ if( pPager->journalMode==PAGER_JOURNALMODE_MEMORY ){ sqlite3MemJournalOpen(pPager->jfd); - rc = SQLITE_OK; }else{ + const int flags = /* VFS flags to open journal file */ + SQLITE_OPEN_READWRITE|SQLITE_OPEN_EXCLUSIVE|SQLITE_OPEN_CREATE| + (pPager->tempFile ? + (SQLITE_OPEN_DELETEONCLOSE|SQLITE_OPEN_TEMP_JOURNAL): + (SQLITE_OPEN_MAIN_JOURNAL) + ); #ifdef SQLITE_ENABLE_ATOMIC_WRITE rc = sqlite3JournalOpen( pVfs, pPager->zJournal, pPager->jfd, flags, jrnlBufferSize(pPager) @@ -3297,81 +3917,64 @@ static int pager_open_journal(Pager *pPager){ rc = sqlite3OsOpen(pVfs, pPager->zJournal, pPager->jfd, flags, 0); #endif } - assert( rc!=SQLITE_OK || pPager->jfd->pMethods ); + assert( rc!=SQLITE_OK || isOpen(pPager->jfd) ); + } + + + /* Write the first journal header to the journal file and open + ** the sub-journal if necessary. + */ + if( rc==SQLITE_OK ){ + /* TODO: Check if all of these are really required. */ + pPager->dbOrigSize = pPager->dbSize; + pPager->journalStarted = 0; + pPager->needSync = 0; + pPager->nRec = 0; pPager->journalOff = 0; pPager->setMaster = 0; pPager->journalHdr = 0; - if( rc!=SQLITE_OK ){ - if( rc==SQLITE_NOMEM ){ - sqlite3OsDelete(pVfs, pPager->zJournal, 0); - } - goto failed_to_open_journal; - } - } - pPager->journalOpen = 1; - pPager->journalStarted = 0; - pPager->needSync = 0; - pPager->nRec = 0; - if( pPager->errCode ){ - rc = pPager->errCode; - goto failed_to_open_journal; + rc = writeJournalHdr(pPager); } - pPager->dbOrigSize = pPager->dbSize; - - rc = writeJournalHdr(pPager); - - if( pPager->nSavepoint && rc==SQLITE_OK ){ + if( rc==SQLITE_OK && pPager->nSavepoint ){ rc = openSubJournal(pPager); } - if( rc!=SQLITE_OK && rc!=SQLITE_NOMEM && rc!=SQLITE_IOERR_NOMEM ){ - rc = pager_end_transaction(pPager, 0); - if( rc==SQLITE_OK ){ - rc = SQLITE_FULL; - } - } - return rc; -failed_to_open_journal: - sqlite3BitvecDestroy(pPager->pInJournal); - pPager->pInJournal = 0; + if( rc!=SQLITE_OK ){ + sqlite3BitvecDestroy(pPager->pInJournal); + pPager->pInJournal = 0; + } return rc; } /* -** Acquire a write-lock on the database. The lock is removed when -** the any of the following happen: +** Begin a write-transaction on the specified pager object. If a +** write-transaction has already been opened, this function is a no-op. ** -** * sqlite3PagerCommitPhaseTwo() is called. -** * sqlite3PagerRollback() is called. -** * sqlite3PagerClose() is called. -** * sqlite3PagerUnref() is called to on every outstanding page. +** If the exFlag argument is false, then acquire at least a RESERVED +** lock on the database file. If exFlag is true, then acquire at least +** an EXCLUSIVE lock. If such a lock is already held, no locking +** functions need be called. ** -** The first parameter to this routine is a pointer to any open page of the -** database file. Nothing changes about the page - it is used merely to -** acquire a pointer to the Pager structure and as proof that there is -** already a read-lock on the database. +** If this is not a temporary or in-memory file and, the journal file is +** opened if it has not been already. For a temporary file, the opening +** of the journal file is deferred until there is an actual need to +** write to the journal. TODO: Why handle temporary files differently? ** -** The second parameter indicates how much space in bytes to reserve for a -** master journal file-name at the start of the journal when it is created. -** -** A journal file is opened if this is not a temporary file. For temporary -** files, the opening of the journal file is deferred until there is an -** actual need to write to the journal. -** -** If the database is already reserved for writing, this routine is a no-op. -** -** If exFlag is true, go ahead and get an EXCLUSIVE lock on the file -** immediately instead of waiting until we try to flush the cache. The -** exFlag is ignored if a transaction is already active. +** If the journal file is opened (or if it is already open), then a +** journal-header is written to the start of it. */ -int sqlite3PagerBegin(DbPage *pPg, int exFlag){ - Pager *pPager = pPg->pPager; +int sqlite3PagerBegin(Pager *pPager, int exFlag){ int rc = SQLITE_OK; - assert( pPg->nRef>0 ); assert( pPager->state!=PAGER_UNLOCK ); if( pPager->state==PAGER_SHARED ){ assert( pPager->pInJournal==0 ); - assert( !MEMDB ); + assert( !MEMDB && !pPager->tempFile ); + + /* Obtain a RESERVED lock on the database file. If the exFlag parameter + ** is true, then immediately upgrade this to an EXCLUSIVE lock. The + ** busy-handler callback can be used when upgrading to the EXCLUSIVE + ** lock, but not when obtaining the RESERVED lock. + */ rc = sqlite3OsLock(pPager->fd, RESERVED_LOCK); if( rc==SQLITE_OK ){ pPager->state = PAGER_RESERVED; @@ -3379,16 +3982,16 @@ int sqlite3PagerBegin(DbPage *pPg, int exFlag){ rc = pager_wait_on_lock(pPager, EXCLUSIVE_LOCK); } } - if( rc!=SQLITE_OK ){ - return rc; - } - pPager->dirtyCache = 0; - PAGERTRACE(("TRANSACTION %d\n", PAGERID(pPager))); - if( pPager->useJournal && !pPager->tempFile - && pPager->journalMode!=PAGER_JOURNALMODE_OFF ){ + + /* If the required locks were successfully obtained, open the journal + ** file and write the first journal-header to it. + */ + if( rc==SQLITE_OK && pPager->useJournal + && pPager->journalMode!=PAGER_JOURNALMODE_OFF + ){ rc = pager_open_journal(pPager); } - }else if( pPager->journalOpen && pPager->journalOff==0 ){ + }else if( isOpen(pPager->jfd) && pPager->journalOff==0 ){ /* This happens when the pager was in exclusive-access mode the last ** time a (read or write) transaction was successfully concluded ** by this connection. Instead of deleting the journal file it was @@ -3398,35 +4001,20 @@ int sqlite3PagerBegin(DbPage *pPg, int exFlag){ assert( pPager->nRec==0 ); assert( pPager->dbOrigSize==0 ); assert( pPager->pInJournal==0 ); - sqlite3PagerPagecount(pPager, 0); - pPager->pInJournal = sqlite3BitvecCreate( pPager->dbSize ); - if( !pPager->pInJournal ){ - rc = SQLITE_NOMEM; - }else{ - pPager->dbOrigSize = pPager->dbSize; - rc = writeJournalHdr(pPager); - } + rc = pager_open_journal(pPager); } - assert( !pPager->journalOpen || pPager->journalOff>0 || rc!=SQLITE_OK ); + + PAGERTRACE(("TRANSACTION %d\n", PAGERID(pPager))); + assert( !isOpen(pPager->jfd) || pPager->journalOff>0 || rc!=SQLITE_OK ); return rc; } /* -** Mark a data page as writeable. The page is written into the journal -** if it is not there already. This routine must be called before making -** changes to a page. -** -** The first time this routine is called, the pager creates a new -** journal and acquires a RESERVED lock on the database. If the RESERVED -** lock could not be acquired, this routine returns SQLITE_BUSY. The -** calling routine must check for that return value and be careful not to -** change any page data until this routine returns SQLITE_OK. -** -** If the journal file could not be written because the disk is full, -** then this routine returns SQLITE_FULL and does an immediate rollback. -** All subsequent write attempts also return SQLITE_FULL until there -** is a call to sqlite3PagerCommit() or sqlite3PagerRollback() to -** reset. +** Mark a single data page as writeable. The page is written into the +** main journal or sub-journal as required. If the page is written into +** one of the journals, the corresponding bit is set in the +** Pager.pInJournal bitvec and the PagerSavepoint.pInSavepoint bitvecs +** of any open savepoints as appropriate. */ static int pager_write(PgHdr *pPg){ void *pData = pPg->pData; @@ -3446,25 +4034,11 @@ static int pager_write(PgHdr *pPg){ CHECK_PAGE(pPg); - /* If this page was previously acquired with noContent==1, that means - ** we didn't really read in the content of the page. This can happen - ** (for example) when the page is being moved to the freelist. But - ** now we are (perhaps) moving the page off of the freelist for - ** reuse and we need to know its original content so that content - ** can be stored in the rollback journal. So do the read at this - ** time. - */ - rc = pager_get_content(pPg); - if( rc ){ - return rc; - } - /* Mark the page as dirty. If the page has already been written ** to the journal then we can return right away. */ sqlite3PcacheMakeDirty(pPg); if( pageInJournal(pPg) && !subjRequiresPage(pPg) ){ - pPager->dirtyCache = 1; pPager->dbModified = 1; }else{ @@ -3476,24 +4050,23 @@ static int pager_write(PgHdr *pPg){ ** create it if it does not. */ assert( pPager->state!=PAGER_UNLOCK ); - rc = sqlite3PagerBegin(pPg, 0); + rc = sqlite3PagerBegin(pPager, 0); if( rc!=SQLITE_OK ){ return rc; } assert( pPager->state>=PAGER_RESERVED ); - if( !pPager->journalOpen && pPager->useJournal + if( !isOpen(pPager->jfd) && pPager->useJournal && pPager->journalMode!=PAGER_JOURNALMODE_OFF ){ rc = pager_open_journal(pPager); if( rc!=SQLITE_OK ) return rc; } - pPager->dirtyCache = 1; pPager->dbModified = 1; /* The transaction journal now exists and we have a RESERVED or an ** EXCLUSIVE lock on the main database file. Write the current page to ** the transaction journal if it is not there already. */ - if( !pageInJournal(pPg) && pPager->journalOpen ){ + if( !pageInJournal(pPg) && isOpen(pPager->jfd) ){ if( pPg->pgno<=pPager->dbOrigSize ){ u32 cksum; char *pData2; @@ -3584,14 +4157,18 @@ static int pager_write(PgHdr *pPg){ } /* -** This function is used to mark a data-page as writable. It uses -** pager_write() to open a journal file (if it is not already open) -** and write the page *pData to the journal. +** Mark a data page as writeable. This routine must be called before +** making changes to a page. The caller must check the return value +** of this function and be careful not to change any page data unless +** this routine returns SQLITE_OK. ** ** The difference between this function and pager_write() is that this ** function also deals with the special case where 2 or more pages ** fit on a single disk sector. In this case all co-resident pages ** must have been written to the journal file before returning. +** +** If an error occurs, SQLITE_NOMEM or an IO error code is returned +** as appropriate. Otherwise, SQLITE_OK. */ int sqlite3PagerWrite(DbPage *pDbPage){ int rc = SQLITE_OK; @@ -3604,8 +4181,8 @@ int sqlite3PagerWrite(DbPage *pDbPage){ Pgno nPageCount; /* Total number of pages in database file */ Pgno pg1; /* First page of the sector pPg is located on. */ int nPage; /* Number of pages starting at pg1 to journal */ - int ii; - int needSync = 0; + int ii; /* Loop counter */ + int needSync = 0; /* True if any page has PGHDR_NEED_SYNC */ /* Set the doNotSync flag to 1. This is because we cannot allow a journal ** header to be written between the pages journaled by this function. @@ -3700,184 +4277,113 @@ int sqlite3PagerIswriteable(DbPage *pPg){ ** content no longer matters. ** ** The overlying software layer calls this routine when all of the data -** on the given page is unused. The pager marks the page as clean so +** on the given page is unused. The pager marks the page as clean so ** that it does not get written to disk. ** -** Tests show that this optimization, together with the -** sqlite3PagerDontRollback() below, more than double the speed -** of large INSERT operations and quadruple the speed of large DELETEs. -** -** When this routine is called, set the bit corresponding to pDbPage in -** the Pager.pAlwaysRollback bitvec. Subsequent calls to -** sqlite3PagerDontRollback() for the same page will thereafter be ignored. -** This is necessary to avoid a problem where a page with data is added to -** the freelist during one part of a transaction then removed from the -** freelist during a later part of the same transaction and reused for some -** other purpose. When it is first added to the freelist, this routine is -** called. When reused, the sqlite3PagerDontRollback() routine is called. -** But because the page contains critical data, we still need to be sure it -** gets rolled back in spite of the sqlite3PagerDontRollback() call. +** Tests show that this optimization can quadruple the speed of large +** DELETE operations. */ -int sqlite3PagerDontWrite(DbPage *pDbPage){ - PgHdr *pPg = pDbPage; +void sqlite3PagerDontWrite(PgHdr *pPg){ Pager *pPager = pPg->pPager; - int rc; - - if( pPg->pgno>pPager->dbOrigSize ){ - return SQLITE_OK; - } - if( pPager->pAlwaysRollback==0 ){ - assert( pPager->pInJournal ); - pPager->pAlwaysRollback = sqlite3BitvecCreate(pPager->dbOrigSize); - if( !pPager->pAlwaysRollback ){ - return SQLITE_NOMEM; - } - } - rc = sqlite3BitvecSet(pPager->pAlwaysRollback, pPg->pgno); - - if( rc==SQLITE_OK && (pPg->flags&PGHDR_DIRTY) && pPager->nSavepoint==0 ){ - assert( pPager->state>=PAGER_SHARED ); - if( pPager->dbSize==pPg->pgno && pPager->dbOrigSizedbSize ){ - /* If this pages is the last page in the file and the file has grown - ** during the current transaction, then do NOT mark the page as clean. - ** When the database file grows, we must make sure that the last page - ** gets written at least once so that the disk file will be the correct - ** size. If you do not write this page and the size of the file - ** on the disk ends up being too small, that can lead to database - ** corruption during the next transaction. - */ - }else{ - PAGERTRACE(("DONT_WRITE page %d of %d\n", pPg->pgno, PAGERID(pPager))); - IOTRACE(("CLEAN %p %d\n", pPager, pPg->pgno)) - pPg->flags |= PGHDR_DONT_WRITE; + if( (pPg->flags&PGHDR_DIRTY) && pPager->nSavepoint==0 ){ + PAGERTRACE(("DONT_WRITE page %d of %d\n", pPg->pgno, PAGERID(pPager))); + IOTRACE(("CLEAN %p %d\n", pPager, pPg->pgno)) + pPg->flags |= PGHDR_DONT_WRITE; #ifdef SQLITE_CHECK_PAGES - pPg->pageHash = pager_pagehash(pPg); + pPg->pageHash = pager_pagehash(pPg); #endif - } } - return rc; } /* -** A call to this routine tells the pager that if a rollback occurs, -** it is not necessary to restore the data on the given page. This -** means that the pager does not have to record the given page in the -** rollback journal. -** -** If we have not yet actually read the content of this page (if -** the PgHdr.needRead flag is set) then this routine acts as a promise -** that we will never need to read the page content in the future. -** so the needRead flag can be cleared at this point. +** This routine is called to increment the value of the database file +** change-counter, stored as a 4-byte big-endian integer starting at +** byte offset 24 of the pager file. +** +** If the isDirect flag is zero, then this is done by calling +** sqlite3PagerWrite() on page 1, then modifying the contents of the +** page data. In this case the file will be updated when the current +** transaction is committed. +** +** The isDirect flag may only be non-zero if the library was compiled +** with the SQLITE_ENABLE_ATOMIC_WRITE macro defined. In this case, +** if isDirect is non-zero, then the database file is updated directly +** by writing an updated version of page 1 using a call to the +** sqlite3OsWrite() function. */ -void sqlite3PagerDontRollback(DbPage *pPg){ - Pager *pPager = pPg->pPager; - TESTONLY( int rc; ) /* Return value from sqlite3BitvecSet() */ - - assert( pPager->state>=PAGER_RESERVED ); - - /* If the journal file is not open, or DontWrite() has been called on - ** this page (DontWrite() sets the Pager.pAlwaysRollback bit), then this - ** function is a no-op. - */ - if( pPager->journalOpen==0 - || sqlite3BitvecTest(pPager->pAlwaysRollback, pPg->pgno) - || pPg->pgno>pPager->dbOrigSize - ){ - return; - } - -#ifdef SQLITE_SECURE_DELETE - if( sqlite3BitvecTest(pPager->pInJournal, pPg->pgno)!=0 - || pPg->pgno>pPager->dbOrigSize ){ - return; - } -#endif +static int pager_incr_changecounter(Pager *pPager, int isDirectMode){ + int rc = SQLITE_OK; - /* If SECURE_DELETE is disabled, then there is no way that this - ** routine can be called on a page for which sqlite3PagerDontWrite() - ** has not been previously called during the same transaction. - ** And if DontWrite() has previously been called, the following - ** conditions must be met. + /* Declare and initialize constant integer 'isDirect'. If the + ** atomic-write optimization is enabled in this build, then isDirect + ** is initialized to the value passed as the isDirectMode parameter + ** to this function. Otherwise, it is always set to zero. ** - ** (Later:) Not true. If the database is corrupted by having duplicate - ** pages on the freelist (ex: corrupt9.test) then the following is not - ** necessarily true: - */ - /* assert( !pPg->inJournal && (int)pPg->pgno <= pPager->dbOrigSize ); */ - - assert( pPager->pInJournal!=0 ); - pPg->flags &= ~PGHDR_NEED_READ; - - /* Failure to set the bits in the InJournal bit-vectors is benign. - ** It merely means that we might do some extra work to journal a page - ** that does not need to be journaled. Nevertheless, be sure to test the - ** case where a malloc error occurs while trying to set a bit in a - ** bit vector. + ** The idea is that if the atomic-write optimization is not + ** enabled at compile time, the compiler can omit the tests of + ** 'isDirect' below, as well as the block enclosed in the + ** "if( isDirect )" condition. */ - sqlite3BeginBenignMalloc(); - TESTONLY( rc = ) sqlite3BitvecSet(pPager->pInJournal, pPg->pgno); - testcase( rc==SQLITE_NOMEM ); - TESTONLY( rc = ) addToSavepointBitvecs(pPager, pPg->pgno); - testcase( rc==SQLITE_NOMEM ); - sqlite3EndBenignMalloc(); - - - PAGERTRACE(("DONT_ROLLBACK page %d of %d\n", pPg->pgno, PAGERID(pPager))); - IOTRACE(("GARBAGE %p %d\n", pPager, pPg->pgno)) -} - - -/* -** This routine is called to increment the database file change-counter, -** stored at byte 24 of the pager file. -*/ -static int pager_incr_changecounter(Pager *pPager, int isDirect){ - PgHdr *pPgHdr; - u32 change_counter; - int rc = SQLITE_OK; - #ifndef SQLITE_ENABLE_ATOMIC_WRITE - assert( isDirect==0 ); /* isDirect is only true for atomic writes */ + const int isDirect = 0; + assert( isDirectMode==0 ); +#else + const int isDirect = isDirectMode; #endif + + assert( pPager->state>=PAGER_RESERVED ); if( !pPager->changeCountDone && pPager->dbSize>0 ){ + PgHdr *pPgHdr; /* Reference to page 1 */ + u32 change_counter; /* Initial value of change-counter field */ + + assert( !pPager->tempFile && isOpen(pPager->fd) ); + /* Open page 1 of the file for writing. */ rc = sqlite3PagerGet(pPager, 1, &pPgHdr); - if( rc!=SQLITE_OK ) return rc; + assert( pPgHdr==0 || rc==SQLITE_OK ); - if( !isDirect ){ + /* If page one was fetched successfully, and this function is not + ** operating in direct-mode, make page 1 writable. + */ + if( rc==SQLITE_OK && !isDirect ){ rc = sqlite3PagerWrite(pPgHdr); - if( rc!=SQLITE_OK ){ - sqlite3PagerUnref(pPgHdr); - return rc; - } } - /* Increment the value just read and write it back to byte 24. */ - change_counter = sqlite3Get4byte((u8*)pPager->dbFileVers); - change_counter++; - put32bits(((char*)pPgHdr->pData)+24, change_counter); + if( rc==SQLITE_OK ){ + /* Increment the value just read and write it back to byte 24. */ + change_counter = sqlite3Get4byte((u8*)pPager->dbFileVers); + change_counter++; + put32bits(((char*)pPgHdr->pData)+24, change_counter); + + /* If running in direct mode, write the contents of page 1 to the file. */ + if( isDirect ){ + const void *zBuf = pPgHdr->pData; + assert( pPager->dbFileSize>0 ); + rc = sqlite3OsWrite(pPager->fd, zBuf, pPager->pageSize, 0); + } -#ifdef SQLITE_ENABLE_ATOMIC_WRITE - if( isDirect && pPager->fd->pMethods ){ - const void *zBuf = pPgHdr->pData; - assert( pPager->dbFileSize>0 ); - rc = sqlite3OsWrite(pPager->fd, zBuf, pPager->pageSize, 0); + /* If everything worked, set the changeCountDone flag. */ + if( rc==SQLITE_OK ){ + pPager->changeCountDone = 1; + } } -#endif /* Release the page reference. */ sqlite3PagerUnref(pPgHdr); - pPager->changeCountDone = 1; } return rc; } /* -** Sync the pager file to disk. +** Sync the pager file to disk. This is a no-op for in-memory files +** or pages with the Pager.noSync flag set. +** +** If successful, or called on a pager for which it is a no-op, this +** function returns SQLITE_OK. Otherwise, an IO error code is returned. */ int sqlite3PagerSync(Pager *pPager){ - int rc; - if( MEMDB ){ + int rc; /* Return code */ + if( MEMDB || pPager->noSync ){ rc = SQLITE_OK; }else{ rc = sqlite3OsSync(pPager->fd, pPager->sync_flags); @@ -3891,10 +4397,17 @@ int sqlite3PagerSync(Pager *pPager){ ** journal file. zMaster may be NULL, which is interpreted as no master ** journal (a single database transaction). ** -** This routine ensures that the journal is synced, all dirty pages written -** to the database file and the database file synced. The only thing that -** remains to commit the transaction is to delete the journal file (or -** master journal file if specified). +** This routine ensures that: +** +** * The database file change-counter is updated, +** * the journal is synced (unless the atomic-write optimization is used), +** * all dirty pages are written to the database file, +** * the database file is truncated (if required), and +** * the database file synced. +** +** The only thing that remains to commit the transaction is to finalize +** (delete, truncate or zero the first part of) the journal file (or +** delete the master journal file if specified). ** ** Note that if zMaster==NULL, this does not overwrite a previous value ** passed to an sqlite3PagerCommitPhaseOne() call. @@ -3905,140 +4418,138 @@ int sqlite3PagerSync(Pager *pPager){ ** journal file in this case. */ int sqlite3PagerCommitPhaseOne( - Pager *pPager, - const char *zMaster, - int noSync + Pager *pPager, /* Pager object */ + const char *zMaster, /* If not NULL, the master journal name */ + int noSync /* True to omit the xSync on the db file */ ){ - int rc = SQLITE_OK; + int rc = SQLITE_OK; /* Return code */ if( pPager->errCode ){ return pPager->errCode; } - /* If no changes have been made, we can leave the transaction early. - */ - if( pPager->dbModified==0 && - (pPager->journalMode!=PAGER_JOURNALMODE_DELETE || - pPager->exclusiveMode!=0) ){ - assert( pPager->dirtyCache==0 || pPager->journalOpen==0 ); - return SQLITE_OK; - } - PAGERTRACE(("DATABASE SYNC: File=%s zMaster=%s nSize=%d\n", pPager->zFilename, zMaster, pPager->dbSize)); /* If this is an in-memory db, or no pages have been written to, or this ** function has already been called, it is a no-op. */ - if( pPager->state!=PAGER_SYNCED && !MEMDB && pPager->dirtyCache ){ - PgHdr *pPg; + if( pPager->state!=PAGER_SYNCED && !MEMDB && pPager->dbModified ){ -#ifdef SQLITE_ENABLE_ATOMIC_WRITE - /* The atomic-write optimization can be used if all of the - ** following are true: + /* The following block updates the change-counter. Exactly how it + ** does this depends on whether or not the atomic-update optimization + ** was enabled at compile time, and if this transaction meets the + ** runtime criteria to use the operation: ** - ** + The file-system supports the atomic-write property for - ** blocks of size page-size, and - ** + This commit is not part of a multi-file transaction, and - ** + Exactly one page has been modified and store in the journal file. + ** * The file-system supports the atomic-write property for + ** blocks of size page-size, and + ** * This commit is not part of a multi-file transaction, and + ** * Exactly one page has been modified and store in the journal file. ** - ** If the optimization can be used, then the journal file will never - ** be created for this transaction. + ** If the optimization was not enabled at compile time, then the + ** pager_incr_changecounter() function is called to update the change + ** counter in 'indirect-mode'. If the optimization is compiled in but + ** is not applicable to this transaction, call sqlite3JournalCreate() + ** to make sure the journal file has actually been created, then call + ** pager_incr_changecounter() to update the change-counter in indirect + ** mode. + ** + ** Otherwise, if the optimization is both enabled and applicable, + ** then call pager_incr_changecounter() to update the change-counter + ** in 'direct' mode. In this case the journal file will never be + ** created for this transaction. */ - int useAtomicWrite; - pPg = sqlite3PcacheDirtyList(pPager->pPCache); - useAtomicWrite = ( - !zMaster && - pPager->journalOpen && - pPager->journalOff==jrnlBufferSize(pPager) && - pPager->dbSize>=pPager->dbFileSize && - (pPg==0 || pPg->pDirty==0) - ); - assert( pPager->journalOpen || pPager->journalMode==PAGER_JOURNALMODE_OFF ); - if( useAtomicWrite ){ - /* Update the nRec field in the journal file. */ - int offset = pPager->journalHdr + sizeof(aJournalMagic); - assert(pPager->nRec==1); - rc = write32bits(pPager->jfd, offset, pPager->nRec); - - /* Update the db file change counter. The following call will modify - ** the in-memory representation of page 1 to include the updated - ** change counter and then write page 1 directly to the database - ** file. Because of the atomic-write property of the host file-system, - ** this is safe. +#ifdef SQLITE_ENABLE_ATOMIC_WRITE + PgHdr *pPg; + assert( isOpen(pPager->jfd) || pPager->journalMode==PAGER_JOURNALMODE_OFF ); + if( !zMaster && isOpen(pPager->jfd) + && pPager->journalOff==jrnlBufferSize(pPager) + && pPager->dbSize>=pPager->dbFileSize + && (0==(pPg = sqlite3PcacheDirtyList(pPager->pPCache)) || 0==pPg->pDirty) + ){ + /* Update the db file change counter via the direct-write method. The + ** following call will modify the in-memory representation of page 1 + ** to include the updated change counter and then write page 1 + ** directly to the database file. Because of the atomic-write + ** property of the host file-system, this is safe. */ - if( rc==SQLITE_OK ){ - rc = pager_incr_changecounter(pPager, 1); - } + rc = pager_incr_changecounter(pPager, 1); }else{ rc = sqlite3JournalCreate(pPager->jfd); + if( rc==SQLITE_OK ){ + rc = pager_incr_changecounter(pPager, 0); + } } - - if( !useAtomicWrite && rc==SQLITE_OK ) +#else + rc = pager_incr_changecounter(pPager, 0); #endif + if( rc!=SQLITE_OK ) goto commit_phase_one_exit; - /* If a master journal file name has already been written to the - ** journal file, then no sync is required. This happens when it is - ** written, then the process fails to upgrade from a RESERVED to an - ** EXCLUSIVE lock. The next time the process tries to commit the - ** transaction the m-j name will have already been written. + /* If this transaction has made the database smaller, then all pages + ** being discarded by the truncation must be written to the journal + ** file. This can only happen in auto-vacuum mode. + ** + ** Before reading the pages with page numbers larger than the + ** current value of Pager.dbSize, set dbSize back to the value + ** that it took at the start of the transaction. Otherwise, the + ** calls to sqlite3PagerGet() return zeroed pages instead of + ** reading data from the database file. */ - if( !pPager->setMaster ){ - rc = pager_incr_changecounter(pPager, 0); - if( rc!=SQLITE_OK ) goto sync_exit; - if( pPager->journalMode!=PAGER_JOURNALMODE_OFF ){ #ifndef SQLITE_OMIT_AUTOVACUUM - if( pPager->dbSizedbOrigSize ){ - /* If this transaction has made the database smaller, then all pages - ** being discarded by the truncation must be written to the journal - ** file. - */ - Pgno i; - Pgno iSkip = PAGER_MJ_PGNO(pPager); - Pgno dbSize = pPager->dbSize; - pPager->dbSize = pPager->dbOrigSize; - for( i=dbSize+1; i<=pPager->dbOrigSize; i++ ){ - if( !sqlite3BitvecTest(pPager->pInJournal, i) && i!=iSkip ){ - rc = sqlite3PagerGet(pPager, i, &pPg); - if( rc!=SQLITE_OK ) goto sync_exit; - rc = sqlite3PagerWrite(pPg); - sqlite3PagerUnref(pPg); - if( rc!=SQLITE_OK ) goto sync_exit; - } - } - pPager->dbSize = dbSize; + if( pPager->dbSizedbOrigSize + && pPager->journalMode!=PAGER_JOURNALMODE_OFF + ){ + Pgno i; /* Iterator variable */ + const Pgno iSkip = PAGER_MJ_PGNO(pPager); /* Pending lock page */ + const Pgno dbSize = pPager->dbSize; /* Database image size */ + pPager->dbSize = pPager->dbOrigSize; + for( i=dbSize+1; i<=pPager->dbOrigSize; i++ ){ + if( !sqlite3BitvecTest(pPager->pInJournal, i) && i!=iSkip ){ + PgHdr *pPage; /* Page to journal */ + rc = sqlite3PagerGet(pPager, i, &pPage); + if( rc!=SQLITE_OK ) goto commit_phase_one_exit; + rc = sqlite3PagerWrite(pPage); + sqlite3PagerUnref(pPage); + if( rc!=SQLITE_OK ) goto commit_phase_one_exit; } -#endif - rc = writeMasterJournal(pPager, zMaster); - if( rc!=SQLITE_OK ) goto sync_exit; - rc = syncJournal(pPager); - } + } + pPager->dbSize = dbSize; } - if( rc!=SQLITE_OK ) goto sync_exit; +#endif - /* Write all dirty pages to the database file */ - pPg = sqlite3PcacheDirtyList(pPager->pPCache); - rc = pager_write_pagelist(pPg); + /* Write the master journal name into the journal file. If a master + ** journal file name has already been written to the journal file, + ** or if zMaster is NULL (no master journal), then this call is a no-op. + */ + rc = writeMasterJournal(pPager, zMaster); + if( rc!=SQLITE_OK ) goto commit_phase_one_exit; + + /* Sync the journal file. If the atomic-update optimization is being + ** used, this call will not create the journal file or perform any + ** real IO. + */ + rc = syncJournal(pPager); + if( rc!=SQLITE_OK ) goto commit_phase_one_exit; + + /* Write all dirty pages to the database file. */ + rc = pager_write_pagelist(sqlite3PcacheDirtyList(pPager->pPCache)); if( rc!=SQLITE_OK ){ assert( rc!=SQLITE_IOERR_BLOCKED ); - /* The error might have left the dirty list all fouled up here, - ** but that does not matter because if the if the dirty list did - ** get corrupted, then the transaction will roll back and - ** discard the dirty list. There is an assert in - ** pager_get_all_dirty_pages() that verifies that no attempt - ** is made to use an invalid dirty list. - */ - goto sync_exit; + goto commit_phase_one_exit; } sqlite3PcacheCleanAll(pPager->pPCache); - if( pPager->dbSizedbFileSize ){ + /* If the file on disk is not the same size as the database image, + ** then use pager_truncate to grow or shrink the file here. + */ + if( pPager->dbSize!=pPager->dbFileSize ){ + Pgno nNew = pPager->dbSize - (pPager->dbSize==PAGER_MJ_PGNO(pPager)); assert( pPager->state>=PAGER_EXCLUSIVE ); - rc = pager_truncate(pPager, pPager->dbSize); - if( rc!=SQLITE_OK ) goto sync_exit; + rc = pager_truncate(pPager, nNew); + if( rc!=SQLITE_OK ) goto commit_phase_one_exit; } - /* Sync the database file. */ + /* Finally, sync the database file. */ if( !pPager->noSync && !noSync ){ rc = sqlite3OsSync(pPager->fd, pPager->sync_flags); } @@ -4047,13 +4558,13 @@ int sqlite3PagerCommitPhaseOne( pPager->state = PAGER_SYNCED; } -sync_exit: +commit_phase_one_exit: if( rc==SQLITE_IOERR_BLOCKED ){ /* pager_incr_changecounter() may attempt to obtain an exclusive - * lock to spill the cache and return IOERR_BLOCKED. But since - * there is no chance the cache is inconsistent, it is - * better to return SQLITE_BUSY. - */ + ** lock to spill the cache and return IOERR_BLOCKED. But since + ** there is no chance the cache is inconsistent, it is + ** better to return SQLITE_BUSY. + **/ rc = SQLITE_BUSY; } return rc; @@ -4061,50 +4572,107 @@ sync_exit: /* -** Commit all changes to the database and release the write lock. +** When this function is called, the database file has been completely +** updated to reflect the changes made by the current transaction and +** synced to disk. The journal file still exists in the file-system +** though, and if a failure occurs at this point it will eventually +** be used as a hot-journal and the current transaction rolled back. ** -** If the commit fails for any reason, a rollback attempt is made -** and an error code is returned. If the commit worked, SQLITE_OK -** is returned. +** This function finalizes the journal file, either by deleting, +** truncating or partially zeroing it, so that it cannot be used +** for hot-journal rollback. Once this is done the transaction is +** irrevocably committed. +** +** If an error occurs, an IO error code is returned and the pager +** moves into the error state. Otherwise, SQLITE_OK is returned. */ int sqlite3PagerCommitPhaseTwo(Pager *pPager){ - int rc = SQLITE_OK; + int rc = SQLITE_OK; /* Return code */ + /* Do not proceed if the pager is already in the error state. */ if( pPager->errCode ){ return pPager->errCode; } - if( pPager->statestatedbModified==0 && pPager->exclusiveMode && pPager->journalMode==PAGER_JOURNALMODE_PERSIST ){ - assert( pPager->dirtyCache==0 || pPager->journalOpen==0 ); + assert( pPager->journalOff==JOURNAL_HDR_SZ(pPager) ); return SQLITE_OK; } + PAGERTRACE(("COMMIT %d\n", PAGERID(pPager))); - assert( pPager->state==PAGER_SYNCED || MEMDB || !pPager->dirtyCache ); + assert( pPager->state==PAGER_SYNCED || MEMDB || !pPager->dbModified ); rc = pager_end_transaction(pPager, pPager->setMaster); - rc = pager_error(pPager, rc); - return rc; + return pager_error(pPager, rc); } /* -** Rollback all changes. The database falls back to PAGER_SHARED mode. -** All in-memory cache pages revert to their original data contents. -** The journal is deleted. -** -** This routine cannot fail unless some other process is not following -** the correct locking protocol or unless some other -** process is writing trash into the journal file (SQLITE_CORRUPT) or -** unless a prior malloc() failed (SQLITE_NOMEM). Appropriate error -** codes are returned for all these occasions. Otherwise, -** SQLITE_OK is returned. +** Rollback all changes. The database falls back to PAGER_SHARED mode. +** +** This function performs two tasks: +** +** 1) It rolls back the journal file, restoring all database file and +** in-memory cache pages to the state they were in when the transaction +** was opened, and +** 2) It finalizes the journal file, so that it is not used for hot +** rollback at any point in the future. +** +** subject to the following qualifications: +** +** * If the journal file is not yet open when this function is called, +** then only (2) is performed. In this case there is no journal file +** to roll back. +** +** * If in an error state other than SQLITE_FULL, then task (1) is +** performed. If successful, task (2). Regardless of the outcome +** of either, the error state error code is returned to the caller +** (i.e. either SQLITE_IOERR or SQLITE_CORRUPT). +** +** * If the pager is in PAGER_RESERVED state, then attempt (1). Whether +** or not (1) is succussful, also attempt (2). If successful, return +** SQLITE_OK. Otherwise, enter the error state and return the first +** error code encountered. +** +** In this case there is no chance that the database was written to. +** So is safe to finalize the journal file even if the playback +** (operation 1) failed. However the pager must enter the error state +** as the contents of the in-memory cache are now suspect. +** +** * Finally, if in PAGER_EXCLUSIVE state, then attempt (1). Only +** attempt (2) if (1) is successful. Return SQLITE_OK if successful, +** otherwise enter the error state and return the error code from the +** failing operation. +** +** In this case the database file may have been written to. So if the +** playback operation did not succeed it would not be safe to finalize +** the journal file. It needs to be left in the file-system so that +** some other process can use it to restore the database state (by +** hot-journal rollback). */ int sqlite3PagerRollback(Pager *pPager){ - int rc = SQLITE_OK; + int rc = SQLITE_OK; /* Return code */ PAGERTRACE(("ROLLBACK %d\n", PAGERID(pPager))); - if( !pPager->dirtyCache || !pPager->journalOpen ){ + if( !pPager->dbModified || !isOpen(pPager->jfd) ){ rc = pager_end_transaction(pPager, pPager->setMaster); }else if( pPager->errCode && pPager->errCode!=SQLITE_FULL ){ if( pPager->state>=PAGER_EXCLUSIVE ){ @@ -4183,18 +4751,26 @@ int sqlite3PagerIsMemdb(Pager *pPager){ #endif /* -** Ensure that there are at least nSavepoint savepoints open. +** Check that there are at least nSavepoint savepoints open. If there are +** currently less than nSavepoints open, then open one or more savepoints +** to make up the difference. If the number of savepoints is already +** equal to nSavepoint, then this function is a no-op. +** +** If a memory allocation fails, SQLITE_NOMEM is returned. If an error +** occurs while opening the sub-journal file, then an IO error code is +** returned. Otherwise, SQLITE_OK. */ int sqlite3PagerOpenSavepoint(Pager *pPager, int nSavepoint){ - int rc = SQLITE_OK; + int rc = SQLITE_OK; /* Return code */ + int nCurrent = pPager->nSavepoint; /* Current number of savepoints */ - if( nSavepoint>pPager->nSavepoint && pPager->useJournal ){ - int ii; - PagerSavepoint *aNew; + if( nSavepoint>nCurrent && pPager->useJournal ){ + int ii; /* Iterator variable */ + PagerSavepoint *aNew; /* New Pager.aSavepoint array */ /* Either there is no active journal or the sub-journal is open or ** the journal is always stored in memory */ - assert( pPager->nSavepoint==0 || pPager->sjfd->pMethods || + assert( pPager->nSavepoint==0 || isOpen(pPager->sjfd) || pPager->journalMode==PAGER_JOURNALMODE_MEMORY ); /* Grow the Pager.aSavepoint array using realloc(). Return SQLITE_NOMEM @@ -4207,23 +4783,20 @@ int sqlite3PagerOpenSavepoint(Pager *pPager, int nSavepoint){ if( !aNew ){ return SQLITE_NOMEM; } - memset(&aNew[pPager->nSavepoint], 0, - (nSavepoint - pPager->nSavepoint) * sizeof(PagerSavepoint) - ); + memset(&aNew[nCurrent], 0, (nSavepoint-nCurrent) * sizeof(PagerSavepoint)); pPager->aSavepoint = aNew; - ii = pPager->nSavepoint; pPager->nSavepoint = nSavepoint; /* Populate the PagerSavepoint structures just allocated. */ - for(/* no-op */; iidbSizeValid ); aNew[ii].nOrig = pPager->dbSize; - if( pPager->journalOpen && pPager->journalOff>0 ){ + if( isOpen(pPager->jfd) && pPager->journalOff>0 ){ aNew[ii].iOffset = pPager->journalOff; }else{ aNew[ii].iOffset = JOURNAL_HDR_SZ(pPager); } - aNew[ii].iSubRec = pPager->stmtNRec; + aNew[ii].iSubRec = pPager->nSubRec; aNew[ii].pInSavepoint = sqlite3BitvecCreate(pPager->dbSize); if( !aNew[ii].pInSavepoint ){ return SQLITE_NOMEM; @@ -4238,42 +4811,72 @@ int sqlite3PagerOpenSavepoint(Pager *pPager, int nSavepoint){ } /* +** This function is called to rollback or release (commit) a savepoint. +** The savepoint to release or rollback need not be the most recently +** created savepoint. +** ** Parameter op is always either SAVEPOINT_ROLLBACK or SAVEPOINT_RELEASE. ** If it is SAVEPOINT_RELEASE, then release and destroy the savepoint with ** index iSavepoint. If it is SAVEPOINT_ROLLBACK, then rollback all changes -** that have occured since savepoint iSavepoint was created. +** that have occured since the specified savepoint was created. +** +** The savepoint to rollback or release is identified by parameter +** iSavepoint. A value of 0 means to operate on the outermost savepoint +** (the first created). A value of (Pager.nSavepoint-1) means operate +** on the most recently created savepoint. If iSavepoint is greater than +** (Pager.nSavepoint-1), then this function is a no-op. ** -** In either case, all savepoints with an index greater than iSavepoint -** are destroyed. +** If a negative value is passed to this function, then the current +** transaction is rolled back. This is different to calling +** sqlite3PagerRollback() because this function does not terminate +** the transaction or unlock the database, it just restores the +** contents of the database to its original state. ** -** If there are less than (iSavepoint+1) active savepoints when this -** function is called it is a no-op. +** In any case, all savepoints with an index greater than iSavepoint +** are destroyed. If this is a release operation (op==SAVEPOINT_RELEASE), +** then savepoint iSavepoint is also destroyed. +** +** This function may return SQLITE_NOMEM if a memory allocation fails, +** or an IO error code if an IO error occurs while rolling back a +** savepoint. If no errors occur, SQLITE_OK is returned. */ int sqlite3PagerSavepoint(Pager *pPager, int op, int iSavepoint){ int rc = SQLITE_OK; assert( op==SAVEPOINT_RELEASE || op==SAVEPOINT_ROLLBACK ); + assert( iSavepoint>=0 || op==SAVEPOINT_ROLLBACK ); if( iSavepointnSavepoint ){ - int ii; - int nNew = iSavepoint + (op==SAVEPOINT_ROLLBACK); + int ii; /* Iterator variable */ + int nNew; /* Number of remaining savepoints after this op. */ + + /* Figure out how many savepoints will still be active after this + ** operation. Store this value in nNew. Then free resources associated + ** with any savepoints that are destroyed by this operation. + */ + nNew = iSavepoint + (op==SAVEPOINT_ROLLBACK); for(ii=nNew; iinSavepoint; ii++){ sqlite3BitvecDestroy(pPager->aSavepoint[ii].pInSavepoint); } pPager->nSavepoint = nNew; - if( op==SAVEPOINT_ROLLBACK && pPager->jfd->pMethods ){ + /* If this is a rollback operation, playback the specified savepoint. + ** If this is a temp-file, it is possible that the journal file has + ** not yet been opened. In this case there have been no changes to + ** the database file, so the playback operation can be skipped. + */ + if( op==SAVEPOINT_ROLLBACK && isOpen(pPager->jfd) ){ PagerSavepoint *pSavepoint = (nNew==0)?0:&pPager->aSavepoint[nNew-1]; rc = pagerPlaybackSavepoint(pPager, pSavepoint); assert(rc!=SQLITE_DONE); } /* If this is a release of the outermost savepoint, truncate - ** the sub-journal. */ - if( nNew==0 && op==SAVEPOINT_RELEASE && pPager->sjfd->pMethods ){ + ** the sub-journal to zero bytes in size. */ + if( nNew==0 && op==SAVEPOINT_RELEASE && isOpen(pPager->sjfd) ){ assert( rc==SQLITE_OK ); rc = sqlite3OsTruncate(pPager->sjfd, 0); - pPager->stmtNRec = 0; + pPager->nSubRec = 0; } } return rc; @@ -4302,13 +4905,6 @@ sqlite3_file *sqlite3PagerFile(Pager *pPager){ return pPager->fd; } -/* -** Return the directory of the database file. -*/ -const char *sqlite3PagerDirname(Pager *pPager){ - return pPager->zDirectory; -} - /* ** Return the full pathname of the journal file. */ @@ -4360,11 +4956,14 @@ void sqlite3PagerSetCodec( ** moved as part of a database reorganization just before the transaction ** is being committed. In this case, it is guaranteed that the database page ** pPg refers to will not be written to again within this transaction. +** +** This function may return SQLITE_NOMEM or an IO error code if an error +** occurs. Otherwise, it returns SQLITE_OK. */ int sqlite3PagerMovepage(Pager *pPager, DbPage *pPg, Pgno pgno, int isCommit){ - PgHdr *pPgOld; /* The page being overwritten. */ - Pgno needSyncPgno = 0; - int rc; + PgHdr *pPgOld; /* The page being overwritten. */ + Pgno needSyncPgno = 0; /* Old value of pPg->pgno, if sync is required */ + int rc; /* Return code */ assert( pPg->nRef>0 ); @@ -4380,7 +4979,11 @@ int sqlite3PagerMovepage(Pager *pPager, DbPage *pPg, Pgno pgno, int isCommit){ ** ** If page X were not written to the sub-journal here, it would not ** be possible to restore its contents when the "ROLLBACK TO one" - ** statement were processed. + ** statement were is processed. + ** + ** subjournalPage() may need to allocate space to store pPg->pgno into + ** one or more savepoint bitvecs. This is the reason this function + ** may return SQLITE_NOMEM. */ if( pPg->flags&PGHDR_DIRTY && subjRequiresPage(pPg) @@ -4393,8 +4996,6 @@ int sqlite3PagerMovepage(Pager *pPager, DbPage *pPg, Pgno pgno, int isCommit){ PAGERID(pPager), pPg->pgno, (pPg->flags&PGHDR_NEED_SYNC)?1:0, pgno)); IOTRACE(("MOVE %p %d %d\n", pPager, pPg->pgno, pgno)) - pager_get_content(pPg); - /* If the journal needs to be sync()ed before page pPg->pgno can ** be written to, store pPg->pgno in local variable needSyncPgno. ** @@ -4427,7 +5028,6 @@ int sqlite3PagerMovepage(Pager *pPager, DbPage *pPg, Pgno pgno, int isCommit){ } sqlite3PcacheMakeDirty(pPg); - pPager->dirtyCache = 1; pPager->dbModified = 1; if( needSyncPgno ){ @@ -4515,12 +5115,12 @@ int sqlite3PagerLockingMode(Pager *pPager, int eMode){ ** PAGER_JOURNALMODE_TRUNCATE ** PAGER_JOURNALMODE_PERSIST ** PAGER_JOURNALMODE_OFF +** PAGER_JOURNALMODE_MEMORY ** ** If the parameter is not _QUERY, then the journal-mode is set to the ** value specified. ** -** The returned indicate the current (possibly updated) -** journal-mode. +** The returned indicate the current (possibly updated) journal-mode. */ int sqlite3PagerJournalMode(Pager *pPager, int eMode){ if( !MEMDB ){ diff --git a/src/pager.h b/src/pager.h index 8aa39adb3e..6fdaccfeb6 100644 --- a/src/pager.h +++ b/src/pager.h @@ -13,15 +13,16 @@ ** subsystem. The page cache subsystem reads and writes a file a page ** at a time and provides a journal for rollback. ** -** @(#) $Id: pager.h,v 1.95 2009/01/16 16:23:38 danielk1977 Exp $ +** @(#) $Id: pager.h,v 1.96 2009/01/20 17:06:27 danielk1977 Exp $ */ #ifndef _PAGER_H_ #define _PAGER_H_ /* -** If defined as non-zero, auto-vacuum is enabled by default. Otherwise -** it must be turned on for each database using "PRAGMA auto_vacuum = 1". +** Default maximum size for persistent journal files. A negative +** value means no limit. This value may be overridden using the +** sqlite3PagerJournalSizeLimit() API. See also "PRAGMA journal_size_limit". */ #ifndef SQLITE_DEFAULT_JOURNAL_SIZE_LIMIT #define SQLITE_DEFAULT_JOURNAL_SIZE_LIMIT -1 @@ -43,10 +44,20 @@ typedef struct Pager Pager; */ typedef struct PgHdr DbPage; +/* +** Page number PAGER_MJ_PGNO is never used in an SQLite database (it is +** reserved for working around a windows/posix incompatibility). It is +** used in the journal to signify that the remainder of the journal file +** is devoted to storing a master journal name - there are no more pages to +** roll back. See comments for function writeMasterJournal() in pager.c +** for details. +*/ +#define PAGER_MJ_PGNO(x) ((Pgno)((PENDING_BYTE/((x)->pageSize))+1)) + /* ** Allowed values for the flags parameter to sqlite3PagerOpen(). ** -** NOTE: This values must match the corresponding BTREE_ values in btree.h. +** NOTE: These values must match the corresponding BTREE_ values in btree.h. */ #define PAGER_OMIT_JOURNAL 0x0001 /* Do not use a rollback journal */ #define PAGER_NO_READLOCK 0x0002 /* Omit readlocks on readonly files */ @@ -69,75 +80,84 @@ typedef struct PgHdr DbPage; #define PAGER_JOURNALMODE_MEMORY 4 /* In-memory journal file */ /* -** See source code comments for a detailed description of the following -** routines: +** The remainder of this file contains the declarations of the functions +** that make up the Pager sub-system API. See source code comments for +** a detailed description of each routine. */ + +/* Open and close a Pager connection. */ int sqlite3PagerOpen(sqlite3_vfs *, Pager **ppPager, const char*, int,int,int); +int sqlite3PagerClose(Pager *pPager); + +/* Functions used to configure a Pager object. */ void sqlite3PagerSetBusyhandler(Pager*, int(*)(void *), void *); void sqlite3PagerSetReiniter(Pager*, void(*)(DbPage*)); int sqlite3PagerSetPagesize(Pager*, u16*); int sqlite3PagerMaxPageCount(Pager*, int); int sqlite3PagerReadFileheader(Pager*, int, unsigned char*); void sqlite3PagerSetCachesize(Pager*, int); -int sqlite3PagerClose(Pager *pPager); +void sqlite3PagerSetSafetyLevel(Pager*,int,int); +int sqlite3PagerLockingMode(Pager *, int); +int sqlite3PagerJournalMode(Pager *, int); +i64 sqlite3PagerJournalSizeLimit(Pager *, i64); + +/* Functions used to obtain and release page references. */ int sqlite3PagerAcquire(Pager *pPager, Pgno pgno, DbPage **ppPage, int clrFlag); #define sqlite3PagerGet(A,B,C) sqlite3PagerAcquire(A,B,C,0) DbPage *sqlite3PagerLookup(Pager *pPager, Pgno pgno); -int sqlite3PagerPageRefcount(DbPage*); -int sqlite3PagerRef(DbPage*); -int sqlite3PagerUnref(DbPage*); +void sqlite3PagerRef(DbPage*); +void sqlite3PagerUnref(DbPage*); + +/* Operations on page references. */ int sqlite3PagerWrite(DbPage*); +void sqlite3PagerDontWrite(DbPage*); +int sqlite3PagerMovepage(Pager*,DbPage*,Pgno,int); +int sqlite3PagerPageRefcount(DbPage*); +void *sqlite3PagerGetData(DbPage *); +void *sqlite3PagerGetExtra(DbPage *); + +/* Functions used to manage pager transactions and savepoints. */ int sqlite3PagerPagecount(Pager*, int*); -int sqlite3PagerBegin(DbPage*, int exFlag); +int sqlite3PagerBegin(Pager*, int exFlag); int sqlite3PagerCommitPhaseOne(Pager*,const char *zMaster, int); +int sqlite3PagerSync(Pager *pPager); int sqlite3PagerCommitPhaseTwo(Pager*); int sqlite3PagerRollback(Pager*); +int sqlite3PagerOpenSavepoint(Pager *pPager, int n); +int sqlite3PagerSavepoint(Pager *pPager, int op, int iSavepoint); + +/* Functions used to query pager state and configuration. */ u8 sqlite3PagerIsreadonly(Pager*); -void sqlite3PagerDontRollback(DbPage*); -int sqlite3PagerDontWrite(DbPage*); int sqlite3PagerRefcount(Pager*); -void sqlite3PagerSetSafetyLevel(Pager*,int,int); const char *sqlite3PagerFilename(Pager*); const sqlite3_vfs *sqlite3PagerVfs(Pager*); sqlite3_file *sqlite3PagerFile(Pager*); const char *sqlite3PagerDirname(Pager*); const char *sqlite3PagerJournalname(Pager*); int sqlite3PagerNosync(Pager*); -int sqlite3PagerMovepage(Pager*,DbPage*,Pgno,int); -void *sqlite3PagerGetData(DbPage *); -void *sqlite3PagerGetExtra(DbPage *); -int sqlite3PagerLockingMode(Pager *, int); -int sqlite3PagerJournalMode(Pager *, int); -i64 sqlite3PagerJournalSizeLimit(Pager *, i64); void *sqlite3PagerTempSpace(Pager*); -int sqlite3PagerSync(Pager *pPager); - -int sqlite3PagerOpenSavepoint(Pager *pPager, int n); -int sqlite3PagerSavepoint(Pager *pPager, int op, int iSavepoint); +/* Functions used in auto-vacuum mode to truncate the database file. */ #ifndef SQLITE_OMIT_AUTOVACUUM void sqlite3PagerTruncateImage(Pager*,Pgno); - Pgno sqlite3PagerImageSize(Pager *); #endif +/* Used by encryption extensions. */ #ifdef SQLITE_HAS_CODEC void sqlite3PagerSetCodec(Pager*,void*(*)(void*,void*,Pgno,int),void*); #endif +/* Functions to support testing and debugging. */ #if !defined(NDEBUG) || defined(SQLITE_TEST) Pgno sqlite3PagerPagenumber(DbPage*); int sqlite3PagerIswriteable(DbPage*); #endif - #ifdef SQLITE_TEST int *sqlite3PagerStats(Pager*); void sqlite3PagerRefdump(Pager*); int sqlite3PagerIsMemdb(Pager*); -#endif - -#ifdef SQLITE_TEST -void disable_simulated_io_errors(void); -void enable_simulated_io_errors(void); + void disable_simulated_io_errors(void); + void enable_simulated_io_errors(void); #else # define disable_simulated_io_errors() # define enable_simulated_io_errors() diff --git a/src/pcache.c b/src/pcache.c index c16606dd43..4065a54f4b 100644 --- a/src/pcache.c +++ b/src/pcache.c @@ -11,7 +11,7 @@ ************************************************************************* ** This file implements that page cache. ** -** @(#) $Id: pcache.c,v 1.41 2009/01/16 16:23:38 danielk1977 Exp $ +** @(#) $Id: pcache.c,v 1.42 2009/01/20 17:06:27 danielk1977 Exp $ */ #include "sqliteInt.h" @@ -23,14 +23,13 @@ struct PCache { PgHdr *pSynced; /* Last synced page in dirty page list */ int nRef; /* Number of referenced pages */ int nMax; /* Configured cache size */ - int nMin; /* Configured minimum cache size */ int szPage; /* Size of every page in this cache */ int szExtra; /* Size of extra space for each page */ int bPurgeable; /* True if pages are on backing store */ int (*xStress)(void*,PgHdr*); /* Call to try make a page clean */ void *pStress; /* Argument to xStress */ sqlite3_pcache *pCache; /* Pluggable cache module */ - PgHdr *pPage1; + PgHdr *pPage1; /* Reference to page 1 */ }; /* @@ -181,7 +180,6 @@ void sqlite3PcacheOpen( p->xStress = xStress; p->pStress = pStress; p->nMax = 100; - p->nMin = 10; } /* @@ -431,9 +429,8 @@ void sqlite3PcacheClose(PCache *pCache){ /* ** Discard the contents of the cache. */ -int sqlite3PcacheClear(PCache *pCache){ +void sqlite3PcacheClear(PCache *pCache){ sqlite3PcacheTruncate(pCache, 0); - return SQLITE_OK; } /* diff --git a/src/pcache.h b/src/pcache.h index 7cffac15c8..bc1e7a37a2 100644 --- a/src/pcache.h +++ b/src/pcache.h @@ -12,7 +12,7 @@ ** This header file defines the interface that the sqlite page cache ** subsystem. ** -** @(#) $Id: pcache.h,v 1.18 2009/01/16 16:23:38 danielk1977 Exp $ +** @(#) $Id: pcache.h,v 1.19 2009/01/20 17:06:27 danielk1977 Exp $ */ #ifndef _PCACHE_H_ @@ -111,7 +111,7 @@ void sqlite3PcacheClose(PCache*); void sqlite3PcacheClearSyncFlags(PCache *); /* Discard the contents of the cache */ -int sqlite3PcacheClear(PCache*); +void sqlite3PcacheClear(PCache*); /* Return the total number of outstanding page references */ int sqlite3PcacheRefCount(PCache*); diff --git a/src/sqliteInt.h b/src/sqliteInt.h index 63529f67ec..ded10de9c0 100644 --- a/src/sqliteInt.h +++ b/src/sqliteInt.h @@ -11,7 +11,7 @@ ************************************************************************* ** Internal interface definitions for SQLite. ** -** @(#) $Id: sqliteInt.h,v 1.828 2009/01/20 16:53:41 danielk1977 Exp $ +** @(#) $Id: sqliteInt.h,v 1.829 2009/01/20 17:06:27 danielk1977 Exp $ */ #ifndef _SQLITEINT_H_ #define _SQLITEINT_H_ @@ -2277,6 +2277,7 @@ int sqlite3BitvecTest(Bitvec*, u32); int sqlite3BitvecSet(Bitvec*, u32); void sqlite3BitvecClear(Bitvec*, u32); void sqlite3BitvecDestroy(Bitvec*); +u32 sqlite3BitvecSize(Bitvec*); int sqlite3BitvecBuiltinTest(int,int*); RowSet *sqlite3RowSetInit(sqlite3*, void*, unsigned int); diff --git a/src/test2.c b/src/test2.c index 8ac75a50e8..43096048c7 100644 --- a/src/test2.c +++ b/src/test2.c @@ -13,7 +13,7 @@ ** is not included in the SQLite library. It is used for automated ** testing of the SQLite library. ** -** $Id: test2.c,v 1.68 2009/01/20 16:53:41 danielk1977 Exp $ +** $Id: test2.c,v 1.69 2009/01/20 17:06:27 danielk1977 Exp $ */ #include "sqliteInt.h" #include "tcl.h" @@ -418,18 +418,13 @@ static int page_unref( const char **argv /* Text of each argument */ ){ DbPage *pPage; - int rc; if( argc!=2 ){ Tcl_AppendResult(interp, "wrong # args: should be \"", argv[0], " PAGE\"", 0); return TCL_ERROR; } pPage = (DbPage *)sqlite3TestTextToPtr(argv[1]); - rc = sqlite3PagerUnref(pPage); - if( rc!=SQLITE_OK ){ - Tcl_AppendResult(interp, errorName(rc), 0); - return TCL_ERROR; - } + sqlite3PagerUnref(pPage); return TCL_OK; } diff --git a/src/vdbeaux.c b/src/vdbeaux.c index 488d53c4bf..54528ea38f 100644 --- a/src/vdbeaux.c +++ b/src/vdbeaux.c @@ -14,7 +14,7 @@ ** to version 2.8.7, all this code was combined into the vdbe.c source file. ** But that file was getting too big so this subroutines were split out. ** -** $Id: vdbeaux.c,v 1.433 2009/01/20 16:53:41 danielk1977 Exp $ +** $Id: vdbeaux.c,v 1.434 2009/01/20 17:06:27 danielk1977 Exp $ */ #include "sqliteInt.h" #include "vdbeInt.h" @@ -1387,10 +1387,10 @@ static int vdbeCommit(sqlite3 *db, Vdbe *p){ /* Sync the master journal file. If the IOCAP_SEQUENTIAL device ** flag is set this is not required. */ - zMainFile = sqlite3BtreeGetDirname(db->aDb[0].pBt); - if( (needSync - && (0==(sqlite3OsDeviceCharacteristics(pMaster)&SQLITE_IOCAP_SEQUENTIAL)) - && (rc=sqlite3OsSync(pMaster, SQLITE_SYNC_NORMAL))!=SQLITE_OK) ){ + if( needSync + && 0==(sqlite3OsDeviceCharacteristics(pMaster)&SQLITE_IOCAP_SEQUENTIAL) + && SQLITE_OK!=(rc = sqlite3OsSync(pMaster, SQLITE_SYNC_NORMAL)) + ){ sqlite3OsCloseFree(pMaster); sqlite3OsDelete(pVfs, zMaster, 0); sqlite3DbFree(db, zMaster);