From: dan Date: Wed, 4 Oct 2017 20:57:14 +0000 (+0000) Subject: Add experimental mode that uses two wal files. Activated using "PRAGMA X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=055cc1e431811d6e87eea672acd2e96e95138e07;p=thirdparty%2Fsqlite.git Add experimental mode that uses two wal files. Activated using "PRAGMA journal_mode = wal2". FossilOrigin-Name: e2fc5c814cf6862d536aacb9eca66ecd31ba7e3e3033fa4c5564d533f4a18dfc --- diff --git a/manifest b/manifest index 1ea0bfb8d7..f6dbc867fd 100644 --- a/manifest +++ b/manifest @@ -1,5 +1,5 @@ -C Updates\sto\srequirements\smarks. -D 2017-10-04T18:26:44.610 +C Add\sexperimental\smode\sthat\suses\stwo\swal\sfiles.\sActivated\susing\s"PRAGMA\njournal_mode\s=\swal2". +D 2017-10-04T20:57:14.949 F Makefile.in 4bc36d913c2e3e2d326d588d72f618ac9788b2fd4b7efda61102611a6495c3ff F Makefile.linux-gcc 7bc79876b875010e8c8f9502eb935ca92aa3c434 F Makefile.msc 6033b51b6aea702ea059f6ab2d47b1d3cef648695f787247dd4fb395fe60673f @@ -401,7 +401,7 @@ F src/auth.c 6277d63837357549fe14e723490d6dc1a38768d71c795c5eb5c0f8a99f918f73 F src/backup.c faf17e60b43233c214aae6a8179d24503a61e83b F src/bitvec.c 17ea48eff8ba979f1f5b04cc484c7bb2be632f33 F src/btmutex.c 0e9ce2d56159b89b9bc8e197e023ee11e39ff8ca -F src/btree.c cc88a7fca7287dfc004921bb5e2764893dfe4f6dd33be3570126b3fc37932600 +F src/btree.c c7d94975b015058e6a68278e53d841585552bbc541c1b31b6e024eb9fd888746 F src/btree.h 32ef5d3f25dc70ef1ee9cecf84a023c21378f06a57cd701d2e866e141b150f09 F src/btreeInt.h 55b702efce17e5d1941865464227d3802cfc9c7c832fac81d4c94dced47a71fc F src/build.c e71e96a67daf3d1dd23188423e66cd6af38017e2ec73fead5d2b57da2d3c7e16 @@ -442,16 +442,16 @@ F src/os.c 93e0979b9b55df29c0c4923f73b48e9d3fe728f01dd8ed4f6a9d2f1d79779bc8 F src/os.h 8e976e59eb4ca1c0fca6d35ee803e38951cb0343 F src/os_common.h b2f4707a603e36811d9b1a13278bffd757857b85 F src/os_setup.h 0dbaea40a7d36bf311613d31342e0b99e2536586 -F src/os_unix.c 3984fc069df59e26f000e30609611cecdb4e93293e6ee52313a473a7e874af1b +F src/os_unix.c 1f9c3e771557edd248e1fcec0818739826b2a3121b609a6e3372b670236d085a F src/os_win.c 6892c3ff23b7886577e47f13d827ca220c0831bae3ce00eea8c258352692f8c6 F src/os_win.h 7b073010f1451abe501be30d12f6bc599824944a -F src/pager.c 2aa56a99bb13128d9102e84c7a9f835e546cbb58f0861d481bc3db32973b1628 -F src/pager.h 581698f2177e8bd4008fe4760898ce20b6133d1df22139b9101b5155f900df7a +F src/pager.c bc2d2b6784cc2b5103e07aff13b012dab30c7efa0abb9bc2bb949ec45cba4326 +F src/pager.h e11e516208a460bea1b95fe3da697642306e7f350d5f96d2c1d21231ee4d2bf2 F src/parse.y 52ef3cecd0934e9da4a45b585883a03243ad615d338ad94f44501a05891dcdfa F src/pcache.c 4bada070456980c3c1f16d58ec2e64e389ad77b935e3d77e0c96e7bbd397289c F src/pcache.h 072f94d29281cffd99e46c1539849f248c4b56ae7684c1f36626797fee375170 F src/pcache1.c 716975564c15eb6679e97f734cec1bfd6c16ac3d4010f05f1f8e509fc7d19880 -F src/pragma.c d04725ac25387d9638919e197fb009f378e13af7bf899516979e54b3164e3602 +F src/pragma.c 49a04b2ec3199b7967c19b3239182d9c4e860726909ed80b7a3e21fe5bb9e6c4 F src/pragma.h bb83728944b42f6d409c77f5838a8edbdb0fe83046c5496ffc9602b40340a324 F src/prepare.c 9a141a1b02dca53beaa9771699d390aafcac01f5d1f1c0ae6e23ded8dcdb709a F src/printf.c 40aee47ae9be4bd3dbdc8968bd07fddc027be8edec8daddf24d3391d36698a1c @@ -526,7 +526,7 @@ F src/update.c 5404be9e840717323a69209190cdbc9d0d34adaedaaf1d1a1069babf2c4171c0 F src/utf.c 810fbfebe12359f10bc2a011520a6e10879ab2a163bcb26c74768eab82ea62a5 F src/util.c 5168013cfd937a695d23cce8c67cb07a3dda242d4cb812530ba1148b88e0f159 F src/vacuum.c 90839322fd5f00df9617eb21b68beda9b6e2a2937576b0d65985e4aeb1c53739 -F src/vdbe.c 176c0897af0aedecd3abc9afaf7fa80eaa7cf5eaf62583de256a9961df474373 +F src/vdbe.c c3c3fe95d98df8757570fb265236cfdf31536d1025bf37c232237d1e422ae39b F src/vdbe.h d50cadf12bcf9fb99117ef392ce1ea283aa429270481426b6e8b0280c101fd97 F src/vdbeInt.h 1fe00770144c12c4913128f35262d11527ef3284561baaab59b947a41c08d0d9 F src/vdbeapi.c 9c670ca0dcc1cd86373aa353b747b26fe531ca5cd4331690c611d1f03842e2a1 @@ -537,8 +537,8 @@ F src/vdbesort.c 731a09e5cb9e96b70c394c1b7cf3860fbe84acca7682e178615eb941a3a0ef2 F src/vdbetrace.c 48e11ebe040c6b41d146abed2602e3d00d621d7ebe4eb29b0a0f1617fd3c2f6c F src/vtab.c 0e4885495172e1bdf54b12cce23b395ac74ef5729031f15e1bc1e3e6b360ed1a F src/vxworks.h d2988f4e5a61a4dfe82c6524dd3d6e4f2ce3cdb9 -F src/wal.c 839db09792fead5052bb35e533fa485e134913d547d05b5f42e537b73e63f07a -F src/wal.h 8de5d2d3de0956d6f6cb48c83a4012d5f227b8fe940f3a349a4b7e85ebcb492a +F src/wal.c 5ca528539a69edd6333dcab1d49e89d4f98efb6a23f0fda85bc52c4ec313db49 +F src/wal.h b6063e6be1b03389372f3f32240e99b8ab92c32cdd05aa0e31b30a21e4e41654 F src/walker.c 3ccfa8637f95355bff61144e01a615b8ef26f79c312880848da73f03367da1e6 F src/where.c 049522adcf5426f1a8c3ed07be15e1ffa3266afd34e8e7bee64b63e2fbfad0b5 F src/whereInt.h 82c04c5075308abbac59180c8bad5ecb45b07453981f60a53f3c7dee21e1e971 @@ -1097,7 +1097,7 @@ F test/parser1.test 391b9bf9a229547a129c61ac345ed1a6f5eb1854 F test/pcache.test c8acbedd3b6fd0f9a7ca887a83b11d24a007972b F test/pcache2.test af7f3deb1a819f77a6d0d81534e97d1cf62cd442 F test/percentile.test 4243af26b8f3f4555abe166f723715a1f74c77ff -F test/permutations.test d911c9ba49088d22054a05dc73743f677872a92ac89288bcdeafa0ebf3f9c531 +F test/permutations.test 5fe80f417441ed6a69acd77f5190305dbf255cc2be214f8e36df4715c4f63f08 F test/pragma.test c31b5e98998c160a4c85b1e04f590655c67f2daa7f73854640cd120610e3ac15 F test/pragma2.test e5d5c176360c321344249354c0c16aec46214c9f F test/pragma3.test 14c12bc5352b1e100e0b6b44f371053a81ccf8ed @@ -1142,11 +1142,11 @@ F test/rowvalue9.test d8dd2c6ecac432dadaa79e41dc2434f007be1b6b F test/rowvaluefault.test 7cd9ccc6c2fbdd881672984087aad0491bb75504 F test/rtree.test 0c8d9dd458d6824e59683c19ab2ffa9ef946f798 F test/run-wordcount.sh 891e89c4c2d16e629cd45951d4ed899ad12afc09 -F test/savepoint.test 1f8a6b1aea9a0d05837adc463d4bf47bd9d0f1c842f1c2a9caccd639baf34bf9 +F test/savepoint.test 69c56b891ce0ff28f1376b5516bf2b6a8b39d0430433216bfc496e72103baaa5 F test/savepoint2.test 9b8543940572a2f01a18298c3135ad0c9f4f67d7 F test/savepoint4.test c8f8159ade6d2acd9128be61e1230f1c1edc6cc0 F test/savepoint5.test 0735db177e0ebbaedc39812c8d065075d563c4fd -F test/savepoint6.test f41279c5e137139fa5c21485773332c7adb98cd7 +F test/savepoint6.test 48a645a7bb3a59a6fcf06a7364cfe5b655c336760de39068f7c241b0fc80d963 F test/savepoint7.test cde525ea3075283eb950cdcdefe23ead4f700daa F test/savepointfault.test f044eac64b59f09746c7020ee261734de82bf9b2 F test/scanstatus.test 5253c219e331318a437f436268e0e82345700285 @@ -1261,7 +1261,7 @@ F test/temptable.test d2c9b87a54147161bcd1822e30c1d1cd891e5b30 F test/temptable2.test cd396beb41117a5302fff61767c35fa4270a0d5e F test/temptable3.test d11a0974e52b347e45ee54ef1923c91ed91e4637 F test/temptrigger.test 38f0ca479b1822d3117069e014daabcaacefffcc -F test/tester.tcl 9948bd856ce8a1c127f2f7900365387a42a917ce0dc87185bdd128fa5b11aff2 +F test/tester.tcl 71d30287dd22aae0eb9b07e62574336ae6a57e7a50e5dab320a604cfcca4b173 F test/thread001.test 9f22fd3525a307ff42a326b6bc7b0465be1745a5 F test/thread002.test e630504f8a06c00bf8bbe68528774dd96aeb2e58 F test/thread003.test ee4c9efc3b86a6a2767516a37bd64251272560a7 @@ -1520,6 +1520,7 @@ F test/walro.test 4ab7ac01b77c2f894235c699d59e3e3c7f15a160 F test/walshared.test 0befc811dcf0b287efae21612304d15576e35417 F test/walslow.test c05c68d4dc2700a982f89133ce103a1a84cc285f F test/walthread.test de8dbaf6d9e41481c460ba31ca61e163d7348f8e +F test/waltwo2.test 6e4d36500a20ff2d19761cf0e9a5d178e83d1798feda157ebc0681e01a35e56e F test/where.test f0c325563acde44f2c4ea6ba348e9e29f7121757 F test/where2.test 478d2170637b9211f593120648858593bf2445a1 F test/where3.test 54cdeb02157acc979de41530b804ae7b09552bf1 @@ -1655,7 +1656,11 @@ F vsixtest/vsixtest.tcl 6a9a6ab600c25a91a7acc6293828957a386a8a93 F vsixtest/vsixtest.vcxproj.data 2ed517e100c66dc455b492e1a33350c1b20fbcdc F vsixtest/vsixtest.vcxproj.filters 37e51ffedcdb064aad6ff33b6148725226cd608e F vsixtest/vsixtest_TemporaryKey.pfx e5b1b036facdb453873e7084e1cae9102ccc67a0 -P 7c69f8f1089c3e3843fbf7ec37a897c849a3df822a4ce3b4fcde586adf991a3f -R 61b14913c42f4f709d3e3a3845316f97 -U drh -Z 37903880ad327c432f24949839963d82 +P 40964a4ef7565ea0ddf452f48cb22373d068528e07d40eefc008f2231c969422 +R 07b0a272ae21af8e4cb674b6ac44fa89 +T *branch * wal2 +T *sym-wal2 * +T +closed f04ded1d9b40d54463162264e37e6d92411d09427eea592ef05681035e2f2e64 +T -sym-trunk * +U dan +Z 07422baaaff21d5f15df6853d7e92b06 diff --git a/manifest.uuid b/manifest.uuid index c2c8cc6040..092af1e9bc 100644 --- a/manifest.uuid +++ b/manifest.uuid @@ -1 +1 @@ -40964a4ef7565ea0ddf452f48cb22373d068528e07d40eefc008f2231c969422 \ No newline at end of file +e2fc5c814cf6862d536aacb9eca66ecd31ba7e3e3033fa4c5564d533f4a18dfc \ No newline at end of file diff --git a/src/btree.c b/src/btree.c index cadd711e45..b221b7cf9c 100644 --- a/src/btree.c +++ b/src/btree.c @@ -2993,10 +2993,10 @@ static int lockBtree(BtShared *pBt){ goto page1_init_failed; } #else - if( page1[18]>2 ){ + if( page1[18]>3 ){ pBt->btsFlags |= BTS_READ_ONLY; } - if( page1[19]>2 ){ + if( page1[19]>3 ){ goto page1_init_failed; } @@ -3008,9 +3008,9 @@ static int lockBtree(BtShared *pBt){ ** may not be the latest version - there may be a newer one in the log ** file. */ - if( page1[19]==2 && (pBt->btsFlags & BTS_NO_WAL)==0 ){ + if( page1[19]>=2 && (pBt->btsFlags & BTS_NO_WAL)==0 ){ int isOpen = 0; - rc = sqlite3PagerOpenWal(pBt->pPager, &isOpen); + rc = sqlite3PagerOpenWal(pBt->pPager, (page1[19]==3), &isOpen); if( rc!=SQLITE_OK ){ goto page1_init_failed; }else{ @@ -9866,7 +9866,7 @@ int sqlite3BtreeSetVersion(Btree *pBtree, int iVersion){ BtShared *pBt = pBtree->pBt; int rc; /* Return code */ - assert( iVersion==1 || iVersion==2 ); + assert( iVersion==1 || iVersion==2 || iVersion==3 ); /* If setting the version fields to 1, do not automatically open the ** WAL connection, even if the version fields are currently set to 2. diff --git a/src/os_unix.c b/src/os_unix.c index 4445104dd6..19ac47263f 100644 --- a/src/os_unix.c +++ b/src/os_unix.c @@ -4163,7 +4163,7 @@ static int unixShmSystemLock( assert( sqlite3_mutex_held(pShmNode->mutex) || pShmNode->nRef==0 ); /* Shared locks never span more than one byte */ - assert( n==1 || lockType!=F_RDLCK ); + /* assert( n==1 || lockType!=F_RDLCK ); */ /* Locks are within range */ assert( n>=1 && n<=SQLITE_SHM_NLOCK ); @@ -4609,7 +4609,7 @@ static int unixShmLock( || flags==(SQLITE_SHM_LOCK | SQLITE_SHM_EXCLUSIVE) || flags==(SQLITE_SHM_UNLOCK | SQLITE_SHM_SHARED) || flags==(SQLITE_SHM_UNLOCK | SQLITE_SHM_EXCLUSIVE) ); - assert( n==1 || (flags & SQLITE_SHM_EXCLUSIVE)!=0 ); + /* assert( n==1 || (flags & SQLITE_SHM_EXCLUSIVE)!=0 ); */ assert( pShmNode->h>=0 || pDbFd->pInode->bProcessLock==1 ); assert( pShmNode->h<0 || pDbFd->pInode->bProcessLock==0 ); diff --git a/src/pager.c b/src/pager.c index a43614cdb4..5a2b01f77a 100644 --- a/src/pager.c +++ b/src/pager.c @@ -810,20 +810,6 @@ static const unsigned char aJournalMagic[] = { */ #define PAGER_MAX_PGNO 2147483647 -/* -** The argument to this macro is a file descriptor (type sqlite3_file*). -** Return 0 if it is not open, or non-zero (but not 1) if it is. -** -** This is so that expressions can be written as: -** -** if( isOpen(pPager->jfd) ){ ... -** -** instead of -** -** if( pPager->jfd->pMethods ){ ... -*/ -#define isOpen(pFd) ((pFd)->pMethods!=0) - /* ** Return true if this pager uses a write-ahead log to read page pgno. ** Return false if the pager reads pgno directly from the database. @@ -944,6 +930,7 @@ static int assert_pager_state(Pager *p){ assert( isOpen(p->jfd) || p->journalMode==PAGER_JOURNALMODE_OFF || p->journalMode==PAGER_JOURNALMODE_WAL + || p->journalMode==PAGER_JOURNALMODE_WAL2 ); } assert( pPager->dbOrigSize==pPager->dbFileSize ); @@ -958,6 +945,7 @@ static int assert_pager_state(Pager *p){ assert( isOpen(p->jfd) || p->journalMode==PAGER_JOURNALMODE_OFF || p->journalMode==PAGER_JOURNALMODE_WAL + || p->journalMode==PAGER_JOURNALMODE_WAL2 || (sqlite3OsDeviceCharacteristics(p->fd)&SQLITE_IOCAP_BATCH_ATOMIC) ); assert( pPager->dbOrigSize<=pPager->dbHintSize ); @@ -970,6 +958,7 @@ static int assert_pager_state(Pager *p){ assert( isOpen(p->jfd) || p->journalMode==PAGER_JOURNALMODE_OFF || p->journalMode==PAGER_JOURNALMODE_WAL + || p->journalMode==PAGER_JOURNALMODE_WAL2 || (sqlite3OsDeviceCharacteristics(p->fd)&SQLITE_IOCAP_BATCH_ATOMIC) ); break; @@ -2063,7 +2052,7 @@ static int pager_end_transaction(Pager *pPager, int hasMaster, int bCommit){ } pPager->journalOff = 0; }else if( pPager->journalMode==PAGER_JOURNALMODE_PERSIST - || (pPager->exclusiveMode && pPager->journalMode!=PAGER_JOURNALMODE_WAL) + || (pPager->exclusiveMode && pPager->journalModetempFile); pPager->journalOff = 0; @@ -2077,7 +2066,8 @@ static int pager_end_transaction(Pager *pPager, int hasMaster, int bCommit){ assert( sqlite3JournalIsInMemory(pPager->jfd)==0 ); assert( pPager->journalMode==PAGER_JOURNALMODE_DELETE || pPager->journalMode==PAGER_JOURNALMODE_MEMORY - || pPager->journalMode==PAGER_JOURNALMODE_WAL + || pPager->journalMode==PAGER_JOURNALMODE_WAL + || pPager->journalMode==PAGER_JOURNALMODE_WAL2 ); sqlite3OsClose(pPager->jfd); if( bDelete ){ @@ -3344,9 +3334,9 @@ static int pagerOpenWalIfPresent(Pager *pPager){ rc = sqlite3OsDelete(pPager->pVfs, pPager->zWal, 0); }else{ testcase( sqlite3PcachePagecount(pPager->pPCache)==0 ); - rc = sqlite3PagerOpenWal(pPager, 0); + rc = sqlite3PagerOpenWal(pPager, 0, 0); } - }else if( pPager->journalMode==PAGER_JOURNALMODE_WAL ){ + }else if( pPager->journalMode>=PAGER_JOURNALMODE_WAL ){ pPager->journalMode = PAGER_JOURNALMODE_DELETE; } } @@ -7245,6 +7235,7 @@ int sqlite3PagerSetJournalMode(Pager *pPager, int eMode){ || eMode==PAGER_JOURNALMODE_PERSIST || eMode==PAGER_JOURNALMODE_OFF || eMode==PAGER_JOURNALMODE_WAL + || eMode==PAGER_JOURNALMODE_WAL2 || eMode==PAGER_JOURNALMODE_MEMORY ); /* This routine is only called from the OP_JournalMode opcode, and @@ -7279,9 +7270,12 @@ int sqlite3PagerSetJournalMode(Pager *pPager, int eMode){ assert( (PAGER_JOURNALMODE_MEMORY & 5)==4 ); assert( (PAGER_JOURNALMODE_OFF & 5)==0 ); assert( (PAGER_JOURNALMODE_WAL & 5)==5 ); + assert( (PAGER_JOURNALMODE_WAL2 & 5)==4 ); assert( isOpen(pPager->fd) || pPager->exclusiveMode ); - if( !pPager->exclusiveMode && (eOld & 5)==1 && (eMode & 1)==0 ){ + if( !pPager->exclusiveMode && (eOld & 5)==1 && (eMode & 1)==0 + && eMode!=PAGER_JOURNALMODE_WAL2 /* TODO: fix this if possible */ + ){ /* In this case we would like to delete the journal file. If it is ** not possible, then that is not a problem. Deleting the journal file @@ -7443,7 +7437,7 @@ static int pagerExclusiveLock(Pager *pPager){ ** lock on the database file and use heap-memory to store the wal-index ** in. Otherwise, use the normal shared-memory. */ -static int pagerOpenWal(Pager *pPager){ +static int pagerOpenWal(Pager *pPager, int bWal2){ int rc = SQLITE_OK; assert( pPager->pWal==0 && pPager->tempFile==0 ); @@ -7464,7 +7458,7 @@ static int pagerOpenWal(Pager *pPager){ if( rc==SQLITE_OK ){ rc = sqlite3WalOpen(pPager->pVfs, pPager->fd, pPager->zWal, pPager->exclusiveMode, - pPager->journalSizeLimit, &pPager->pWal + pPager->journalSizeLimit, bWal2, &pPager->pWal ); } pagerFixMaplimit(pPager); @@ -7490,6 +7484,7 @@ static int pagerOpenWal(Pager *pPager){ */ int sqlite3PagerOpenWal( Pager *pPager, /* Pager object */ + int bWal2, /* Open in wal2 mode if not already open */ int *pbOpen /* OUT: Set to true if call is a no-op */ ){ int rc = SQLITE_OK; /* Return code */ @@ -7506,9 +7501,9 @@ int sqlite3PagerOpenWal( /* Close any rollback journal previously open */ sqlite3OsClose(pPager->jfd); - rc = pagerOpenWal(pPager); + rc = pagerOpenWal(pPager, bWal2); if( rc==SQLITE_OK ){ - pPager->journalMode = PAGER_JOURNALMODE_WAL; + pPager->journalMode = bWal2?PAGER_JOURNALMODE_WAL2:PAGER_JOURNALMODE_WAL; pPager->eState = PAGER_OPEN; } }else{ @@ -7530,7 +7525,9 @@ int sqlite3PagerOpenWal( int sqlite3PagerCloseWal(Pager *pPager, sqlite3 *db){ int rc = SQLITE_OK; - assert( pPager->journalMode==PAGER_JOURNALMODE_WAL ); + assert( pPager->journalMode==PAGER_JOURNALMODE_WAL + || pPager->journalMode==PAGER_JOURNALMODE_WAL2 + ); /* If the log file is not already open, but does exist in the file-system, ** it may need to be checkpointed before the connection can switch to @@ -7545,7 +7542,7 @@ int sqlite3PagerCloseWal(Pager *pPager, sqlite3 *db){ ); } if( rc==SQLITE_OK && logexists ){ - rc = pagerOpenWal(pPager); + rc = pagerOpenWal(pPager, 0); } } diff --git a/src/pager.h b/src/pager.h index 126267bcc8..399fcd4de3 100644 --- a/src/pager.h +++ b/src/pager.h @@ -81,6 +81,23 @@ typedef struct PgHdr DbPage; #define PAGER_JOURNALMODE_TRUNCATE 3 /* Commit by truncating journal */ #define PAGER_JOURNALMODE_MEMORY 4 /* In-memory journal file */ #define PAGER_JOURNALMODE_WAL 5 /* Use write-ahead logging */ +#define PAGER_JOURNALMODE_WAL2 6 /* Use write-ahead logging mode 2 */ + +#define isWalMode(x) ((x)==PAGER_JOURNALMODE_WAL || (x)==PAGER_JOURNALMODE_WAL2) + +/* +** The argument to this macro is a file descriptor (type sqlite3_file*). +** Return 0 if it is not open, or non-zero (but not 1) if it is. +** +** This is so that expressions can be written as: +** +** if( isOpen(pPager->jfd) ){ ... +** +** instead of +** +** if( pPager->jfd->pMethods ){ ... +*/ +#define isOpen(pFd) ((pFd)->pMethods!=0) /* ** Flags that make up the mask passed to sqlite3PagerGet(). @@ -177,7 +194,7 @@ int sqlite3PagerSharedLock(Pager *pPager); int sqlite3PagerCheckpoint(Pager *pPager, sqlite3*, int, int*, int*); int sqlite3PagerWalSupported(Pager *pPager); int sqlite3PagerWalCallback(Pager *pPager); - int sqlite3PagerOpenWal(Pager *pPager, int *pisOpen); + int sqlite3PagerOpenWal(Pager *pPager, int, int *pisOpen); int sqlite3PagerCloseWal(Pager *pPager, sqlite3*); # ifdef SQLITE_DIRECT_OVERFLOW_READ int sqlite3PagerUseWal(Pager *pPager, Pgno); diff --git a/src/pragma.c b/src/pragma.c index 918b1d8131..b78940c971 100644 --- a/src/pragma.c +++ b/src/pragma.c @@ -260,7 +260,7 @@ const char *sqlite3JournalModename(int eMode){ static char * const azModeName[] = { "delete", "persist", "off", "truncate", "memory" #ifndef SQLITE_OMIT_WAL - , "wal" + , "wal", "wal2" #endif }; assert( PAGER_JOURNALMODE_DELETE==0 ); @@ -269,6 +269,7 @@ const char *sqlite3JournalModename(int eMode){ assert( PAGER_JOURNALMODE_TRUNCATE==3 ); assert( PAGER_JOURNALMODE_MEMORY==4 ); assert( PAGER_JOURNALMODE_WAL==5 ); + assert( PAGER_JOURNALMODE_WAL2==6 ); assert( eMode>=0 && eMode<=ArraySize(azModeName) ); if( eMode==ArraySize(azModeName) ) return 0; diff --git a/src/vdbe.c b/src/vdbe.c index 9687170bec..c7b5cf74c4 100644 --- a/src/vdbe.c +++ b/src/vdbe.c @@ -6325,6 +6325,7 @@ case OP_JournalMode: { /* out2 */ || eNew==PAGER_JOURNALMODE_OFF || eNew==PAGER_JOURNALMODE_MEMORY || eNew==PAGER_JOURNALMODE_WAL + || eNew==PAGER_JOURNALMODE_WAL2 || eNew==PAGER_JOURNALMODE_QUERY ); assert( pOp->p1>=0 && pOp->p1nDb ); @@ -6342,16 +6343,25 @@ case OP_JournalMode: { /* out2 */ /* Do not allow a transition to journal_mode=WAL for a database ** in temporary storage or if the VFS does not support shared memory */ - if( eNew==PAGER_JOURNALMODE_WAL + if( isWalMode(eNew) && (sqlite3Strlen30(zFilename)==0 /* Temp file */ || !sqlite3PagerWalSupported(pPager)) /* No shared-memory support */ ){ eNew = eOld; } - if( (eNew!=eOld) - && (eOld==PAGER_JOURNALMODE_WAL || eNew==PAGER_JOURNALMODE_WAL) - ){ + if( eNew!=eOld && (isWalMode(eNew) || isWalMode(eOld)) ){ + + /* Prevent changing directly to wal2 from wal mode. And vice versa. */ + if( isWalMode(eNew) && isWalMode(eOld) ){ + rc = SQLITE_ERROR; + sqlite3VdbeError(p, "cannot change from %s to %s mode", + sqlite3JournalModename(eOld), sqlite3JournalModename(eNew) + ); + goto abort_due_to_error; + } + + /* Prevent switching into or out of wal/wal2 mode mid-transaction */ if( !db->autoCommit || db->nVdbeRead>1 ){ rc = SQLITE_ERROR; sqlite3VdbeError(p, @@ -6359,31 +6369,33 @@ case OP_JournalMode: { /* out2 */ (eNew==PAGER_JOURNALMODE_WAL ? "into" : "out of") ); goto abort_due_to_error; - }else{ + } - if( eOld==PAGER_JOURNALMODE_WAL ){ - /* If leaving WAL mode, close the log file. If successful, the call - ** to PagerCloseWal() checkpoints and deletes the write-ahead-log - ** file. An EXCLUSIVE lock may still be held on the database file - ** after a successful return. - */ - rc = sqlite3PagerCloseWal(pPager, db); - if( rc==SQLITE_OK ){ - sqlite3PagerSetJournalMode(pPager, eNew); - } - }else if( eOld==PAGER_JOURNALMODE_MEMORY ){ - /* Cannot transition directly from MEMORY to WAL. Use mode OFF - ** as an intermediate */ - sqlite3PagerSetJournalMode(pPager, PAGER_JOURNALMODE_OFF); - } - - /* Open a transaction on the database file. Regardless of the journal - ** mode, this transaction always uses a rollback journal. + if( isWalMode(eOld) ){ + /* If leaving WAL mode, close the log file. If successful, the call + ** to PagerCloseWal() checkpoints and deletes the write-ahead-log + ** file. An EXCLUSIVE lock may still be held on the database file + ** after a successful return. */ - assert( sqlite3BtreeIsInTrans(pBt)==0 ); + rc = sqlite3PagerCloseWal(pPager, db); if( rc==SQLITE_OK ){ - rc = sqlite3BtreeSetVersion(pBt, (eNew==PAGER_JOURNALMODE_WAL ? 2 : 1)); + sqlite3PagerSetJournalMode(pPager, eNew); } + }else if( eOld==PAGER_JOURNALMODE_MEMORY ){ + /* Cannot transition directly from MEMORY to WAL. Use mode OFF + ** as an intermediate */ + sqlite3PagerSetJournalMode(pPager, PAGER_JOURNALMODE_OFF); + } + + /* Open a transaction on the database file. Regardless of the journal + ** mode, this transaction always uses a rollback journal. + */ + assert( sqlite3BtreeIsInTrans(pBt)==0 ); + if( rc==SQLITE_OK ){ + /* 1==rollback, 2==wal, 3==wal2 */ + rc = sqlite3BtreeSetVersion(pBt, + 1 + isWalMode(eNew) + (eNew==PAGER_JOURNALMODE_WAL2) + ); } } #endif /* ifndef SQLITE_OMIT_WAL */ diff --git a/src/wal.c b/src/wal.c index 9930b84421..36c04289a5 100644 --- a/src/wal.c +++ b/src/wal.c @@ -255,20 +255,19 @@ int sqlite3WalTrace = 0; #endif /* -** The maximum (and only) versions of the wal and wal-index formats -** that may be interpreted by this version of SQLite. +** Both the wal-file and the wal-index contain version fields +** indicating the current version of the system. If a client +** reads the header of a wal file (as part of recovery), or the +** wal-index (as part of opening a read transaction) and (a) the +** header checksum is correct but (b) the version field is not +** recognized, the operation fails with SQLITE_CANTOPEN. ** -** If a client begins recovering a WAL file and finds that (a) the checksum -** values in the wal-header are correct and (b) the version field is not -** WAL_MAX_VERSION, recovery fails and SQLite returns SQLITE_CANTOPEN. -** -** Similarly, if a client successfully reads a wal-index header (i.e. the -** checksum test is successful) and finds that the version field is not -** WALINDEX_MAX_VERSION, then no read-transaction is opened and SQLite -** returns SQLITE_CANTOPEN. +** Currently, clients support both version-1 ("journal_mode=wal") and +** version-2 ("journal_mode=wal2"). Legacy clients may support version-1 +** only. */ -#define WAL_MAX_VERSION 3007000 -#define WALINDEX_MAX_VERSION 3007000 +#define WAL_VERSION1 3007000 /* For "journal_mode=wal" */ +#define WAL_VERSION2 3021000 /* For "journal_mode=wal2" */ /* ** Indices of various locking bytes. WAL_NREADER is the number @@ -282,6 +281,35 @@ int sqlite3WalTrace = 0; #define WAL_READ_LOCK(I) (3+(I)) #define WAL_NREADER (SQLITE_SHM_NLOCK-3) +/* +** Values that may be stored in Wal.readLock in wal2 mode. +** +** In wal mode, the Wal.readLock member is set to -1 when no read-lock +** is held, or else is the index of the read-mark on which a lock is +** held. +** +** In wal2 mode, Wal.readLock must be set to one of the following values. +** A value of -1 still indicates that no read-lock is held, but the other +** values are symbolic. See the implementation of walLockReader() for +** details of how the symbols map to OS level locks. +*/ +#define WAL_LOCK_NONE -1 +#define WAL_LOCK_PART1 1 +#define WAL_LOCK_PART1_FULL2 2 +#define WAL_LOCK_PART2 3 +#define WAL_LOCK_PART2_FULL1 4 + +/* +** This constant is used in wal2 mode only. +** +** In wal2 mode, when committing a transaction, if the current wal file +** is sufficiently large and there are no conflicting locks held, the +** writer writes the new transaction into the start of the other wal +** file. Usually, "sufficiently large" is defined by the value configured +** using "PRAGMA journal_size_limit". However, if no such value has been +** configured, sufficiently large defaults to WAL_DEFAULT_WALSIZE frames. +*/ +#define WAL_DEFAULT_WALSIZE 1000 /* Object declarations */ typedef struct WalIndexHdr WalIndexHdr; @@ -301,21 +329,64 @@ typedef struct WalCkptInfo WalCkptInfo; ** The szPage value can be any power of 2 between 512 and 32768, inclusive. ** Or it can be 1 to represent a 65536-byte page. The latter case was ** added in 3.7.1 when support for 64K pages was added. +** +** WAL2 mode notes: Member variable mxFrame2 is only used in wal2 mode +** (when iVersion is set to WAL_VERSION2). The lower 31 bits store +** the maximum frame number in file *-wal2. The most significant bit +** is a flag - set if clients are currently appending to *-wal2, clear +** otherwise. */ struct WalIndexHdr { u32 iVersion; /* Wal-index version */ - u32 unused; /* Unused (padding) field */ + u32 mxFrame2; /* See "WAL2 mode notes" above */ u32 iChange; /* Counter incremented each transaction */ u8 isInit; /* 1 when initialized */ u8 bigEndCksum; /* True if checksums in WAL are big-endian */ u16 szPage; /* Database page size in bytes. 1==64K */ - u32 mxFrame; /* Index of last valid frame in the WAL */ + u32 mxFrame; /* Index of last valid frame in each WAL */ u32 nPage; /* Size of database in pages */ u32 aFrameCksum[2]; /* Checksum of last frame in log */ u32 aSalt[2]; /* Two salt values copied from WAL header */ u32 aCksum[2]; /* Checksum over all prior fields */ }; +/* +** The following macros and functions are get/set methods for the maximum +** frame numbers and current wal file values stored in the WalIndexHdr +** structure. These are helpful because of the unorthodox way in which +** the values are stored in wal2 mode (see above). They are equivalent +** to functions with the following signatures. +** +** u32 walidxGetMxFrame(WalIndexHdr*, int iWal); // get mxFrame +** void walidxSetMxFrame(WalIndexHdr*, int iWal, u32 val); // set mxFrame +** int walidxGetFile(WalIndexHdr*) // get file +** void walidxSetFile(WalIndexHdr*, int val); // set file +*/ +#define walidxGetMxFrame(pHdr, iWal) \ + ((iWal) ? ((pHdr)->mxFrame2 & 0x7FFFFFF) : (pHdr)->mxFrame) + +static void walidxSetMxFrame(WalIndexHdr *pHdr, int iWal, u32 mxFrame){ + if( iWal ){ + pHdr->mxFrame2 = (pHdr->mxFrame2 & 0x80000000) | mxFrame; + }else{ + pHdr->mxFrame = mxFrame; + } + assert( walidxGetMxFrame(pHdr, iWal)==mxFrame ); +} + +#define walidxGetFile(pHdr) ((pHdr)->mxFrame2 >> 31) + +#define walidxSetFile(pHdr, iWal) ( \ + (pHdr)->mxFrame2 = ((pHdr)->mxFrame2 & 0x7FFFFFFF) | ((iWal)<<31) \ +) + +/* +** Argument is a pointer to a Wal structure. Return true if the current +** cache of the wal-index header indicates "journal_mode=wal2" mode, or +** false otherwise. +*/ +#define isWalMode2(pWal) ((pWal)->hdr.iVersion==WAL_VERSION2) + /* ** A copy of the following object occurs in the wal-index immediately ** following the second copy of the WalIndexHdr. This object stores @@ -427,7 +498,7 @@ struct WalCkptInfo { struct Wal { sqlite3_vfs *pVfs; /* The VFS used to create pDbFd */ sqlite3_file *pDbFd; /* File handle for the database file */ - sqlite3_file *pWalFd; /* File handle for WAL file */ + sqlite3_file *apWalFd[2]; /* File handle for "*-wal" and "*-wal2" */ u32 iCallback; /* Value to pass to log callback (or 0) */ i64 mxWalSize; /* Truncate WAL to this size upon reset */ int nWiData; /* Size of array apWiData */ @@ -447,6 +518,7 @@ struct Wal { u32 minFrame; /* Ignore wal frames before this one */ u32 iReCksum; /* On commit, recalculate checksums from here */ const char *zWalName; /* Name of WAL file */ + char *zWalName2; /* Name of second WAL file */ u32 nCkpt; /* Checkpoint sequence counter in the wal-header */ #ifdef SQLITE_DEBUG u8 lockError; /* True if a locking error has occurred */ @@ -454,6 +526,7 @@ struct Wal { #ifdef SQLITE_ENABLE_SNAPSHOT WalIndexHdr *pSnapshot; /* Start transaction here if not NULL */ #endif + int bWal2; /* bWal2 flag passed to WalOpen() */ }; /* @@ -667,7 +740,7 @@ static void walIndexWriteHdr(Wal *pWal){ assert( pWal->writeLock ); pWal->hdr.isInit = 1; - pWal->hdr.iVersion = WALINDEX_MAX_VERSION; + assert( pWal->hdr.iVersion==WAL_VERSION1||pWal->hdr.iVersion==WAL_VERSION2 ); walChecksumBytes(1, (u8*)&pWal->hdr, nCksum, 0, pWal->hdr.aCksum); memcpy((void*)&aHdr[1], (const void*)&pWal->hdr, sizeof(WalIndexHdr)); walShmBarrier(pWal); @@ -745,7 +818,7 @@ static int walDecodeFrame( } /* A frame is only valid if a checksum of the WAL header, - ** all prior frams, the first 16 bytes of this frame-header, + ** all prior frames, the first 16 bytes of this frame-header, ** and the frame-data matches the checksum in the last 8 ** bytes of this frame-header. */ @@ -831,6 +904,36 @@ static void walUnlockExclusive(Wal *pWal, int lockIdx, int n){ walLockName(lockIdx), n)); } +/* +** This function is used to take and release read-locks in wal2 mode. +** +** Use of WAL_READ_LOCK(x) slots for (1<=x<=4). +** +** 1) Partial read of *-wal-1 (blocks checkpointer from checkpointing) +** 2) Full read of *-wal-2 (blocks writer from writing) +** 3) Partial read of *-wal-2 (blocks checkpointer from checkpointing) +** 4) Full read of *-wal-1 (blocks writer from writing) +*/ +static int walLockReader(Wal *pWal, int eLock, int bLock){ + int i; /* Index of first readmark to lock */ + int n; /* Number of readmarks to lock */ + + assert( pWal->hdr.iVersion==WAL_VERSION2 ); + if( pWal->exclusiveMode ) return SQLITE_OK; + + switch( eLock ){ + case WAL_LOCK_PART1 : i = 1; n = 1; break; + case WAL_LOCK_PART1_FULL2: i = 1; n = 2; break; + case WAL_LOCK_PART2 : i = 3; n = 1; break; + case WAL_LOCK_PART2_FULL1: i = 3; n = 2; break; + default: assert( !"cannot happen" ); + } + + return sqlite3OsShmLock(pWal->pDbFd, WAL_READ_LOCK(i), n, + SQLITE_SHM_SHARED | (bLock ? SQLITE_SHM_LOCK : SQLITE_SHM_UNLOCK) + ); +} + /* ** Compute a hash on a page number. The resulting hash value must land ** between 0 and (HASHTABLE_NSLOT-1). The walHashNext() function advances @@ -891,6 +994,43 @@ static int walHashGet( return rc; } +static u32 walExternalEncode(int iWal, u32 iFrame){ + u32 iRet; + if( iWal ){ + iRet = HASHTABLE_NPAGE_ONE + iFrame; + iRet += ((iFrame-1) / HASHTABLE_NPAGE) * HASHTABLE_NPAGE; + }else{ + iRet = iFrame; + iFrame += HASHTABLE_NPAGE - HASHTABLE_NPAGE_ONE; + iRet += ((iFrame-1) / HASHTABLE_NPAGE) * HASHTABLE_NPAGE; + } + return iRet; +} + +/* +** Parameter iExternal is an external frame identifier. This function +** transforms it to a wal file number (0 or 1) and frame number within +** this wal file (reported via output parameter *piRead). +*/ +static int walExternalDecode(u32 iExternal, u32 *piRead){ + int iHash = (iExternal+HASHTABLE_NPAGE-HASHTABLE_NPAGE_ONE-1)/HASHTABLE_NPAGE; + + if( 0==(iHash & 0x01) ){ + /* A frame in wal file 0 */ + *piRead = (iExternal <= HASHTABLE_NPAGE_ONE) ? iExternal : + iExternal - (iHash/2) * HASHTABLE_NPAGE; + return 0; + } + if( iHash==0 ){ + *piRead = iExternal; + return 0; + }else{ + *piRead = iExternal - HASHTABLE_NPAGE_ONE - ((iHash-1)/2) * HASHTABLE_NPAGE; + } + + return (iHash % 2); +} + /* ** Return the number of the wal-index page that contains the hash-table ** and page-number array that contain entries corresponding to WAL frame @@ -908,6 +1048,22 @@ static int walFramePage(u32 iFrame){ return iHash; } +/* +** Return the index of the hash-table corresponding to frame iFrame of wal +** file iWal. +*/ +static int walFramePage2(int iWal, u32 iFrame){ + int iRet; + assert( iWal==0 || iWal==1 ); + assert( iFrame>0 ); + if( iWal==0 ){ + iRet = 2*((iFrame+HASHTABLE_NPAGE-HASHTABLE_NPAGE_ONE-1)/HASHTABLE_NPAGE); + }else{ + iRet = 1 + 2 * ((iFrame-1) / HASHTABLE_NPAGE); + } + return iRet; +} + /* ** Return the page number associated with frame iFrame in this WAL. */ @@ -919,6 +1075,10 @@ static u32 walFramePgno(Wal *pWal, u32 iFrame){ return pWal->apWiData[iHash][(iFrame-1-HASHTABLE_NPAGE_ONE)%HASHTABLE_NPAGE]; } +static u32 walFramePgno2(Wal *pWal, int iWal, u32 iFrame){ + return walFramePgno(pWal, walExternalEncode(iWal, iFrame)); +} + /* ** Remove entries from the hash table that point to WAL slots greater ** than pWal->hdr.mxFrame. @@ -938,26 +1098,36 @@ static void walCleanupHash(Wal *pWal){ int iLimit = 0; /* Zero values greater than this */ int nByte; /* Number of bytes to zero in aPgno[] */ int i; /* Used to iterate through aHash[] */ + int iWal = walidxGetFile(&pWal->hdr); + u32 mxFrame = walidxGetMxFrame(&pWal->hdr, iWal); + + u32 iExternal; + if( isWalMode2(pWal) ){ + iExternal = walExternalEncode(iWal, mxFrame); + }else{ + assert( iWal==0 ); + iExternal = mxFrame; + } assert( pWal->writeLock ); - testcase( pWal->hdr.mxFrame==HASHTABLE_NPAGE_ONE-1 ); - testcase( pWal->hdr.mxFrame==HASHTABLE_NPAGE_ONE ); - testcase( pWal->hdr.mxFrame==HASHTABLE_NPAGE_ONE+1 ); + testcase( mxFrame==HASHTABLE_NPAGE_ONE-1 ); + testcase( mxFrame==HASHTABLE_NPAGE_ONE ); + testcase( mxFrame==HASHTABLE_NPAGE_ONE+1 ); - if( pWal->hdr.mxFrame==0 ) return; + if( mxFrame==0 ) return; /* Obtain pointers to the hash-table and page-number array containing ** the entry that corresponds to frame pWal->hdr.mxFrame. It is guaranteed ** that the page said hash-table and array reside on is already mapped. */ - assert( pWal->nWiData>walFramePage(pWal->hdr.mxFrame) ); - assert( pWal->apWiData[walFramePage(pWal->hdr.mxFrame)] ); - walHashGet(pWal, walFramePage(pWal->hdr.mxFrame), &aHash, &aPgno, &iZero); + assert( pWal->nWiData>walFramePage(iExternal) ); + assert( pWal->apWiData[walFramePage(iExternal)] ); + walHashGet(pWal, walFramePage(iExternal), &aHash, &aPgno, &iZero); /* Zero all hash-table entries that correspond to frame numbers greater ** than pWal->hdr.mxFrame. */ - iLimit = pWal->hdr.mxFrame - iZero; + iLimit = iExternal - iZero; assert( iLimit>0 ); for(i=0; iiLimit ){ @@ -966,8 +1136,7 @@ static void walCleanupHash(Wal *pWal){ } /* Zero the entries in the aPgno array that correspond to frames with - ** frame numbers greater than pWal->hdr.mxFrame. - */ + ** frame numbers greater than pWal->hdr.mxFrame. */ nByte = (int)((char *)aHash - (char *)&aPgno[iLimit+1]); memset((void *)&aPgno[iLimit+1], 0, nByte); @@ -988,18 +1157,25 @@ static void walCleanupHash(Wal *pWal){ #endif /* SQLITE_ENABLE_EXPENSIVE_ASSERT */ } - /* ** Set an entry in the wal-index that will map database page number ** pPage into WAL frame iFrame. */ -static int walIndexAppend(Wal *pWal, u32 iFrame, u32 iPage){ +static int walIndexAppend(Wal *pWal, int iWal, u32 iFrame, u32 iPage){ int rc; /* Return code */ u32 iZero = 0; /* One less than frame number of aPgno[1] */ volatile u32 *aPgno = 0; /* Page number array */ volatile ht_slot *aHash = 0; /* Hash table */ + u32 iExternal; + + if( isWalMode2(pWal) ){ + iExternal = walExternalEncode(iWal, iFrame); + }else{ + assert( iWal==0 ); + iExternal = iFrame; + } - rc = walHashGet(pWal, walFramePage(iFrame), &aHash, &aPgno, &iZero); + rc = walHashGet(pWal, walFramePage(iExternal), &aHash, &aPgno, &iZero); /* Assuming the wal-index file was successfully mapped, populate the ** page number array and hash table entry. @@ -1009,7 +1185,7 @@ static int walIndexAppend(Wal *pWal, u32 iFrame, u32 iPage){ int idx; /* Value to write to hash-table slot */ int nCollide; /* Number of hash collisions */ - idx = iFrame - iZero; + idx = iExternal - iZero; assert( idx <= HASHTABLE_NSLOT/2 + 1 ); /* If this is the first entry to be added to this hash-table, zero the @@ -1071,6 +1247,133 @@ static int walIndexAppend(Wal *pWal, u32 iFrame, u32 iPage){ return rc; } +/* +** Recover a single wal file - *-wal if iWal==0, or *-wal2 if iWal==1. +*/ +static int walIndexRecoverOne(Wal *pWal, int iWal, u32 *pnCkpt, int *pbZero){ + i64 nSize; /* Size of log file */ + u32 aFrameCksum[2] = {0, 0}; + int rc; + sqlite3_file *pWalFd = pWal->apWalFd[iWal]; + + assert( iWal==0 || iWal==1 ); + + memset(&pWal->hdr, 0, sizeof(WalIndexHdr)); + sqlite3_randomness(8, pWal->hdr.aSalt); + + rc = sqlite3OsFileSize(pWalFd, &nSize); + if( rc==SQLITE_OK ){ + if( nSize>WAL_HDRSIZE ){ + u8 aBuf[WAL_HDRSIZE]; /* Buffer to load WAL header into */ + u8 *aFrame = 0; /* Malloc'd buffer to load entire frame */ + int szFrame; /* Number of bytes in buffer aFrame[] */ + u8 *aData; /* Pointer to data part of aFrame buffer */ + int iFrame; /* Index of last frame read */ + i64 iOffset; /* Next offset to read from log file */ + int szPage; /* Page size according to the log */ + u32 magic; /* Magic value read from WAL header */ + u32 version; /* Magic value read from WAL header */ + int isValid; /* True if this frame is valid */ + + /* Read in the WAL header. */ + rc = sqlite3OsRead(pWalFd, aBuf, WAL_HDRSIZE, 0); + if( rc!=SQLITE_OK ){ + return rc; + } + + /* If the database page size is not a power of two, or is greater than + ** SQLITE_MAX_PAGE_SIZE, conclude that the WAL file contains no valid + ** data. Similarly, if the 'magic' value is invalid, ignore the whole + ** WAL file. + */ + magic = sqlite3Get4byte(&aBuf[0]); + szPage = sqlite3Get4byte(&aBuf[8]); + if( (magic&0xFFFFFFFE)!=WAL_MAGIC + || szPage&(szPage-1) + || szPage>SQLITE_MAX_PAGE_SIZE + || szPage<512 + ){ + return SQLITE_OK; + } + pWal->hdr.bigEndCksum = (u8)(magic&0x00000001); + pWal->szPage = szPage; + + /* Verify that the WAL header checksum is correct */ + walChecksumBytes(pWal->hdr.bigEndCksum==SQLITE_BIGENDIAN, + aBuf, WAL_HDRSIZE-2*4, 0, pWal->hdr.aFrameCksum + ); + if( pWal->hdr.aFrameCksum[0]!=sqlite3Get4byte(&aBuf[24]) + || pWal->hdr.aFrameCksum[1]!=sqlite3Get4byte(&aBuf[28]) + ){ + return SQLITE_OK; + } + + memcpy(&pWal->hdr.aSalt, &aBuf[16], 8); + *pnCkpt = sqlite3Get4byte(&aBuf[12]); + + /* Verify that the version number on the WAL format is one that + ** are able to understand */ + version = sqlite3Get4byte(&aBuf[4]); + if( version!=WAL_VERSION1 && version!=WAL_VERSION2 ){ + return SQLITE_CANTOPEN_BKPT; + } + pWal->hdr.iVersion = version; + + /* Malloc a buffer to read frames into. */ + szFrame = szPage + WAL_FRAME_HDRSIZE; + aFrame = (u8 *)sqlite3_malloc64(szFrame); + if( !aFrame ){ + return SQLITE_NOMEM_BKPT; + } + aData = &aFrame[WAL_FRAME_HDRSIZE]; + + /* Read all frames from the log file. */ + iFrame = 0; + for(iOffset=WAL_HDRSIZE; (iOffset+szFrame)<=nSize; iOffset+=szFrame){ + u32 pgno; /* Database page number for frame */ + u32 nTruncate; /* dbsize field from frame header */ + + /* Read and decode the next log frame. */ + iFrame++; + rc = sqlite3OsRead(pWalFd, aFrame, szFrame, iOffset); + if( rc!=SQLITE_OK ) break; + isValid = walDecodeFrame(pWal, &pgno, &nTruncate, aData, aFrame); + if( !isValid ) break; + rc = walIndexAppend(pWal, iWal, iFrame, pgno); + if( rc!=SQLITE_OK ) break; + + /* If nTruncate is non-zero, this is a commit record. */ + if( nTruncate ){ + pWal->hdr.mxFrame = iFrame; + pWal->hdr.nPage = nTruncate; + pWal->hdr.szPage = (u16)((szPage&0xff00) | (szPage>>16)); + testcase( szPage<=32768 ); + testcase( szPage>=65536 ); + aFrameCksum[0] = pWal->hdr.aFrameCksum[0]; + aFrameCksum[1] = pWal->hdr.aFrameCksum[1]; + } + } + + sqlite3_free(aFrame); + }else if( pbZero && nSize==0 ){ + *pbZero = 1; + } + } + + pWal->hdr.aFrameCksum[0] = aFrameCksum[0]; + pWal->hdr.aFrameCksum[1] = aFrameCksum[1]; + + return rc; +} + +static int walOpenWal2(Wal *pWal){ + int rc = SQLITE_OK; + if( !isOpen(pWal->apWalFd[1]) ){ + int f = (SQLITE_OPEN_READWRITE|SQLITE_OPEN_CREATE|SQLITE_OPEN_WAL); + rc = sqlite3OsOpen(pWal->pVfs, pWal->zWalName2, pWal->apWalFd[1], f, &f); + } + return rc; +} /* ** Recover the wal-index by reading the write-ahead log file. @@ -1084,10 +1387,12 @@ static int walIndexAppend(Wal *pWal, u32 iFrame, u32 iPage){ */ static int walIndexRecover(Wal *pWal){ int rc; /* Return Code */ - i64 nSize; /* Size of log file */ - u32 aFrameCksum[2] = {0, 0}; int iLock; /* Lock offset to lock for checkpoint */ int nLock; /* Number of locks to hold */ + u32 nCkpt1 = 0xFFFFFFFF; + u32 nCkpt2 = 0xFFFFFFFF; + int bZero = 0; + WalIndexHdr hdr; /* Obtain an exclusive lock on all byte in the locking range not already ** locked by the caller. The caller is guaranteed to have locked the @@ -1107,147 +1412,116 @@ static int walIndexRecover(Wal *pWal){ } WALTRACE(("WAL%p: recovery begin...\n", pWal)); - memset(&pWal->hdr, 0, sizeof(WalIndexHdr)); - - rc = sqlite3OsFileSize(pWal->pWalFd, &nSize); - if( rc!=SQLITE_OK ){ - goto recovery_error; - } - - if( nSize>WAL_HDRSIZE ){ - u8 aBuf[WAL_HDRSIZE]; /* Buffer to load WAL header into */ - u8 *aFrame = 0; /* Malloc'd buffer to load entire frame */ - int szFrame; /* Number of bytes in buffer aFrame[] */ - u8 *aData; /* Pointer to data part of aFrame buffer */ - int iFrame; /* Index of last frame read */ - i64 iOffset; /* Next offset to read from log file */ - int szPage; /* Page size according to the log */ - u32 magic; /* Magic value read from WAL header */ - u32 version; /* Magic value read from WAL header */ - int isValid; /* True if this frame is valid */ - - /* Read in the WAL header. */ - rc = sqlite3OsRead(pWal->pWalFd, aBuf, WAL_HDRSIZE, 0); - if( rc!=SQLITE_OK ){ - goto recovery_error; - } - - /* If the database page size is not a power of two, or is greater than - ** SQLITE_MAX_PAGE_SIZE, conclude that the WAL file contains no valid - ** data. Similarly, if the 'magic' value is invalid, ignore the whole - ** WAL file. - */ - magic = sqlite3Get4byte(&aBuf[0]); - szPage = sqlite3Get4byte(&aBuf[8]); - if( (magic&0xFFFFFFFE)!=WAL_MAGIC - || szPage&(szPage-1) - || szPage>SQLITE_MAX_PAGE_SIZE - || szPage<512 - ){ - goto finished; - } - pWal->hdr.bigEndCksum = (u8)(magic&0x00000001); - pWal->szPage = szPage; - pWal->nCkpt = sqlite3Get4byte(&aBuf[12]); - memcpy(&pWal->hdr.aSalt, &aBuf[16], 8); - - /* Verify that the WAL header checksum is correct */ - walChecksumBytes(pWal->hdr.bigEndCksum==SQLITE_BIGENDIAN, - aBuf, WAL_HDRSIZE-2*4, 0, pWal->hdr.aFrameCksum - ); - if( pWal->hdr.aFrameCksum[0]!=sqlite3Get4byte(&aBuf[24]) - || pWal->hdr.aFrameCksum[1]!=sqlite3Get4byte(&aBuf[28]) - ){ - goto finished; + /* Recover the *-wal file. If a valid version-1 header is recovered + ** from it, do not open the *-wal2 file. Even if it exists. + ** + ** Otherwise, if the *-wal2 file exists or if the "wal2" flag was + ** specified when sqlite3WalOpen() was called, open and recover + ** the *-wal2 file. Except, if the *-wal file was zero bytes in size, + ** truncate the *-wal2 to zero bytes in size. + ** + ** After this block has run, if the *-wal2 file is open the system + ** starts up in VERSION2 mode. In this case pWal->hdr contains the + ** wal-index header considering only *-wal2. Stack variable hdr + ** contains the wal-index header considering only *-wal. The hash + ** tables are populated for both. + ** + ** Or, if the *-wal2 file is not open, start up in VERSION1 mode. + ** pWal->hdr is already populated. + */ + rc = walIndexRecoverOne(pWal, 0, &nCkpt1, &bZero); + assert( pWal->hdr.iVersion==0 + || pWal->hdr.iVersion==WAL_VERSION1 + || pWal->hdr.iVersion==WAL_VERSION2 + ); + if( rc==SQLITE_OK && pWal->hdr.iVersion!=WAL_VERSION1 ){ + int bOpen = 1; + sqlite3_vfs *pVfs = pWal->pVfs; + if( pWal->hdr.iVersion==0 && pWal->bWal2==0 ){ + rc = sqlite3OsAccess(pVfs, pWal->zWalName2, SQLITE_ACCESS_EXISTS, &bOpen); } - - /* Verify that the version number on the WAL format is one that - ** are able to understand */ - version = sqlite3Get4byte(&aBuf[4]); - if( version!=WAL_MAX_VERSION ){ - rc = SQLITE_CANTOPEN_BKPT; - goto finished; + if( rc==SQLITE_OK && bOpen ){ + rc = walOpenWal2(pWal); + if( rc==SQLITE_OK ){ + hdr = pWal->hdr; + rc = walIndexRecoverOne(pWal, 1, &nCkpt2, 0); + } } + } - /* Malloc a buffer to read frames into. */ - szFrame = szPage + WAL_FRAME_HDRSIZE; - aFrame = (u8 *)sqlite3_malloc64(szFrame); - if( !aFrame ){ - rc = SQLITE_NOMEM_BKPT; - goto recovery_error; - } - aData = &aFrame[WAL_FRAME_HDRSIZE]; + if( rc==SQLITE_OK ){ + volatile WalCkptInfo *pInfo; - /* Read all frames from the log file. */ - iFrame = 0; - for(iOffset=WAL_HDRSIZE; (iOffset+szFrame)<=nSize; iOffset+=szFrame){ - u32 pgno; /* Database page number for frame */ - u32 nTruncate; /* dbsize field from frame header */ + if( isOpen(pWal->apWalFd[1]) ){ + /* The case where *-wal2 may follow *-wal */ + if( nCkpt2<=0x0F && nCkpt2==nCkpt1+1 ){ + if( sqlite3Get4byte((u8*)(&pWal->hdr.aSalt[0]))==hdr.aFrameCksum[0] + && sqlite3Get4byte((u8*)(&pWal->hdr.aSalt[1]))==hdr.aFrameCksum[1] + ){ + walidxSetFile(&pWal->hdr, 1); + walidxSetMxFrame(&pWal->hdr, 1, pWal->hdr.mxFrame); + walidxSetMxFrame(&pWal->hdr, 0, hdr.mxFrame); + }else{ + pWal->hdr = hdr; + } + }else - /* Read and decode the next log frame. */ - iFrame++; - rc = sqlite3OsRead(pWal->pWalFd, aFrame, szFrame, iOffset); - if( rc!=SQLITE_OK ) break; - isValid = walDecodeFrame(pWal, &pgno, &nTruncate, aData, aFrame); - if( !isValid ) break; - rc = walIndexAppend(pWal, iFrame, pgno); - if( rc!=SQLITE_OK ) break; + /* When *-wal may follow *-wal2 */ + if( (nCkpt2==0x0F && nCkpt1==0) || (nCkpt2<0x0F && nCkpt2==nCkpt1-1) ){ + if( sqlite3Get4byte((u8*)(&hdr.aSalt[0]))==pWal->hdr.aFrameCksum[0] + && sqlite3Get4byte((u8*)(&hdr.aSalt[1]))==pWal->hdr.aFrameCksum[1] + ){ + SWAP(WalIndexHdr, pWal->hdr, hdr); + walidxSetMxFrame(&pWal->hdr, 1, hdr.mxFrame); + } + }else - /* If nTruncate is non-zero, this is a commit record. */ - if( nTruncate ){ - pWal->hdr.mxFrame = iFrame; - pWal->hdr.nPage = nTruncate; - pWal->hdr.szPage = (u16)((szPage&0xff00) | (szPage>>16)); - testcase( szPage<=32768 ); - testcase( szPage>=65536 ); - aFrameCksum[0] = pWal->hdr.aFrameCksum[0]; - aFrameCksum[1] = pWal->hdr.aFrameCksum[1]; + /* Fallback */ + if( nCkpt1<=nCkpt2 ){ + pWal->hdr = hdr; + }else{ + walidxSetFile(&pWal->hdr, 1); } + pWal->hdr.iVersion = WAL_VERSION2; + }else{ + pWal->hdr.iVersion = WAL_VERSION1; } - sqlite3_free(aFrame); - } - -finished: - if( rc==SQLITE_OK ){ - volatile WalCkptInfo *pInfo; - int i; - pWal->hdr.aFrameCksum[0] = aFrameCksum[0]; - pWal->hdr.aFrameCksum[1] = aFrameCksum[1]; walIndexWriteHdr(pWal); /* Reset the checkpoint-header. This is safe because this thread is ** currently holding locks that exclude all other readers, writers and - ** checkpointers. - */ + ** checkpointers. */ pInfo = walCkptInfo(pWal); - pInfo->nBackfill = 0; - pInfo->nBackfillAttempted = pWal->hdr.mxFrame; - pInfo->aReadMark[0] = 0; - for(i=1; iaReadMark[i] = READMARK_NOT_USED; - if( pWal->hdr.mxFrame ) pInfo->aReadMark[1] = pWal->hdr.mxFrame; + memset((void*)pInfo, 0, sizeof(WalCkptInfo)); + if( 0==isWalMode2(pWal) ){ + int i; + pInfo->nBackfillAttempted = pWal->hdr.mxFrame; + pInfo->aReadMark[0] = 0; + for(i=1; iaReadMark[i] = READMARK_NOT_USED; + if( pWal->hdr.mxFrame ) pInfo->aReadMark[1] = pWal->hdr.mxFrame; + } /* If more than one frame was recovered from the log file, report an ** event via sqlite3_log(). This is to help with identifying performance ** problems caused by applications routinely shutting down without - ** checkpointing the log file. - */ + ** checkpointing the log file. */ if( pWal->hdr.nPage ){ sqlite3_log(SQLITE_NOTICE_RECOVER_WAL, - "recovered %d frames from WAL file %s", - pWal->hdr.mxFrame, pWal->zWalName + "recovered (%d,%d) frames from WAL files %s[2] (%s mode)", + walidxGetMxFrame(&pWal->hdr, 0), walidxGetMxFrame(&pWal->hdr, 1), + pWal->zWalName, isWalMode2(pWal) ? "wal2" : "wal" ); } } -recovery_error: WALTRACE(("WAL%p: recovery %s\n", pWal, rc ? "failed" : "ok")); walUnlockExclusive(pWal, iLock, nLock); return rc; } /* -** Close an open wal-index. +** Close an open wal-index and wal files. */ static void walIndexClose(Wal *pWal, int isDelete){ if( pWal->exclusiveMode==WAL_HEAPMEMORY_MODE ){ @@ -1259,6 +1533,8 @@ static void walIndexClose(Wal *pWal, int isDelete){ }else{ sqlite3OsShmUnmap(pWal->pDbFd, isDelete); } + sqlite3OsClose(pWal->apWalFd[0]); + sqlite3OsClose(pWal->apWalFd[1]); } /* @@ -1282,11 +1558,14 @@ int sqlite3WalOpen( const char *zWalName, /* Name of the WAL file */ int bNoShm, /* True to run in heap-memory mode */ i64 mxWalSize, /* Truncate WAL to this size on reset */ + int bWal2, /* True to open in wal2 mode */ Wal **ppWal /* OUT: Allocated Wal handle */ ){ int rc; /* Return Code */ Wal *pRet; /* Object to allocate and return */ int flags; /* Flags passed to OsOpen() */ + int nWalName; /* Length of zWalName in bytes */ + int nByte; /* Bytes of space to allocate */ assert( zWalName && zWalName[0] ); assert( pDbFd ); @@ -1306,34 +1585,42 @@ int sqlite3WalOpen( assert( UNIX_SHM_BASE==WALINDEX_LOCK_OFFSET ); #endif + nWalName = sqlite3Strlen30(zWalName); + nByte = sizeof(Wal) + pVfs->szOsFile*2 + nWalName+2; /* Allocate an instance of struct Wal to return. */ *ppWal = 0; - pRet = (Wal*)sqlite3MallocZero(sizeof(Wal) + pVfs->szOsFile); + pRet = (Wal*)sqlite3MallocZero(nByte); if( !pRet ){ return SQLITE_NOMEM_BKPT; } pRet->pVfs = pVfs; - pRet->pWalFd = (sqlite3_file *)&pRet[1]; + pRet->apWalFd[0] = (sqlite3_file*)((char*)pRet+sizeof(Wal)); + pRet->apWalFd[1] = (sqlite3_file*)((char*)pRet+sizeof(Wal)+pVfs->szOsFile); pRet->pDbFd = pDbFd; - pRet->readLock = -1; + pRet->readLock = WAL_LOCK_NONE; pRet->mxWalSize = mxWalSize; pRet->zWalName = zWalName; pRet->syncHeader = 1; pRet->padToSectorBoundary = 1; pRet->exclusiveMode = (bNoShm ? WAL_HEAPMEMORY_MODE: WAL_NORMAL_MODE); + pRet->bWal2 = bWal2; + + pRet->zWalName2 = (char*)pRet + sizeof(Wal) + 2*pVfs->szOsFile; + memcpy(pRet->zWalName2, zWalName, nWalName); + pRet->zWalName2[nWalName] = '2'; + pRet->zWalName2[nWalName+1] = '\0'; - /* Open file handle on the write-ahead log file. */ + /* Open a file handle on the first write-ahead log file. */ flags = (SQLITE_OPEN_READWRITE|SQLITE_OPEN_CREATE|SQLITE_OPEN_WAL); - rc = sqlite3OsOpen(pVfs, zWalName, pRet->pWalFd, flags, &flags); + rc = sqlite3OsOpen(pVfs, zWalName, pRet->apWalFd[0], flags, &flags); if( rc==SQLITE_OK && flags&SQLITE_OPEN_READONLY ){ pRet->readOnly = WAL_RDONLY; } if( rc!=SQLITE_OK ){ walIndexClose(pRet, 0); - sqlite3OsClose(pRet->pWalFd); sqlite3_free(pRet); }else{ int iDC = sqlite3OsDeviceCharacteristics(pDbFd); @@ -1546,30 +1833,40 @@ static void walIteratorFree(WalIterator *p){ ** pages in the WAL in ascending order. The caller must hold the checkpoint ** lock. ** -** On success, make *pp point to the newly allocated WalInterator object -** return SQLITE_OK. Otherwise, return an error code. If this routine -** returns an error, the value of *pp is undefined. +** On success, make *pp point to the newly allocated WalIterator object +** and return SQLITE_OK. Otherwise, return an error code. If this routine +** returns an error, the final value of *pp is undefined. ** ** The calling routine should invoke walIteratorFree() to destroy the ** WalIterator object when it has finished with it. */ -static int walIteratorInit(Wal *pWal, WalIterator **pp){ +static int walIteratorInit(Wal *pWal, int iWal, WalIterator **pp){ WalIterator *p; /* Return value */ int nSegment; /* Number of segments to merge */ u32 iLast; /* Last frame in log */ int nByte; /* Number of bytes to allocate */ int i; /* Iterator variable */ + int iLastSeg; /* Last hash table to iterate though */ ht_slot *aTmp; /* Temp space used by merge-sort */ int rc = SQLITE_OK; /* Return Code */ + int iMode = isWalMode2(pWal) ? 2 : 1; + + assert( isWalMode2(pWal) || iWal==0 ); /* This routine only runs while holding the checkpoint lock. And ** it only runs if there is actually content in the log (mxFrame>0). */ - assert( pWal->ckptLock && pWal->hdr.mxFrame>0 ); - iLast = pWal->hdr.mxFrame; + iLast = walidxGetMxFrame(&pWal->hdr, iWal); + assert( pWal->ckptLock && iLast>0 ); + + if( iMode==2 ){ + iLastSeg = walFramePage2(iWal, iLast); + }else{ + iLastSeg = walFramePage(iLast); + } + nSegment = 1 + (iLastSeg/iMode); /* Allocate space for the WalIterator object. */ - nSegment = walFramePage(iLast) + 1; nByte = sizeof(WalIterator) + (nSegment-1)*sizeof(struct WalSegment) + iLast*sizeof(ht_slot); @@ -1590,19 +1887,28 @@ static int walIteratorInit(Wal *pWal, WalIterator **pp){ rc = SQLITE_NOMEM_BKPT; } - for(i=0; rc==SQLITE_OK && i=2 ); + }else{ + iZero = iExtZero; + } aPgno++; - if( (i+1)==nSegment ){ + if( i==iLastSeg ){ nEntry = (int)(iLast - iZero); }else{ nEntry = (int)((u32*)aHash - (u32*)aPgno); @@ -1614,10 +1920,10 @@ static int walIteratorInit(Wal *pWal, WalIterator **pp){ aIndex[j] = (ht_slot)j; } walMergesort((u32 *)aPgno, aTmp, aIndex, &nEntry); - p->aSegment[i].iZero = iZero; - p->aSegment[i].nEntry = nEntry; - p->aSegment[i].aIndex = aIndex; - p->aSegment[i].aPgno = (u32 *)aPgno; + p->aSegment[i/iMode].iZero = iZero; + p->aSegment[i/iMode].nEntry = nEntry; + p->aSegment[i/iMode].aIndex = aIndex; + p->aSegment[i/iMode].aPgno = (u32 *)aPgno; } } sqlite3_free(aTmp); @@ -1678,6 +1984,7 @@ static void walRestartHdr(Wal *pWal, u32 salt1){ volatile WalCkptInfo *pInfo = walCkptInfo(pWal); int i; /* Loop counter */ u32 *aSalt = pWal->hdr.aSalt; /* Big-endian salt values */ + assert( isWalMode2(pWal)==0 ); pWal->nCkpt++; pWal->hdr.mxFrame = 0; sqlite3Put4byte((u8*)&aSalt[0], 1 + sqlite3Get4byte((u8*)&aSalt[0])); @@ -1739,15 +2046,30 @@ static int walCheckpoint( u32 mxPage; /* Max database page to write */ int i; /* Loop counter */ volatile WalCkptInfo *pInfo; /* The checkpoint status information */ + int bWal2 = isWalMode2(pWal); /* True for wal2 connections */ + int iCkpt = bWal2 ? !walidxGetFile(&pWal->hdr) : 0; + mxSafeFrame = walidxGetMxFrame(&pWal->hdr, iCkpt); szPage = walPagesize(pWal); testcase( szPage<=32768 ); testcase( szPage>=65536 ); pInfo = walCkptInfo(pWal); - if( pInfo->nBackfillhdr.mxFrame ){ + if( (bWal2==1 && pInfo->nBackfill==0 && mxSafeFrame) + || (bWal2==0 && pInfo->nBackfillapWalFd[iCkpt]; + mxPage = pWal->hdr.nPage; + + /* If this is a wal2 system, check for a reader holding a lock + ** preventing this checkpoint operation. If one is found, return + ** early. */ + if( bWal2 ){ + rc = walLockExclusive(pWal, WAL_READ_LOCK(1 + iCkpt*2), 1); + if( rc!=SQLITE_OK ) return rc; + } /* Allocate the iterator */ - rc = walIteratorInit(pWal, &pIter); + rc = walIteratorInit(pWal, iCkpt, &pIter); if( rc!=SQLITE_OK ){ return rc; } @@ -1757,52 +2079,53 @@ static int walCheckpoint( ** in the SQLITE_CHECKPOINT_PASSIVE mode. */ assert( eMode!=SQLITE_CHECKPOINT_PASSIVE || xBusy==0 ); - /* Compute in mxSafeFrame the index of the last frame of the WAL that is - ** safe to write into the database. Frames beyond mxSafeFrame might - ** overwrite database pages that are in use by active readers and thus - ** cannot be backfilled from the WAL. + + /* If this is a wal system (not wal2), compute in mxSafeFrame the index + ** of the last frame of the WAL that is safe to write into the database. + ** Frames beyond mxSafeFrame might overwrite database pages that are in + ** use by active readers and thus cannot be backfilled from the WAL. */ - mxSafeFrame = pWal->hdr.mxFrame; - mxPage = pWal->hdr.nPage; - for(i=1; iaReadMark[i]; - if( mxSafeFrame>y ){ - assert( y<=pWal->hdr.mxFrame ); - rc = walBusyLock(pWal, xBusy, pBusyArg, WAL_READ_LOCK(i), 1); - if( rc==SQLITE_OK ){ - pInfo->aReadMark[i] = (i==1 ? mxSafeFrame : READMARK_NOT_USED); - walUnlockExclusive(pWal, WAL_READ_LOCK(i), 1); - }else if( rc==SQLITE_BUSY ){ - mxSafeFrame = y; - xBusy = 0; - }else{ - goto walcheckpoint_out; + if( bWal2==0 ){ + for(i=1; iaReadMark[i]; + if( mxSafeFrame>y ){ + assert( y<=pWal->hdr.mxFrame ); + rc = walBusyLock(pWal, xBusy, pBusyArg, WAL_READ_LOCK(i), 1); + if( rc==SQLITE_OK ){ + pInfo->aReadMark[i] = (i==1 ? mxSafeFrame : READMARK_NOT_USED); + walUnlockExclusive(pWal, WAL_READ_LOCK(i), 1); + }else if( rc==SQLITE_BUSY ){ + mxSafeFrame = y; + xBusy = 0; + }else{ + goto walcheckpoint_out; + } } } } - if( pInfo->nBackfillnBackfillnBackfill; + assert( bWal2==0 || nBackfill==0 ); pInfo->nBackfillAttempted = mxSafeFrame; - /* Sync the WAL to disk */ - rc = sqlite3OsSync(pWal->pWalFd, CKPT_SYNC_FLAGS(sync_flags)); + /* Sync the wal file being checkpointed to disk */ + rc = sqlite3OsSync(pWalFd, CKPT_SYNC_FLAGS(sync_flags)); /* If the database may grow as a result of this checkpoint, hint - ** about the eventual size of the db file to the VFS layer. - */ + ** about the eventual size of the db file to the VFS layer. */ if( rc==SQLITE_OK ){ i64 nReq = ((i64)mxPage * szPage); rc = sqlite3OsFileSize(pWal->pDbFd, &nSize); @@ -1811,21 +2134,27 @@ static int walCheckpoint( } } - /* Iterate through the contents of the WAL, copying data to the db file */ while( rc==SQLITE_OK && 0==walIteratorNext(pIter, &iDbpage, &iFrame) ){ i64 iOffset; - assert( walFramePgno(pWal, iFrame)==iDbpage ); + + assert( bWal2==1 || walFramePgno(pWal, iFrame)==iDbpage ); + assert( bWal2==0 || walFramePgno2(pWal, iCkpt, iFrame)==iDbpage ); + if( db->u1.isInterrupted ){ rc = db->mallocFailed ? SQLITE_NOMEM_BKPT : SQLITE_INTERRUPT; break; } if( iFrame<=nBackfill || iFrame>mxSafeFrame || iDbpage>mxPage ){ + assert( bWal2==0 || iDbpage>mxPage ); continue; } iOffset = walFrameOffset(iFrame, szPage) + WAL_FRAME_HDRSIZE; + WALTRACE(("WAL%p: checkpoint frame %d of wal %d to db page %d\n", + pWal, (int)iFrame, iCkpt, (int)iDbpage + )); /* testcase( IS_BIG_INT(iOffset) ); // requires a 4GiB WAL file */ - rc = sqlite3OsRead(pWal->pWalFd, zBuf, szPage, iOffset); + rc = sqlite3OsRead(pWalFd, zBuf, szPage, iOffset); if( rc!=SQLITE_OK ) break; iOffset = (iDbpage-1)*(i64)szPage; testcase( IS_BIG_INT(iOffset) ); @@ -1833,23 +2162,22 @@ static int walCheckpoint( if( rc!=SQLITE_OK ) break; } - /* If work was actually accomplished... */ + /* Truncate the db file, sync the wal file and set the WalCkptInfo + ** flag to indicate that it has been checkpointed. */ + if( !bWal2 && rc==SQLITE_OK && mxSafeFrame==walIndexHdr(pWal)->mxFrame ){ + i64 szDb = pWal->hdr.nPage*(i64)szPage; + testcase( IS_BIG_INT(szDb) ); + rc = sqlite3OsTruncate(pWal->pDbFd, szDb); + } if( rc==SQLITE_OK ){ - if( mxSafeFrame==walIndexHdr(pWal)->mxFrame ){ - i64 szDb = pWal->hdr.nPage*(i64)szPage; - testcase( IS_BIG_INT(szDb) ); - rc = sqlite3OsTruncate(pWal->pDbFd, szDb); - if( rc==SQLITE_OK ){ - rc = sqlite3OsSync(pWal->pDbFd, CKPT_SYNC_FLAGS(sync_flags)); - } - } - if( rc==SQLITE_OK ){ - pInfo->nBackfill = mxSafeFrame; - } + rc = sqlite3OsSync(pWal->pDbFd, CKPT_SYNC_FLAGS(sync_flags)); + } + if( rc==SQLITE_OK ){ + pInfo->nBackfill = bWal2 ? 1 : mxSafeFrame; } /* Release the reader lock held while backfilling */ - walUnlockExclusive(pWal, WAL_READ_LOCK(0), 1); + walUnlockExclusive(pWal, WAL_READ_LOCK(bWal2 ? 1 + iCkpt*2 : 0), 1); } if( rc==SQLITE_BUSY ){ @@ -1864,7 +2192,7 @@ static int walCheckpoint( ** until all readers have finished using the wal file. This ensures that ** the next process to write to the database restarts the wal file. */ - if( rc==SQLITE_OK && eMode!=SQLITE_CHECKPOINT_PASSIVE ){ + if( bWal2==0 && rc==SQLITE_OK && eMode!=SQLITE_CHECKPOINT_PASSIVE ){ assert( pWal->writeLock ); if( pInfo->nBackfillhdr.mxFrame ){ rc = SQLITE_BUSY; @@ -1889,7 +2217,7 @@ static int walCheckpoint( ** file-system. To avoid this, update the wal-index header to ** indicate that the log file contains zero valid frames. */ walRestartHdr(pWal, salt1); - rc = sqlite3OsTruncate(pWal->pWalFd, 0); + rc = sqlite3OsTruncate(pWal->apWalFd[0], 0); } walUnlockExclusive(pWal, WAL_READ_LOCK(1), WAL_NREADER-1); } @@ -1906,16 +2234,18 @@ static int walCheckpoint( ** it to exactly nMax bytes. If an error occurs while doing so, ignore it. */ static void walLimitSize(Wal *pWal, i64 nMax){ - i64 sz; - int rx; - sqlite3BeginBenignMalloc(); - rx = sqlite3OsFileSize(pWal->pWalFd, &sz); - if( rx==SQLITE_OK && (sz > nMax ) ){ - rx = sqlite3OsTruncate(pWal->pWalFd, nMax); - } - sqlite3EndBenignMalloc(); - if( rx ){ - sqlite3_log(rx, "cannot limit WAL size: %s", pWal->zWalName); + if( isWalMode2(pWal)==0 ){ + i64 sz; + int rx; + sqlite3BeginBenignMalloc(); + rx = sqlite3OsFileSize(pWal->apWalFd[0], &sz); + if( rx==SQLITE_OK && (sz > nMax ) ){ + rx = sqlite3OsTruncate(pWal->apWalFd[0], nMax); + } + sqlite3EndBenignMalloc(); + if( rx ){ + sqlite3_log(rx, "cannot limit WAL size: %s", pWal->zWalName); + } } } @@ -1944,39 +2274,50 @@ int sqlite3WalClose( if( zBuf!=0 && SQLITE_OK==(rc = sqlite3OsLock(pWal->pDbFd, SQLITE_LOCK_EXCLUSIVE)) ){ + int i; if( pWal->exclusiveMode==WAL_NORMAL_MODE ){ pWal->exclusiveMode = WAL_EXCLUSIVE_MODE; } - rc = sqlite3WalCheckpoint(pWal, db, - SQLITE_CHECKPOINT_PASSIVE, 0, 0, sync_flags, nBuf, zBuf, 0, 0 - ); - if( rc==SQLITE_OK ){ - int bPersist = -1; - sqlite3OsFileControlHint( - pWal->pDbFd, SQLITE_FCNTL_PERSIST_WAL, &bPersist + for(i=0; rc==SQLITE_OK && i<2; i++){ + rc = sqlite3WalCheckpoint(pWal, db, + SQLITE_CHECKPOINT_PASSIVE, 0, 0, sync_flags, nBuf, zBuf, 0, 0 ); - if( bPersist!=1 ){ - /* Try to delete the WAL file if the checkpoint completed and - ** fsyned (rc==SQLITE_OK) and if we are not in persistent-wal - ** mode (!bPersist) */ - isDelete = 1; - }else if( pWal->mxWalSize>=0 ){ - /* Try to truncate the WAL file to zero bytes if the checkpoint - ** completed and fsynced (rc==SQLITE_OK) and we are in persistent - ** WAL mode (bPersist) and if the PRAGMA journal_size_limit is a - ** non-negative value (pWal->mxWalSize>=0). Note that we truncate - ** to zero bytes as truncating to the journal_size_limit might - ** leave a corrupt WAL file on disk. */ - walLimitSize(pWal, 0); + if( rc==SQLITE_OK ){ + int bPersist = -1; + sqlite3OsFileControlHint( + pWal->pDbFd, SQLITE_FCNTL_PERSIST_WAL, &bPersist + ); + if( bPersist!=1 ){ + /* Try to delete the WAL file if the checkpoint completed and + ** fsyned (rc==SQLITE_OK) and if we are not in persistent-wal + ** mode (!bPersist) */ + isDelete = 1; + }else if( pWal->mxWalSize>=0 ){ + /* Try to truncate the WAL file to zero bytes if the checkpoint + ** completed and fsynced (rc==SQLITE_OK) and we are in persistent + ** WAL mode (bPersist) and if the PRAGMA journal_size_limit is a + ** non-negative value (pWal->mxWalSize>=0). Note that we truncate + ** to zero bytes as truncating to the journal_size_limit might + ** leave a corrupt WAL file on disk. */ + walLimitSize(pWal, 0); + } } + + if( isWalMode2(pWal)==0 ) break; + + walCkptInfo(pWal)->nBackfill = 0; + walidxSetFile(&pWal->hdr, !walidxGetFile(&pWal->hdr)); + pWal->writeLock = 1; + walIndexWriteHdr(pWal); + pWal->writeLock = 0; } } walIndexClose(pWal, isDelete); - sqlite3OsClose(pWal->pWalFd); if( isDelete ){ sqlite3BeginBenignMalloc(); sqlite3OsDelete(pWal->pVfs, pWal->zWalName, 0); + sqlite3OsDelete(pWal->pVfs, pWal->zWalName2, 0); sqlite3EndBenignMalloc(); } WALTRACE(("WAL%p: closed\n", pWal)); @@ -2115,7 +2456,9 @@ static int walIndexReadHdr(Wal *pWal, int *pChanged){ ** sure the wal-index was not constructed with some future format that ** this version of SQLite cannot understand. */ - if( badHdr==0 && pWal->hdr.iVersion!=WALINDEX_MAX_VERSION ){ + if( badHdr==0 + && pWal->hdr.iVersion!=WAL_VERSION1 && pWal->hdr.iVersion!=WAL_VERSION2 + ){ rc = SQLITE_CANTOPEN_BKPT; } @@ -2180,13 +2523,9 @@ static int walIndexReadHdr(Wal *pWal, int *pChanged){ */ static int walTryBeginRead(Wal *pWal, int *pChanged, int useWal, int cnt){ volatile WalCkptInfo *pInfo; /* Checkpoint information in wal-index */ - u32 mxReadMark; /* Largest aReadMark[] value */ - int mxI; /* Index of largest aReadMark[] value */ - int i; /* Loop counter */ int rc = SQLITE_OK; /* Return code */ - u32 mxFrame; /* Wal frame to lock to */ - assert( pWal->readLock<0 ); /* Not currently locked */ + assert( pWal->readLock==WAL_LOCK_NONE ); /* Not currently locked */ /* Take steps to avoid spinning forever if there is a protocol error. ** @@ -2248,131 +2587,156 @@ static int walTryBeginRead(Wal *pWal, int *pChanged, int useWal, int cnt){ } pInfo = walCkptInfo(pWal); - if( !useWal && pInfo->nBackfill==pWal->hdr.mxFrame + if( isWalMode2(pWal) ){ + int eLock = 1 + (walidxGetFile(&pWal->hdr)*2); + if( pInfo->nBackfill==0 ){ + eLock += walidxGetMxFrame(&pWal->hdr, !walidxGetFile(&pWal->hdr))>0; + } + rc = walLockReader(pWal, eLock, 1); + if( rc!=SQLITE_OK ){ + return rc; + } + + walShmBarrier(pWal); + if( memcmp((void *)walIndexHdr(pWal), &pWal->hdr, sizeof(WalIndexHdr)) ){ + walLockReader(pWal, eLock, 0); + return WAL_RETRY; + }else{ + pWal->readLock = eLock; + } + assert( pWal->minFrame==0 && walFramePage(pWal->minFrame)==0 ); + }else{ + u32 mxReadMark; /* Largest aReadMark[] value */ + int mxI; /* Index of largest aReadMark[] value */ + int i; /* Loop counter */ + u32 mxFrame; /* Wal frame to lock to */ + + if( !useWal && pInfo->nBackfill==pWal->hdr.mxFrame #ifdef SQLITE_ENABLE_SNAPSHOT - && (pWal->pSnapshot==0 || pWal->hdr.mxFrame==0 - || 0==memcmp(&pWal->hdr, pWal->pSnapshot, sizeof(WalIndexHdr))) + && (pWal->pSnapshot==0 || pWal->hdr.mxFrame==0 + || 0==memcmp(&pWal->hdr, pWal->pSnapshot, sizeof(WalIndexHdr))) #endif - ){ - /* The WAL has been completely backfilled (or it is empty). - ** and can be safely ignored. - */ - rc = walLockShared(pWal, WAL_READ_LOCK(0)); - walShmBarrier(pWal); - if( rc==SQLITE_OK ){ - if( memcmp((void *)walIndexHdr(pWal), &pWal->hdr, sizeof(WalIndexHdr)) ){ - /* It is not safe to allow the reader to continue here if frames - ** may have been appended to the log before READ_LOCK(0) was obtained. - ** When holding READ_LOCK(0), the reader ignores the entire log file, - ** which implies that the database file contains a trustworthy - ** snapshot. Since holding READ_LOCK(0) prevents a checkpoint from - ** happening, this is usually correct. - ** - ** However, if frames have been appended to the log (or if the log - ** is wrapped and written for that matter) before the READ_LOCK(0) - ** is obtained, that is not necessarily true. A checkpointer may - ** have started to backfill the appended frames but crashed before - ** it finished. Leaving a corrupt image in the database file. - */ - walUnlockShared(pWal, WAL_READ_LOCK(0)); - return WAL_RETRY; + ){ + /* The WAL has been completely backfilled (or it is empty). + ** and can be safely ignored. + */ + rc = walLockShared(pWal, WAL_READ_LOCK(0)); + walShmBarrier(pWal); + if( rc==SQLITE_OK ){ + if( memcmp((void*)walIndexHdr(pWal), &pWal->hdr, sizeof(WalIndexHdr)) ){ + /* It is not safe to allow the reader to continue here if frames + ** may have been appended to the log before READ_LOCK(0) was obtained. + ** When holding READ_LOCK(0), the reader ignores the entire log file, + ** which implies that the database file contains a trustworthy + ** snapshot. Since holding READ_LOCK(0) prevents a checkpoint from + ** happening, this is usually correct. + ** + ** However, if frames have been appended to the log (or if the log + ** is wrapped and written for that matter) before the READ_LOCK(0) + ** is obtained, that is not necessarily true. A checkpointer may + ** have started to backfill the appended frames but crashed before + ** it finished. Leaving a corrupt image in the database file. + */ + walUnlockShared(pWal, WAL_READ_LOCK(0)); + return WAL_RETRY; + } + pWal->readLock = 0; + return SQLITE_OK; + }else if( rc!=SQLITE_BUSY ){ + return rc; } - pWal->readLock = 0; - return SQLITE_OK; - }else if( rc!=SQLITE_BUSY ){ - return rc; } - } - /* If we get this far, it means that the reader will want to use - ** the WAL to get at content from recent commits. The job now is - ** to select one of the aReadMark[] entries that is closest to - ** but not exceeding pWal->hdr.mxFrame and lock that entry. - */ - mxReadMark = 0; - mxI = 0; - mxFrame = pWal->hdr.mxFrame; + /* If we get this far, it means that the reader will want to use + ** the WAL to get at content from recent commits. The job now is + ** to select one of the aReadMark[] entries that is closest to + ** but not exceeding pWal->hdr.mxFrame and lock that entry. + */ + mxReadMark = 0; + mxI = 0; + mxFrame = pWal->hdr.mxFrame; #ifdef SQLITE_ENABLE_SNAPSHOT - if( pWal->pSnapshot && pWal->pSnapshot->mxFramepSnapshot->mxFrame; - } -#endif - for(i=1; iaReadMark[i]; - if( mxReadMark<=thisMark && thisMark<=mxFrame ){ - assert( thisMark!=READMARK_NOT_USED ); - mxReadMark = thisMark; - mxI = i; + if( pWal->pSnapshot && pWal->pSnapshot->mxFramepSnapshot->mxFrame; } - } - if( (pWal->readOnly & WAL_SHM_RDONLY)==0 - && (mxReadMarkaReadMark[i] = mxFrame; + u32 thisMark = pInfo->aReadMark[i]; + if( mxReadMark<=thisMark && thisMark<=mxFrame ){ + assert( thisMark!=READMARK_NOT_USED ); + mxReadMark = thisMark; mxI = i; - walUnlockExclusive(pWal, WAL_READ_LOCK(i), 1); - break; - }else if( rc!=SQLITE_BUSY ){ - return rc; } } - } - if( mxI==0 ){ - assert( rc==SQLITE_BUSY || (pWal->readOnly & WAL_SHM_RDONLY)!=0 ); - return rc==SQLITE_BUSY ? WAL_RETRY : SQLITE_READONLY_CANTLOCK; - } + if( (pWal->readOnly & WAL_SHM_RDONLY)==0 + && (mxReadMarkaReadMark[i] = mxFrame; + mxI = i; + walUnlockExclusive(pWal, WAL_READ_LOCK(i), 1); + break; + }else if( rc!=SQLITE_BUSY ){ + return rc; + } + } + } + if( mxI==0 ){ + assert( rc==SQLITE_BUSY || (pWal->readOnly & WAL_SHM_RDONLY)!=0 ); + return rc==SQLITE_BUSY ? WAL_RETRY : SQLITE_READONLY_CANTLOCK; + } - rc = walLockShared(pWal, WAL_READ_LOCK(mxI)); - if( rc ){ - return rc==SQLITE_BUSY ? WAL_RETRY : rc; - } - /* Now that the read-lock has been obtained, check that neither the - ** value in the aReadMark[] array or the contents of the wal-index - ** header have changed. - ** - ** It is necessary to check that the wal-index header did not change - ** between the time it was read and when the shared-lock was obtained - ** on WAL_READ_LOCK(mxI) was obtained to account for the possibility - ** that the log file may have been wrapped by a writer, or that frames - ** that occur later in the log than pWal->hdr.mxFrame may have been - ** copied into the database by a checkpointer. If either of these things - ** happened, then reading the database with the current value of - ** pWal->hdr.mxFrame risks reading a corrupted snapshot. So, retry - ** instead. - ** - ** Before checking that the live wal-index header has not changed - ** since it was read, set Wal.minFrame to the first frame in the wal - ** file that has not yet been checkpointed. This client will not need - ** to read any frames earlier than minFrame from the wal file - they - ** can be safely read directly from the database file. - ** - ** Because a ShmBarrier() call is made between taking the copy of - ** nBackfill and checking that the wal-header in shared-memory still - ** matches the one cached in pWal->hdr, it is guaranteed that the - ** checkpointer that set nBackfill was not working with a wal-index - ** header newer than that cached in pWal->hdr. If it were, that could - ** cause a problem. The checkpointer could omit to checkpoint - ** a version of page X that lies before pWal->minFrame (call that version - ** A) on the basis that there is a newer version (version B) of the same - ** page later in the wal file. But if version B happens to like past - ** frame pWal->hdr.mxFrame - then the client would incorrectly assume - ** that it can read version A from the database file. However, since - ** we can guarantee that the checkpointer that set nBackfill could not - ** see any pages past pWal->hdr.mxFrame, this problem does not come up. - */ - pWal->minFrame = pInfo->nBackfill+1; - walShmBarrier(pWal); - if( pInfo->aReadMark[mxI]!=mxReadMark - || memcmp((void *)walIndexHdr(pWal), &pWal->hdr, sizeof(WalIndexHdr)) - ){ - walUnlockShared(pWal, WAL_READ_LOCK(mxI)); - return WAL_RETRY; - }else{ - assert( mxReadMark<=pWal->hdr.mxFrame ); - pWal->readLock = (i16)mxI; + rc = walLockShared(pWal, WAL_READ_LOCK(mxI)); + if( rc ){ + return rc==SQLITE_BUSY ? WAL_RETRY : rc; + } + /* Now that the read-lock has been obtained, check that neither the + ** value in the aReadMark[] array or the contents of the wal-index + ** header have changed. + ** + ** It is necessary to check that the wal-index header did not change + ** between the time it was read and when the shared-lock was obtained + ** on WAL_READ_LOCK(mxI) was obtained to account for the possibility + ** that the log file may have been wrapped by a writer, or that frames + ** that occur later in the log than pWal->hdr.mxFrame may have been + ** copied into the database by a checkpointer. If either of these things + ** happened, then reading the database with the current value of + ** pWal->hdr.mxFrame risks reading a corrupted snapshot. So, retry + ** instead. + ** + ** Before checking that the live wal-index header has not changed + ** since it was read, set Wal.minFrame to the first frame in the wal + ** file that has not yet been checkpointed. This client will not need + ** to read any frames earlier than minFrame from the wal file - they + ** can be safely read directly from the database file. + ** + ** Because a ShmBarrier() call is made between taking the copy of + ** nBackfill and checking that the wal-header in shared-memory still + ** matches the one cached in pWal->hdr, it is guaranteed that the + ** checkpointer that set nBackfill was not working with a wal-index + ** header newer than that cached in pWal->hdr. If it were, that could + ** cause a problem. The checkpointer could omit to checkpoint + ** a version of page X that lies before pWal->minFrame (call that version + ** A) on the basis that there is a newer version (version B) of the same + ** page later in the wal file. But if version B happens to like past + ** frame pWal->hdr.mxFrame - then the client would incorrectly assume + ** that it can read version A from the database file. However, since + ** we can guarantee that the checkpointer that set nBackfill could not + ** see any pages past pWal->hdr.mxFrame, this problem does not come up. + */ + pWal->minFrame = pInfo->nBackfill+1; + walShmBarrier(pWal); + if( pInfo->aReadMark[mxI]!=mxReadMark + || memcmp((void *)walIndexHdr(pWal), &pWal->hdr, sizeof(WalIndexHdr)) + ){ + walUnlockShared(pWal, WAL_READ_LOCK(mxI)); + return WAL_RETRY; + }else{ + assert( mxReadMark<=pWal->hdr.mxFrame ); + pWal->readLock = (i16)mxI; + } } return rc; } @@ -2488,6 +2852,10 @@ int sqlite3WalBeginReadTransaction(Wal *pWal, int *pChanged){ testcase( (rc&0xff)==SQLITE_IOERR ); testcase( rc==SQLITE_PROTOCOL ); testcase( rc==SQLITE_OK ); + + if( rc==SQLITE_OK && pWal->hdr.iVersion==WAL_VERSION2 ){ + rc = walOpenWal2(pWal); + } #ifdef SQLITE_ENABLE_SNAPSHOT if( rc==SQLITE_OK ){ @@ -2561,12 +2929,75 @@ int sqlite3WalBeginReadTransaction(Wal *pWal, int *pChanged){ */ void sqlite3WalEndReadTransaction(Wal *pWal){ sqlite3WalEndWriteTransaction(pWal); - if( pWal->readLock>=0 ){ - walUnlockShared(pWal, WAL_READ_LOCK(pWal->readLock)); - pWal->readLock = -1; + if( pWal->readLock!=WAL_LOCK_NONE ){ + if( isWalMode2(pWal) ){ + (void)walLockReader(pWal, pWal->readLock, 0); + }else{ + walUnlockShared(pWal, WAL_READ_LOCK(pWal->readLock)); + } + pWal->readLock = WAL_LOCK_NONE; } } +/* Search hash table iHash for an entry matching page number +** pgno. Each call to this function searches a single hash table +** (each hash table indexes up to HASHTABLE_NPAGE frames). +** +** This code might run concurrently to the code in walIndexAppend() +** that adds entries to the wal-index (and possibly to this hash +** table). This means the value just read from the hash +** slot (aHash[iKey]) may have been added before or after the +** current read transaction was opened. Values added after the +** read transaction was opened may have been written incorrectly - +** i.e. these slots may contain garbage data. However, we assume +** that any slots written before the current read transaction was +** opened remain unmodified. +** +** For the reasons above, the if(...) condition featured in the inner +** loop of the following block is more stringent that would be required +** if we had exclusive access to the hash-table: +** +** (aPgno[iFrame]==pgno): +** This condition filters out normal hash-table collisions. +** +** (iFrame<=iLast): +** This condition filters out entries that were added to the hash +** table after the current read-transaction had started. +*/ +static int walSearchHash( + Wal *pWal, + u32 iLast, + int iHash, + Pgno pgno, + u32 *piRead +){ + volatile ht_slot *aHash; /* Pointer to hash table */ + volatile u32 *aPgno; /* Pointer to array of page numbers */ + u32 iZero; /* Frame number corresponding to aPgno[0] */ + int iKey; /* Hash slot index */ + int nCollide; /* Number of hash collisions remaining */ + int rc; /* Error code */ + + rc = walHashGet(pWal, iHash, &aHash, &aPgno, &iZero); + if( rc!=SQLITE_OK ){ + return rc; + } + nCollide = HASHTABLE_NSLOT; + for(iKey=walHash(pgno); aHash[iKey]; iKey=walNextHash(iKey)){ + u32 iFrame = aHash[iKey] + iZero; + if( iFrame<=iLast && iFrame>=pWal->minFrame && aPgno[aHash[iKey]]==pgno ){ + assert( iFrame>*piRead || CORRUPT_DB ); + *piRead = iFrame; + } + if( (nCollide--)==0 ){ + return SQLITE_CORRUPT_BKPT; + } + } + + return SQLITE_OK; +} + + /* ** Search the wal file for page pgno. If found, set *piRead to the frame that ** contains the page. Otherwise, if pgno is not in the wal file, set *piRead @@ -2580,80 +3011,74 @@ int sqlite3WalFindFrame( Pgno pgno, /* Database page number to read data for */ u32 *piRead /* OUT: Frame number (or zero) */ ){ + int bWal2 = isWalMode2(pWal); + int iApp = walidxGetFile(&pWal->hdr); + int rc = SQLITE_OK; u32 iRead = 0; /* If !=0, WAL frame to return data from */ - u32 iLast = pWal->hdr.mxFrame; /* Last page in WAL for this reader */ + u32 iLast; /* Last frame in wal file */ int iHash; /* Used to loop through N hash tables */ - int iMinHash; /* This routine is only be called from within a read transaction. */ - assert( pWal->readLock>=0 || pWal->lockError ); + assert( pWal->readLock!=WAL_LOCK_NONE ); - /* If the "last page" field of the wal-index header snapshot is 0, then - ** no data will be read from the wal under any circumstances. Return early - ** in this case as an optimization. Likewise, if pWal->readLock==0, - ** then the WAL is ignored by the reader so return early, as if the - ** WAL were empty. - */ - if( iLast==0 || pWal->readLock==0 ){ - *piRead = 0; - return SQLITE_OK; + /* If this is a wal2 system, the client must have a partial-wal lock + ** on wal file iApp. Or if it is a wal system, iApp==0 must be true. */ + assert( bWal2==0 || iApp==1 + || pWal->readLock==WAL_LOCK_PART1 || pWal->readLock==WAL_LOCK_PART1_FULL2 + ); + assert( bWal2==0 || iApp==0 + || pWal->readLock==WAL_LOCK_PART2 || pWal->readLock==WAL_LOCK_PART2_FULL1 + ); + assert( bWal2 || iApp==0 ); + + /* Search the wal file that the client holds a partial lock on first */ + iLast = walidxGetMxFrame(&pWal->hdr, iApp); + if( iLast ){ + u32 iExternal = bWal2 ? walExternalEncode(iApp, iLast) : iLast; + int iMinHash = walFramePage(pWal->minFrame); + for(iHash=walFramePage(iExternal); + iHash>=iMinHash && iRead==0; + iHash-=(1+bWal2) + ){ + rc = walSearchHash(pWal, iExternal, iHash, pgno, &iRead); + if( rc!=SQLITE_OK ) break; + } } - /* Search the hash table or tables for an entry matching page number - ** pgno. Each iteration of the following for() loop searches one - ** hash table (each hash table indexes up to HASHTABLE_NPAGE frames). - ** - ** This code might run concurrently to the code in walIndexAppend() - ** that adds entries to the wal-index (and possibly to this hash - ** table). This means the value just read from the hash - ** slot (aHash[iKey]) may have been added before or after the - ** current read transaction was opened. Values added after the - ** read transaction was opened may have been written incorrectly - - ** i.e. these slots may contain garbage data. However, we assume - ** that any slots written before the current read transaction was - ** opened remain unmodified. - ** - ** For the reasons above, the if(...) condition featured in the inner - ** loop of the following block is more stringent that would be required - ** if we had exclusive access to the hash-table: - ** - ** (aPgno[iFrame]==pgno): - ** This condition filters out normal hash-table collisions. - ** - ** (iFrame<=iLast): - ** This condition filters out entries that were added to the hash - ** table after the current read-transaction had started. - */ - iMinHash = walFramePage(pWal->minFrame); - for(iHash=walFramePage(iLast); iHash>=iMinHash && iRead==0; iHash--){ - volatile ht_slot *aHash; /* Pointer to hash table */ - volatile u32 *aPgno; /* Pointer to array of page numbers */ - u32 iZero; /* Frame number corresponding to aPgno[0] */ - int iKey; /* Hash slot index */ - int nCollide; /* Number of hash collisions remaining */ - int rc; /* Error code */ - - rc = walHashGet(pWal, iHash, &aHash, &aPgno, &iZero); - if( rc!=SQLITE_OK ){ - return rc; - } - nCollide = HASHTABLE_NSLOT; - for(iKey=walHash(pgno); aHash[iKey]; iKey=walNextHash(iKey)){ - u32 iFrame = aHash[iKey] + iZero; - if( iFrame<=iLast && iFrame>=pWal->minFrame && aPgno[aHash[iKey]]==pgno ){ - assert( iFrame>iRead || CORRUPT_DB ); - iRead = iFrame; - } - if( (nCollide--)==0 ){ - return SQLITE_CORRUPT_BKPT; + /* If the requested page was not found, no error has occured, and + ** the client holds a full-wal lock on the other wal file, search it + ** too. */ + if( rc==SQLITE_OK && bWal2 && iRead==0 && ( + pWal->readLock==WAL_LOCK_PART1_FULL2 + || pWal->readLock==WAL_LOCK_PART2_FULL1 + )){ + iLast = walidxGetMxFrame(&pWal->hdr, !iApp); + if( iLast ){ + u32 iExternal = walExternalEncode(!iApp, iLast); + for(iHash=walFramePage2(!iApp, iLast); iHash>=0 && iRead==0; iHash -= 2){ + rc = walSearchHash(pWal, iExternal, iHash, pgno, &iRead); + if( rc!=SQLITE_OK ) break; } } } +#if defined(SQLITE_TEST) && defined(SQLITE_DEBUG) + if( iRead ){ + u32 iFrame; + int iWal = walExternalDecode(iRead, &iFrame); + WALTRACE(("WAL%p: page %d @ frame %d wal %d\n",pWal,(int)pgno,iFrame,iWal)); + }else{ + WALTRACE(("WAL%p: page %d not found\n", pWal, (int)pgno)); + } +#endif + #ifdef SQLITE_ENABLE_EXPENSIVE_ASSERT /* If expensive assert() statements are available, do a linear search ** of the wal-index file content. Make sure the results agree with the - ** result obtained using the hash indexes above. */ + ** result obtained using the hash indexes above. + ** + ** TODO: This is broken for wal2. + */ { u32 iRead2 = 0; u32 iTest; @@ -2679,26 +3104,40 @@ int sqlite3WalFindFrame( */ int sqlite3WalReadFrame( Wal *pWal, /* WAL handle */ - u32 iRead, /* Frame to read */ + u32 iExternal, /* Frame to read */ int nOut, /* Size of buffer pOut in bytes */ u8 *pOut /* Buffer to write page data to */ ){ int sz; + int iWal = 0; + u32 iRead; i64 iOffset; + + /* Figure out the page size */ sz = pWal->hdr.szPage; sz = (sz&0xfe00) + ((sz&0x0001)<<16); testcase( sz<=32768 ); testcase( sz>=65536 ); + + if( isWalMode2(pWal) ){ + /* Figure out which of the two wal files, and the frame within, that + ** iExternal refers to. */ + iWal = walExternalDecode(iExternal, &iRead); + }else{ + iRead = iExternal; + } + + WALTRACE(("WAL%p: reading frame %d wal %d\n", pWal, iRead, iWal)); iOffset = walFrameOffset(iRead, sz) + WAL_FRAME_HDRSIZE; /* testcase( IS_BIG_INT(iOffset) ); // requires a 4GiB WAL */ - return sqlite3OsRead(pWal->pWalFd, pOut, (nOut>sz ? sz : nOut), iOffset); + return sqlite3OsRead(pWal->apWalFd[iWal], pOut, (nOut>sz?sz:nOut), iOffset); } /* ** Return the size of the database in pages (or zero, if unknown). */ Pgno sqlite3WalDbsize(Wal *pWal){ - if( pWal && ALWAYS(pWal->readLock>=0) ){ + if( pWal && ALWAYS(pWal->readLock!=WAL_LOCK_NONE) ){ return pWal->hdr.nPage; } return 0; @@ -2723,7 +3162,7 @@ int sqlite3WalBeginWriteTransaction(Wal *pWal){ /* Cannot start a write transaction without first holding a read ** transaction. */ - assert( pWal->readLock>=0 ); + assert( pWal->readLock!=WAL_LOCK_NONE ); assert( pWal->writeLock==0 && pWal->iReCksum==0 ); if( pWal->readOnly ){ @@ -2781,18 +3220,21 @@ int sqlite3WalEndWriteTransaction(Wal *pWal){ int sqlite3WalUndo(Wal *pWal, int (*xUndo)(void *, Pgno), void *pUndoCtx){ int rc = SQLITE_OK; if( ALWAYS(pWal->writeLock) ){ - Pgno iMax = pWal->hdr.mxFrame; + int iWal = walidxGetFile(&pWal->hdr); + Pgno iMax = walidxGetMxFrame(&pWal->hdr, iWal); + Pgno iNew; Pgno iFrame; - + + assert( isWalMode2(pWal) || iWal==0 ); + /* Restore the clients cache of the wal-index header to the state it ** was in before the client began writing to the database. */ memcpy(&pWal->hdr, (void *)walIndexHdr(pWal), sizeof(WalIndexHdr)); + assert( walidxGetFile(&pWal->hdr)==iWal ); + iNew = walidxGetMxFrame(&pWal->hdr, walidxGetFile(&pWal->hdr)); - for(iFrame=pWal->hdr.mxFrame+1; - ALWAYS(rc==SQLITE_OK) && iFrame<=iMax; - iFrame++ - ){ + for(iFrame=iNew+1; ALWAYS(rc==SQLITE_OK) && iFrame<=iMax; iFrame++){ /* This call cannot fail. Unless the page for which the page number ** is passed as the second argument is (a) in the cache and ** (b) has an outstanding reference, then xUndo is either a no-op @@ -2804,10 +3246,16 @@ int sqlite3WalUndo(Wal *pWal, int (*xUndo)(void *, Pgno), void *pUndoCtx){ ** page 1 is never written to the log until the transaction is ** committed. As a result, the call to xUndo may not fail. */ - assert( walFramePgno(pWal, iFrame)!=1 ); - rc = xUndo(pUndoCtx, walFramePgno(pWal, iFrame)); + Pgno pgno; + if( isWalMode2(pWal) ){ + pgno = walFramePgno2(pWal, iWal, iFrame); + }else{ + pgno = walFramePgno(pWal, iFrame); + } + assert( pgno!=1 ); + rc = xUndo(pUndoCtx, pgno); } - if( iMax!=pWal->hdr.mxFrame ) walCleanupHash(pWal); + if( iMax!=iNew ) walCleanupHash(pWal); } return rc; } @@ -2819,11 +3267,13 @@ int sqlite3WalUndo(Wal *pWal, int (*xUndo)(void *, Pgno), void *pUndoCtx){ ** point in the event of a savepoint rollback (via WalSavepointUndo()). */ void sqlite3WalSavepoint(Wal *pWal, u32 *aWalData){ + int iWal = walidxGetFile(&pWal->hdr); assert( pWal->writeLock ); - aWalData[0] = pWal->hdr.mxFrame; + assert( isWalMode2(pWal) || iWal==0 ); + aWalData[0] = walidxGetMxFrame(&pWal->hdr, iWal); aWalData[1] = pWal->hdr.aFrameCksum[0]; aWalData[2] = pWal->hdr.aFrameCksum[1]; - aWalData[3] = pWal->nCkpt; + aWalData[3] = isWalMode2(pWal) ? iWal : pWal->nCkpt; } /* @@ -2834,21 +3284,24 @@ void sqlite3WalSavepoint(Wal *pWal, u32 *aWalData){ */ int sqlite3WalSavepointUndo(Wal *pWal, u32 *aWalData){ int rc = SQLITE_OK; + int iWal = walidxGetFile(&pWal->hdr); + int iCmp = isWalMode2(pWal) ? iWal : pWal->nCkpt; assert( pWal->writeLock ); - assert( aWalData[3]!=pWal->nCkpt || aWalData[0]<=pWal->hdr.mxFrame ); + assert( isWalMode2(pWal) || iWal==0 ); + assert( aWalData[3]!=iCmp || aWalData[0]<=walidxGetMxFrame(&pWal->hdr,iWal) ); - if( aWalData[3]!=pWal->nCkpt ){ + if( aWalData[3]!=iCmp ){ /* This savepoint was opened immediately after the write-transaction ** was started. Right after that, the writer decided to wrap around ** to the start of the log. Update the savepoint values to match. */ aWalData[0] = 0; - aWalData[3] = pWal->nCkpt; + aWalData[3] = iCmp; } - if( aWalData[0]hdr.mxFrame ){ - pWal->hdr.mxFrame = aWalData[0]; + if( aWalData[0]hdr, iWal) ){ + walidxSetMxFrame(&pWal->hdr, iWal, aWalData[0]); pWal->hdr.aFrameCksum[0] = aWalData[1]; pWal->hdr.aFrameCksum[1] = aWalData[2]; walCleanupHash(pWal); @@ -2857,23 +3310,80 @@ int sqlite3WalSavepointUndo(Wal *pWal, u32 *aWalData){ return rc; } +/* +** This function is used in wal2 mode. +** +** This function is called when writer pWal is just about to start +** writing out frames. The "other" wal file (wal file !pWal->hdr.iAppend) +** has been fully checkpointed. This function returns SQLITE_OK if there +** are no readers preventing the writer from switching to the other wal +** file. Or SQLITE_BUSY if there are. +*/ +static int walRestartOk(Wal *pWal){ + int rc; /* Return code */ + int iApp = walidxGetFile(&pWal->hdr); /* Current WAL file */ + + /* No reader can be doing a "partial" read of wal file !iApp - in that + ** case it would not have been possible to checkpoint the file. So + ** it is only necessary to test for "full" readers. See the comment + ** above walLockReader() function for exactly what this means in terms + ** of locks. */ + int i = (iApp==0) ? 2 : 4; + + rc = walLockExclusive(pWal, WAL_READ_LOCK(i), 1); + if( rc==SQLITE_OK ){ + walUnlockExclusive(pWal, WAL_READ_LOCK(i), 1); + } + return rc; +} + /* ** This function is called just before writing a set of frames to the log ** file (see sqlite3WalFrames()). It checks to see if, instead of appending -** to the current log file, it is possible to overwrite the start of the -** existing log file with the new frames (i.e. "reset" the log). If so, -** it sets pWal->hdr.mxFrame to 0. Otherwise, pWal->hdr.mxFrame is left -** unchanged. +** to the current log file, it is possible and desirable to switch to the +** other log file and write the new transaction to the start of it. +** If so, the wal-index header is updated accordingly - both in heap memory +** and in the *-shm file. ** ** SQLITE_OK is returned if no error is encountered (regardless of whether -** or not pWal->hdr.mxFrame is modified). An SQLite error code is returned +** or not the wal-index header is modified). An SQLite error code is returned ** if an error occurs. */ static int walRestartLog(Wal *pWal){ int rc = SQLITE_OK; - int cnt; - if( pWal->readLock==0 ){ + if( isWalMode2(pWal) ){ + int iApp = walidxGetFile(&pWal->hdr); + int nWalSize = WAL_DEFAULT_WALSIZE; + if( pWal->mxWalSize>0 ){ + nWalSize = (pWal->mxWalSize-WAL_HDRSIZE+pWal->szPage+WAL_FRAME_HDRSIZE-1) + / (pWal->szPage+WAL_FRAME_HDRSIZE); + nWalSize = MAX(nWalSize, 1); + } + + if( walidxGetMxFrame(&pWal->hdr, iApp)>=nWalSize ){ + volatile WalCkptInfo *pInfo = walCkptInfo(pWal); + if( walidxGetMxFrame(&pWal->hdr, !iApp)==0 || pInfo->nBackfill ){ + rc = walRestartOk(pWal); + if( rc==SQLITE_OK ){ + iApp = !iApp; + pWal->nCkpt++; + walidxSetFile(&pWal->hdr, iApp); + walidxSetMxFrame(&pWal->hdr, iApp, 0); + sqlite3Put4byte((u8*)&pWal->hdr.aSalt[0], pWal->hdr.aFrameCksum[0]); + sqlite3Put4byte((u8*)&pWal->hdr.aSalt[1], pWal->hdr.aFrameCksum[1]); + walIndexWriteHdr(pWal); + pInfo->nBackfill = 0; + walLockReader(pWal, pWal->readLock, 0); + pWal->readLock = iApp ? WAL_LOCK_PART2_FULL1 : WAL_LOCK_PART1_FULL2; + rc = walLockReader(pWal, pWal->readLock, 1); + }else if( rc==SQLITE_BUSY ){ + rc = SQLITE_OK; + } + } + } + }else if( pWal->readLock==0 ){ + int cnt; volatile WalCkptInfo *pInfo = walCkptInfo(pWal); assert( pInfo->nBackfill==pWal->hdr.mxFrame ); if( pInfo->nBackfill>0 ){ @@ -2897,7 +3407,7 @@ static int walRestartLog(Wal *pWal){ } } walUnlockShared(pWal, WAL_READ_LOCK(0)); - pWal->readLock = -1; + pWal->readLock = WAL_LOCK_NONE; cnt = 0; do{ int notUsed; @@ -2908,6 +3418,7 @@ static int walRestartLog(Wal *pWal){ testcase( rc==SQLITE_PROTOCOL ); testcase( rc==SQLITE_OK ); } + return rc; } @@ -2966,6 +3477,18 @@ static int walWriteOneFrame( int rc; /* Result code from subfunctions */ void *pData; /* Data actually written */ u8 aFrame[WAL_FRAME_HDRSIZE]; /* Buffer to assemble frame-header in */ + +#if defined(SQLITE_TEST) && defined(SQLITE_DEBUG) + { + int iWal = walidxGetFile(&p->pWal->hdr); + int iFrame = 1 + (iOffset / (WAL_FRAME_HDRSIZE + p->pWal->szPage)); + assert( p->pWal->apWalFd[iWal]==p->pFd ); + WALTRACE(("WAL%p: page %d written to frame %d of wal %d\n", + p->pWal, (int)pPage->pgno, iFrame, iWal + )); + } +#endif + #if defined(SQLITE_HAS_CODEC) if( (pData = sqlite3PagerCodec(pPage))==0 ) return SQLITE_NOMEM_BKPT; #else @@ -2988,13 +3511,15 @@ static int walWriteOneFrame( ** SQLITE_OK is returned if successful, or an SQLite error code otherwise. */ static int walRewriteChecksums(Wal *pWal, u32 iLast){ - const int szPage = pWal->szPage;/* Database page size */ int rc = SQLITE_OK; /* Return code */ + const int szPage = pWal->szPage;/* Database page size */ u8 *aBuf; /* Buffer to load data from wal file into */ u8 aFrame[WAL_FRAME_HDRSIZE]; /* Buffer to assemble frame-headers in */ u32 iRead; /* Next frame to read from wal file */ i64 iCksumOff; + assert( isWalMode2(pWal)==0 ); + aBuf = sqlite3_malloc(szPage + WAL_FRAME_HDRSIZE); if( aBuf==0 ) return SQLITE_NOMEM_BKPT; @@ -3009,7 +3534,7 @@ static int walRewriteChecksums(Wal *pWal, u32 iLast){ }else{ iCksumOff = walFrameOffset(pWal->iReCksum-1, szPage) + 16; } - rc = sqlite3OsRead(pWal->pWalFd, aBuf, sizeof(u32)*2, iCksumOff); + rc = sqlite3OsRead(pWal->apWalFd[0], aBuf, sizeof(u32)*2, iCksumOff); pWal->hdr.aFrameCksum[0] = sqlite3Get4byte(aBuf); pWal->hdr.aFrameCksum[1] = sqlite3Get4byte(&aBuf[sizeof(u32)]); @@ -3017,14 +3542,14 @@ static int walRewriteChecksums(Wal *pWal, u32 iLast){ pWal->iReCksum = 0; for(; rc==SQLITE_OK && iRead<=iLast; iRead++){ i64 iOff = walFrameOffset(iRead, szPage); - rc = sqlite3OsRead(pWal->pWalFd, aBuf, szPage+WAL_FRAME_HDRSIZE, iOff); + rc = sqlite3OsRead(pWal->apWalFd[0], aBuf, szPage+WAL_FRAME_HDRSIZE, iOff); if( rc==SQLITE_OK ){ u32 iPgno, nDbSize; iPgno = sqlite3Get4byte(aBuf); nDbSize = sqlite3Get4byte(&aBuf[4]); walEncodeFrame(pWal, iPgno, nDbSize, &aBuf[WAL_FRAME_HDRSIZE], aFrame); - rc = sqlite3OsWrite(pWal->pWalFd, aFrame, sizeof(aFrame), iOff); + rc = sqlite3OsWrite(pWal->apWalFd[0], aFrame, sizeof(aFrame), iOff); } } @@ -3054,6 +3579,7 @@ int sqlite3WalFrames( WalWriter w; /* The writer */ u32 iFirst = 0; /* First frame that may be overwritten */ WalIndexHdr *pLive; /* Pointer to shared header */ + int iApp; assert( pList ); assert( pWal->writeLock ); @@ -3062,22 +3588,17 @@ int sqlite3WalFrames( ** nTruncate==0 then this frame set does not complete the transaction. */ assert( (isCommit!=0)==(nTruncate!=0) ); -#if defined(SQLITE_TEST) && defined(SQLITE_DEBUG) - { int cnt; for(cnt=0, p=pList; p; p=p->pDirty, cnt++){} - WALTRACE(("WAL%p: frame write begin. %d frames. mxFrame=%d. %s\n", - pWal, cnt, pWal->hdr.mxFrame, isCommit ? "Commit" : "Spill")); - } -#endif - pLive = (WalIndexHdr*)walIndexHdr(pWal); if( memcmp(&pWal->hdr, (void *)pLive, sizeof(WalIndexHdr))!=0 ){ - iFirst = pLive->mxFrame+1; + if( isWalMode2(pWal)==0 ){ + iFirst = pLive->mxFrame+1; + } } /* See if it is possible to write these frames into the start of the ** log file, instead of appending to it at pWal->hdr.mxFrame. */ - if( SQLITE_OK!=(rc = walRestartLog(pWal)) ){ + else if( SQLITE_OK!=(rc = walRestartLog(pWal)) ){ return rc; } @@ -3085,28 +3606,50 @@ int sqlite3WalFrames( ** header to the start of the WAL file. See comments at the top of ** this source file for a description of the WAL header format. */ - iFrame = pWal->hdr.mxFrame; + iApp = walidxGetFile(&pWal->hdr); + iFrame = walidxGetMxFrame(&pWal->hdr, iApp); + assert( iApp==0 || isWalMode2(pWal) ); + +#if defined(SQLITE_TEST) && defined(SQLITE_DEBUG) + { int cnt; for(cnt=0, p=pList; p; p=p->pDirty, cnt++){} + WALTRACE(("WAL%p: frame write begin. %d frames. iWal=%d. mxFrame=%d. %s\n", + pWal, cnt, iApp, iFrame, isCommit ? "Commit" : "Spill")); + } +#endif + if( iFrame==0 ){ + u32 iCkpt = 0; u8 aWalHdr[WAL_HDRSIZE]; /* Buffer to assemble wal-header in */ u32 aCksum[2]; /* Checksum for wal-header */ sqlite3Put4byte(&aWalHdr[0], (WAL_MAGIC | SQLITE_BIGENDIAN)); - sqlite3Put4byte(&aWalHdr[4], WAL_MAX_VERSION); + sqlite3Put4byte(&aWalHdr[4], pWal->hdr.iVersion); sqlite3Put4byte(&aWalHdr[8], szPage); - sqlite3Put4byte(&aWalHdr[12], pWal->nCkpt); - if( pWal->nCkpt==0 ) sqlite3_randomness(8, pWal->hdr.aSalt); + if( isWalMode2(pWal) ){ + if( walidxGetMxFrame(&pWal->hdr, !iApp)>0 ){ + u8 aPrev[4]; + rc = sqlite3OsRead(pWal->apWalFd[!iApp], aPrev, 4, 12); + if( rc!=SQLITE_OK ){ + return rc; + } + iCkpt = (sqlite3Get4byte(aPrev) + 1) & 0x0F; + } + }else{ + iCkpt = pWal->nCkpt; + } + sqlite3Put4byte(&aWalHdr[12], iCkpt); memcpy(&aWalHdr[16], pWal->hdr.aSalt, 8); walChecksumBytes(1, aWalHdr, WAL_HDRSIZE-2*4, 0, aCksum); sqlite3Put4byte(&aWalHdr[24], aCksum[0]); sqlite3Put4byte(&aWalHdr[28], aCksum[1]); - + pWal->szPage = szPage; pWal->hdr.bigEndCksum = SQLITE_BIGENDIAN; pWal->hdr.aFrameCksum[0] = aCksum[0]; pWal->hdr.aFrameCksum[1] = aCksum[1]; pWal->truncateOnCommit = 1; - rc = sqlite3OsWrite(pWal->pWalFd, aWalHdr, sizeof(aWalHdr), 0); + rc = sqlite3OsWrite(pWal->apWalFd[iApp], aWalHdr, sizeof(aWalHdr), 0); WALTRACE(("WAL%p: wal-header write %s\n", pWal, rc ? "failed" : "ok")); if( rc!=SQLITE_OK ){ return rc; @@ -3120,7 +3663,7 @@ int sqlite3WalFrames( ** https://sqlite.org/src/info/ff5be73dee */ if( pWal->syncHeader ){ - rc = sqlite3OsSync(pWal->pWalFd, CKPT_SYNC_FLAGS(sync_flags)); + rc = sqlite3OsSync(pWal->apWalFd[iApp], CKPT_SYNC_FLAGS(sync_flags)); if( rc ) return rc; } } @@ -3128,7 +3671,7 @@ int sqlite3WalFrames( /* Setup information needed to write frames into the WAL */ w.pWal = pWal; - w.pFd = pWal->pWalFd; + w.pFd = pWal->apWalFd[iApp]; w.iSyncPoint = 0; w.syncFlags = sync_flags; w.szPage = szPage; @@ -3158,7 +3701,7 @@ int sqlite3WalFrames( #else pData = p->pData; #endif - rc = sqlite3OsWrite(pWal->pWalFd, pData, szPage, iOff); + rc = sqlite3OsWrite(pWal->apWalFd[iApp], pData, szPage, iOff); if( rc ) return rc; p->flags &= ~PGHDR_WAL_APPEND; continue; @@ -3198,7 +3741,7 @@ int sqlite3WalFrames( if( isCommit && WAL_SYNC_FLAGS(sync_flags)!=0 ){ int bSync = 1; if( pWal->padToSectorBoundary ){ - int sectorSize = sqlite3SectorSize(pWal->pWalFd); + int sectorSize = sqlite3SectorSize(w.pFd); w.iSyncPoint = ((iOffset+sectorSize-1)/sectorSize)*sectorSize; bSync = (w.iSyncPoint==iOffset); testcase( bSync ); @@ -3233,16 +3776,16 @@ int sqlite3WalFrames( ** guarantees that there are no other writers, and no data that may ** be in use by existing readers is being overwritten. */ - iFrame = pWal->hdr.mxFrame; + iFrame = walidxGetMxFrame(&pWal->hdr, iApp); for(p=pList; p && rc==SQLITE_OK; p=p->pDirty){ if( (p->flags & PGHDR_WAL_APPEND)==0 ) continue; iFrame++; - rc = walIndexAppend(pWal, iFrame, p->pgno); + rc = walIndexAppend(pWal, iApp, iFrame, p->pgno); } while( rc==SQLITE_OK && nExtra>0 ){ iFrame++; nExtra--; - rc = walIndexAppend(pWal, iFrame, pLast->pgno); + rc = walIndexAppend(pWal, iApp, iFrame, pLast->pgno); } if( rc==SQLITE_OK ){ @@ -3250,7 +3793,7 @@ int sqlite3WalFrames( pWal->hdr.szPage = (u16)((szPage&0xff00) | (szPage>>16)); testcase( szPage<=32768 ); testcase( szPage>=65536 ); - pWal->hdr.mxFrame = iFrame; + walidxSetMxFrame(&pWal->hdr, iApp, iFrame); if( isCommit ){ pWal->hdr.iChange++; pWal->hdr.nPage = nTruncate; @@ -3258,7 +3801,17 @@ int sqlite3WalFrames( /* If this is a commit, update the wal-index header too. */ if( isCommit ){ walIndexWriteHdr(pWal); - pWal->iCallback = iFrame; + if( isWalMode2(pWal) ){ + int iOther = !walidxGetFile(&pWal->hdr); + if( walidxGetMxFrame(&pWal->hdr, iOther) + && !walCkptInfo(pWal)->nBackfill + ){ + pWal->iCallback = walidxGetMxFrame(&pWal->hdr, 0); + pWal->iCallback += walidxGetMxFrame(&pWal->hdr, 1); + } + }else{ + pWal->iCallback = iFrame; + } } } @@ -3350,7 +3903,9 @@ int sqlite3WalCheckpoint( /* Copy data from the log to the database file. */ if( rc==SQLITE_OK ){ - if( pWal->hdr.mxFrame && walPagesize(pWal)!=nBuf ){ + if( (walPagesize(pWal)!=nBuf) + && (walidxGetMxFrame(&pWal->hdr, 0) || walidxGetMxFrame(&pWal->hdr, 1)) + ){ rc = SQLITE_CORRUPT_BKPT; }else{ rc = walCheckpoint(pWal, db, eMode2, xBusy2, pBusyArg, sync_flags, zBuf); @@ -3358,8 +3913,20 @@ int sqlite3WalCheckpoint( /* If no error occurred, set the output variables. */ if( rc==SQLITE_OK || rc==SQLITE_BUSY ){ - if( pnLog ) *pnLog = (int)pWal->hdr.mxFrame; - if( pnCkpt ) *pnCkpt = (int)(walCkptInfo(pWal)->nBackfill); + if( pnLog ){ + *pnLog = walidxGetMxFrame(&pWal->hdr,0)+walidxGetMxFrame(&pWal->hdr,1); + } + if( pnCkpt ){ + if( isWalMode2(pWal) ){ + if( (int)(walCkptInfo(pWal)->nBackfill) ){ + *pnCkpt = walidxGetMxFrame(&pWal->hdr, !walidxGetFile(&pWal->hdr)); + }else{ + *pnCkpt = 0; + } + }else{ + *pnCkpt = walCkptInfo(pWal)->nBackfill; + } + } } } @@ -3421,6 +3988,7 @@ int sqlite3WalCallback(Wal *pWal){ */ int sqlite3WalExclusiveMode(Wal *pWal, int op){ int rc; + assert( pWal->writeLock==0 ); assert( pWal->exclusiveMode!=WAL_HEAPMEMORY_MODE || op==-1 ); @@ -3430,13 +3998,18 @@ int sqlite3WalExclusiveMode(Wal *pWal, int op){ ** locks are taken in this case). Nor should the pager attempt to ** upgrade to exclusive-mode following such an error. */ - assert( pWal->readLock>=0 || pWal->lockError ); - assert( pWal->readLock>=0 || (op<=0 && pWal->exclusiveMode==0) ); + assert( pWal->readLock!=WAL_LOCK_NONE || pWal->lockError ); + assert( pWal->readLock!=WAL_LOCK_NONE || (op<=0 && pWal->exclusiveMode==0) ); if( op==0 ){ if( pWal->exclusiveMode ){ pWal->exclusiveMode = 0; - if( walLockShared(pWal, WAL_READ_LOCK(pWal->readLock))!=SQLITE_OK ){ + if( isWalMode2(pWal) ){ + rc = walLockReader(pWal, pWal->readLock, 1); + }else{ + rc = walLockShared(pWal, WAL_READ_LOCK(pWal->readLock)); + } + if( rc==SQLITE_OK ){ pWal->exclusiveMode = 1; } rc = pWal->exclusiveMode==0; @@ -3447,7 +4020,11 @@ int sqlite3WalExclusiveMode(Wal *pWal, int op){ }else if( op>0 ){ assert( pWal->exclusiveMode==0 ); assert( pWal->readLock>=0 ); - walUnlockShared(pWal, WAL_READ_LOCK(pWal->readLock)); + if( isWalMode2(pWal) ){ + walLockReader(pWal, pWal->readLock, 0); + }else{ + walUnlockShared(pWal, WAL_READ_LOCK(pWal->readLock)); + } pWal->exclusiveMode = 1; rc = 1; }else{ @@ -3531,7 +4108,7 @@ int sqlite3WalFramesize(Wal *pWal){ /* Return the sqlite3_file object for the WAL file */ sqlite3_file *sqlite3WalFile(Wal *pWal){ - return pWal->pWalFd; + return pWal->apWalFd[0]; } #endif /* #ifndef SQLITE_OMIT_WAL */ diff --git a/src/wal.h b/src/wal.h index d97300a684..29ddd7eb0b 100644 --- a/src/wal.h +++ b/src/wal.h @@ -26,7 +26,7 @@ #define CKPT_SYNC_FLAGS(X) (((X)>>2)&0x03) #ifdef SQLITE_OMIT_WAL -# define sqlite3WalOpen(x,y,z) 0 +# define sqlite3WalOpen(w,x,y,z) 0 # define sqlite3WalLimit(x,y) # define sqlite3WalClose(v,w,x,y,z) 0 # define sqlite3WalBeginReadTransaction(y,z) 0 @@ -55,7 +55,7 @@ typedef struct Wal Wal; /* Open and close a connection to a write-ahead log. */ -int sqlite3WalOpen(sqlite3_vfs*, sqlite3_file*, const char *, int, i64, Wal**); +int sqlite3WalOpen(sqlite3_vfs*, sqlite3_file*, const char *,int,i64,int,Wal**); int sqlite3WalClose(Wal *pWal, sqlite3*, int sync_flags, int, u8 *); /* Set the limiting size of a WAL file. */ diff --git a/test/permutations.test b/test/permutations.test index 5afc51cb7d..7fbf98d412 100644 --- a/test/permutations.test +++ b/test/permutations.test @@ -1019,6 +1019,23 @@ test_suite "wal" -description { fts3c.test fts3d.test fts3e.test fts3query.test } +test_suite "wal2" -description { + Run tests with journal_mode=WAL2 +} -initialize { + set ::G(savepoint6_iterations) 100 +} -shutdown { + unset -nocomplain ::G(savepoint6_iterations) +} -files { + savepoint.test savepoint2.test savepoint6.test + trans.test avtrans.test + + fts3aa.test fts3ab.test fts3ac.test fts3ad.test + fts3ae.test fts3af.test fts3ag.test fts3ah.test + fts3ai.test fts3aj.test fts3ak.test fts3al.test + fts3am.test fts3an.test fts3ao.test fts3b.test + fts3c.test fts3d.test fts3e.test fts3query.test +} + test_suite "rtree" -description { All R-tree related tests. Provides coverage of source file rtree.c. } -files [glob -nocomplain $::testdir/../ext/rtree/*.test] diff --git a/test/savepoint.test b/test/savepoint.test index eed8a9e702..f196f8d2fc 100644 --- a/test/savepoint.test +++ b/test/savepoint.test @@ -28,6 +28,7 @@ do_test savepoint-1.1 { RELEASE sp1; } } {} +wal_check_journal_mode savepoint-1.1 do_test savepoint-1.2 { execsql { SAVEPOINT sp1; diff --git a/test/savepoint6.test b/test/savepoint6.test index b1d0d46f5c..6b41ef2da9 100644 --- a/test/savepoint6.test +++ b/test/savepoint6.test @@ -15,6 +15,10 @@ set testdir [file dirname $argv0] source $testdir/tester.tcl proc sql {zSql} { + if {0 && $::debug_op} { + puts stderr "$zSql ;" + flush stderr + } uplevel db eval [list $zSql] #puts stderr "$zSql ;" } @@ -67,11 +71,13 @@ proc x_to_y {x} { # delete_rows XVALUES # proc savepoint {zName} { + if {$::debug_op} { puts stderr "savepoint $zName" ; flush stderr } catch { sql "SAVEPOINT $zName" } lappend ::lSavepoint [list $zName [array get ::aEntry]] } proc rollback {zName} { + if {$::debug_op} { puts stderr "rollback $zName" ; flush stderr } catch { sql "ROLLBACK TO $zName" } for {set i [expr {[llength $::lSavepoint]-1}]} {$i>=0} {incr i -1} { set zSavepoint [lindex $::lSavepoint $i 0] @@ -89,6 +95,7 @@ proc rollback {zName} { } proc release {zName} { + if {$::debug_op} { puts stderr "release $zName" ; flush stderr } catch { sql "RELEASE $zName" } for {set i [expr {[llength $::lSavepoint]-1}]} {$i>=0} {incr i -1} { set zSavepoint [lindex $::lSavepoint $i 0] @@ -104,6 +111,7 @@ proc release {zName} { } proc insert_rows {lX} { + if {$::debug_op} { puts stderr "insert_rows $lX" ; flush stderr } foreach x $lX { set y [x_to_y $x] @@ -116,6 +124,7 @@ proc insert_rows {lX} { } proc delete_rows {lX} { + if {$::debug_op} { puts stderr "delete_rows $lX" ; flush stderr } foreach x $lX { # Update database [db] sql "DELETE FROM t1 WHERE x = $x" @@ -164,6 +173,11 @@ proc random_integers {nRes nRange} { } #------------------------------------------------------------------------- +set ::debug_op 0 +proc debug_ops {} { + set ::debug_op 1 +} + proc database_op {} { set i [expr int(rand()*2)] if {$i==0} { @@ -185,9 +199,6 @@ proc savepoint_op {} { set C [lindex $cmds [expr int(rand()*6)]] set N [lindex $names [expr int(rand()*5)]] - #puts stderr " $C $N ; " - #flush stderr - $C $N return ok } diff --git a/test/tester.tcl b/test/tester.tcl index 10a20a47d6..4294b00786 100644 --- a/test/tester.tcl +++ b/test/tester.tcl @@ -586,6 +586,7 @@ proc reset_db {} { forcedelete test.db forcedelete test.db-journal forcedelete test.db-wal + forcedelete test.db-wal2 sqlite3 db ./test.db set ::DB [sqlite3_connection_pointer db] if {[info exists ::SETUP_SQL]} { @@ -2053,17 +2054,32 @@ proc drop_all_indexes {{db db}} { # Returns true if this test should be run in WAL mode. False otherwise. # proc wal_is_wal_mode {} { - expr {[permutation] eq "wal"} + if {[permutation] eq "wal"} { return 1 } + if {[permutation] eq "wal2"} { return 2 } + return 0 } proc wal_set_journal_mode {{db db}} { - if { [wal_is_wal_mode] } { - $db eval "PRAGMA journal_mode = WAL" + switch -- [wal_is_wal_mode] { + 0 { + } + + 1 { + $db eval "PRAGMA journal_mode = WAL" + } + + 2 { + $db eval "PRAGMA journal_mode = WAL2" + } } } proc wal_check_journal_mode {testname {db db}} { if { [wal_is_wal_mode] } { $db eval { SELECT * FROM sqlite_master } - do_test $testname [list $db eval "PRAGMA main.journal_mode"] {wal} + set expected "wal" + if {[wal_is_wal_mode]==2} { + set expected "wal2" + } + do_test $testname [list $db eval "PRAGMA main.journal_mode"] $expected } } diff --git a/test/waltwo2.test b/test/waltwo2.test new file mode 100644 index 0000000000..0ab5636a06 --- /dev/null +++ b/test/waltwo2.test @@ -0,0 +1,127 @@ +# 2017 September 19 +# +# The author disclaims copyright to this source code. In place of +# a legal notice, here is a blessing: +# +# May you do good and not evil. +# May you find forgiveness for yourself and forgive others. +# May you share freely, never taking more than you give. +# +#*********************************************************************** +# This file implements regression tests for SQLite library. The +# focus of this file is testing the operation of the library in +# "PRAGMA journal_mode=WAL2" mode. +# + +set testdir [file dirname $argv0] +source $testdir/tester.tcl +source $testdir/lock_common.tcl +source $testdir/malloc_common.tcl +source $testdir/wal_common.tcl + +set testprefix walsimple +ifcapable !wal {finish_test ; return } + +db close +foreach f [glob -nocomplain test.db*] { forcedelete $f } +sqlite3 db test.db + +do_execsql_test 1.0 { + CREATE TABLE t1(x, y); + PRAGMA journal_mode = wal2; +} {wal2} + +do_execsql_test 1.1 { + SELECT * FROM t1; +} {} + +do_execsql_test 1.2 { + INSERT INTO t1 VALUES(1, 2); +} {} + +do_execsql_test 1.3 { + SELECT * FROM t1; +} {1 2} + +do_test 1.4 { + sqlite3 db2 test.db + execsql { SELECT * FROM t1 } db2 +} {1 2} + +do_test 1.5 { + lsort [glob test.db*] +} {test.db test.db-shm test.db-wal test.db-wal2} + +do_test 1.6 { + db close + db2 close + sqlite3 db test.db + execsql { SELECT * FROM t1 } +} {1 2} + +do_execsql_test 1.7 { + PRAGMA journal_size_limit = 4000; + INSERT INTO t1 VALUES(3, 4); + INSERT INTO t1 VALUES(5, 6); + INSERT INTO t1 VALUES(7, 8); + INSERT INTO t1 VALUES(9, 10); + INSERT INTO t1 VALUES(11, 12); + INSERT INTO t1 VALUES(13, 14); + INSERT INTO t1 VALUES(15, 16); + INSERT INTO t1 VALUES(17, 18); + SELECT * FROM t1; +} {4000 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18} + +do_test 1.8 { + sqlite3 db2 test.db + execsql { SELECT * FROM t1 } db2 +} {1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18} + +do_test 1.9 { + db close + db2 close + lsort [glob test.db*] +} {test.db} + +#------------------------------------------------------------------------- +reset_db +do_execsql_test 2.0 { + CREATE TABLE t1(a INTEGER PRIMARY KEY, b, c); + CREATE INDEX i1 ON t1(b, c); + PRAGMA journal_mode = wal2; + PRAGMA journal_size_limit = 4000; +} {wal2 4000} + +proc wal_hook {DB nm nFrame} { + $DB eval { PRAGMA wal_checkpoint } +} +db wal_hook [list wal_hook db] + + +foreach js {4000 8000 12000} { + foreach NROW [list 100 200 300 400 500 600 1000] { + do_test 2.$js.$NROW.1 { + db eval "DELETE FROM t1" + db eval "PRAGMA journal_size_limit = $js" + set nTotal 0 + for {set i 0} {$i < $NROW} {incr i} { + db eval { INSERT INTO t1 VALUES($i, $i, randomblob(abs(random()%50))) } + incr nTotal $i + } + set {} {} + } {} + + do_test 2.$js.$NROW.2 { + sqlite3 db2 test.db + db2 eval { + PRAGMA integrity_check; + SELECT count(*), sum(b) FROM t1; + } + } [list ok $NROW $nTotal] + + db2 close + } +} + +finish_test +