From d992b150c79b4b87419587aac218a40df879c963 Mon Sep 17 00:00:00 2001 From: drh Date: Tue, 20 Dec 2011 20:13:25 +0000 Subject: [PATCH] Refactor the sqlite3WalFrames() routine for clarity of presentation. Do the padded transaction sync as the write pointer crosses the final sector boundary instead of at the end, for efficiency. Always sync the WAL header immediately after it is written. FossilOrigin-Name: 92c73b421b6242b09247dfb759777a531a107523 --- manifest | 14 ++-- manifest.uuid | 2 +- src/wal.c | 215 ++++++++++++++++++++++++++++---------------------- 3 files changed, 129 insertions(+), 102 deletions(-) diff --git a/manifest b/manifest index 54fa8d4f5e..5a218072ca 100644 --- a/manifest +++ b/manifest @@ -1,5 +1,5 @@ -C Merge\s[21b76af6ed]\sinto\sstatvfs\sbranch. -D 2011-12-19T11:57:41.110 +C Refactor\sthe\ssqlite3WalFrames()\sroutine\sfor\sclarity\sof\spresentation.\nDo\sthe\spadded\stransaction\ssync\sas\sthe\swrite\spointer\scrosses\sthe\sfinal\nsector\sboundary\sinstead\sof\sat\sthe\send,\sfor\sefficiency.\s\sAlways\ssync\nthe\sWAL\sheader\simmediately\safter\sit\sis\swritten. +D 2011-12-20T20:13:25.386 F Makefile.arm-wince-mingw32ce-gcc d6df77f1f48d690bd73162294bbba7f59507c72f F Makefile.in 5b4a3e12a850b021547e43daf886b25133b44c07 F Makefile.linux-gcc 91d710bdc4998cb015f39edf3cb314ec4f4d7e23 @@ -250,7 +250,7 @@ F src/vdbemem.c 2fc78b3e0fabcc1eaa23cd79dd2e30e6dcfe1e56 F src/vdbesort.c 468d43c057063e54da4f1988b38b4f46d60e7790 F src/vdbetrace.c d6e50e04e1ec498150e519058f617d91b8f5c843 F src/vtab.c e9318d88feac85be8e27ee783ac8f5397933fc8a -F src/wal.c 645fdf75d57f2a1b437241513f0ef0904233b8f2 +F src/wal.c 311c36af11a721f8601371c1a5a9b15c84ec2dee F src/wal.h 42f8313f7aaf8913e2d1fdf7b47025c23491ea1d F src/walker.c 3112bb3afe1d85dc52317cb1d752055e9a781f8f F src/where.c af623942514571895818b9b7ae11db95ae3b3d88 @@ -984,7 +984,7 @@ F tool/tostr.awk e75472c2f98dd76e06b8c9c1367f4ab07e122d06 F tool/vdbe-compress.tcl d70ea6d8a19e3571d7ab8c9b75cba86d1173ff0f F tool/warnings-clang.sh 9f406d66e750e8ac031c63a9ef3248aaa347ef2a F tool/warnings.sh fbc018d67fd7395f440c28f33ef0f94420226381 -P 68684495f1a62a41ad27934f3a6d3bc9d290a57d 21b76af6edd48f665cdd3af5f99d477f030c7668 -R 13c6e9f68f2269f9e8e46485261d751c -U dan -Z 3609ee926071d362fe7af243818ae9ac +P e694f7b166144a0afba7846e1e18ad568b33a081 +R 5ba56fa16752a6d0272db759071cab34 +U drh +Z ae8908dc690fbf93e21d9e8cf0a20f23 diff --git a/manifest.uuid b/manifest.uuid index 84c4125952..f885437af4 100644 --- a/manifest.uuid +++ b/manifest.uuid @@ -1 +1 @@ -e694f7b166144a0afba7846e1e18ad568b33a081 \ No newline at end of file +92c73b421b6242b09247dfb759777a531a107523 \ No newline at end of file diff --git a/src/wal.c b/src/wal.c index b601d42ffb..d61bfc069f 100644 --- a/src/wal.c +++ b/src/wal.c @@ -424,7 +424,7 @@ struct Wal { u8 ckptLock; /* True if holding a checkpoint lock */ u8 readOnly; /* WAL_RDWR, WAL_RDONLY, or WAL_SHM_RDONLY */ u8 truncateOnCommit; /* True to truncate WAL file on commit */ - u8 noSyncHeader; /* Avoid WAL header fsyncs if true */ + u8 syncHeader; /* Fsync the WAL header if true */ u8 padToSectorBoundary; /* Pad transactions out to the next sector */ WalIndexHdr hdr; /* Wal-index header for current transaction */ const char *zWalName; /* Name of WAL file */ @@ -1295,6 +1295,7 @@ int sqlite3WalOpen( pRet->readLock = -1; pRet->mxWalSize = mxWalSize; pRet->zWalName = zWalName; + pRet->syncHeader = 1; pRet->padToSectorBoundary = 1; pRet->exclusiveMode = (bNoShm ? WAL_HEAPMEMORY_MODE: WAL_NORMAL_MODE); @@ -1311,7 +1312,7 @@ int sqlite3WalOpen( sqlite3_free(pRet); }else{ int iDC = sqlite3OsDeviceCharacteristics(pRet->pWalFd); - if( iDC & SQLITE_IOCAP_SEQUENTIAL ){ pRet->noSyncHeader = 1; } + if( iDC & SQLITE_IOCAP_SEQUENTIAL ){ pRet->syncHeader = 0; } if( iDC & SQLITE_IOCAP_ZERO_DAMAGE ){ pRet->padToSectorBoundary = 0; } *ppWal = pRet; WALTRACE(("WAL%d: opened\n", pRet)); @@ -2634,41 +2635,71 @@ static int walRestartLog(Wal *pWal){ return rc; } +/* +** Information about the current state of the WAL file and where +** the next fsync should occur - passed from sqlite3WalFrames() into +** walWriteToLog(). +*/ +typedef struct WalWriter { + Wal *pWal; /* The complete WAL information */ + sqlite3_file *pFd; /* The WAL file to which we write */ + sqlite3_int64 iSyncPoint; /* Fsync at this offset */ + int syncFlags; /* Flags for the fsync */ + int szPage; /* Size of one page */ +} WalWriter; + /* ** Write iAmt bytes of content into the WAL file beginning at iOffset. +** Do a sync when crossing the p->iSyncPoint boundary. ** -** When crossing the boundary between the first and second sectors of the -** file, first write all of the first sector content, then fsync(), then -** continue writing content for the second sector. This ensures that -** the WAL header is overwritten before the first commit mark. +** In other words, if iSyncPoint is in between iOffset and iOffset+iAmt, +** first write the part before iSyncPoint, then sync, then write the +** rest. */ static int walWriteToLog( - Wal *pWal, /* WAL to write to */ + WalWriter *p, /* WAL to write to */ void *pContent, /* Content to be written */ int iAmt, /* Number of bytes to write */ sqlite3_int64 iOffset /* Start writing at this offset */ ){ int rc; - if( iOffset>=pWal->szFirstBlock - || iOffset+iAmtszFirstBlock - || pWal->syncFlags==0 - ){ - /* The common and fast case. Just write the data. */ - rc = sqlite3OsWrite(pWal->pWalFd, pContent, iAmt, iOffset); - }else{ - /* If this write will cross the first sector boundary, it has to - ** be split it two with a sync in between. */ - int iFirstAmt = pWal->szFirstBlock - iOffset; - assert( iFirstAmt>0 && iFirstAmtpWalFd, pContent, iFirstAmt, iOffset); - if( rc ) return rc; - assert( pWal->syncFlags & (SQLITE_SYNC_NORMAL|SQLITE_SYNC_FULL) ); - rc = sqlite3OsSync(pWal->pWalFd, pWal->syncFlags); + if( iOffsetiSyncPoint && iOffset+iAmt>=p->iSyncPoint ){ + int iFirstAmt = (int)(p->iSyncPoint - iOffset); + rc = sqlite3OsWrite(p->pFd, pContent, iFirstAmt, iOffset); if( rc ) return rc; + iOffset += iFirstAmt; + iAmt -= iFirstAmt; pContent = (void*)(iFirstAmt + (char*)pContent); - rc = sqlite3OsWrite(pWal->pWalFd, pContent, - iAmt-iFirstAmt, iOffset+iFirstAmt); + assert( p->syncFlags & (SQLITE_SYNC_NORMAL|SQLITE_SYNC_FULL) ); + rc = sqlite3OsSync(p->pFd, p->syncFlags); + if( rc ) return rc; } + rc = sqlite3OsWrite(p->pFd, pContent, iAmt, iOffset); + return rc; +} + +/* +** Write out a single frame of the WAL +*/ +static int walWriteOneFrame( + WalWriter *p, /* Where to write the frame */ + PgHdr *pPage, /* The page of the frame to be written */ + int nTruncate, /* The commit flag. Usually 0. >0 for commit */ + sqlite3_int64 iOffset /* Byte offset at which to write */ +){ + int rc; /* Result code from subfunctions */ + void *pData; /* Data actually written */ + u8 aFrame[WAL_FRAME_HDRSIZE]; /* Buffer to assemble frame-header in */ +#if defined(SQLITE_HAS_CODEC) + if( (pData = sqlite3PagerCodec(pPage))==0 ) return SQLITE_NOMEM; +#else + pData = pPage->pData; +#endif + walEncodeFrame(p->pWal, pPage->pgno, nTruncate, pData, aFrame); + rc = walWriteToLog(p, aFrame, sizeof(aFrame), iOffset); + if( rc ) return rc; + /* Write the page data */ + rc = walWriteToLog(p, pData, p->szPage, iOffset+sizeof(aFrame)); return rc; } @@ -2686,14 +2717,20 @@ int sqlite3WalFrames( ){ int rc; /* Used to catch return codes */ u32 iFrame; /* Next frame address */ - u8 aFrame[WAL_FRAME_HDRSIZE]; /* Buffer to assemble frame-header in */ PgHdr *p; /* Iterator to run through pList with. */ PgHdr *pLast = 0; /* Last frame in list */ - int nLast = 0; /* Number of extra copies of last page */ + int nExtra = 0; /* Number of extra copies of last page */ + int szFrame; /* The size of a single frame */ + i64 iOffset; /* Next byte to write in WAL file */ + WalWriter w; /* The writer */ assert( pList ); assert( pWal->writeLock ); + /* If this frame set completes a transaction, then nTruncate>0. If + ** nTruncate==0 then this frame set does not complete the transaction. */ + assert( (isCommit!=0)==(nTruncate!=0) ); + #if defined(SQLITE_TEST) && defined(SQLITE_DEBUG) { int cnt; for(cnt=0, p=pList; p; p=p->pDirty, cnt++){} WALTRACE(("WAL%p: frame write begin. %d frames. mxFrame=%d. %s\n", @@ -2738,88 +2775,78 @@ int sqlite3WalFrames( if( rc!=SQLITE_OK ){ return rc; } + + /* Sync the header (unless SQLITE_IOCAP_SEQUENTIAL is true or unless + ** all syncing is turned off by PRAGMA synchronous=OFF). Otherwise + ** an out-of-order write following a WAL restart could result in + ** database corruption. See the ticket: + ** + ** http://localhost:591/sqlite/info/ff5be73dee + */ + if( pWal->syncHeader && sync_flags ){ + rc = sqlite3OsSync(pWal->pWalFd, sync_flags & SQLITE_SYNC_MASK); + if( rc ) return rc; + } } assert( (int)pWal->szPage==szPage ); - /* Setup information needed to do the WAL header sync */ - if( pWal->noSyncHeader ){ - assert( pWal->szFirstBlock==0 ); - assert( pWal->syncFlags==0 ); - }else{ - pWal->szFirstBlock = sqlite3OsSectorSize(pWal->pWalFd); - if( szPage>pWal->szFirstBlock ) pWal->szFirstBlock = szPage; - pWal->syncFlags = sync_flags & SQLITE_SYNC_MASK; - } + /* Setup information needed to write frames into the WAL */ + w.pWal = pWal; + w.pFd = pWal->pWalFd; + w.iSyncPoint = 0; + w.syncFlags = sync_flags; + w.szPage = szPage; + iOffset = walFrameOffset(iFrame+1, szPage); + szFrame = szPage + WAL_FRAME_HDRSIZE; - /* Write the log file. */ + /* Write all frames into the log file exactly once */ for(p=pList; p; p=p->pDirty){ - u32 nDbsize; /* Db-size field for frame header */ - i64 iOffset; /* Write offset in log file */ - void *pData; - - iOffset = walFrameOffset(++iFrame, szPage); - /* testcase( IS_BIG_INT(iOffset) ); // requires a 4GiB WAL */ - - /* Populate and write the frame header */ - nDbsize = (isCommit && p->pDirty==0) ? nTruncate : 0; -#if defined(SQLITE_HAS_CODEC) - if( (pData = sqlite3PagerCodec(p))==0 ) return SQLITE_NOMEM; -#else - pData = p->pData; -#endif - walEncodeFrame(pWal, p->pgno, nDbsize, pData, aFrame); - rc = walWriteToLog(pWal, aFrame, sizeof(aFrame), iOffset); - if( rc!=SQLITE_OK ){ - return rc; - } - - /* Write the page data */ - rc = walWriteToLog(pWal, pData, szPage, iOffset+sizeof(aFrame)); - if( rc!=SQLITE_OK ){ - return rc; - } + int nDbSize; /* 0 normally. Positive == commit flag */ + iFrame++; + assert( iOffset==walFrameOffset(iFrame, szPage) ); + nDbSize = (isCommit && p->pDirty==0) ? nTruncate : 0; + rc = walWriteOneFrame(&w, p, nDbSize, iOffset); + if( rc ) return rc; pLast = p; + iOffset += szFrame; } - /* Sync the log file if the 'isSync' flag was specified. */ + /* If this is the end of a transaction, then we might need to pad + ** the transaction and/or sync the WAL file. + ** + ** Padding and syncing only occur if this set of frames complete a + ** transaction and if PRAGMA synchronous=FULL. If synchronous==NORMAL + ** or synchonous==OFF, then no padding or syncing are needed. + ** + ** If SQLITE_IOCAP_ZERO_DAMAGE is defined, then padding is not needed + ** and only the sync is done. If padding is needed, then the final + ** frame is repeated (with its commit mark) until the next sector + ** boundary is crossed. Only the part of the WAL prior to the last + ** sector boundary is synced; the part of the last frame that extends + ** past the sector boundary is written after the sync. + */ if( isCommit && (sync_flags & WAL_SYNC_TRANSACTIONS)!=0 ){ if( pWal->padToSectorBoundary ){ - i64 iSegment = sqlite3OsSectorSize(pWal->pWalFd); - i64 iOffset = walFrameOffset(iFrame+1, szPage); - - assert( iSegment>0 ); - - iSegment = (((iOffset+iSegment-1)/iSegment) * iSegment); - while( iOffsetpData; -#endif - walEncodeFrame(pWal, pLast->pgno, nTruncate, pData, aFrame); - /* testcase( IS_BIG_INT(iOffset) ); // requires a 4GiB WAL */ - rc = walWriteToLog(pWal, aFrame, sizeof(aFrame), iOffset); - if( rc!=SQLITE_OK ){ - return rc; - } - iOffset += WAL_FRAME_HDRSIZE; - rc = walWriteToLog(pWal, pData, szPage, iOffset); - if( rc!=SQLITE_OK ){ - return rc; - } - nLast++; - iOffset += szPage; + int sectorSize = sqlite3OsSectorSize(pWal->pWalFd); + w.iSyncPoint = ((iOffset+sectorSize-1)/sectorSize)*sectorSize; + while( iOffsetpWalFd, sync_flags & SQLITE_SYNC_MASK); + rc = sqlite3OsSync(w.pFd, sync_flags & SQLITE_SYNC_MASK); } + /* If this frame set completes the first transaction in the WAL and + ** if PRAGMA journal_size_limit is set, then truncate the WAL to the + ** journal size limit, if possible. + */ if( isCommit && pWal->truncateOnCommit && pWal->mxWalSize>=0 ){ i64 sz = pWal->mxWalSize; - if( walFrameOffset(iFrame+nLast+1, szPage)>pWal->mxWalSize ){ - sz = walFrameOffset(iFrame+nLast+1, szPage); + if( walFrameOffset(iFrame+nExtra+1, szPage)>pWal->mxWalSize ){ + sz = walFrameOffset(iFrame+nExtra+1, szPage); } walLimitSize(pWal, sz); pWal->truncateOnCommit = 0; @@ -2835,9 +2862,9 @@ int sqlite3WalFrames( iFrame++; rc = walIndexAppend(pWal, iFrame, p->pgno); } - while( nLast>0 && rc==SQLITE_OK ){ + while( nExtra>0 && rc==SQLITE_OK ){ iFrame++; - nLast--; + nExtra--; rc = walIndexAppend(pWal, iFrame, pLast->pgno); } -- 2.47.2