From bfaa3dbd61d55da36027e7c252893a0c08dc650b Mon Sep 17 00:00:00 2001 From: drh Date: Tue, 26 Mar 2019 13:28:15 +0000 Subject: [PATCH] Bring this branch into closer alignment with begin-concurrent. FossilOrigin-Name: 6433d366175c5be586a0e11b540e76a2515466b358332c4356388f4c449ec0d7 --- doc/begin_concurrent.md | 107 ++++++++++++++++++++++++++++++++++++++++ manifest | 25 +++++----- manifest.uuid | 2 +- src/btree.c | 2 +- src/pager.c | 8 ++- src/sqliteInt.h | 2 +- src/wal.c | 14 +++++- src/wal.h | 4 +- test/concfault.test | 41 ++++++++++++++- test/permutations.test | 1 + 10 files changed, 186 insertions(+), 20 deletions(-) create mode 100644 doc/begin_concurrent.md diff --git a/doc/begin_concurrent.md b/doc/begin_concurrent.md new file mode 100644 index 0000000000..2a06b5872e --- /dev/null +++ b/doc/begin_concurrent.md @@ -0,0 +1,107 @@ + +Begin Concurrent +================ + +## Overview + +Usually, SQLite allows at most one writer to proceed concurrently. The +BEGIN CONCURRENT enhancement allows multiple writers to process write +transactions simultanously if the database is in "wal" or "wal2" mode, +although the system still serializes COMMIT commands. + +When a write-transaction is opened with "BEGIN CONCURRENT", actually +locking the database is deferred until a COMMIT is executed. This means +that any number of transactions started with BEGIN CONCURRENT may proceed +concurrently. The system uses optimistic page-level-locking to prevent +conflicting concurrent transactions from being committed. + +When a BEGIN CONCURRENT transaction is committed, the system checks whether +or not any of the database pages that the transaction has read have been +modified since the BEGIN CONCURRENT was opened. In other words - it asks +if the transaction being committed operates on a different set of data than +all other concurrently executing transactions. If the answer is "yes, this +transaction did not read or modify any data modified by any concurrent +transaction", then the transaction is committed as normal. Otherwise, if the +transaction does conflict, it cannot be committed and an SQLITE_BUSY_SNAPSHOT +error is returned. At this point, all the client can do is ROLLBACK the +transaction. + +If SQLITE_BUSY_SNAPSHOT is returned, messages are output via the sqlite3_log +mechanism indicating the page and table or index on which the conflict +occurred. This can be useful when optimizing concurrency. + +## Application Programming Notes + +In order to serialize COMMIT processing, SQLite takes a lock on the database +as part of each COMMIT command and releases it before returning. At most one +writer may hold this lock at any one time. If a writer cannot obtain the lock, +it uses SQLite's busy-handler to pause and retry for a while: + + + https://www.sqlite.org/c3ref/busy_handler.html + + +If there is significant contention for the writer lock, this mechanism can be +inefficient. In this case it is better for the application to use a mutex or +some other mechanism that supports blocking to ensure that at most one writer +is attempting to COMMIT a BEGIN CONCURRENT transaction at a time. This is +usually easier if all writers are part of the same operating system process. + +If all database clients (readers and writers) are located in the same OS +process, and if that OS is a Unix variant, then it can be more efficient to +the built-in VFS "unix-excl" instead of the default "unix". This is because it +uses more efficient locking primitives. + +The key to maximizing concurrency using BEGIN CONCURRENT is to ensure that +there are a large number of non-conflicting transactions. In SQLite, each +table and each index is stored as a separate b-tree, each of which is +distributed over a discrete set of database pages. This means that: + + * Two transactions that write to different sets of tables never + conflict, and that + + * Two transactions that write to the same tables or indexes only + conflict if the values of the keys (either primary keys or indexed + rows) are fairly close together. For example, given a large + table with the schema: + +
     CREATE TABLE t1(a INTEGER PRIMARY KEY, b BLOB);
+ + writing two rows with adjacent values for "a" probably will cause a + conflict (as the two keys are stored on the same page), but writing two + rows with vastly different values for "a" will not (as the keys will likly + be stored on different pages). + +Note that, in SQLite, if values are not explicitly supplied for an INTEGER +PRIMARY KEY, as for example in: + +> + INSERT INTO t1(b) VALUES(<blob-value>); + +then monotonically increasing values are assigned automatically. This is +terrible for concurrency, as it all but ensures that all new rows are +added to the same database page. In such situations, it is better to +explicitly assign random values to INTEGER PRIMARY KEY fields. + +This problem also comes up for non-WITHOUT ROWID tables that do not have an +explicit INTEGER PRIMARY KEY column. In these cases each table has an implicit +INTEGER PRIMARY KEY column that is assigned increasing values, leading to the +same problem as omitting to assign a value to an explicit INTEGER PRIMARY KEY +column. + +For both explicit and implicit INTEGER PRIMARY KEYs, it is possible to have +SQLite assign values at random (instead of the monotonically increasing +values) by writing a row with a rowid equal to the largest possible signed +64-bit integer to the table. For example: + + INSERT INTO t1(a) VALUES(9223372036854775807); + +Applications should take care not to malfunction due to the presence of such +rows. + +The nature of some types of indexes, for example indexes on timestamp fields, +can also cause problems (as concurrent transactions may assign similar +timestamps that will be stored on the same db page to new records). In these +cases the database schema may need to be rethought to increase the concurrency +provided by page-level-locking. + diff --git a/manifest b/manifest index 5863a05066..acd4386c8b 100644 --- a/manifest +++ b/manifest @@ -1,5 +1,5 @@ -C Merge\sthe\slatest\strunk\schanges. -D 2019-03-26T12:16:24.827 +C Bring\sthis\sbranch\sinto\scloser\salignment\swith\sbegin-concurrent. +D 2019-03-26T13:28:15.115 F .fossil-settings/empty-dirs dbb81e8fc0401ac46a1491ab34a7f2c7c0452f2f06b54ebb845d024ca8283ef1 F .fossil-settings/ignore-glob 35175cdfcf539b2318cb04a9901442804be81cd677d8b889fcc9149c21f239ea F LICENSE.md df5091916dbb40e6e9686186587125e1b2ff51f022cc334e886c19a0e9982724 @@ -38,6 +38,7 @@ F configure b013bf805064650b072817c7c7f0a295cfcec5b1afec15e59ea4e9996543f51e x F configure.ac 3552d3aecade98a9d4b64bceb48ffb7726cbc85902efde956812942f060fbd0a F contrib/sqlitecon.tcl 210a913ad63f9f991070821e599d600bd913e0ad F doc/F2FS.txt c1d4a0ae9711cfe0e1d8b019d154f1c29e0d3abfe820787ba1e9ed7691160fcd +F doc/begin_concurrent.md 4bee2c3990d1eb800f1ce3726a911292a8e4b889300b2ffd4b08d357370db299 F doc/lemon.html 24956ab2995e55fe171e55bdd04f22b553957dc8bb43501dbb9311e30187e0d3 F doc/pager-invariants.txt 27fed9a70ddad2088750c4a2b493b63853da2710 F doc/vfs-shm.txt e101f27ea02a8387ce46a05be2b1a902a021d37a @@ -461,7 +462,7 @@ F src/auth.c 0fac71038875693a937e506bceb492c5f136dd7b1249fbd4ae70b4e8da14f9df F src/backup.c 78d3cecfbe28230a3a9a1793e2ead609f469be43e8f486ca996006be551857ab F src/bitvec.c 8433d9e98dd6f2ea3286e0d2fe5d65de1bfc18a706486eb2026b01be066b5806 F src/btmutex.c 8acc2f464ee76324bf13310df5692a262b801808984c1b79defb2503bbafadb6 -F src/btree.c b517c680d108228e4ff2de9182957f96b9932057fed5e4322eaabe0c95e2b7df +F src/btree.c b8d297f81e69bdb52e405d4ba3dca62c02e1421120309fb1ceef0e05f5eece55 F src/btree.h f5c65a8c7ce7d3c493ab5ee4d35ea95610422362d3d207993b05804d2bbcc17c F src/btreeInt.h 9d7f00ca9402f5e881e30eeba1e65814be8544284d59bd843419b6f73b761730 F src/build.c 1d5dc39cb3fd437c8031c13f52f600b1541977ea68fe1c29bf200ceb22217d0f @@ -507,7 +508,7 @@ F src/os_setup.h 0dbaea40a7d36bf311613d31342e0b99e2536586 F src/os_unix.c 86eca42c3d955bebea0082450f978e5633448235f03f86b27a02538bb26e7fff F src/os_win.c 85d9e532d0444ab6c16d7431490c2e279e282aa0917b0e988996b1ae0de5c5a0 F src/os_win.h 7b073010f1451abe501be30d12f6bc599824944a -F src/pager.c 56e624829268336890dad1ed864ab62df6cb2eb35633a430ca22d09b9eea3cfb +F src/pager.c da57d6cdd925392aa5e8ed99908e037d562a238c8153e458b9a4b1a13021bfdb F src/pager.h 389ba8f526d13026aa7081dc581aa742eb7207e3277e7106c522c5b65ad92590 F src/parse.y e53f90120fc1c47bfdf2a0894e6ffe0d06f757dbd61eacaaa1731334a1cf08fa F src/pcache.c 696a01f1a6370c1b50a09c15972bc3bee3333f8fcd1f2da8e9a76b1b062c59ee @@ -525,7 +526,7 @@ F src/shell.c.in c1986496062f9dba4ed5b70db06b5e0f32e1954cdcfab0b30372c6c18679681 F src/sqlite.h.in 491a9fca996e30b210244f786e062c3513ecf7d0450f93c06a5484f60a10df6a F src/sqlite3.rc 5121c9e10c3964d5755191c80dd1180c122fc3a8 F src/sqlite3ext.h 960f1b86c3610fa23cb6a267572a97dcf286e77aa0dd3b9b23292ffaa1ea8683 -F src/sqliteInt.h 7a67213db1f6217432948f3d898473f48ccfe9c1bc37a11b8d2b8c978fda517c +F src/sqliteInt.h 3a676f6f67929155f6e9bf279230cc339f924104b2151432e3a73d0cbb6d05c2 F src/sqliteLimit.h 1513bfb7b20378aa0041e7022d04acb73525de35b80b252f1b83fedb4de6a76b F src/status.c 46e7aec11f79dad50965a5ca5fa9de009f7d6bde08be2156f1538a0a296d4d0e F src/table.c b46ad567748f24a326d9de40e5b9659f96ffff34 @@ -602,8 +603,8 @@ F src/vdbesort.c 90aad5a92608f2dd771c96749beabdb562c9d881131a860a7a5bccf66dc3be7 F src/vdbetrace.c 79d6dbbc479267b255a7de8080eee6e729928a0ef93ed9b0bfa5618875b48392 F src/vtab.c 2462b7d6fd72b0b916477f5ef210ee49ab58cec195483ebdac0c8c5e3ec42cab F src/vxworks.h d2988f4e5a61a4dfe82c6524dd3d6e4f2ce3cdb9 -F src/wal.c 59b20f8fd0d7420e8896a1ef7cbd5db7352f383235ec25db08d9c22b55f4b603 -F src/wal.h f325a5856b669f5ba449157485915816103857c8574efc746ac55eba3335c5e0 +F src/wal.c 9ad123ad4695cf5883d03754b9b32a2267b1bdc95857059781c575fc796ce8b2 +F src/wal.h ac2100eeda406a4492b8c183154507532d23ab9d5a8e32e208adfe4f9ea554f9 F src/walker.c 7607f1a68130c028255d8d56094ea602fc402c79e1e35a46e6282849d90d5fe4 F src/where.c 8a207cb2ca6b99e1edb1e4bbff9b0504385a759cbf66180d1deb34d80ca4b799 F src/whereInt.h 5f14db426ca46a83eabab1ae9aa6d4b8f27504ad35b64c290916289b1ddb2e88 @@ -740,7 +741,7 @@ F test/collateA.test b8218ab90d1fa5c59dcf156efabb1b2599c580d6 F test/collateB.test 1e68906951b846570f29f20102ed91d29e634854ee47454d725f2151ecac0b95 F test/colmeta.test 2c765ea61ee37bc43bbe6d6047f89004e6508eb1 F test/colname.test fb28b3687e03625425bc216edf8b186ce974aa71008e2aa1f426a7dcb75a601d -F test/concfault.test 500f17c3fcfe7705114422bcc6ddd3c740001a43 +F test/concfault.test a88fdcd2dd959ac127a534581160fbe902080349d8703da3a8ac9633d7785a09 F test/concurrent.test 86661967a680670127a62a819e60dc93c2d3d49043ac95b26dfa70d3e60dbde5 F test/concurrent2.test de748c7dd749c77e2af2c4b914b9b09a28ac09608042ca498c0251dc6f46aa1a F test/concurrent3.test 530671ac706f6a1d0f4992dbdd33a86408330d03cd90fb9e82ecb1b27f5fd081 @@ -1209,7 +1210,7 @@ F test/pagesize.test 5769fc62d8c890a83a503f67d47508dfdc543305 F test/pcache.test c8acbedd3b6fd0f9a7ca887a83b11d24a007972b F test/pcache2.test af7f3deb1a819f77a6d0d81534e97d1cf62cd442 F test/percentile.test 4243af26b8f3f4555abe166f723715a1f74c77ff -F test/permutations.test a2ea383ba27332d9f964b74fb646680c68a83cb5bb32bdf018a22d2f6422d0f3 +F test/permutations.test 52d2c37fe8cc07ec7362024c214b04bb69432995b3a984a3fbabc60fa6ada3ee F test/pg_common.tcl 301ac19c1a52fd55166d26db929b3b89165c634d52b5f8ad76ea8cb06960db30 F test/pragma.test c267bf02742c823a191960895b3d52933cebd7beee26757d1ed694f213fcd867 F test/pragma2.test e5d5c176360c321344249354c0c16aec46214c9f @@ -1820,7 +1821,7 @@ F vsixtest/vsixtest.tcl 6a9a6ab600c25a91a7acc6293828957a386a8a93 F vsixtest/vsixtest.vcxproj.data 2ed517e100c66dc455b492e1a33350c1b20fbcdc F vsixtest/vsixtest.vcxproj.filters 37e51ffedcdb064aad6ff33b6148725226cd608e F vsixtest/vsixtest_TemporaryKey.pfx e5b1b036facdb453873e7084e1cae9102ccc67a0 -P 667cce3dce39487e970c4eb43efd1bae7efed7ac6ee31030c582bfafa846dcce fade103cbac1b067f9544935b767f36dc266aceb3269cc84a3ae3b04ad9a4823 -R 4f3c8653330ec682cd33c2959e9b41eb +P 51e3e835490a12dca6359ca02b8cbf31091fc4376b265995a986d002d376fad9 +R b33d74604a290882b801e01fd4143d6d U drh -Z 165c946130d99e8a2b3dc34a886a51e9 +Z 04a15b76f4ebf30d6bcb9170bc1c04d2 diff --git a/manifest.uuid b/manifest.uuid index 32ac4b9971..dbbdeaa150 100644 --- a/manifest.uuid +++ b/manifest.uuid @@ -1 +1 @@ -51e3e835490a12dca6359ca02b8cbf31091fc4376b265995a986d002d376fad9 \ No newline at end of file +6433d366175c5be586a0e11b540e76a2515466b358332c4356388f4c449ec0d7 \ No newline at end of file diff --git a/src/btree.c b/src/btree.c index 154af1f039..3f84825e2e 100644 --- a/src/btree.c +++ b/src/btree.c @@ -8024,6 +8024,7 @@ static int balance_nonroot( memset(apOld, 0, (i+1)*sizeof(MemPage*)); goto balance_cleanup; } + setMempageRoot(apOld[i], pgnoRoot); if( apOld[i]->nFree<0 ){ rc = btreeComputeFreeSpace(apOld[i]); if( rc ){ @@ -8031,7 +8032,6 @@ static int balance_nonroot( goto balance_cleanup; } } - setMempageRoot(apOld[i], pgnoRoot); if( (i--)==0 ) break; if( pParent->nOverflow && i+nxDiv==pParent->aiOvfl[0] ){ diff --git a/src/pager.c b/src/pager.c index 2b5db2ceb6..dd6aa85d82 100644 --- a/src/pager.c +++ b/src/pager.c @@ -3216,7 +3216,13 @@ static int pagerRollbackWal(Pager *pPager){ ** + Reload page content from the database (if refcount>0). */ pPager->dbSize = pPager->dbOrigSize; - rc = sqlite3WalUndo(pPager->pWal, pagerUndoCallback, (void *)pPager); + rc = sqlite3WalUndo(pPager->pWal, pagerUndoCallback, (void *)pPager, +#ifdef SQLITE_OMIT_CONCURRENT + 0 +#else + pPager->pAllRead!=0 +#endif + ); pList = sqlite3PcacheDirtyList(pPager->pPCache); #ifndef SQLITE_OMIT_CONCURRENT diff --git a/src/sqliteInt.h b/src/sqliteInt.h index 31546da49d..732cc7e755 100644 --- a/src/sqliteInt.h +++ b/src/sqliteInt.h @@ -1558,8 +1558,8 @@ struct sqlite3 { #define SQLITE_LegacyAlter 0x04000000 /* Legacy ALTER TABLE behaviour */ #define SQLITE_NoSchemaError 0x08000000 /* Do not report schema parse errors*/ #define SQLITE_Defensive 0x10000000 /* Input SQL is likely hostile */ +#define SQLITE_NoopUpdate 0x20000000 /* UPDATE operations are no-ops */ -#define SQLITE_NoopUpdate 0x01000000 /* UPDATE operations are no-ops */ /* Flags used only if debugging */ #define HI(X) ((u64)(X)<<32) #ifdef SQLITE_DEBUG diff --git a/src/wal.c b/src/wal.c index 4b22ba42ef..bd92fd95c5 100644 --- a/src/wal.c +++ b/src/wal.c @@ -3258,7 +3258,12 @@ int sqlite3WalEndWriteTransaction(Wal *pWal){ ** Otherwise, if the callback function does not return an error, this ** function returns SQLITE_OK. */ -int sqlite3WalUndo(Wal *pWal, int (*xUndo)(void *, Pgno), void *pUndoCtx){ +int sqlite3WalUndo( + Wal *pWal, + int (*xUndo)(void *, Pgno), + void *pUndoCtx, + int bConcurrent /* True if this is a CONCURRENT transaction */ +){ int rc = SQLITE_OK; if( pWal->writeLock ){ Pgno iMax = pWal->hdr.mxFrame; @@ -3268,6 +3273,13 @@ int sqlite3WalUndo(Wal *pWal, int (*xUndo)(void *, Pgno), void *pUndoCtx){ ** was in before the client began writing to the database. */ memcpy(&pWal->hdr, (void *)walIndexHdr(pWal), sizeof(WalIndexHdr)); +#ifndef SQLITE_OMIT_CONCURRENT + if( bConcurrent ){ + pWal->hdr.aCksum[0]++; + } +#else + UNUSED_PARAMETER(bConcurrent); +#endif for(iFrame=pWal->hdr.mxFrame+1; ALWAYS(rc==SQLITE_OK) && iFrame<=iMax; diff --git a/src/wal.h b/src/wal.h index 70247f1c2b..9878852c76 100644 --- a/src/wal.h +++ b/src/wal.h @@ -34,7 +34,7 @@ # define sqlite3WalDbsize(y) 0 # define sqlite3WalBeginWriteTransaction(y) 0 # define sqlite3WalEndWriteTransaction(x) 0 -# define sqlite3WalUndo(x,y,z) 0 +# define sqlite3WalUndo(w,x,y,z) 0 # define sqlite3WalSavepoint(y,z) # define sqlite3WalSavepointUndo(y,z) 0 # define sqlite3WalFrames(u,v,w,x,y,z) 0 @@ -83,7 +83,7 @@ int sqlite3WalBeginWriteTransaction(Wal *pWal); int sqlite3WalEndWriteTransaction(Wal *pWal); /* Undo any frames written (but not committed) to the log */ -int sqlite3WalUndo(Wal *pWal, int (*xUndo)(void *, Pgno), void *pUndoCtx); +int sqlite3WalUndo(Wal *pWal, int (*xUndo)(void *, Pgno), void *pUndoCtx, int); /* Return an integer that records the current (uncommitted) write ** position in the WAL */ diff --git a/test/concfault.test b/test/concfault.test index 6d409c8d74..205e1d93be 100644 --- a/test/concfault.test +++ b/test/concfault.test @@ -82,5 +82,44 @@ do_faultsim_test 1.3 -prep { faultsim_integrity_check } -finish_test +#------------------------------------------------------------------------- +reset_db + +do_execsql_test 2.0 { + PRAGMA auto_vacuum = 0; + PRAGMA journal_mode = wal; + CREATE TABLE t1(a PRIMARY KEY, b); + CREATE TABLE t2(a PRIMARY KEY, b); + INSERT INTO t1 VALUES(randomblob(1000), randomblob(100)); + INSERT INTO t1 SELECT randomblob(1000), randomblob(1000) FROM t1; + INSERT INTO t1 SELECT randomblob(1000), randomblob(1000) FROM t1; + INSERT INTO t1 SELECT randomblob(1000), randomblob(1000) FROM t1; + INSERT INTO t1 SELECT randomblob(1000), randomblob(1000) FROM t1; + DELETE FROM t1 WHERE rowid%2; +} {wal} +faultsim_save_and_close +do_faultsim_test 1 -prep { + faultsim_restore_and_reopen + execsql { + SELECT * FROM t1; + BEGIN CONCURRENT; + INSERT INTO t2 VALUES(1, 2); + } + sqlite3 db2 test.db + execsql { + PRAGMA journal_size_limit = 10000; + INSERT INTO t1 VALUES(randomblob(1000), randomblob(1000)); + } db2 + db2 close +} -body { + execsql { COMMIT } +} -test { + faultsim_test_result {0 {}} + catchsql { ROLLBACK } + set res [catchsql { SELECT count(*) FROM t1 }] + if {$res!="0 9"} { error "expected {0 9} got {$res}" } + faultsim_integrity_check +} + +finish_test diff --git a/test/permutations.test b/test/permutations.test index 91d92bdb23..577cc7bd05 100644 --- a/test/permutations.test +++ b/test/permutations.test @@ -89,6 +89,7 @@ foreach f [glob $testdir/*.test] { lappend alltests [file tail $f] } foreach f [glob -nocomplain \ $testdir/../ext/rtree/*.test \ $testdir/../ext/fts5/test/*.test \ + $testdir/../ext/expert/*.test \ $testdir/../ext/lsm1/test/*.test \ ] { lappend alltests $f -- 2.47.3