From: drh Date: Tue, 1 Mar 2016 14:31:59 +0000 (+0000) Subject: Change the estimated row counts in stat1 to be one-third worst-case and X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=268df2d77f7ba057864d6e636ad8ffa88eeb6e88;p=thirdparty%2Fsqlite.git Change the estimated row counts in stat1 to be one-third worst-case and two-threads average case. FossilOrigin-Name: 21bfd47c4245846b12aeeb7cf0212529e300b878 --- diff --git a/manifest b/manifest index 21cddaa2b2..4997303fa7 100644 --- a/manifest +++ b/manifest @@ -1,5 +1,5 @@ -C Fix\stest\scases\sto\salign\swith\sthe\simproved\sstats\scomputation. -D 2016-03-01T12:45:08.294 +C Change\sthe\sestimated\srow\scounts\sin\sstat1\sto\sbe\sone-third\sworst-case\sand\ntwo-threads\saverage\scase. +D 2016-03-01T14:31:59.247 F Makefile.in 4e90dc1521879022aa9479268a4cd141d1771142 F Makefile.linux-gcc 7bc79876b875010e8c8f9502eb935ca92aa3c434 F Makefile.msc 4f319afb7c049d40aff7af6e8c4e7cc2ba18e079 @@ -286,7 +286,7 @@ F sqlite.pc.in 42b7bf0d02e08b9e77734a47798d1a55a9e0716b F sqlite3.1 fc7ad8990fc8409983309bb80de8c811a7506786 F sqlite3.pc.in 48fed132e7cb71ab676105d2a4dc77127d8c1f3a F src/alter.c 44e18dfd78e8942d65d3cdaec4de972b5cd9f1f2 -F src/analyze.c 997b0ce0f4efd68b0499f941ebebdd1ec0c71549 +F src/analyze.c dac2f49d4a622091df1964a9ec9de1d2cd0584bb F src/attach.c a3724c64de1099d85e30751213d285752aed9505 F src/auth.c b56c78ebe40a2110fd361379f7e8162d23f92240 F src/backup.c f60f0aa55d25d853ffde53d0b0370a7bb7ee41ce @@ -444,7 +444,7 @@ F test/alter3.test b3568d11c38c4599c92f24242eda34144d78dc10 F test/alter4.test c461150723ac957f3b2214aa0b11552cd72023ec F test/altermalloc.test e81ac9657ed25c6c5bb09bebfa5a047cd8e4acfc F test/amatch1.test b5ae7065f042b7f4c1c922933f4700add50cdb9f -F test/analyze.test 6e7bb2c94df69461b83627bdc3d960b0cb2847cd +F test/analyze.test 3eb35a4af972f98422e5dc0586501b17d103d321 F test/analyze3.test 1dccda46a6c374018af617fba00bfe297a61d442 F test/analyze4.test eff2df19b8dd84529966420f29ea52edc6b56213 F test/analyze5.test 765c4e284aa69ca172772aa940946f55629bc8c4 @@ -1451,7 +1451,7 @@ F vsixtest/vsixtest.tcl 6a9a6ab600c25a91a7acc6293828957a386a8a93 F vsixtest/vsixtest.vcxproj.data 2ed517e100c66dc455b492e1a33350c1b20fbcdc F vsixtest/vsixtest.vcxproj.filters 37e51ffedcdb064aad6ff33b6148725226cd608e F vsixtest/vsixtest_TemporaryKey.pfx e5b1b036facdb453873e7084e1cae9102ccc67a0 -P 421b5b544af734b97e3da47699fa0f82b21617d3 -R d260dd34a084c646db10204191cf68af +P 810967bff6ba262f38a2833e2d4efef4ef00f463 +R cd9cc1e55f6145b7b1e4855011dd5873 U drh -Z 232c83ab04b03227d0856ebcd7ccd2ec +Z 85196c62dcaae63dbb4c33e57a54153c diff --git a/manifest.uuid b/manifest.uuid index 43e84cf5bf..6c06608fd7 100644 --- a/manifest.uuid +++ b/manifest.uuid @@ -1 +1 @@ -810967bff6ba262f38a2833e2d4efef4ef00f463 \ No newline at end of file +21bfd47c4245846b12aeeb7cf0212529e300b878 \ No newline at end of file diff --git a/src/analyze.c b/src/analyze.c index 9c3c37b05b..b2b85d3d71 100644 --- a/src/analyze.c +++ b/src/analyze.c @@ -828,31 +828,47 @@ static void statGet( assert( argc==1 ); #endif { - /* Return the value to store in the "stat" column of the sqlite_stat1 + /* Return a value for the "stat" column of the sqlite_stat1 ** table for this index. ** ** The value is a string composed of a list of integers describing - ** the index. The first integer in the list is the total number of - ** entries in the index. There is one additional integer in the list - ** for each indexed column. This additional integer is an estimate of - ** the number of rows matched by a query on the index using - ** a key with the corresponding number of fields. In other words, - ** if the index is on columns (a,b) and the sqlite_stat1 value is - ** "100 10 2", then SQLite estimates that: + ** the index. The first integer is the (estimated) total number + ** entries in the index. There is one additional integer for each + ** column in the index. The first added integer is an estimate of + ** the number of rows that match a single key in the first column of + ** the index. The second added integer is an estimate on of the number + ** of rows that match a single key consisting of the first two columns + ** of the index. And so forth. ** - ** * the index contains 100 rows, - ** * "WHERE a=?" matches 10 rows, and - ** * "WHERE a=? AND b=?" matches 2 rows. + ** For example, for an index on columns (a,b), if the sqlite_stat1.stat + ** values is "100 10 2", that means there are about 100 rows in the + ** index, and that a query against a=$key1 will match about 10 rows + ** and a query against "a=$key1 AND b=$key2" will match about 2 rows. ** - ** A worst-case estimate is used: the maximum number of rows that - ** could be select for any set of query parameters. The worst case - ** is the estimate we want for choosing indexes. + ** Let V be the average number of rows that match a key, and let M + ** be the most number of rows that match the key for any possible value + ** of that key. The estimate is computed as: + ** + ** E = (2*V + M)/3 + ** + ** Consider two indexes. Index X has with 100 values of exactly 0 and + ** 100 singleton values between 1 and 100. Index Y has 200 values + ** evenly distributed between 1 and 20. If only the average (V) is + ** used in the estimate, X would have "200 2" and Y would have "200 10" + ** and so the planner would think X is the more selective index. And + ** X often would be more selective. But when searching for 0, index X + ** would perform badly. To avoid this problem, the M is added into the + ** estimate so that the stat for X is "200 34" and Y is still "200 10". + ** In this way, Y is the preferred index (all else being equal) and + ** the pathological case is avoided. ** ** For deciding whether or not to do a skip-scan, we want to know the - ** average number of rows with the same key. We can approximate this - ** using the (worst case) most number of rows with the same key. But - ** sometimes that approximation can be badly off. In those cases, - ** mark the index as "noskipscan" to avoid suboptimal skip-scan plans. + ** average number of rows (V) with the same key, not the mixed estimate + ** E shown above. Usually E will be close enough. However, if E is + ** large but V is small, that could trick the query planner into thinking + ** that a skip-scan might work well on this index. To avoid that, the + ** "noskipscan" flag is added in cases where the divergence between E + ** and V might mislead the query planner. */ char *z; int i; @@ -867,13 +883,14 @@ static void statGet( sqlite3_snprintf(24, zRet, "%llu", (u64)p->nRow); z = zRet + sqlite3Strlen30(zRet); for(i=0; inKeyCol; i++){ - u64 iVal = p->current.amxEq[i]; + u64 nDistinct = p->current.anDLt[i]; + u64 iMx = p->current.amxEq[i]; /* M: Most rows per key */ + u64 iAvg = (p->nRow+nDistinct)/(nDistinct+1); /* V: Average per key */ + u64 iVal = (iMx+iAvg*2)/3; /* E: The estimate */ sqlite3_snprintf(24, z, " %llu", iVal); z += sqlite3Strlen30(z); assert( p->current.anEq[i] ); - if( iVal>=WHERE_SKIPSCAN_ONSET - && p->current.anDLt[i] > p->nRow/(WHERE_SKIPSCAN_ONSET*2/3) - ){ + if( iVal>=WHERE_SKIPSCAN_ONSET && iAvg<(WHERE_SKIPSCAN_ONSET*2/3) ){ noSkipScan = 1; } } diff --git a/test/analyze.test b/test/analyze.test index 76e12c7395..af277cc835 100644 --- a/test/analyze.test +++ b/test/analyze.test @@ -158,7 +158,7 @@ do_test analyze-3.3 { ANALYZE main; SELECT idx, stat FROM sqlite_stat1 ORDER BY idx; } -} {t1i1 {5 4} t1i2 {5 2} t1i3 {5 4 1}} +} {t1i1 {5 3} t1i2 {5 2} t1i3 {5 3 1}} do_test analyze-3.4 { execsql { CREATE TABLE t2 AS SELECT * FROM t1; @@ -168,27 +168,27 @@ do_test analyze-3.4 { ANALYZE; SELECT idx, stat FROM sqlite_stat1 ORDER BY idx; } -} {t1i1 {5 4} t1i2 {5 2} t1i3 {5 4 1} t2i1 {5 4} t2i2 {5 2} t2i3 {5 4 1}} +} {t1i1 {5 3} t1i2 {5 2} t1i3 {5 3 1} t2i1 {5 3} t2i2 {5 2} t2i3 {5 3 1}} do_test analyze-3.5 { execsql { DROP INDEX t2i3; ANALYZE t1; SELECT idx, stat FROM sqlite_stat1 ORDER BY idx; } -} {t1i1 {5 4} t1i2 {5 2} t1i3 {5 4 1} t2i1 {5 4} t2i2 {5 2}} +} {t1i1 {5 3} t1i2 {5 2} t1i3 {5 3 1} t2i1 {5 3} t2i2 {5 2}} do_test analyze-3.6 { execsql { ANALYZE t2; SELECT idx, stat FROM sqlite_stat1 ORDER BY idx; } -} {t1i1 {5 4} t1i2 {5 2} t1i3 {5 4 1} t2i1 {5 4} t2i2 {5 2}} +} {t1i1 {5 3} t1i2 {5 2} t1i3 {5 3 1} t2i1 {5 3} t2i2 {5 2}} do_test analyze-3.7 { execsql { DROP INDEX t2i2; ANALYZE t2; SELECT idx, stat FROM sqlite_stat1 ORDER BY idx; } -} {t1i1 {5 4} t1i2 {5 2} t1i3 {5 4 1} t2i1 {5 4}} +} {t1i1 {5 3} t1i2 {5 2} t1i3 {5 3 1} t2i1 {5 3}} do_test analyze-3.8 { execsql { CREATE TABLE t3 AS SELECT a, b, rowid AS c, 'hi' AS d FROM t1; @@ -205,7 +205,7 @@ do_test analyze-3.9 { ANALYZE; SELECT idx, stat FROM sqlite_stat1 ORDER BY idx; } -} {t3i1 {5 4} t3i2 {5 4 1 1 1} t3i3 {5 5 2 1 1}} +} {t3i1 {5 3} t3i2 {5 3 1 1 1} t3i3 {5 5 2 1 1}} do_test analyze-3.10 { execsql { @@ -217,19 +217,19 @@ do_test analyze-3.10 { ANALYZE; SELECT idx, stat FROM sqlite_stat1 ORDER BY idx; } -} {{another foolish ' name} {2 1} {foolish ' name} {2 1 1} t3i1 {5 4} t3i2 {5 4 1 1 1} t3i3 {5 5 2 1 1}} +} {{another foolish ' name} {2 1} {foolish ' name} {2 1 1} t3i1 {5 3} t3i2 {5 3 1 1 1} t3i3 {5 5 2 1 1}} do_test analyze-3.11 { execsql { DROP INDEX "foolish ' name"; SELECT idx, stat FROM sqlite_stat1 ORDER BY idx; } -} {{another foolish ' name} {2 1} t3i1 {5 4} t3i2 {5 4 1 1 1} t3i3 {5 5 2 1 1}} +} {{another foolish ' name} {2 1} t3i1 {5 3} t3i2 {5 3 1 1 1} t3i3 {5 5 2 1 1}} do_test analyze-3.11 { execsql { DROP TABLE "silly "" name"; SELECT idx, stat FROM sqlite_stat1 ORDER BY idx; } -} {t3i1 {5 4} t3i2 {5 4 1 1 1} t3i3 {5 5 2 1 1}} +} {t3i1 {5 3} t3i2 {5 3 1 1 1} t3i3 {5 5 2 1 1}} # Try corrupting the sqlite_stat1 table and make sure the # database is still able to function. @@ -249,7 +249,7 @@ do_test analyze-4.0 { ANALYZE; SELECT idx, stat FROM sqlite_stat1 ORDER BY idx; } -} {t3i1 {5 4} t3i2 {5 4 1 1 1} t3i3 {5 5 2 1 1} t4i1 {5 4} t4i2 {5 2}} +} {t3i1 {5 3} t3i2 {5 3 1 1 1} t3i3 {5 5 2 1 1} t4i1 {5 3} t4i2 {5 2}} do_test analyze-4.1 { execsql { PRAGMA writable_schema=on;