From: dan Date: Sat, 14 Mar 2015 18:59:58 +0000 (+0000) Subject: When estimating the number of rows visited by a range scan for which the keys consist... X-Git-Tag: version-3.8.9~75^2~1 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=a3d0c1365470318410d4993375f34b13a7edc96a;p=thirdparty%2Fsqlite.git When estimating the number of rows visited by a range scan for which the keys consist of more than one field, consider prefixes of stat4 samples as well as the full samples. FossilOrigin-Name: e1caf93c9ad0ee15d42030af95619f212d3fcf9d --- diff --git a/manifest b/manifest index e1d464f1a0..4946d588ee 100644 --- a/manifest +++ b/manifest @@ -1,5 +1,5 @@ -C Add\stests\sto\sensure\s"PRAGMA\sincremental_vacuum"\sand\s"PRAGMA\sauto_vacuum\s=\sincremental"\shandle\scorrupt\sdatabases\scorrectly. -D 2015-03-13T15:44:36.085 +C When\sestimating\sthe\snumber\sof\srows\svisited\sby\sa\srange\sscan\sfor\swhich\sthe\skeys\sconsist\sof\smore\sthan\sone\sfield,\sconsider\sprefixes\sof\sstat4\ssamples\sas\swell\sas\sthe\sfull\ssamples. +D 2015-03-14T18:59:58.801 F Makefile.arm-wince-mingw32ce-gcc d6df77f1f48d690bd73162294bbba7f59507c72f F Makefile.in 88a3e6261286db378fdffa1124cad11b3c05f5bb F Makefile.linux-gcc 91d710bdc4998cb015f39edf3cb314ec4f4d7e23 @@ -307,7 +307,7 @@ F src/vxworks.h c18586c8edc1bddbc15c004fa16aeb1e1342b4fb F src/wal.c 39303f2c9db02a4e422cd8eb2c8760420c6a51fe F src/wal.h df01efe09c5cb8c8e391ff1715cca294f89668a4 F src/walker.c c253b95b4ee44b21c406e2a1052636c31ea27804 -F src/where.c eb141b075776e9864d38f279333e2472a8653202 +F src/where.c 5a4e4ab378dbddeca59ad283c61aa67c6e56a913 F src/whereInt.h cbe4aa57326998d89e7698ca65bb7c28541d483c F test/8_3_names.test ebbb5cd36741350040fd28b432ceadf495be25b2 F test/aggerror.test a867e273ef9e3d7919f03ef4f0e8c0d2767944f2 @@ -327,7 +327,7 @@ F test/analyze5.test 765c4e284aa69ca172772aa940946f55629bc8c4 F test/analyze6.test f1c552ce39cca4ec922a7e4e0e5d0203d6b3281f F test/analyze7.test bb1409afc9e8629e414387ef048b8e0e3e0bdc4f F test/analyze8.test c05a461d0a6b05991106467d0c47480f2e709c82 -F test/analyze9.test 72795c8113604b5dcd47a1498a61d6d7fb5d041a +F test/analyze9.test 2f6cfeae1fcc61cc531bd19f68e1e28fb6edafbf F test/analyzeA.test 3335697f6700c7052295cfd0067fc5b2aacddf9a F test/analyzeB.test 8bf35ee0a548aea831bf56762cb8e7fdb1db083d F test/analyzeC.test 555a6cc388b9818b6eda6df816f01ce0a75d3a93 @@ -1244,7 +1244,10 @@ F tool/vdbe_profile.tcl 67746953071a9f8f2f668b73fe899074e2c6d8c1 F tool/warnings-clang.sh f6aa929dc20ef1f856af04a730772f59283631d4 F tool/warnings.sh 0abfd78ceb09b7f7c27c688c8e3fe93268a13b32 F tool/win/sqlite.vsix deb315d026cc8400325c5863eef847784a219a2f -P 5aa522dcb9bfa18d49683f7cc889516984e2bcd2 -R a79445283a8ae7c7f85ae571d23bc239 +P 1c2166cb2a387a0856f41b399c3648bf8c5fce73 +R f0473fc546184f7826d3d60610130f49 +T *branch * stat4-change +T *sym-stat4-change * +T -sym-trunk * U dan -Z d6c136b9ed1be16533c60706d0427e82 +Z e2a2579617ea31b6dadba1e9a71ba744 diff --git a/manifest.uuid b/manifest.uuid index e474b58d5e..02ad1198b1 100644 --- a/manifest.uuid +++ b/manifest.uuid @@ -1 +1 @@ -1c2166cb2a387a0856f41b399c3648bf8c5fce73 \ No newline at end of file +e1caf93c9ad0ee15d42030af95619f212d3fcf9d \ No newline at end of file diff --git a/src/where.c b/src/where.c index 8b9060b0a7..6ecc9c33c2 100644 --- a/src/where.c +++ b/src/where.c @@ -1931,11 +1931,14 @@ static int vtabBestIndex(Parse *pParse, Table *pTab, sqlite3_index_info *p){ ** Estimate the location of a particular key among all keys in an ** index. Store the results in aStat as follows: ** -** aStat[0] Est. number of rows less than pVal -** aStat[1] Est. number of rows equal to pVal +** aStat[0] Est. number of rows less than pRec +** aStat[1] Est. number of rows equal to pRec ** ** Return the index of the sample that is the smallest sample that -** is greater than or equal to pRec. +** is greater than or equal to pRec. Note that this index is not an index +** into the aSample[] array - it is an index into a virtual set of samples +** based on the contents of aSample[] and the number of fields in record +** pRec. */ static int whereKeyStats( Parse *pParse, /* Database connection */ @@ -1946,67 +1949,158 @@ static int whereKeyStats( ){ IndexSample *aSample = pIdx->aSample; int iCol; /* Index of required stats in anEq[] etc. */ + int i; /* Index of first sample >= pRec */ + int iSample; /* Smallest sample larger than or equal to pRec */ int iMin = 0; /* Smallest sample not yet tested */ - int i = pIdx->nSample; /* Smallest sample larger than or equal to pRec */ int iTest; /* Next sample to test */ int res; /* Result of comparison operation */ + int nField; /* Number of fields in pRec */ + tRowcnt iLower = 0; /* anLt[] + anEq[] of largest sample pRec is > */ #ifndef SQLITE_DEBUG UNUSED_PARAMETER( pParse ); #endif assert( pRec!=0 ); - iCol = pRec->nField - 1; assert( pIdx->nSample>0 ); - assert( pRec->nField>0 && iColnSampleCol ); + assert( pRec->nField>0 && pRec->nField<=pIdx->nSampleCol ); + + /* Do a binary search to find the first sample greater than or equal + ** to pRec. If pRec contains a single field, the set of samples to search + ** is simply the aSample[] array. If the samples in aSample[] contain more + ** than one fields, all fields following the first are ignored. + ** + ** If pRec contains N fields, where N is more than one, then as well as the + ** samples in aSample[] (truncated to N fields), the search also has to + ** consider prefixes of those samples. For example, if the set of samples + ** in aSample is: + ** + ** aSample[0] = (a, 5) + ** aSample[1] = (a, 10) + ** aSample[2] = (b, 5) + ** aSample[3] = (c, 100) + ** aSample[4] = (c, 105) + ** + ** Then the search space should ideally be the samples above and the + ** unique prefixes [a], [b] and [c]. But since that is hard to organize, + ** the code actually searches this set: + ** + ** 0: (a) + ** 1: (a, 5) + ** 2: (a, 10) + ** 3: (a, 10) + ** 4: (b) + ** 5: (b, 5) + ** 6: (c) + ** 7: (c, 100) + ** 8: (c, 105) + ** 9: (c, 105) + ** + ** For each sample in the aSample[] array, N samples are present in the + ** effective sample array. In the above, samples 0 and 1 are based on + ** sample aSample[0]. Samples 2 and 3 on aSample[1] etc. + ** + ** Often, sample i of each block of N effective samples has (i+1) fields. + ** Except, each sample may be extended to ensure that it is greater than or + ** equal to the previous sample in the array. For example, in the above, + ** sample 2 is the first sample of a block of N samples, so at first it + ** appears that it should be 1 field in size. However, that would make it + ** smaller than sample 1, so the binary search would not work. As a result, + ** it is extended to two fields. The duplicates that this creates do not + ** cause any problems. + */ + nField = pRec->nField; + iCol = 0; + iSample = pIdx->nSample * nField; do{ - iTest = (iMin+i)/2; - res = sqlite3VdbeRecordCompare(aSample[iTest].n, aSample[iTest].p, pRec); + int iSamp; /* Index in aSample[] of test sample */ + int n; /* Number of fields in test sample */ + + iTest = (iMin+iSample)/2; + iSamp = iTest / nField; + if( iSamp>0 ){ + /* The proposed effective sample is a prefix of sample aSample[iSamp]. + ** Specifically, the shortest prefix of at least (1 + iTest%nField) + ** fields that is greater than the previous effective sample. */ + for(n=(iTest % nField) + 1; nnField = n; + res = sqlite3VdbeRecordCompare(aSample[iSamp].n, aSample[iSamp].p, pRec); if( res<0 ){ + iLower = aSample[iSamp].anLt[n-1] + aSample[iSamp].anEq[n-1]; + iMin = iTest+1; + }else if( res==0 && nnSample ); - assert( 0==sqlite3VdbeRecordCompare(aSample[i].n, aSample[i].p, pRec) - || pParse->db->mallocFailed ); - }else{ - /* Otherwise, pRec must be smaller than sample $i and larger than - ** sample ($i-1). */ - assert( i==pIdx->nSample - || sqlite3VdbeRecordCompare(aSample[i].n, aSample[i].p, pRec)>0 - || pParse->db->mallocFailed ); - assert( i==0 - || sqlite3VdbeRecordCompare(aSample[i-1].n, aSample[i-1].p, pRec)<0 - || pParse->db->mallocFailed ); + if( pParse->db->mallocFailed==0 ){ + if( res==0 ){ + /* If (res==0) is true, then pRec must be equal to sample i. */ + assert( inSample ); + assert( iCol==nField-1 ); + pRec->nField = nField; + assert( 0==sqlite3VdbeRecordCompare(aSample[i].n, aSample[i].p, pRec) + || pParse->db->mallocFailed + ); + }else{ + /* Unless i==pIdx->nSample, indicating that pRec is larger than + ** all samples in the aSample[] array, pRec must be smaller than the + ** (iCol+1) field prefix of sample i. */ + assert( i<=pIdx->nSample && i>=0 ); + pRec->nField = iCol+1; + assert( i==pIdx->nSample + || sqlite3VdbeRecordCompare(aSample[i].n, aSample[i].p, pRec)>0 + || pParse->db->mallocFailed ); + + /* if i==0 and iCol==0, then record pRec is smaller than all samples + ** in the aSample[] array. Otherwise, if (iCol>0) then pRec must + ** be greater than or equal to the (iCol) field prefix of sample i. + ** If (i>0), then pRec must also be greater than sample (i-1). */ + if( iCol>0 ){ + pRec->nField = iCol; + assert( sqlite3VdbeRecordCompare(aSample[i].n, aSample[i].p, pRec)<=0 + || pParse->db->mallocFailed ); + } + if( i>0 ){ + pRec->nField = nField; + assert( sqlite3VdbeRecordCompare(aSample[i-1].n, aSample[i-1].p, pRec)<0 + || pParse->db->mallocFailed ); + } + } } #endif /* ifdef SQLITE_DEBUG */ - /* At this point, aSample[i] is the first sample that is greater than - ** or equal to pVal. Or if i==pIdx->nSample, then all samples are less - ** than pVal. If aSample[i]==pVal, then res==0. - */ if( res==0 ){ + /* Record pRec is equal to sample i */ + assert( iCol==nField-1 ); aStat[0] = aSample[i].anLt[iCol]; aStat[1] = aSample[i].anEq[iCol]; }else{ - tRowcnt iLower, iUpper, iGap; - if( i==0 ){ - iLower = 0; - iUpper = aSample[0].anLt[iCol]; + /* At this point, the (iCol+1) field prefix of aSample[i] is the first + ** sample that is greater than pRec. Or, if i==pIdx->nSample then pRec + ** is larger than all samples in the array. */ + tRowcnt iUpper, iGap; + if( i>=pIdx->nSample ){ + iUpper = sqlite3LogEstToInt(pIdx->aiRowLogEst[0]); }else{ - i64 nRow0 = sqlite3LogEstToInt(pIdx->aiRowLogEst[0]); - iUpper = i>=pIdx->nSample ? nRow0 : aSample[i].anLt[iCol]; - iLower = aSample[i-1].anEq[iCol] + aSample[i-1].anLt[iCol]; + iUpper = aSample[i].anLt[iCol]; } - aStat[1] = pIdx->aAvgEq[iCol]; + if( iLower>=iUpper ){ iGap = 0; }else{ @@ -2018,7 +2112,11 @@ static int whereKeyStats( iGap = iGap/3; } aStat[0] = iLower + iGap; + aStat[1] = pIdx->aAvgEq[iCol]; } + + /* Restore the pRec->nField value before returning. */ + pRec->nField = nField; return i; } #endif /* SQLITE_ENABLE_STAT3_OR_STAT4 */ diff --git a/test/analyze9.test b/test/analyze9.test index 8572cbea00..6e85272ad6 100644 --- a/test/analyze9.test +++ b/test/analyze9.test @@ -1134,4 +1134,67 @@ ifcapable stat4&&cte { } } +#------------------------------------------------------------------------- +# Check that a problem in they way stat4 data is used has been +# resolved (see below). +# +reset_db +do_test 26.1 { + db transaction { + execsql { + CREATE TABLE t1(x, y, z); + CREATE INDEX t1xy ON t1(x, y); + CREATE INDEX t1z ON t1(z); + } + for {set i 0} {$i < 10000} {incr i} { + execsql { INSERT INTO t1(x, y) VALUES($i, $i) } + } + for {set i 0} {$i < 10} {incr i} { + execsql { + WITH cnt(x) AS (SELECT 1 UNION ALL SELECT x+1 FROM cnt WHERE x<100) + INSERT INTO t1(x, y) SELECT 10000+$i, x FROM cnt; + INSERT INTO t1(x, y) SELECT 10000+$i, 100; + } + } + execsql { + UPDATE t1 SET z = rowid / 20; + ANALYZE; + } + } +} {} + +do_execsql_test 26.2 { + SELECT count(*) FROM t1 WHERE x = 10000 AND y < 50; +} {49} +do_execsql_test 26.3 { + SELECT count(*) FROM t1 WHERE z = 444; +} {20} + +# The analyzer knows that any (z=?) expression matches 20 rows. So it +# will use index "t1z" if the estimate of hits for (x=10000 AND y<50) +# is greater than 20 rows. +# +# And it should be. The analyzer has a stat4 sample as follows: +# +# sample=(x=10000, y=100) nLt=(10000 10099) +# +# There should be no other samples that start with (x=10000). So it knows +# that (x=10000 AND y<50) must match somewhere between 0 and 99 rows, but +# know more than that. Guessing less than 20 is therefore unreasonable. +# +# At one point though, due to a problem in whereKeyStats(), the planner was +# estimating that (x=10000 AND y<50) would match only 2 rows. +# +do_eqp_test 26.4 { + SELECT * FROM t1 WHERE x = 10000 AND y < 50 AND z = 444; +} { + 0 0 0 {SEARCH TABLE t1 USING INDEX t1z (z=?)} +} + + + + finish_test + + +