From: drh <drh@noemail.net>
Date: Tue, 1 Mar 2016 14:31:59 +0000 (+0000)
Subject: Change the estimated row counts in stat1 to be one-third worst-case and
X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=268df2d77f7ba057864d6e636ad8ffa88eeb6e88;p=thirdparty%2Fsqlite.git

Change the estimated row counts in stat1 to be one-third worst-case and
two-threads average case.

FossilOrigin-Name: 21bfd47c4245846b12aeeb7cf0212529e300b878
---

diff --git a/manifest b/manifest
index 21cddaa2b2..4997303fa7 100644
--- a/manifest
+++ b/manifest
@@ -1,5 +1,5 @@
-C Fix\stest\scases\sto\salign\swith\sthe\simproved\sstats\scomputation.
-D 2016-03-01T12:45:08.294
+C Change\sthe\sestimated\srow\scounts\sin\sstat1\sto\sbe\sone-third\sworst-case\sand\ntwo-threads\saverage\scase.
+D 2016-03-01T14:31:59.247
 F Makefile.in 4e90dc1521879022aa9479268a4cd141d1771142
 F Makefile.linux-gcc 7bc79876b875010e8c8f9502eb935ca92aa3c434
 F Makefile.msc 4f319afb7c049d40aff7af6e8c4e7cc2ba18e079
@@ -286,7 +286,7 @@ F sqlite.pc.in 42b7bf0d02e08b9e77734a47798d1a55a9e0716b
 F sqlite3.1 fc7ad8990fc8409983309bb80de8c811a7506786
 F sqlite3.pc.in 48fed132e7cb71ab676105d2a4dc77127d8c1f3a
 F src/alter.c 44e18dfd78e8942d65d3cdaec4de972b5cd9f1f2
-F src/analyze.c 997b0ce0f4efd68b0499f941ebebdd1ec0c71549
+F src/analyze.c dac2f49d4a622091df1964a9ec9de1d2cd0584bb
 F src/attach.c a3724c64de1099d85e30751213d285752aed9505
 F src/auth.c b56c78ebe40a2110fd361379f7e8162d23f92240
 F src/backup.c f60f0aa55d25d853ffde53d0b0370a7bb7ee41ce
@@ -444,7 +444,7 @@ F test/alter3.test b3568d11c38c4599c92f24242eda34144d78dc10
 F test/alter4.test c461150723ac957f3b2214aa0b11552cd72023ec
 F test/altermalloc.test e81ac9657ed25c6c5bb09bebfa5a047cd8e4acfc
 F test/amatch1.test b5ae7065f042b7f4c1c922933f4700add50cdb9f
-F test/analyze.test 6e7bb2c94df69461b83627bdc3d960b0cb2847cd
+F test/analyze.test 3eb35a4af972f98422e5dc0586501b17d103d321
 F test/analyze3.test 1dccda46a6c374018af617fba00bfe297a61d442
 F test/analyze4.test eff2df19b8dd84529966420f29ea52edc6b56213
 F test/analyze5.test 765c4e284aa69ca172772aa940946f55629bc8c4
@@ -1451,7 +1451,7 @@ F vsixtest/vsixtest.tcl 6a9a6ab600c25a91a7acc6293828957a386a8a93
 F vsixtest/vsixtest.vcxproj.data 2ed517e100c66dc455b492e1a33350c1b20fbcdc
 F vsixtest/vsixtest.vcxproj.filters 37e51ffedcdb064aad6ff33b6148725226cd608e
 F vsixtest/vsixtest_TemporaryKey.pfx e5b1b036facdb453873e7084e1cae9102ccc67a0
-P 421b5b544af734b97e3da47699fa0f82b21617d3
-R d260dd34a084c646db10204191cf68af
+P 810967bff6ba262f38a2833e2d4efef4ef00f463
+R cd9cc1e55f6145b7b1e4855011dd5873
 U drh
-Z 232c83ab04b03227d0856ebcd7ccd2ec
+Z 85196c62dcaae63dbb4c33e57a54153c
diff --git a/manifest.uuid b/manifest.uuid
index 43e84cf5bf..6c06608fd7 100644
--- a/manifest.uuid
+++ b/manifest.uuid
@@ -1 +1 @@
-810967bff6ba262f38a2833e2d4efef4ef00f463
\ No newline at end of file
+21bfd47c4245846b12aeeb7cf0212529e300b878
\ No newline at end of file
diff --git a/src/analyze.c b/src/analyze.c
index 9c3c37b05b..b2b85d3d71 100644
--- a/src/analyze.c
+++ b/src/analyze.c
@@ -828,31 +828,47 @@ static void statGet(
   assert( argc==1 );
 #endif
   {
-    /* Return the value to store in the "stat" column of the sqlite_stat1
+    /* Return a value for the "stat" column of the sqlite_stat1
     ** table for this index.
     **
     ** The value is a string composed of a list of integers describing 
-    ** the index. The first integer in the list is the total number of 
-    ** entries in the index. There is one additional integer in the list 
-    ** for each indexed column. This additional integer is an estimate of
-    ** the number of rows matched by a query on the index using
-    ** a key with the corresponding number of fields. In other words,
-    ** if the index is on columns (a,b) and the sqlite_stat1 value is 
-    ** "100 10 2", then SQLite estimates that:
+    ** the index. The first integer is the (estimated) total number
+    ** entries in the index. There is one additional integer for each
+    ** column in the index.  The first added integer is an estimate of
+    ** the number of rows that match a single key in the first column of
+    ** the index.  The second added integer is an estimate on of the number
+    ** of rows that match a single key consisting of the first two columns
+    ** of the index.  And so forth.
     **
-    **   * the index contains 100 rows,
-    **   * "WHERE a=?" matches 10 rows, and
-    **   * "WHERE a=? AND b=?" matches 2 rows.
+    ** For example, for an index on columns (a,b), if the sqlite_stat1.stat
+    ** values is "100 10 2", that means there are about 100 rows in the
+    ** index, and that a query against a=$key1 will match about 10 rows
+    ** and a query against "a=$key1 AND b=$key2" will match about 2 rows.
     **
-    ** A worst-case estimate is used:  the maximum number of rows that
-    ** could be select for any set of query parameters.  The worst case
-    ** is the estimate we want for choosing indexes.
+    ** Let V be the average number of rows that match a key, and let M
+    ** be the most number of rows that match the key for any possible value
+    ** of that key.  The estimate is computed as:
+    **
+    **     E = (2*V + M)/3
+    **
+    ** Consider two indexes.  Index X has with 100 values of exactly 0 and 
+    ** 100 singleton values between 1 and 100.  Index Y has 200 values
+    ** evenly distributed between 1 and 20.  If only the average (V) is
+    ** used in the estimate, X would have "200 2" and Y would have "200 10"
+    ** and so the planner would think X is the more selective index.  And
+    ** X often would be more selective.  But when searching for 0, index X
+    ** would perform badly.  To avoid this problem, the M is added into the
+    ** estimate so that the stat for X is "200 34" and Y is still "200 10".
+    ** In this way, Y is the preferred index (all else being equal) and
+    ** the pathological case is avoided.
     **
     ** For deciding whether or not to do a skip-scan, we want to know the
-    ** average number of rows with the same key.  We can approximate this
-    ** using the (worst case) most number of rows with the same key.  But
-    ** sometimes that approximation can be badly off.  In those cases,
-    ** mark the index as "noskipscan" to avoid suboptimal skip-scan plans.
+    ** average number of rows (V) with the same key, not the mixed estimate
+    ** E shown above.  Usually E will be close enough.  However, if E is
+    ** large but V is small, that could trick the query planner into thinking
+    ** that a skip-scan might work well on this index.  To avoid that, the
+    ** "noskipscan" flag is added in cases where the divergence between E
+    ** and V might mislead the query planner.
     */
     char *z;
     int i;
@@ -867,13 +883,14 @@ static void statGet(
     sqlite3_snprintf(24, zRet, "%llu", (u64)p->nRow);
     z = zRet + sqlite3Strlen30(zRet);
     for(i=0; i<p->nKeyCol; i++){
-      u64 iVal = p->current.amxEq[i];
+      u64 nDistinct = p->current.anDLt[i];
+      u64 iMx = p->current.amxEq[i];                /* M: Most rows per key */
+      u64 iAvg = (p->nRow+nDistinct)/(nDistinct+1); /* V: Average per key */
+      u64 iVal = (iMx+iAvg*2)/3;                    /* E: The estimate */
       sqlite3_snprintf(24, z, " %llu", iVal);
       z += sqlite3Strlen30(z);
       assert( p->current.anEq[i] );
-      if( iVal>=WHERE_SKIPSCAN_ONSET
-       && p->current.anDLt[i] > p->nRow/(WHERE_SKIPSCAN_ONSET*2/3)
-      ){
+      if( iVal>=WHERE_SKIPSCAN_ONSET && iAvg<(WHERE_SKIPSCAN_ONSET*2/3) ){
         noSkipScan = 1;
       }
     }
diff --git a/test/analyze.test b/test/analyze.test
index 76e12c7395..af277cc835 100644
--- a/test/analyze.test
+++ b/test/analyze.test
@@ -158,7 +158,7 @@ do_test analyze-3.3 {
     ANALYZE main;
     SELECT idx, stat FROM sqlite_stat1 ORDER BY idx;
   }
-} {t1i1 {5 4} t1i2 {5 2} t1i3 {5 4 1}}
+} {t1i1 {5 3} t1i2 {5 2} t1i3 {5 3 1}}
 do_test analyze-3.4 {
   execsql {
     CREATE TABLE t2 AS SELECT * FROM t1;
@@ -168,27 +168,27 @@ do_test analyze-3.4 {
     ANALYZE;
     SELECT idx, stat FROM sqlite_stat1 ORDER BY idx;
   }
-} {t1i1 {5 4} t1i2 {5 2} t1i3 {5 4 1} t2i1 {5 4} t2i2 {5 2} t2i3 {5 4 1}}
+} {t1i1 {5 3} t1i2 {5 2} t1i3 {5 3 1} t2i1 {5 3} t2i2 {5 2} t2i3 {5 3 1}}
 do_test analyze-3.5 {
   execsql {
     DROP INDEX t2i3;
     ANALYZE t1;
     SELECT idx, stat FROM sqlite_stat1 ORDER BY idx;
   }
-} {t1i1 {5 4} t1i2 {5 2} t1i3 {5 4 1} t2i1 {5 4} t2i2 {5 2}}
+} {t1i1 {5 3} t1i2 {5 2} t1i3 {5 3 1} t2i1 {5 3} t2i2 {5 2}}
 do_test analyze-3.6 {
   execsql {
     ANALYZE t2;
     SELECT idx, stat FROM sqlite_stat1 ORDER BY idx;
   }
-} {t1i1 {5 4} t1i2 {5 2} t1i3 {5 4 1} t2i1 {5 4} t2i2 {5 2}}
+} {t1i1 {5 3} t1i2 {5 2} t1i3 {5 3 1} t2i1 {5 3} t2i2 {5 2}}
 do_test analyze-3.7 {
   execsql {
     DROP INDEX t2i2;
     ANALYZE t2;
     SELECT idx, stat FROM sqlite_stat1 ORDER BY idx;
   }
-} {t1i1 {5 4} t1i2 {5 2} t1i3 {5 4 1} t2i1 {5 4}}
+} {t1i1 {5 3} t1i2 {5 2} t1i3 {5 3 1} t2i1 {5 3}}
 do_test analyze-3.8 {
   execsql {
     CREATE TABLE t3 AS SELECT a, b, rowid AS c, 'hi' AS d FROM t1;
@@ -205,7 +205,7 @@ do_test analyze-3.9 {
     ANALYZE;
     SELECT idx, stat FROM sqlite_stat1 ORDER BY idx;
   }
-} {t3i1 {5 4} t3i2 {5 4 1 1 1} t3i3 {5 5 2 1 1}}
+} {t3i1 {5 3} t3i2 {5 3 1 1 1} t3i3 {5 5 2 1 1}}
 
 do_test analyze-3.10 {
   execsql {
@@ -217,19 +217,19 @@ do_test analyze-3.10 {
     ANALYZE;
     SELECT idx, stat FROM sqlite_stat1 ORDER BY idx;
   }
-} {{another foolish ' name} {2 1} {foolish ' name} {2 1 1} t3i1 {5 4} t3i2 {5 4 1 1 1} t3i3 {5 5 2 1 1}}
+} {{another foolish ' name} {2 1} {foolish ' name} {2 1 1} t3i1 {5 3} t3i2 {5 3 1 1 1} t3i3 {5 5 2 1 1}}
 do_test analyze-3.11 {
   execsql {
     DROP INDEX "foolish ' name";
     SELECT idx, stat FROM sqlite_stat1 ORDER BY idx;
   }
-} {{another foolish ' name} {2 1} t3i1 {5 4} t3i2 {5 4 1 1 1} t3i3 {5 5 2 1 1}}
+} {{another foolish ' name} {2 1} t3i1 {5 3} t3i2 {5 3 1 1 1} t3i3 {5 5 2 1 1}}
 do_test analyze-3.11 {
   execsql {
     DROP TABLE "silly "" name";
     SELECT idx, stat FROM sqlite_stat1 ORDER BY idx;
   }
-} {t3i1 {5 4} t3i2 {5 4 1 1 1} t3i3 {5 5 2 1 1}}
+} {t3i1 {5 3} t3i2 {5 3 1 1 1} t3i3 {5 5 2 1 1}}
 
 # Try corrupting the sqlite_stat1 table and make sure the
 # database is still able to function.
@@ -249,7 +249,7 @@ do_test analyze-4.0 {
     ANALYZE;
     SELECT idx, stat FROM sqlite_stat1 ORDER BY idx;
   }
-} {t3i1 {5 4} t3i2 {5 4 1 1 1} t3i3 {5 5 2 1 1} t4i1 {5 4} t4i2 {5 2}}
+} {t3i1 {5 3} t3i2 {5 3 1 1 1} t3i3 {5 5 2 1 1} t4i1 {5 3} t4i2 {5 2}}
 do_test analyze-4.1 {
   execsql {
     PRAGMA writable_schema=on;