From: drh <drh@noemail.net>
Date: Wed, 27 Jul 2005 20:41:43 +0000 (+0000)
Subject: More work on the new optimizer.  Fewer tests fail now. (CVS 2565)
X-Git-Tag: version-3.6.10~3594
X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=28c4cf42daf4ee3bcdb0543d4531b571a9c42c7a;p=thirdparty%2Fsqlite.git

More work on the new optimizer.  Fewer tests fail now. (CVS 2565)

FossilOrigin-Name: ee3a08e353f563c36e904479393fcb56f96ee975
---

diff --git a/manifest b/manifest
index aa85641fc2..7103ccf19b 100644
--- a/manifest
+++ b/manifest
@@ -1,5 +1,5 @@
-C A\snew\soptimizer\sthat\sbreaks\sa\slot\sof\stests.\s\sBut\snone\sof\sthem\scritically,\sI\nthink.\s\sNevertheless,\sthere\sis\sa\slot\sof\swork\sahead\sto\sstabilize\sthe\scode.\s(CVS\s2564)
-D 2005-07-23T22:59:56
+C More\swork\son\sthe\snew\soptimizer.\s\sFewer\stests\sfail\snow.\s(CVS\s2565)
+D 2005-07-27T20:41:44
 F Makefile.in 22ea9c0fe748f591712d8fe3c6d972c6c173a165
 F Makefile.linux-gcc 06be33b2a9ad4f005a5f42b22c4a19dab3cbb5c7
 F README 9c4e2d6706bdcc3efdd773ce752a8cdab4f90028
@@ -33,7 +33,7 @@ F src/attach.c 3615dbe960cbee4aa5ea300b8a213dad36527b0f
 F src/auth.c 18c5a0befe20f3a58a41e3ddd78f372faeeefe1f
 F src/btree.c ec55bd70052cdd0958f3a0e79ad58d93561acb20
 F src/btree.h 41a71ce027db9ddee72cb43df2316bbe3a1d92af
-F src/build.c b78e95dcfcbbe285969e9563560f3e20a23cc8c0
+F src/build.c 4b6f9e61159733f86cd1589f4c10834ba856d280
 F src/callback.c 0910b611e0c158f107ee3ff86f8a371654971e2b
 F src/date.c 7444b0900a28da77e57e3337a636873cff0ae940
 F src/delete.c be1fc25c9e109cd8cbab42a43ee696263da7c04b
@@ -85,7 +85,7 @@ F src/vdbeapi.c 7f392f0792d1258c958083d7de9eae7c3530c9a6
 F src/vdbeaux.c 3732a86566a6be4da4c606e9334baf3fd98667af
 F src/vdbefifo.c b8805850afe13b43f1de78d58088cb5d66f88e1e
 F src/vdbemem.c da8e8d6f29dd1323f782f000d7cd120027c9ff03
-F src/where.c 3e9f8336bac3bbc829b85381227f1341f3fd4362
+F src/where.c 7757b1731d74364bc56d7da7cb9cfe8c9db1c019
 F tclinstaller.tcl 046e3624671962dc50f0481d7c25b38ef803eb42
 F test/all.test 7f0988442ab811dfa41793b5b550f5828ce316f3
 F test/alter.test 9d6837a3d946b73df692b7cef2a7644d2e2f6bc6
@@ -287,7 +287,7 @@ F www/tclsqlite.tcl 425be741b8ae664f55cb1ef2371aab0a75109cf9
 F www/vdbe.tcl 87a31ace769f20d3627a64fa1fade7fed47b90d0
 F www/version3.tcl a99cf5f6d8bd4d5537584a2b342f0fb9fa601d8b
 F www/whentouse.tcl 528299b8316726dbcc5548e9aa0648c8b1bd055b
-P 868279c78e056d27b2d1bea81127fe689b2ce478
-R 5bb82d998a20777c2205e35147889290
+P 86ce56ccea8297b1fba2b9ee53b5f1a3f228662f
+R 4972a6e268e2ac2613a7a3e2ab4d4526
 U drh
-Z 9d14e39e2e0ab0d8adb6f266b567831d
+Z 231d7effe54ec2e53a7f03147246d17a
diff --git a/manifest.uuid b/manifest.uuid
index 9d523ca771..ac57d3de04 100644
--- a/manifest.uuid
+++ b/manifest.uuid
@@ -1 +1 @@
-86ce56ccea8297b1fba2b9ee53b5f1a3f228662f
\ No newline at end of file
+ee3a08e353f563c36e904479393fcb56f96ee975
\ No newline at end of file
diff --git a/src/build.c b/src/build.c
index 93e94b6679..10d21655ca 100644
--- a/src/build.c
+++ b/src/build.c
@@ -22,7 +22,7 @@
 **     COMMIT
 **     ROLLBACK
 **
-** $Id: build.c,v 1.336 2005/07/23 22:59:56 drh Exp $
+** $Id: build.c,v 1.337 2005/07/27 20:41:44 drh Exp $
 */
 #include "sqliteInt.h"
 #include <ctype.h>
@@ -2390,15 +2390,47 @@ exit_create_index:
 /*
 ** Fill the Index.aiRowEst[] array with default information - information
 ** to be used when we have no ANALYZE command to run.
+**
+** aiRowEst[0] is suppose to contain the number of elements in the index.
+** Since we do not know, guess 1 million.  aiRowEst[1] is an estimate of the
+** number of rows in the table that match any particular value of the
+** first column of the index.  aiRowEst[2] is an estimate of the number
+** of rows that match any particular combiniation of the first 2 columns
+** of the index.  And so forth.  It must always be the case that
+*
+**           aiRowEst[N]<=aiRowEst[N-1]
+**           aiRowEst[N]>=1
+**
+** Apart from that, we have little to go on besides intuition as to
+** how aiRowEst[] should be initialized.  The numbers generated here
+** are based on typical values found in actual indices.
 */
 void sqlite3DefaultRowEst(Index *pIdx){
+  int *a = pIdx->aiRowEst;
   int i;
-  int n = pIdx->nColumn;
-  int j = 1000000;
-  int f = (1000000-1-100*(pIdx->onError==OE_None))/n;
-  for(i=0; i<=n; i++, j-=f){
-    assert( j>0 );
-    pIdx->aiRowEst[i] = j;
+  assert( a!=0 );
+  a[0] = 1000000;
+  switch( pIdx->nColumn ){
+    case 1: {
+      a[1] = 20;
+      break;
+    }
+    case 2: {
+      a[1] = 350;
+      a[2] = 20;
+      break;
+    }
+    default: {
+      a[1] = 1250;
+      a[2] = 350;
+      a[3] = 20;
+      for(i=pIdx->nColumn; i>=4; i--){
+        a[i] = 10;
+      }
+    }
+  }
+  if( pIdx->onError!=OE_None ){
+    a[pIdx->nColumn] = 1;
   }
 }
 
diff --git a/src/where.c b/src/where.c
index 2dbcf2ea51..28380844fd 100644
--- a/src/where.c
+++ b/src/where.c
@@ -16,7 +16,7 @@
 ** so is applicable.  Because this module is responsible for selecting
 ** indices, you might also think of this module as the "query optimizer".
 **
-** $Id: where.c,v 1.152 2005/07/23 22:59:56 drh Exp $
+** $Id: where.c,v 1.153 2005/07/27 20:41:44 drh Exp $
 */
 #include "sqliteInt.h"
 
@@ -544,6 +544,14 @@ static int isSortingIndex(
   nTerm = pOrderBy->nExpr;
   assert( nTerm>0 );
 
+  /* A UNIQUE index that is fully specified is always a sorting
+  ** index.
+  */
+  if( pIdx->onError!=OE_None && nEqCol==pIdx->nColumn ){
+    *pbRev = 0;
+    return 1;
+  }
+
   /* Match terms of the ORDER BY clause against columns of
   ** the index.
   */
@@ -619,6 +627,25 @@ static int sortableByRowid(
   return 0;
 }
 
+/*
+** Prepare a crude estimate of the logorithm of the input value.
+** The results need not be exact.  This is only used for estimating
+** the total cost of performing operatings with O(logN) or O(NlogN)
+** complexity.  Because N is just a guess, it is no great tragedy if
+** logN is a little off.
+**
+** We can assume N>=1.0;
+*/
+static double estLog(double N){
+  double logN = 1.0;
+  double x = 10.0;
+  while( N>x ){
+    logN = logN+1.0;
+    x *= 10;
+  }
+  return logN;
+}
+
 /*
 ** Find the best index for accessing a particular table.  Return a pointer
 ** to the index, flags that describe how the index should be used, the
@@ -668,46 +695,64 @@ static double bestIndex(
     *ppIndex = 0;
     bestFlags = WHERE_ROWID_EQ;
     if( pTerm->operator & WO_EQ ){
+      /* Rowid== is always the best pick.  Look no further.  Because only
+      ** a single row is generated, output is always in sorted order */
       *pFlags = WHERE_ROWID_EQ;
       *pnEq = 1;
       if( pOrderBy ) *pFlags |= WHERE_ORDERBY;
       TRACE(("... best is rowid\n"));
       return 0.0;
     }else if( pTerm->operator & WO_LIST ){
+      /* Rowid IN (LIST): cost is NlogN where N is the number of list
+      ** elements.  */
       lowestCost = pTerm->pExpr->pList->nExpr;
+      lowestCost *= estLog(lowestCost);
     }else{
-      lowestCost = 100.0;
+      /* Rowid IN (SELECT): cost is NlogN where N is the number of rows
+      ** in the result of the inner select.  We have no way to estimate
+      ** that value so make a wild guess. */
+      lowestCost = 200.0;
     }
     TRACE(("... rowid IN cost: %g\n", lowestCost));
   }
 
-  /* Check for constraints on a range of rowids or a full table scan.
+  /* Estimate the cost of a table scan.  If we do not know how many
+  ** entries are in the table, use 1 million as a guess.
   */
   pProbe = pSrc->pTab->pIndex;
-  cost = pProbe ? pProbe->aiRowEst[0] : 100000.0;
-  TRACE(("... base cost: %g\n", cost));
+  cost = pProbe ? pProbe->aiRowEst[0] : 1000000.0;
+  TRACE(("... table scan base cost: %g\n", cost));
+  flags = WHERE_ROWID_RANGE;
+
+  /* Check for constraints on a range of rowids in a table scan.
+  */
   pTerm = findTerm(pWC, iCur, -1, notReady, WO_LT|WO_LE|WO_GT|WO_GE, 0);
   if( pTerm ){
-    flags = WHERE_ROWID_RANGE;
     if( findTerm(pWC, iCur, -1, notReady, WO_LT|WO_LE, 0) ){
       flags |= WHERE_TOP_LIMIT;
-      cost *= 0.25;  /* Guess that rowid<EXPR eliminates 75% of the search */
+      cost *= 0.333;  /* Guess that rowid<EXPR eliminates two-thirds or rows */
     }
     if( findTerm(pWC, iCur, -1, notReady, WO_GT|WO_GE, 0) ){
       flags |= WHERE_BTM_LIMIT;
-      cost *= 0.25;  /* Guess that rowid>EXPR eliminates 75% of the search */
+      cost *= 0.333;  /* Guess that rowid>EXPR eliminates two-thirds of rows */
     }
-    TRACE(("... rowid range cost: %g\n", cost));
+    TRACE(("... rowid range reduces cost to %g\n", cost));
   }else{
     flags = 0;
   }
-  if( pOrderBy && sortableByRowid(iCur, pOrderBy, &rev) ){
-    flags |= WHERE_ORDERBY|WHERE_ROWID_RANGE;
-    cost *= 0.5;
-    if( rev ){
-      flags |= WHERE_REVERSE;
+
+  /* If the table scan does not satisfy the ORDER BY clause, increase
+  ** the cost by NlogN to cover the expense of sorting. */
+  if( pOrderBy ){
+    if( sortableByRowid(iCur, pOrderBy, &rev) ){
+      flags |= WHERE_ORDERBY|WHERE_ROWID_RANGE;
+      if( rev ){
+        flags |= WHERE_REVERSE;
+      }
+    }else{
+      cost += cost*estLog(cost);
+      TRACE(("... sorting increases cost to %g\n", cost));
     }
-    TRACE(("... order by reduces cost to %g\n", cost));
   }
   if( cost<lowestCost ){
     lowestCost = cost;
@@ -718,7 +763,7 @@ static double bestIndex(
   */
   for(; pProbe; pProbe=pProbe->pNext){
     int i;                       /* Loop counter */
-    double inMultiplier = 2.0;   /* Includes built-in index lookup penalty */
+    double inMultiplier = 1.0;
 
     TRACE(("... index %s:\n", pProbe->zName));
 
@@ -740,7 +785,7 @@ static double bestIndex(
         }
       }
     }
-    cost = pProbe->aiRowEst[i] * inMultiplier;
+    cost = pProbe->aiRowEst[i] * inMultiplier * estLog(inMultiplier);
     nEq = i;
     TRACE(("...... nEq=%d inMult=%g cost=%g\n", nEq, inMultiplier, cost));
 
@@ -753,30 +798,32 @@ static double bestIndex(
         flags = WHERE_COLUMN_RANGE;
         if( findTerm(pWC, iCur, j, notReady, WO_LT|WO_LE, pProbe) ){
           flags |= WHERE_TOP_LIMIT;
-          cost *= 0.5;
+          cost *= 0.333;
         }
         if( findTerm(pWC, iCur, j, notReady, WO_GT|WO_GE, pProbe) ){
           flags |= WHERE_BTM_LIMIT;
-          cost *= 0.5;
+          cost *= 0.333;
         }
         TRACE(("...... range reduces cost to %g\n", cost));
       }
     }
 
-    /* Reduce the cost substantially if this index can be used to satisfy
-    ** the ORDER BY clause
+    /* Add the additional cost of sorting if that is a factor.
     */
-    if( pOrderBy && (flags & WHERE_COLUMN_IN)==0 &&
+    if( pOrderBy ){
+      if( (flags & WHERE_COLUMN_IN)==0 &&
         isSortingIndex(pParse, pProbe, pSrc->pTab, iCur, pOrderBy, nEq, &rev) ){
-      if( flags==0 ){
-        flags = WHERE_COLUMN_RANGE;
-      }
-      flags |= WHERE_ORDERBY;
-      cost *= 0.5;
-      if( rev ){
-        flags |= WHERE_REVERSE;
+        if( flags==0 ){
+          flags = WHERE_COLUMN_RANGE;
+        }
+        flags |= WHERE_ORDERBY;
+        if( rev ){
+          flags |= WHERE_REVERSE;
+        }
+      }else{
+        cost += cost*estLog(cost);
+        TRACE(("...... orderby reduces cost to %g\n", cost));
       }
-      TRACE(("...... orderby reduces cost to %g\n", cost));
     }
 
     /* Check to see if we can get away with using just the index without