More work on the new optimizer. Fewer tests fail now. (CVS 2565)

author drh <drh@noemail.net>

Wed, 27 Jul 2005 20:41:43 +0000 (20:41 +0000)

committer drh <drh@noemail.net>

Wed, 27 Jul 2005 20:41:43 +0000 (20:41 +0000)
author drh <drh@noemail.net>
Wed, 27 Jul 2005 20:41:43 +0000 (20:41 +0000)
committer drh <drh@noemail.net>
Wed, 27 Jul 2005 20:41:43 +0000 (20:41 +0000)
diff --git a/manifest b/manifest

index aa85641fc2a6e6a074ff2cd3eadff80a795e6ea7..7103ccf19b1915bb4f9a484ffcc93f402fba4438 100644 (file)
--- a/manifest
+++ b/manifest
@@ -1,5 +1,5 @@
-C A\snew\soptimizer\sthat\sbreaks\sa\slot\sof\stests.\s\sBut\snone\sof\sthem\scritically,\sI\nthink.\s\sNevertheless,\sthere\sis\sa\slot\sof\swork\sahead\sto\sstabilize\sthe\scode.\s(CVS\s2564)
-D 2005-07-23T22:59:56
+C More\swork\son\sthe\snew\soptimizer.\s\sFewer\stests\sfail\snow.\s(CVS\s2565)
+D 2005-07-27T20:41:44
  F Makefile.in 22ea9c0fe748f591712d8fe3c6d972c6c173a165
  F Makefile.linux-gcc 06be33b2a9ad4f005a5f42b22c4a19dab3cbb5c7
  F README 9c4e2d6706bdcc3efdd773ce752a8cdab4f90028
@@ -33,7 +33,7 @@ F src/attach.c 3615dbe960cbee4aa5ea300b8a213dad36527b0f
  F src/auth.c 18c5a0befe20f3a58a41e3ddd78f372faeeefe1f
  F src/btree.c ec55bd70052cdd0958f3a0e79ad58d93561acb20
  F src/btree.h 41a71ce027db9ddee72cb43df2316bbe3a1d92af
-F src/build.c b78e95dcfcbbe285969e9563560f3e20a23cc8c0
+F src/build.c 4b6f9e61159733f86cd1589f4c10834ba856d280
  F src/callback.c 0910b611e0c158f107ee3ff86f8a371654971e2b
  F src/date.c 7444b0900a28da77e57e3337a636873cff0ae940
  F src/delete.c be1fc25c9e109cd8cbab42a43ee696263da7c04b
@@ -85,7 +85,7 @@ F src/vdbeapi.c 7f392f0792d1258c958083d7de9eae7c3530c9a6
  F src/vdbeaux.c 3732a86566a6be4da4c606e9334baf3fd98667af
  F src/vdbefifo.c b8805850afe13b43f1de78d58088cb5d66f88e1e
  F src/vdbemem.c da8e8d6f29dd1323f782f000d7cd120027c9ff03
-F src/where.c 3e9f8336bac3bbc829b85381227f1341f3fd4362
+F src/where.c 7757b1731d74364bc56d7da7cb9cfe8c9db1c019
  F tclinstaller.tcl 046e3624671962dc50f0481d7c25b38ef803eb42
  F test/all.test 7f0988442ab811dfa41793b5b550f5828ce316f3
  F test/alter.test 9d6837a3d946b73df692b7cef2a7644d2e2f6bc6
@@ -287,7 +287,7 @@ F www/tclsqlite.tcl 425be741b8ae664f55cb1ef2371aab0a75109cf9
  F www/vdbe.tcl 87a31ace769f20d3627a64fa1fade7fed47b90d0
  F www/version3.tcl a99cf5f6d8bd4d5537584a2b342f0fb9fa601d8b
  F www/whentouse.tcl 528299b8316726dbcc5548e9aa0648c8b1bd055b
-P 868279c78e056d27b2d1bea81127fe689b2ce478
-R 5bb82d998a20777c2205e35147889290
+P 86ce56ccea8297b1fba2b9ee53b5f1a3f228662f
+R 4972a6e268e2ac2613a7a3e2ab4d4526
  U drh
-Z 9d14e39e2e0ab0d8adb6f266b567831d
+Z 231d7effe54ec2e53a7f03147246d17a
diff --git a/manifest.uuid b/manifest.uuid

index 9d523ca771fa88d5533bc2bd641173794f3b6ec1..ac57d3de046383ea6689ab56bc738c5c544f3e2a 100644 (file)
--- a/manifest.uuid
+++ b/manifest.uuid
@@ -1 +1 @@
-86ce56ccea8297b1fba2b9ee53b5f1a3f228662f
-\ No newline at end of file
+ee3a08e353f563c36e904479393fcb56f96ee975
+\ No newline at end of file
diff --git a/src/build.c b/src/build.c

index 93e94b66798e76d195a0f86800be195e5120d39c..10d21655ca85a80a601f19026bee689d95f67483 100644 (file)
--- a/src/build.c
+++ b/src/build.c
@@ -22,7 +22,7 @@
  **     COMMIT
  **     ROLLBACK
  **
-** $Id: build.c,v 1.336 2005/07/23 22:59:56 drh Exp $
+** $Id: build.c,v 1.337 2005/07/27 20:41:44 drh Exp $
  */
  #include "sqliteInt.h"
  #include <ctype.h>
@@ -2390,15 +2390,47 @@ exit_create_index:
  /*
  ** Fill the Index.aiRowEst[] array with default information - information
  ** to be used when we have no ANALYZE command to run.
+**
+** aiRowEst[0] is suppose to contain the number of elements in the index.
+** Since we do not know, guess 1 million.  aiRowEst[1] is an estimate of the
+** number of rows in the table that match any particular value of the
+** first column of the index.  aiRowEst[2] is an estimate of the number
+** of rows that match any particular combiniation of the first 2 columns
+** of the index.  And so forth.  It must always be the case that
+*
+**           aiRowEst[N]<=aiRowEst[N-1]
+**           aiRowEst[N]>=1
+**
+** Apart from that, we have little to go on besides intuition as to
+** how aiRowEst[] should be initialized.  The numbers generated here
+** are based on typical values found in actual indices.
  */
  void sqlite3DefaultRowEst(Index *pIdx){
+  int *a = pIdx->aiRowEst;
    int i;
-  int n = pIdx->nColumn;
-  int j = 1000000;
-  int f = (1000000-1-100*(pIdx->onError==OE_None))/n;
-  for(i=0; i<=n; i++, j-=f){
-    assert( j>0 );
-    pIdx->aiRowEst[i] = j;
+  assert( a!=0 );
+  a[0] = 1000000;
+  switch( pIdx->nColumn ){
+    case 1: {
+      a[1] = 20;
+      break;
+    }
+    case 2: {
+      a[1] = 350;
+      a[2] = 20;
+      break;
+    }
+    default: {
+      a[1] = 1250;
+      a[2] = 350;
+      a[3] = 20;
+      for(i=pIdx->nColumn; i>=4; i--){
+        a[i] = 10;
+      }
+    }
+  }
+  if( pIdx->onError!=OE_None ){
+    a[pIdx->nColumn] = 1;
    }
  }
  
diff --git a/src/where.c b/src/where.c

index 2dbcf2ea51576fe80d54c47534d4ff8d2d043b26..28380844fd2dd95f40adeb712cad962119923045 100644 (file)
--- a/src/where.c
+++ b/src/where.c
@@ -16,7 +16,7 @@
  ** so is applicable.  Because this module is responsible for selecting
  ** indices, you might also think of this module as the "query optimizer".
  **
-** $Id: where.c,v 1.152 2005/07/23 22:59:56 drh Exp $
+** $Id: where.c,v 1.153 2005/07/27 20:41:44 drh Exp $
  */
  #include "sqliteInt.h"
  
@@ -544,6 +544,14 @@ static int isSortingIndex(
    nTerm = pOrderBy->nExpr;
    assert( nTerm>0 );
  
+  /* A UNIQUE index that is fully specified is always a sorting
+  ** index.
+  */
+  if( pIdx->onError!=OE_None && nEqCol==pIdx->nColumn ){
+    *pbRev = 0;
+    return 1;
+  }
+
    /* Match terms of the ORDER BY clause against columns of
    ** the index.
    */
@@ -619,6 +627,25 @@ static int sortableByRowid(
    return 0;
  }
  
+/*
+** Prepare a crude estimate of the logorithm of the input value.
+** The results need not be exact.  This is only used for estimating
+** the total cost of performing operatings with O(logN) or O(NlogN)
+** complexity.  Because N is just a guess, it is no great tragedy if
+** logN is a little off.
+**
+** We can assume N>=1.0;
+*/
+static double estLog(double N){
+  double logN = 1.0;
+  double x = 10.0;
+  while( N>x ){
+    logN = logN+1.0;
+    x *= 10;
+  }
+  return logN;
+}
+
  /*
  ** Find the best index for accessing a particular table.  Return a pointer
  ** to the index, flags that describe how the index should be used, the
@@ -668,46 +695,64 @@ static double bestIndex(
      *ppIndex = 0;
      bestFlags = WHERE_ROWID_EQ;
      if( pTerm->operator & WO_EQ ){
+      /* Rowid== is always the best pick.  Look no further.  Because only
+      ** a single row is generated, output is always in sorted order */
        *pFlags = WHERE_ROWID_EQ;
        *pnEq = 1;
        if( pOrderBy ) *pFlags |= WHERE_ORDERBY;
        TRACE(("... best is rowid\n"));
        return 0.0;
      }else if( pTerm->operator & WO_LIST ){
+      /* Rowid IN (LIST): cost is NlogN where N is the number of list
+      ** elements.  */
        lowestCost = pTerm->pExpr->pList->nExpr;
+      lowestCost *= estLog(lowestCost);
      }else{
-      lowestCost = 100.0;
+      /* Rowid IN (SELECT): cost is NlogN where N is the number of rows
+      ** in the result of the inner select.  We have no way to estimate
+      ** that value so make a wild guess. */
+      lowestCost = 200.0;
      }
      TRACE(("... rowid IN cost: %g\n", lowestCost));
    }
  
-  /* Check for constraints on a range of rowids or a full table scan.
+  /* Estimate the cost of a table scan.  If we do not know how many
+  ** entries are in the table, use 1 million as a guess.
    */
    pProbe = pSrc->pTab->pIndex;
-  cost = pProbe ? pProbe->aiRowEst[0] : 100000.0;
-  TRACE(("... base cost: %g\n", cost));
+  cost = pProbe ? pProbe->aiRowEst[0] : 1000000.0;
+  TRACE(("... table scan base cost: %g\n", cost));
+  flags = WHERE_ROWID_RANGE;
+
+  /* Check for constraints on a range of rowids in a table scan.
+  */
    pTerm = findTerm(pWC, iCur, -1, notReady, WO_LT|WO_LE|WO_GT|WO_GE, 0);
    if( pTerm ){
-    flags = WHERE_ROWID_RANGE;
      if( findTerm(pWC, iCur, -1, notReady, WO_LT|WO_LE, 0) ){
        flags |= WHERE_TOP_LIMIT;
-      cost *= 0.25;  /* Guess that rowid<EXPR eliminates 75% of the search */
+      cost *= 0.333;  /* Guess that rowid<EXPR eliminates two-thirds or rows */
      }
      if( findTerm(pWC, iCur, -1, notReady, WO_GT|WO_GE, 0) ){
        flags |= WHERE_BTM_LIMIT;
-      cost *= 0.25;  /* Guess that rowid>EXPR eliminates 75% of the search */
+      cost *= 0.333;  /* Guess that rowid>EXPR eliminates two-thirds of rows */
      }
-    TRACE(("... rowid range cost: %g\n", cost));
+    TRACE(("... rowid range reduces cost to %g\n", cost));
    }else{
      flags = 0;
    }
-  if( pOrderBy && sortableByRowid(iCur, pOrderBy, &rev) ){
-    flags |= WHERE_ORDERBY|WHERE_ROWID_RANGE;
-    cost *= 0.5;
-    if( rev ){
-      flags |= WHERE_REVERSE;
+
+  /* If the table scan does not satisfy the ORDER BY clause, increase
+  ** the cost by NlogN to cover the expense of sorting. */
+  if( pOrderBy ){
+    if( sortableByRowid(iCur, pOrderBy, &rev) ){
+      flags |= WHERE_ORDERBY|WHERE_ROWID_RANGE;
+      if( rev ){
+        flags |= WHERE_REVERSE;
+      }
+    }else{
+      cost += cost*estLog(cost);
+      TRACE(("... sorting increases cost to %g\n", cost));
      }
-    TRACE(("... order by reduces cost to %g\n", cost));
    }
    if( cost<lowestCost ){
      lowestCost = cost;
@@ -718,7 +763,7 @@ static double bestIndex(
    */
    for(; pProbe; pProbe=pProbe->pNext){
      int i;                       /* Loop counter */
-    double inMultiplier = 2.0;   /* Includes built-in index lookup penalty */
+    double inMultiplier = 1.0;
  
      TRACE(("... index %s:\n", pProbe->zName));
  
@@ -740,7 +785,7 @@ static double bestIndex(
          }
        }
      }
-    cost = pProbe->aiRowEst[i] * inMultiplier;
+    cost = pProbe->aiRowEst[i] * inMultiplier * estLog(inMultiplier);
      nEq = i;
      TRACE(("...... nEq=%d inMult=%g cost=%g\n", nEq, inMultiplier, cost));
  
@@ -753,30 +798,32 @@ static double bestIndex(
          flags = WHERE_COLUMN_RANGE;
          if( findTerm(pWC, iCur, j, notReady, WO_LT|WO_LE, pProbe) ){
            flags |= WHERE_TOP_LIMIT;
-          cost *= 0.5;
+          cost *= 0.333;
          }
          if( findTerm(pWC, iCur, j, notReady, WO_GT|WO_GE, pProbe) ){
            flags |= WHERE_BTM_LIMIT;
-          cost *= 0.5;
+          cost *= 0.333;
          }
          TRACE(("...... range reduces cost to %g\n", cost));
        }
      }
  
-    /* Reduce the cost substantially if this index can be used to satisfy
-    ** the ORDER BY clause
+    /* Add the additional cost of sorting if that is a factor.
      */
-    if( pOrderBy && (flags & WHERE_COLUMN_IN)==0 &&
+    if( pOrderBy ){
+      if( (flags & WHERE_COLUMN_IN)==0 &&
          isSortingIndex(pParse, pProbe, pSrc->pTab, iCur, pOrderBy, nEq, &rev) ){
-      if( flags==0 ){
-        flags = WHERE_COLUMN_RANGE;
-      }
-      flags |= WHERE_ORDERBY;
-      cost *= 0.5;
-      if( rev ){
-        flags |= WHERE_REVERSE;
+        if( flags==0 ){
+          flags = WHERE_COLUMN_RANGE;
+        }
+        flags |= WHERE_ORDERBY;
+        if( rev ){
+          flags |= WHERE_REVERSE;
+        }
+      }else{
+        cost += cost*estLog(cost);
+        TRACE(("...... orderby reduces cost to %g\n", cost));
        }
-      TRACE(("...... orderby reduces cost to %g\n", cost));
      }
  
      /* Check to see if we can get away with using just the index without
author	drh <drh@noemail.net>
	Wed, 27 Jul 2005 20:41:43 +0000 (20:41 +0000)
committer	drh <drh@noemail.net>
	Wed, 27 Jul 2005 20:41:43 +0000 (20:41 +0000)
manifest		patch \| blob \| blame \| history
manifest.uuid		patch \| blob \| blame \| history
src/build.c		patch \| blob \| blame \| history
src/where.c		patch \| blob \| blame \| history