From: drh Date: Wed, 27 Jul 2005 20:41:43 +0000 (+0000) Subject: More work on the new optimizer. Fewer tests fail now. (CVS 2565) X-Git-Tag: version-3.6.10~3594 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=28c4cf42daf4ee3bcdb0543d4531b571a9c42c7a;p=thirdparty%2Fsqlite.git More work on the new optimizer. Fewer tests fail now. (CVS 2565) FossilOrigin-Name: ee3a08e353f563c36e904479393fcb56f96ee975 --- diff --git a/manifest b/manifest index aa85641fc2..7103ccf19b 100644 --- a/manifest +++ b/manifest @@ -1,5 +1,5 @@ -C A\snew\soptimizer\sthat\sbreaks\sa\slot\sof\stests.\s\sBut\snone\sof\sthem\scritically,\sI\nthink.\s\sNevertheless,\sthere\sis\sa\slot\sof\swork\sahead\sto\sstabilize\sthe\scode.\s(CVS\s2564) -D 2005-07-23T22:59:56 +C More\swork\son\sthe\snew\soptimizer.\s\sFewer\stests\sfail\snow.\s(CVS\s2565) +D 2005-07-27T20:41:44 F Makefile.in 22ea9c0fe748f591712d8fe3c6d972c6c173a165 F Makefile.linux-gcc 06be33b2a9ad4f005a5f42b22c4a19dab3cbb5c7 F README 9c4e2d6706bdcc3efdd773ce752a8cdab4f90028 @@ -33,7 +33,7 @@ F src/attach.c 3615dbe960cbee4aa5ea300b8a213dad36527b0f F src/auth.c 18c5a0befe20f3a58a41e3ddd78f372faeeefe1f F src/btree.c ec55bd70052cdd0958f3a0e79ad58d93561acb20 F src/btree.h 41a71ce027db9ddee72cb43df2316bbe3a1d92af -F src/build.c b78e95dcfcbbe285969e9563560f3e20a23cc8c0 +F src/build.c 4b6f9e61159733f86cd1589f4c10834ba856d280 F src/callback.c 0910b611e0c158f107ee3ff86f8a371654971e2b F src/date.c 7444b0900a28da77e57e3337a636873cff0ae940 F src/delete.c be1fc25c9e109cd8cbab42a43ee696263da7c04b @@ -85,7 +85,7 @@ F src/vdbeapi.c 7f392f0792d1258c958083d7de9eae7c3530c9a6 F src/vdbeaux.c 3732a86566a6be4da4c606e9334baf3fd98667af F src/vdbefifo.c b8805850afe13b43f1de78d58088cb5d66f88e1e F src/vdbemem.c da8e8d6f29dd1323f782f000d7cd120027c9ff03 -F src/where.c 3e9f8336bac3bbc829b85381227f1341f3fd4362 +F src/where.c 7757b1731d74364bc56d7da7cb9cfe8c9db1c019 F tclinstaller.tcl 046e3624671962dc50f0481d7c25b38ef803eb42 F test/all.test 7f0988442ab811dfa41793b5b550f5828ce316f3 F test/alter.test 9d6837a3d946b73df692b7cef2a7644d2e2f6bc6 @@ -287,7 +287,7 @@ F www/tclsqlite.tcl 425be741b8ae664f55cb1ef2371aab0a75109cf9 F www/vdbe.tcl 87a31ace769f20d3627a64fa1fade7fed47b90d0 F www/version3.tcl a99cf5f6d8bd4d5537584a2b342f0fb9fa601d8b F www/whentouse.tcl 528299b8316726dbcc5548e9aa0648c8b1bd055b -P 868279c78e056d27b2d1bea81127fe689b2ce478 -R 5bb82d998a20777c2205e35147889290 +P 86ce56ccea8297b1fba2b9ee53b5f1a3f228662f +R 4972a6e268e2ac2613a7a3e2ab4d4526 U drh -Z 9d14e39e2e0ab0d8adb6f266b567831d +Z 231d7effe54ec2e53a7f03147246d17a diff --git a/manifest.uuid b/manifest.uuid index 9d523ca771..ac57d3de04 100644 --- a/manifest.uuid +++ b/manifest.uuid @@ -1 +1 @@ -86ce56ccea8297b1fba2b9ee53b5f1a3f228662f \ No newline at end of file +ee3a08e353f563c36e904479393fcb56f96ee975 \ No newline at end of file diff --git a/src/build.c b/src/build.c index 93e94b6679..10d21655ca 100644 --- a/src/build.c +++ b/src/build.c @@ -22,7 +22,7 @@ ** COMMIT ** ROLLBACK ** -** $Id: build.c,v 1.336 2005/07/23 22:59:56 drh Exp $ +** $Id: build.c,v 1.337 2005/07/27 20:41:44 drh Exp $ */ #include "sqliteInt.h" #include @@ -2390,15 +2390,47 @@ exit_create_index: /* ** Fill the Index.aiRowEst[] array with default information - information ** to be used when we have no ANALYZE command to run. +** +** aiRowEst[0] is suppose to contain the number of elements in the index. +** Since we do not know, guess 1 million. aiRowEst[1] is an estimate of the +** number of rows in the table that match any particular value of the +** first column of the index. aiRowEst[2] is an estimate of the number +** of rows that match any particular combiniation of the first 2 columns +** of the index. And so forth. It must always be the case that +* +** aiRowEst[N]<=aiRowEst[N-1] +** aiRowEst[N]>=1 +** +** Apart from that, we have little to go on besides intuition as to +** how aiRowEst[] should be initialized. The numbers generated here +** are based on typical values found in actual indices. */ void sqlite3DefaultRowEst(Index *pIdx){ + int *a = pIdx->aiRowEst; int i; - int n = pIdx->nColumn; - int j = 1000000; - int f = (1000000-1-100*(pIdx->onError==OE_None))/n; - for(i=0; i<=n; i++, j-=f){ - assert( j>0 ); - pIdx->aiRowEst[i] = j; + assert( a!=0 ); + a[0] = 1000000; + switch( pIdx->nColumn ){ + case 1: { + a[1] = 20; + break; + } + case 2: { + a[1] = 350; + a[2] = 20; + break; + } + default: { + a[1] = 1250; + a[2] = 350; + a[3] = 20; + for(i=pIdx->nColumn; i>=4; i--){ + a[i] = 10; + } + } + } + if( pIdx->onError!=OE_None ){ + a[pIdx->nColumn] = 1; } } diff --git a/src/where.c b/src/where.c index 2dbcf2ea51..28380844fd 100644 --- a/src/where.c +++ b/src/where.c @@ -16,7 +16,7 @@ ** so is applicable. Because this module is responsible for selecting ** indices, you might also think of this module as the "query optimizer". ** -** $Id: where.c,v 1.152 2005/07/23 22:59:56 drh Exp $ +** $Id: where.c,v 1.153 2005/07/27 20:41:44 drh Exp $ */ #include "sqliteInt.h" @@ -544,6 +544,14 @@ static int isSortingIndex( nTerm = pOrderBy->nExpr; assert( nTerm>0 ); + /* A UNIQUE index that is fully specified is always a sorting + ** index. + */ + if( pIdx->onError!=OE_None && nEqCol==pIdx->nColumn ){ + *pbRev = 0; + return 1; + } + /* Match terms of the ORDER BY clause against columns of ** the index. */ @@ -619,6 +627,25 @@ static int sortableByRowid( return 0; } +/* +** Prepare a crude estimate of the logorithm of the input value. +** The results need not be exact. This is only used for estimating +** the total cost of performing operatings with O(logN) or O(NlogN) +** complexity. Because N is just a guess, it is no great tragedy if +** logN is a little off. +** +** We can assume N>=1.0; +*/ +static double estLog(double N){ + double logN = 1.0; + double x = 10.0; + while( N>x ){ + logN = logN+1.0; + x *= 10; + } + return logN; +} + /* ** Find the best index for accessing a particular table. Return a pointer ** to the index, flags that describe how the index should be used, the @@ -668,46 +695,64 @@ static double bestIndex( *ppIndex = 0; bestFlags = WHERE_ROWID_EQ; if( pTerm->operator & WO_EQ ){ + /* Rowid== is always the best pick. Look no further. Because only + ** a single row is generated, output is always in sorted order */ *pFlags = WHERE_ROWID_EQ; *pnEq = 1; if( pOrderBy ) *pFlags |= WHERE_ORDERBY; TRACE(("... best is rowid\n")); return 0.0; }else if( pTerm->operator & WO_LIST ){ + /* Rowid IN (LIST): cost is NlogN where N is the number of list + ** elements. */ lowestCost = pTerm->pExpr->pList->nExpr; + lowestCost *= estLog(lowestCost); }else{ - lowestCost = 100.0; + /* Rowid IN (SELECT): cost is NlogN where N is the number of rows + ** in the result of the inner select. We have no way to estimate + ** that value so make a wild guess. */ + lowestCost = 200.0; } TRACE(("... rowid IN cost: %g\n", lowestCost)); } - /* Check for constraints on a range of rowids or a full table scan. + /* Estimate the cost of a table scan. If we do not know how many + ** entries are in the table, use 1 million as a guess. */ pProbe = pSrc->pTab->pIndex; - cost = pProbe ? pProbe->aiRowEst[0] : 100000.0; - TRACE(("... base cost: %g\n", cost)); + cost = pProbe ? pProbe->aiRowEst[0] : 1000000.0; + TRACE(("... table scan base cost: %g\n", cost)); + flags = WHERE_ROWID_RANGE; + + /* Check for constraints on a range of rowids in a table scan. + */ pTerm = findTerm(pWC, iCur, -1, notReady, WO_LT|WO_LE|WO_GT|WO_GE, 0); if( pTerm ){ - flags = WHERE_ROWID_RANGE; if( findTerm(pWC, iCur, -1, notReady, WO_LT|WO_LE, 0) ){ flags |= WHERE_TOP_LIMIT; - cost *= 0.25; /* Guess that rowidEXPR eliminates 75% of the search */ + cost *= 0.333; /* Guess that rowid>EXPR eliminates two-thirds of rows */ } - TRACE(("... rowid range cost: %g\n", cost)); + TRACE(("... rowid range reduces cost to %g\n", cost)); }else{ flags = 0; } - if( pOrderBy && sortableByRowid(iCur, pOrderBy, &rev) ){ - flags |= WHERE_ORDERBY|WHERE_ROWID_RANGE; - cost *= 0.5; - if( rev ){ - flags |= WHERE_REVERSE; + + /* If the table scan does not satisfy the ORDER BY clause, increase + ** the cost by NlogN to cover the expense of sorting. */ + if( pOrderBy ){ + if( sortableByRowid(iCur, pOrderBy, &rev) ){ + flags |= WHERE_ORDERBY|WHERE_ROWID_RANGE; + if( rev ){ + flags |= WHERE_REVERSE; + } + }else{ + cost += cost*estLog(cost); + TRACE(("... sorting increases cost to %g\n", cost)); } - TRACE(("... order by reduces cost to %g\n", cost)); } if( costpNext){ int i; /* Loop counter */ - double inMultiplier = 2.0; /* Includes built-in index lookup penalty */ + double inMultiplier = 1.0; TRACE(("... index %s:\n", pProbe->zName)); @@ -740,7 +785,7 @@ static double bestIndex( } } } - cost = pProbe->aiRowEst[i] * inMultiplier; + cost = pProbe->aiRowEst[i] * inMultiplier * estLog(inMultiplier); nEq = i; TRACE(("...... nEq=%d inMult=%g cost=%g\n", nEq, inMultiplier, cost)); @@ -753,30 +798,32 @@ static double bestIndex( flags = WHERE_COLUMN_RANGE; if( findTerm(pWC, iCur, j, notReady, WO_LT|WO_LE, pProbe) ){ flags |= WHERE_TOP_LIMIT; - cost *= 0.5; + cost *= 0.333; } if( findTerm(pWC, iCur, j, notReady, WO_GT|WO_GE, pProbe) ){ flags |= WHERE_BTM_LIMIT; - cost *= 0.5; + cost *= 0.333; } TRACE(("...... range reduces cost to %g\n", cost)); } } - /* Reduce the cost substantially if this index can be used to satisfy - ** the ORDER BY clause + /* Add the additional cost of sorting if that is a factor. */ - if( pOrderBy && (flags & WHERE_COLUMN_IN)==0 && + if( pOrderBy ){ + if( (flags & WHERE_COLUMN_IN)==0 && isSortingIndex(pParse, pProbe, pSrc->pTab, iCur, pOrderBy, nEq, &rev) ){ - if( flags==0 ){ - flags = WHERE_COLUMN_RANGE; - } - flags |= WHERE_ORDERBY; - cost *= 0.5; - if( rev ){ - flags |= WHERE_REVERSE; + if( flags==0 ){ + flags = WHERE_COLUMN_RANGE; + } + flags |= WHERE_ORDERBY; + if( rev ){ + flags |= WHERE_REVERSE; + } + }else{ + cost += cost*estLog(cost); + TRACE(("...... orderby reduces cost to %g\n", cost)); } - TRACE(("...... orderby reduces cost to %g\n", cost)); } /* Check to see if we can get away with using just the index without