From: drh Date: Sun, 29 Mar 2009 00:13:03 +0000 (+0000) Subject: Improvements to cost estimation for evaluating the IN operator. X-Git-Tag: version-3.6.15~347 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=75572e9de9e38820d516fa8e8762d69c61c55343;p=thirdparty%2Fsqlite.git Improvements to cost estimation for evaluating the IN operator. Ticket #3757. (CVS 6403) FossilOrigin-Name: 0c438e813c411e8f9e92d6c7405fccb7a36e110a --- diff --git a/manifest b/manifest index e19b47bd4c..114c5b6c22 100644 --- a/manifest +++ b/manifest @@ -1,5 +1,5 @@ -C Previous\scommit\s((6401))\sdid\snot\squite\sfix\sthe\sproblem.\s\sThis\sshould\swork\nbetter.\s(CVS\s6402) -D 2009-03-28T23:47:11 +C Improvements\sto\scost\sestimation\sfor\sevaluating\sthe\sIN\soperator.\nTicket\s#3757.\s(CVS\s6403) +D 2009-03-29T00:13:03 F Makefile.arm-wince-mingw32ce-gcc fcd5e9cd67fe88836360bb4f9ef4cb7f8e2fb5a0 F Makefile.in 583e87706abc3026960ed759aff6371faf84c211 F Makefile.linux-gcc d53183f4aa6a9192d249731c90dbdffbd2c68654 @@ -209,7 +209,7 @@ F src/vdbeblob.c e67757450ae8581a8b354d9d7e467e41502dfe38 F src/vdbemem.c 38615b5d4b1b3b5a1221a5623578e5e3864e4888 F src/vtab.c f1aba5a6dc1f83b97a39fbbc58ff8cbc76311347 F src/walker.c 42bd3f00ca2ef5ae842304ec0d59903ef051412d -F src/where.c d0a78f876593b596c1e97286921cdc746e2f65ce +F src/where.c 72b84f31a0bed42c665fb922b74e9aea5ae3ced2 F test/aggerror.test a867e273ef9e3d7919f03ef4f0e8c0d2767944f2 F test/alias.test 597662c5d777a122f9a3df0047ea5c5bd383a911 F test/all.test 14165b3e32715b700b5f0cbf8f6e3833dda0be45 @@ -631,6 +631,7 @@ F test/tkt35xx.test 53bca895091e968126a858ee7da186f59f328994 F test/tkt3630.test 929f64852103054125200bc825c316d5f75d42f7 F test/tkt3718.test 3ee5e25702f3f5a31340b2766d7a7fac2b5ce99c F test/tkt3731.test 8a6e3732f5a8a24eb875a6faf287ef77bb8c0579 +F test/tkt3757.test 8f2208930655bbd4f92c14e19e72303a43e098ef F test/tokenize.test ce430a7aed48fc98301611429595883fdfcab5d7 F test/trace.test 951cd0f5f571e7f36bf7bfe04be70f90fb16fb00 F test/trans.test 8b79967a7e085289ec64890c6fdf9d089e1b4a5f @@ -710,7 +711,7 @@ F tool/speedtest16.c c8a9c793df96db7e4933f0852abb7a03d48f2e81 F tool/speedtest2.tcl ee2149167303ba8e95af97873c575c3e0fab58ff F tool/speedtest8.c 2902c46588c40b55661e471d7a86e4dd71a18224 F tool/speedtest8inst1.c 293327bc76823f473684d589a8160bde1f52c14e -P fb35cff855e17771caee2a992e7b2b4105b94862 -R 6c5a32ed1dbda923c028324f6e381112 +P 2e7d3cc9f04de1fe7ef95cd5736dbc409c209cef +R f4372d788a5fa3a6a07058dbb62c2eb3 U drh -Z f8f9e15c31b81a3e0355740fc1bcee00 +Z 1bf6a0cf4ef6100a42ff75750d967094 diff --git a/manifest.uuid b/manifest.uuid index 4cdd3d75e3..7c1ae35646 100644 --- a/manifest.uuid +++ b/manifest.uuid @@ -1 +1 @@ -2e7d3cc9f04de1fe7ef95cd5736dbc409c209cef \ No newline at end of file +0c438e813c411e8f9e92d6c7405fccb7a36e110a \ No newline at end of file diff --git a/src/where.c b/src/where.c index d3947edc8d..c2e8e1af5b 100644 --- a/src/where.c +++ b/src/where.c @@ -16,7 +16,7 @@ ** so is applicable. Because this module is responsible for selecting ** indices, you might also think of this module as the "query optimizer". ** -** $Id: where.c,v 1.377 2009/03/25 16:51:43 drh Exp $ +** $Id: where.c,v 1.378 2009/03/29 00:13:03 drh Exp $ */ #include "sqliteInt.h" @@ -26,7 +26,7 @@ #if defined(SQLITE_TEST) || defined(SQLITE_DEBUG) int sqlite3WhereTrace = 0; #endif -#if 0 +#if 1 # define WHERETRACE(X) if(sqlite3WhereTrace) sqlite3DebugPrintf X #else # define WHERETRACE(X) @@ -1926,12 +1926,18 @@ static void bestIndex( pProbe = pSrc->pIndex; } for(; pProbe; pProbe=(pSrc->pIndex ? 0 : pProbe->pNext)){ - double inMultiplier = 1; + double inMultiplier = 1; /* Number of equality look-ups needed */ + int inMultIsEst = 0; /* True if inMultiplier is an estimate */ WHERETRACE(("... index %s:\n", pProbe->zName)); /* Count the number of columns in the index that are satisfied - ** by x=EXPR constraints or x IN (...) constraints. + ** by x=EXPR constraints or x IN (...) constraints. For a term + ** of the form x=EXPR we only have to do a single binary search. + ** But for x IN (...) we have to do a number of binary searched + ** equal to the number of entries on the RHS of the IN operator. + ** The inMultipler variable with try to estimate the number of + ** binary searches needed. */ wsFlags = 0; for(i=0; inColumn; i++){ @@ -1944,21 +1950,31 @@ static void bestIndex( wsFlags |= WHERE_COLUMN_IN; if( ExprHasProperty(pExpr, EP_xIsSelect) ){ inMultiplier *= 25; + inMultIsEst = 1; }else if( pExpr->x.pList ){ inMultiplier *= pExpr->x.pList->nExpr + 1; } } } nRow = pProbe->aiRowEst[i] * inMultiplier; - cost = nRow * estLog(inMultiplier); + /* If inMultiplier is an estimate and that estimate results in an + ** nRow it that is more than half number of rows in the table, + ** then reduce inMultipler */ + if( inMultIsEst && nRow*2 > pProbe->aiRowEst[0] ){ + nRow = pProbe->aiRowEst[0]/2; + inMultiplier = nRow/pProbe->aiRowEst[i]; + } + cost = nRow + inMultiplier*estLog(pProbe->aiRowEst[0]); nEq = i; if( pProbe->onError!=OE_None && (wsFlags & WHERE_COLUMN_IN)==0 && nEq==pProbe->nColumn ){ wsFlags |= WHERE_UNIQUE; } - WHERETRACE(("...... nEq=%d inMult=%.9g cost=%.9g\n",nEq,inMultiplier,cost)); + WHERETRACE(("...... nEq=%d inMult=%.9g nRow=%.9g cost=%.9g\n", + nEq, inMultiplier, nRow, cost)); - /* Look for range constraints + /* Look for range constraints. Assume that each range constraint + ** makes the search space 1/3rd smaller. */ if( nEqnColumn ){ int j = pProbe->aiColumn[nEq]; @@ -1975,7 +1991,8 @@ static void bestIndex( cost /= 3; nRow /= 3; } - WHERETRACE(("...... range reduces cost to %.9g\n", cost)); + WHERETRACE(("...... range reduces nRow to %.9g and cost to %.9g\n", + nRow, cost)); } } diff --git a/test/tkt3757.test b/test/tkt3757.test new file mode 100644 index 0000000000..14bfb23be7 --- /dev/null +++ b/test/tkt3757.test @@ -0,0 +1,60 @@ +# 2009 March 28 +# +# The author disclaims copyright to this source code. In place of +# a legal notice, here is a blessing: +# +# May you do good and not evil. +# May you find forgiveness for yourself and forgive others. +# May you share freely, never taking more than you give. +# +#*********************************************************************** +# +# Ticket #3757: The cost functions on the query optimizer for the +# IN operator can be improved. +# +# $Id: tkt3757.test,v 1.1 2009/03/29 00:13:04 drh Exp $ + +set testdir [file dirname $argv0] +source $testdir/tester.tcl + +# Evaluate SQL. Return the result set followed by the +# and the number of full-scan steps. +# +proc count_steps {sql} { + set r [db eval $sql] + lappend r scan [db status step] sort [db status sort] +} + +# Construct tables +# +do_test tkt3757-1.1 { + db eval { + CREATE TABLE t1(x INTEGER, y INTEGER, z TEXT); + CREATE INDEX t1i1 ON t1(y,z); + INSERT INTO t1 VALUES(1,2,'three'); + CREATE TABLE t2(a INTEGER, b TEXT); + INSERT INTO t2 VALUES(2, 'two'); + ANALYZE; + SELECT * FROM sqlite_stat1; + } +} {t1 t1i1 {1 1 1}} + +# Modify statistics in order to make the optimizer then that: +# +# (1) Table T1 has about 250K entries +# (2) There are only about 5 distinct values of T1. +# +# Then run a query with "t1.y IN (SELECT ..)" in the WHERE clause. +# Make sure the index is used. +# +do_test tkt3757-1.2 { + db eval { + DELETE FROM sqlite_stat1; + INSERT INTO sqlite_stat1 VALUES('t1','t1i1','250000 50000 30'); + } + count_steps { + SELECT * FROM t1 WHERE y IN (SELECT a FROM t2) + } +} {1 2 three scan 0 sort 0} + +finish_test