From: drh Date: Wed, 31 Aug 2005 01:49:59 +0000 (+0000) Subject: Adding the first cut of the optimizer overview document. (CVS 2647) X-Git-Tag: version-3.6.10~3512 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=f6ac657f2720b5416eeb1b6b16542822e101eeff;p=thirdparty%2Fsqlite.git Adding the first cut of the optimizer overview document. (CVS 2647) FossilOrigin-Name: 4c8d0a4c26e44088ffb9f21aa7641a5ebd0058be --- diff --git a/main.mk b/main.mk index f4d26b901f..d8247f6899 100644 --- a/main.mk +++ b/main.mk @@ -472,6 +472,9 @@ opcode.html: $(TOP)/www/opcode.tcl $(TOP)/src/vdbe.c optimizer.html: $(TOP)/www/optimizer.tcl tclsh $(TOP)/www/optimizer.tcl >optimizer.html +optoverview.html: $(TOP)/www/optoverview.tcl + tclsh $(TOP)/www/optoverview.tcl >optoverview.html + quickstart.html: $(TOP)/www/quickstart.tcl tclsh $(TOP)/www/quickstart.tcl >quickstart.html @@ -528,6 +531,7 @@ DOC = \ omitted.html \ opcode.html \ optimizer.html \ + optoverview.html \ pragma.html \ quickstart.html \ speed.html \ diff --git a/manifest b/manifest index c1130b358b..9deb051f14 100644 --- a/manifest +++ b/manifest @@ -1,5 +1,5 @@ -C Adjust\sthe\smakefile\sfor\snew\soptimizer\sdocumentation.\s(CVS\s2646) -D 2005-08-30T22:44:30 +C Adding\sthe\sfirst\scut\sof\sthe\soptimizer\soverview\sdocument.\s(CVS\s2647) +D 2005-08-31T01:49:59 F Makefile.in 12784cdce5ffc8dfb707300c34e4f1eb3b8a14f1 F Makefile.linux-gcc 06be33b2a9ad4f005a5f42b22c4a19dab3cbb5c7 F README 9c4e2d6706bdcc3efdd773ce752a8cdab4f90028 @@ -17,7 +17,7 @@ F doc/lemon.html f0f682f50210928c07e562621c3b7e8ab912a538 F doc/report1.txt a031aaf37b185e4fa540223cb516d3bccec7eeac F install-sh 9d4de14ab9fb0facae2f48780b874848cbf2f895 F ltmain.sh f6b283068efa69f06eb8aa1fe4bddfdbdeb35826 -F main.mk 940fb3d09568be099eaa183ed372650723d0d05b +F main.mk 72176d7299c2fd6ca2850623195a6984f75380a6 F mkdll.sh 5ec23622515d5bf8969404e80cfb5e220ddf0512 F mkopcodec.awk bd46ad001c98dfbab07b1713cb8e692fa0e5415d F mkopcodeh.awk 7563ad235670e864ead95cf672be3fe081450ae0 @@ -295,6 +295,7 @@ F www/omitted.tcl f1e57977299c3ed54fbae55e4b5ea6a64de39e19 F www/opcode.tcl 5bd68059416b223515a680d410a9f7cb6736485f F www/optimizer.tcl d6812a10269bd0d7c488987aac0ad5036cace9dc F www/optimizing.tcl f0b2538988d1bbad16cbfe63ec6e8f48c9eb04e5 +F www/optoverview.tcl e16344994a91318965d573ecc9ef2848153d5bf0 F www/pragma.tcl 44f7b665ca598ad24724f35991653638a36a6e3f F www/quickstart.tcl 6f6f694b6139be2d967b1492eb9a6bdf7058aa60 F www/speed.tcl 656ed5be8cc9d536353e1a96927b925634a62933 @@ -305,7 +306,7 @@ F www/tclsqlite.tcl 3df553505b6efcad08f91e9b975deb2e6c9bb955 F www/vdbe.tcl 87a31ace769f20d3627a64fa1fade7fed47b90d0 F www/version3.tcl a99cf5f6d8bd4d5537584a2b342f0fb9fa601d8b F www/whentouse.tcl 97e2b5cd296f7d8057e11f44427dea8a4c2db513 -P 5cebd7ba3ccbdd0f4c8fe77091992f52d3a4b24c -R c4def8daacbbc5823b9d6194ef76988e +P 14668c85b87f7c9b83efd592ddc693522c764969 +R 3e38efd9bc5b22f2b9f7cfdf76added5 U drh -Z e3c05aeda8ce259a105b8164e4009189 +Z f4986d89db2306d81b4b595a9239ef43 diff --git a/manifest.uuid b/manifest.uuid index bc6cd8cd49..4f014875d9 100644 --- a/manifest.uuid +++ b/manifest.uuid @@ -1 +1 @@ -14668c85b87f7c9b83efd592ddc693522c764969 \ No newline at end of file +4c8d0a4c26e44088ffb9f21aa7641a5ebd0058be \ No newline at end of file diff --git a/www/optoverview.tcl b/www/optoverview.tcl new file mode 100644 index 0000000000..38bc981b70 --- /dev/null +++ b/www/optoverview.tcl @@ -0,0 +1,469 @@ +# +# Run this TCL script to generate HTML for the goals.html file. +# +set rcsid {$Id: optoverview.tcl,v 1.1 2005/08/31 01:50:00 drh Exp $} +source common.tcl +header {The SQLite Query Optimizer Overview} + +proc CODE {text} { + puts "
"
+  puts $text
+  puts "
" +} +proc SYNTAX {text} { + puts "
"
+  set t2 [string map {& & < < > >} $text]
+  regsub -all "/(\[^\n/\]+)/" $t2 {\1} t3
+  puts "$t3"
+  puts "
" +} +proc IMAGE {name {caption {}}} { + puts "
" + if {$caption!=""} { + puts "
$caption" + } + puts "
" +} +proc PARAGRAPH {text} { + regsub -all "/(\[^\n/\]+)/" $text {\1} t2 + regsub -all "\\*(\[^\n*\]+)\\*" $t2 {\1} t3 + puts "

$t3

\n" +} +set level(0) 0 +set level(1) 0 +proc HEADING {n name {tag {}}} { + if {$tag!=""} { + puts "" + } + global level + incr level($n) + for {set i [expr {$n+1}]} {$i<10} {incr i} { + set level($i) 0 + } + if {$n==0} { + set num {} + } elseif {$n==1} { + set num $level(1).0 + } else { + set num $level(1) + for {set i 2} {$i<=$n} {incr i} { + append num .$level($i) + } + } + incr n 1 + puts "$num $name" +} + +HEADING 0 {The SQLite Query Optimizer Overview} + +PARAGRAPH { + This document provides a terse overview of how the query optimizer + for SQLite works. This is not a tutoral. Some prior knowledge of how + database engines operate is likely needed in order to fully understand + this text. +} + +HEADING 1 {WHERE clause analysis} where_clause + +PARAGRAPH { + The WHERE clause on a query is broken up into "terms" where each term + is separated from the others by an AND operator. + Parentheses are ignored when dividing the WHERE clause into terms. +} +PARAGRAPH { + All terms of the WHERE clause are analyzed. + Terms that cannot be satisfied through the use of indices become + tests that are evaluated against each row of the relevant input + tables. No tests are done for terms that are completely satisfied by + indices. Sometimes + one or more terms will provide hints to indices but still must be + evaluated against each row of the input tables. +} + +PARAGRAPH { + Sometimes the analysis of a term will cause new "virtual" terms to + be added to the WHERE clause. Virtual terms can be used with + indices to restrict a search. But virtual terms never generate code + that is tested against input rows. +} + +PARAGRAPH { + In order be used by an index, a term must be of one of the following + forms: +} +SYNTAX { + /column/ = /expression/ + /column/ > /expression/ + /column/ >= /expression/ + /column/ < /expression/ + /column/ <= /expression/ + /expression/ = /column/ + /expression/ > /column/ + /expression/ >= /column/ + /expression/ < /column/ + /expression/ <= /column/ + /column/ IN (/expression-list/) + /column/ IN (/subquery/) +} +PARAGRAPH { + If an index is created using a statement like this: +} +CODE { + CREATE INDEX idx_ex1 ON ex1(a,b,c,d,e,...,y,z); +} +PARAGRAPH { + Then the index might be used if the initial columns of the index + (columns a, b, and so forth) appear in WHERE clause terms. + All index columns must be used with + the *=* or *IN* operators except for + the right-most column which can use inequalities. For the right-most + column of an index that is used, there can be up to two inequalities + that must sandwich the allowed values of the column between two extremes. +} +PARAGRAPH { + It is not necessary for every column of an index to appear in a + WHERE clause term in order for that index to be used. In fact, there + are sometimes advantages if this is not the case. + But there can not be gaps in the columns of the index that are used. + Thus for the example index above, if there is no WHERE clause term + that constraints column c, then terms that constraint columns a and b can + be used with the index but not terms that constraint columns d through z. + Similarly, no index column will be used (for indexing purposes) + that is to the right of a + column that is constrained only by inequalities. + Thus for the index above and WHERE clause like this: +} +CODE { + ... WHERE a=5 AND b IN (1,2,3) AND c>12 AND d='hello' +} +PARAGRAPH { + Only columns a, b, and c of the index would be usable. The d column + would not be usable because it occurs to the right of c and c is + constrained only by inequalities. +} + +HEADING 1 {The BETWEEN optimization} between_opt + +PARAGRAPH { + If a term of the WHERE clause is of the following form: +} +SYNTAX { + /expr1/ BETWEEN /expr2/ AND /expr3/ +} +PARAGRAPH { + Then two virtual terms are added as follows: +} +SYNTAX { + /expr1/ >= /expr2/ AND /expr1/ <= /expr3/ +} +PARAGRAPH { + If both virtual terms end up being used as constraints on an index, + then the original BETWEEN term is omitted and the corresponding test + is not performed on input rows. + Thus if the BETWEEN term ends up being used as an index constraint + no tests are ever performed on that term. + On the other hand, the + virtual terms themselves never causes tests to be performed on + input rows. + Thus if the BETWEEN term is not used as an index constraint and + instead must be used to test input rows, the /expr1/ expression is + only evaluated once. +} + +HEADING 1 {The OR optimization} or_opt + +PARAGRAPH { + If a term consists of multiple subterms containing a common column + name and separated by OR, like this: +} +SYNTAX { + /column/ = /expr1/ OR /column/ = /expr2/ OR /column/ = /expr3/ OR ... +} +PARAGRAPH { + Then the term is rewritten as follows: +} +SYNTAX { + /column/ IN (/expr1/,/expr2/,/expr3/,/expr4/,...) +} +PARAGRAPH { + The rewritten term then might go on to constraint an index using the + normal rules for *IN* operators. + Note that /column/ must be the same column in every OR-connected subterm, + although the column can occur on either the left or the right side of + the *=* operator. +} + +HEADING 1 {The LIKE optimization} like_opt + +PARAGRAPH { + Terms that are composed of the LIKE or GLOB operator + can sometimes be used to constrain indices. + There are many conditions on this use: +} +PARAGRAPH { +
    +
  1. The left-hand side of the LIKE or GLOB operator must be the name + of an indexed column.
  2. +
  3. The right-hand side of the LIKE or GLOB must be a string literal + that does not begin with a wildcard character.
  4. +
  5. The ESCAPE clause cannot appear on the LIKE operator.
  6. +
  7. The build-in functions used to implement LIKE and GLOB must not + have been overloaded using the sqlite3_create_function() API.
  8. +
  9. For the GLOB operator, the column must use the default BINARY + collating sequence.
  10. +
  11. For the LIKE operator, if case_sensitive_like mode is enabled then + the column must use the default BINARY collating sequence, or if + case_sensitive_like mode is disabled then the column must use the + built-in NOCASE collating sequence.
  12. +
+} +PARAGRAPH { + The LIKE operator has two modes that can be set by a pragma. The + default mode is for LIKE comparisons to be insensitive to differences + of case for latin1 characters. Thus, by default, the following + expression is true: +} +CODE { + 'a' LIKE 'A' +} +PARAGRAPH { + By turned on the case_sensitive_like pragma as follows: +} +CODE { + PRAGMA case_sensitive_like=ON; +} +PARAGRAPH { + Then the LIKE operator pays attention to case and the example above would + evaluate to false. Note that case insensitivity only applies to + latin1 characters - basically the upper and lower case letters of English + in the lower 127 byte codes of ASCII. International character sets + are always case sensitive in SQLite unless a user-supplied collating + sequence is used. But if you employ a user-supplied collating sequence, + the LIKE optimization describe here will never be taken. +} +PARAGRAPH { + The LIKE operator is case insensitive by default because this is what + the SQL standard requires. You can change the default behavior at + compile time by using the -DSQLITE_CASE_SENSITIVE_LIKE command-line option + to the compiler. +} +PARAGRAPH { + The LIKE optimization might occur if the column named on the left of the + operator uses the BINARY collating sequence (which is the default) and + case_sensitive_like is turned on. Or the optimization might occur if + the column uses the built-in NOCASE collating sequence and the + case_sensitive_like mode is off. These are the only two combinations + under which LIKE operators will be optimized. If the column on the + right-hand side of the LIKE operator uses any collating sequence other + than the built-in BINARY and NOCASE collating sequences, then no optimizations + will ever be attempted on the LIKE operator. +} +PARAGRAPH { + The GLOB operator is always case sensitive. The column on the left side + of the GLOB operator must always use the built-in BINARY collating sequence + or no attempt will be made to optimize that operator with indices. +} +PARAGRAPH { + The right-hand side of the GLOB or LIKE operator must be a literal string + value that does not begin with a wildcard. If the right-hand side is a + parameter that is bound to a string, then no optimization is attempted. + If the right-hand side begins with a wildcard character then no + optimization is attempted. +} +PARAGRAPH { + Suppose the initial sequence of non-wildcard characters on the right-hand + side of the LIKE or GLOB operator is /x/. We are using a single + character to denote this non-wildcard prefix but the reader should + understand that the prefix can consist of more than 1 character. + Let /y/ the smallest string that is the same length as /x/ but which + compares greater than /x/. For example, if /x/ is *hello* then + /y/ would be *hellp*. + The LIKE and GLOB optimizations consist of adding two virtual terms + like this: +} +SYNTAX { + /column/ >= /x/ AND /column/ < /y/ +} +PARAGRAPH { + Under most circumstances, the original LIKE or GLOB operator is still + tested against each input row even if the virtual terms are used to + constrain an index. This is because we do not know what additional + constraints may be imposed by characters to the right + of the /x/ prefix. However, if there is only a single global wildcard + to the right of /x/, then the original LIKE or GLOB test is disabled. + In other words, if the pattern is like this: +} +SYNTAX { + /column/ LIKE /x/% + /column/ GLOB /x/* +} +PARAGRAPH { + Then the original LIKE or GLOB tests are disabled when the virtual + terms constrain an index because in that case we know that all of the + rows selected by the index will pass the LIKE or GLOB test. +} + +HEADING 1 {Joins} joins + +PARAGRAPH { + The current implementation of + SQLite uses only loop joins. That is to say, joins are implemented as + nested loops. +} +PARAGRAPH { + The default order of the nested loops in a join is for the left-most + table in the FROM clause to form the outer loop and the right-most + table to form the inner loop. + However, SQLite will nest the loops in a different order if doing so + will help it to select better indices. +} +PARAGRAPH { + Inner joins can be freely reordered. However a left outer join is + neither commutative nor associative and hence will not be reordered. + Inner joins to the left and right of the outer join might be reordered + if the optimizer thinks that is advantageous but the outer joins are + always evaluated in the order in which they occur. +} +PARAGRAPH { + When selecting the order of tables in a join, SQLite uses a greedy + algorithm that runs in polynomial time. +} +PARAGRAPH { + The ON and USING clauses of a join are converted into additional + terms of the WHERE clause prior to WHERE clause analysis described + above in paragraph 1.0. Thus + with SQLite, there is no advantage to use the newer SQL92 join syntax + over the older SQL89 comma-join syntax. They both end up accomplishing + exactly the same thing. +} + +HEADING 1 {Choosing between multiple indices} multi_index + +PARAGRAPH { + Each table in the FROM clause of a query can use at most one index, + and SQLite strives to use at least one index on each table. Sometimes, + two or more indices might be candidates for use on a single table. + For example: +} +CODE { + CREATE TABLE ex2(x,y,z); + CREATE INDEX ex2i1 ON ex2(x); + CREATE INDEX ex2i2 ON ex2(y); + SELECT z FROM ex2 WHERE x=5 AND y=6; +} +PARAGRAPH { + For the SELECT statement above, the optimizer can use the ex2i1 index + to lookup rows of ex2 that contain x=5 and then test each row against + the y=6 term. Or it can use the ex2i2 index to lookup rows + of ex2 that contain y=6 then test each of those rows against the + x=5 term. +} +PARAGRAPH { + When faced with a choice of two or more indices, SQLite tries to estimate + the total amount of work needed to perform the query using each option. + It then selects the option that gives the least estimated work. +} +PARAGRAPH { + To help the optimizer get a more accurate estimate of the work involved + in using various indices, the user may optional run the ANALYZE command. + The ANALYZE command scans all indices of database where there might + be a choice between two or more indices and gathers statistics on the + selectiveness of those indices. The results of this scan are stored + in the sqlite_stat1 table. + The contents of the sqlite_stat1 table are not updated as the database + changes so after making significant changes it might be prudent to + rerun ANALYZE. + The results of an ANALYZE command are only available to database connections + that are opened after the ANALYZE command completes. +} +PARAGRAPH { + Once created, the sqlite_stat1 table cannot be dropped. But its + content can be viewed, modified, or erased. Erasing the entire content + of the sqlite_stat1 table has the effect of undoing the ANALYZE command. + Changing the content of the sqlite_stat1 table can get the optimizer + deeply confused and cause it to make silly index choices. Making + updates to the sqlite_stat1 table (except by running ANALYZE) is + not recommended. +} +PARAGRAPH { + Terms of the WHERE clause can be manually disqualified for use with + indices by prepending a unary *+* operator to the column name. The + unary *+* is a no-op and will not slow down the evaluation of the test + in any way. But it will prevent that term from constraining an index. + So, in the example above, if the query were rewritten as: +} +CODE { + SELECT z FROM ex2 WHERE +x=5 AND y=6; +} +PARAGRAPH { + The *+* operator on the *x* column would prevent that term from + constraining an index. This would force the use of the ex2i2 index. +} + +HEADING 1 {Avoidance of table lookups} index_only + +PARAGRAPH { + When doing an indexed lookup of a row, the usual procedure is to + do a binary search on the index to find the index entry, then extract + the rowid from the index and use that rowid to do a binary search on + the original table. Thus a typical indexed lookup involves two + binary searches. + If, however, all columns that were to be fetched from the table are + already available in the index itself, SQLite will use the values + contained in the index and will never look up the original table + row. This saves a binary search for each table and can make many + queries run twice as fast. +} + +HEADING 1 {ORDER BY optimizations} order_by + +PARAGRAPH { + SQLite attempts to use an index to satisfy the ORDER BY clause of a + query when possible. + When faced with the choice of using an index to satisfy WHERE clause + constraints or satisfying an ORDER BY clause, SQLite does the same + work analysis that it normally does when choosing between two indices + and selects that one that it believes will result in the fastest answer. + Usually this means satisfying the WHERE clause constraint. +} + +HEADING 1 {Subquery flattening} flattening + +PARAGRAPH { + When a subquery occurs in the FROM clause of a SELECT, the default + behavior is to evaluate the subquery into a transient table, then run + the outer SELECT against the transient table. + This is problematic since the transient table will not have any indices + and the outer query (which is likely a join) will be forced to do a + full table scan on the transient table. +} +PARAGRAPH { + To overcome this problem, SQLite attempts to flatten subqueries in + the FROM clause of a SELECT. + This involves inserting the FROM clause of the subquery into the + FROM clause of the outer query and rewriting certain expressions in + the outer query. + There is a long list of conditions that must be met in order for + query flattening to occur. These conditions are fully documented in + source code comments in the select.c source file. +} +PARAGRAPH { + Query flattening is an important optimization when views are used as + each use of a view is translated into a subquery. +} + +HEADING 1 {The MIN/MAX optimization} minmax + +PARAGRAPH { + Queries of the following forms will be optimized to run in logorithmic + time assuming appropriate indices exist: +} +CODE { + SELECT MIN(x) FROM table; + SELECT MAX(x) FROM table; +} +PARAGRAPH { + In order for these optimizations to occur, they must appear in exactly + the form shown above - changing only the name of the table and column. + And the column in the MIN or MAX function must be an indexed column. +}