--- /dev/null
+#
+# Run this TCL script to generate HTML for the goals.html file.
+#
+set rcsid {$Id: optoverview.tcl,v 1.1 2005/08/31 01:50:00 drh Exp $}
+source common.tcl
+header {The SQLite Query Optimizer Overview}
+
+proc CODE {text} {
+ puts "<blockquote><pre>"
+ puts $text
+ puts "</pre></blockquote>"
+}
+proc SYNTAX {text} {
+ puts "<blockquote><pre>"
+ set t2 [string map {& & < < > >} $text]
+ regsub -all "/(\[^\n/\]+)/" $t2 {</b><i>\1</i><b>} t3
+ puts "<b>$t3</b>"
+ puts "</pre></blockquote>"
+}
+proc IMAGE {name {caption {}}} {
+ puts "<center><img src=\"$name\">"
+ if {$caption!=""} {
+ puts "<br>$caption"
+ }
+ puts "</center>"
+}
+proc PARAGRAPH {text} {
+ regsub -all "/(\[^\n/\]+)/" $text {<i>\1</i>} t2
+ regsub -all "\\*(\[^\n*\]+)\\*" $t2 {<tt><b><big>\1</big></b></tt>} t3
+ puts "<p>$t3</p>\n"
+}
+set level(0) 0
+set level(1) 0
+proc HEADING {n name {tag {}}} {
+ if {$tag!=""} {
+ puts "<a name=\"$tag\">"
+ }
+ global level
+ incr level($n)
+ for {set i [expr {$n+1}]} {$i<10} {incr i} {
+ set level($i) 0
+ }
+ if {$n==0} {
+ set num {}
+ } elseif {$n==1} {
+ set num $level(1).0
+ } else {
+ set num $level(1)
+ for {set i 2} {$i<=$n} {incr i} {
+ append num .$level($i)
+ }
+ }
+ incr n 1
+ puts "<h$n>$num $name</h$n>"
+}
+
+HEADING 0 {The SQLite Query Optimizer Overview}
+
+PARAGRAPH {
+ This document provides a terse overview of how the query optimizer
+ for SQLite works. This is not a tutoral. Some prior knowledge of how
+ database engines operate is likely needed in order to fully understand
+ this text.
+}
+
+HEADING 1 {WHERE clause analysis} where_clause
+
+PARAGRAPH {
+ The WHERE clause on a query is broken up into "terms" where each term
+ is separated from the others by an AND operator.
+ Parentheses are ignored when dividing the WHERE clause into terms.
+}
+PARAGRAPH {
+ All terms of the WHERE clause are analyzed.
+ Terms that cannot be satisfied through the use of indices become
+ tests that are evaluated against each row of the relevant input
+ tables. No tests are done for terms that are completely satisfied by
+ indices. Sometimes
+ one or more terms will provide hints to indices but still must be
+ evaluated against each row of the input tables.
+}
+
+PARAGRAPH {
+ Sometimes the analysis of a term will cause new "virtual" terms to
+ be added to the WHERE clause. Virtual terms can be used with
+ indices to restrict a search. But virtual terms never generate code
+ that is tested against input rows.
+}
+
+PARAGRAPH {
+ In order be used by an index, a term must be of one of the following
+ forms:
+}
+SYNTAX {
+ /column/ = /expression/
+ /column/ > /expression/
+ /column/ >= /expression/
+ /column/ < /expression/
+ /column/ <= /expression/
+ /expression/ = /column/
+ /expression/ > /column/
+ /expression/ >= /column/
+ /expression/ < /column/
+ /expression/ <= /column/
+ /column/ IN (/expression-list/)
+ /column/ IN (/subquery/)
+}
+PARAGRAPH {
+ If an index is created using a statement like this:
+}
+CODE {
+ CREATE INDEX idx_ex1 ON ex1(a,b,c,d,e,...,y,z);
+}
+PARAGRAPH {
+ Then the index might be used if the initial columns of the index
+ (columns a, b, and so forth) appear in WHERE clause terms.
+ All index columns must be used with
+ the *=* or *IN* operators except for
+ the right-most column which can use inequalities. For the right-most
+ column of an index that is used, there can be up to two inequalities
+ that must sandwich the allowed values of the column between two extremes.
+}
+PARAGRAPH {
+ It is not necessary for every column of an index to appear in a
+ WHERE clause term in order for that index to be used. In fact, there
+ are sometimes advantages if this is not the case.
+ But there can not be gaps in the columns of the index that are used.
+ Thus for the example index above, if there is no WHERE clause term
+ that constraints column c, then terms that constraint columns a and b can
+ be used with the index but not terms that constraint columns d through z.
+ Similarly, no index column will be used (for indexing purposes)
+ that is to the right of a
+ column that is constrained only by inequalities.
+ Thus for the index above and WHERE clause like this:
+}
+CODE {
+ ... WHERE a=5 AND b IN (1,2,3) AND c>12 AND d='hello'
+}
+PARAGRAPH {
+ Only columns a, b, and c of the index would be usable. The d column
+ would not be usable because it occurs to the right of c and c is
+ constrained only by inequalities.
+}
+
+HEADING 1 {The BETWEEN optimization} between_opt
+
+PARAGRAPH {
+ If a term of the WHERE clause is of the following form:
+}
+SYNTAX {
+ /expr1/ BETWEEN /expr2/ AND /expr3/
+}
+PARAGRAPH {
+ Then two virtual terms are added as follows:
+}
+SYNTAX {
+ /expr1/ >= /expr2/ AND /expr1/ <= /expr3/
+}
+PARAGRAPH {
+ If both virtual terms end up being used as constraints on an index,
+ then the original BETWEEN term is omitted and the corresponding test
+ is not performed on input rows.
+ Thus if the BETWEEN term ends up being used as an index constraint
+ no tests are ever performed on that term.
+ On the other hand, the
+ virtual terms themselves never causes tests to be performed on
+ input rows.
+ Thus if the BETWEEN term is not used as an index constraint and
+ instead must be used to test input rows, the /expr1/ expression is
+ only evaluated once.
+}
+
+HEADING 1 {The OR optimization} or_opt
+
+PARAGRAPH {
+ If a term consists of multiple subterms containing a common column
+ name and separated by OR, like this:
+}
+SYNTAX {
+ /column/ = /expr1/ OR /column/ = /expr2/ OR /column/ = /expr3/ OR ...
+}
+PARAGRAPH {
+ Then the term is rewritten as follows:
+}
+SYNTAX {
+ /column/ IN (/expr1/,/expr2/,/expr3/,/expr4/,...)
+}
+PARAGRAPH {
+ The rewritten term then might go on to constraint an index using the
+ normal rules for *IN* operators.
+ Note that /column/ must be the same column in every OR-connected subterm,
+ although the column can occur on either the left or the right side of
+ the *=* operator.
+}
+
+HEADING 1 {The LIKE optimization} like_opt
+
+PARAGRAPH {
+ Terms that are composed of the LIKE or GLOB operator
+ can sometimes be used to constrain indices.
+ There are many conditions on this use:
+}
+PARAGRAPH {
+ <ol>
+ <li>The left-hand side of the LIKE or GLOB operator must be the name
+ of an indexed column.</li>
+ <li>The right-hand side of the LIKE or GLOB must be a string literal
+ that does not begin with a wildcard character.</li>
+ <li>The ESCAPE clause cannot appear on the LIKE operator.</li>
+ <li>The build-in functions used to implement LIKE and GLOB must not
+ have been overloaded using the sqlite3_create_function() API.</li>
+ <li>For the GLOB operator, the column must use the default BINARY
+ collating sequence.</li>
+ <li>For the LIKE operator, if case_sensitive_like mode is enabled then
+ the column must use the default BINARY collating sequence, or if
+ case_sensitive_like mode is disabled then the column must use the
+ built-in NOCASE collating sequence.</li>
+ </ol>
+}
+PARAGRAPH {
+ The LIKE operator has two modes that can be set by a pragma. The
+ default mode is for LIKE comparisons to be insensitive to differences
+ of case for latin1 characters. Thus, by default, the following
+ expression is true:
+}
+CODE {
+ 'a' LIKE 'A'
+}
+PARAGRAPH {
+ By turned on the case_sensitive_like pragma as follows:
+}
+CODE {
+ PRAGMA case_sensitive_like=ON;
+}
+PARAGRAPH {
+ Then the LIKE operator pays attention to case and the example above would
+ evaluate to false. Note that case insensitivity only applies to
+ latin1 characters - basically the upper and lower case letters of English
+ in the lower 127 byte codes of ASCII. International character sets
+ are always case sensitive in SQLite unless a user-supplied collating
+ sequence is used. But if you employ a user-supplied collating sequence,
+ the LIKE optimization describe here will never be taken.
+}
+PARAGRAPH {
+ The LIKE operator is case insensitive by default because this is what
+ the SQL standard requires. You can change the default behavior at
+ compile time by using the -DSQLITE_CASE_SENSITIVE_LIKE command-line option
+ to the compiler.
+}
+PARAGRAPH {
+ The LIKE optimization might occur if the column named on the left of the
+ operator uses the BINARY collating sequence (which is the default) and
+ case_sensitive_like is turned on. Or the optimization might occur if
+ the column uses the built-in NOCASE collating sequence and the
+ case_sensitive_like mode is off. These are the only two combinations
+ under which LIKE operators will be optimized. If the column on the
+ right-hand side of the LIKE operator uses any collating sequence other
+ than the built-in BINARY and NOCASE collating sequences, then no optimizations
+ will ever be attempted on the LIKE operator.
+}
+PARAGRAPH {
+ The GLOB operator is always case sensitive. The column on the left side
+ of the GLOB operator must always use the built-in BINARY collating sequence
+ or no attempt will be made to optimize that operator with indices.
+}
+PARAGRAPH {
+ The right-hand side of the GLOB or LIKE operator must be a literal string
+ value that does not begin with a wildcard. If the right-hand side is a
+ parameter that is bound to a string, then no optimization is attempted.
+ If the right-hand side begins with a wildcard character then no
+ optimization is attempted.
+}
+PARAGRAPH {
+ Suppose the initial sequence of non-wildcard characters on the right-hand
+ side of the LIKE or GLOB operator is /x/. We are using a single
+ character to denote this non-wildcard prefix but the reader should
+ understand that the prefix can consist of more than 1 character.
+ Let /y/ the smallest string that is the same length as /x/ but which
+ compares greater than /x/. For example, if /x/ is *hello* then
+ /y/ would be *hellp*.
+ The LIKE and GLOB optimizations consist of adding two virtual terms
+ like this:
+}
+SYNTAX {
+ /column/ >= /x/ AND /column/ < /y/
+}
+PARAGRAPH {
+ Under most circumstances, the original LIKE or GLOB operator is still
+ tested against each input row even if the virtual terms are used to
+ constrain an index. This is because we do not know what additional
+ constraints may be imposed by characters to the right
+ of the /x/ prefix. However, if there is only a single global wildcard
+ to the right of /x/, then the original LIKE or GLOB test is disabled.
+ In other words, if the pattern is like this:
+}
+SYNTAX {
+ /column/ LIKE /x/%
+ /column/ GLOB /x/*
+}
+PARAGRAPH {
+ Then the original LIKE or GLOB tests are disabled when the virtual
+ terms constrain an index because in that case we know that all of the
+ rows selected by the index will pass the LIKE or GLOB test.
+}
+
+HEADING 1 {Joins} joins
+
+PARAGRAPH {
+ The current implementation of
+ SQLite uses only loop joins. That is to say, joins are implemented as
+ nested loops.
+}
+PARAGRAPH {
+ The default order of the nested loops in a join is for the left-most
+ table in the FROM clause to form the outer loop and the right-most
+ table to form the inner loop.
+ However, SQLite will nest the loops in a different order if doing so
+ will help it to select better indices.
+}
+PARAGRAPH {
+ Inner joins can be freely reordered. However a left outer join is
+ neither commutative nor associative and hence will not be reordered.
+ Inner joins to the left and right of the outer join might be reordered
+ if the optimizer thinks that is advantageous but the outer joins are
+ always evaluated in the order in which they occur.
+}
+PARAGRAPH {
+ When selecting the order of tables in a join, SQLite uses a greedy
+ algorithm that runs in polynomial time.
+}
+PARAGRAPH {
+ The ON and USING clauses of a join are converted into additional
+ terms of the WHERE clause prior to WHERE clause analysis described
+ above in paragraph 1.0. Thus
+ with SQLite, there is no advantage to use the newer SQL92 join syntax
+ over the older SQL89 comma-join syntax. They both end up accomplishing
+ exactly the same thing.
+}
+
+HEADING 1 {Choosing between multiple indices} multi_index
+
+PARAGRAPH {
+ Each table in the FROM clause of a query can use at most one index,
+ and SQLite strives to use at least one index on each table. Sometimes,
+ two or more indices might be candidates for use on a single table.
+ For example:
+}
+CODE {
+ CREATE TABLE ex2(x,y,z);
+ CREATE INDEX ex2i1 ON ex2(x);
+ CREATE INDEX ex2i2 ON ex2(y);
+ SELECT z FROM ex2 WHERE x=5 AND y=6;
+}
+PARAGRAPH {
+ For the SELECT statement above, the optimizer can use the ex2i1 index
+ to lookup rows of ex2 that contain x=5 and then test each row against
+ the y=6 term. Or it can use the ex2i2 index to lookup rows
+ of ex2 that contain y=6 then test each of those rows against the
+ x=5 term.
+}
+PARAGRAPH {
+ When faced with a choice of two or more indices, SQLite tries to estimate
+ the total amount of work needed to perform the query using each option.
+ It then selects the option that gives the least estimated work.
+}
+PARAGRAPH {
+ To help the optimizer get a more accurate estimate of the work involved
+ in using various indices, the user may optional run the ANALYZE command.
+ The ANALYZE command scans all indices of database where there might
+ be a choice between two or more indices and gathers statistics on the
+ selectiveness of those indices. The results of this scan are stored
+ in the sqlite_stat1 table.
+ The contents of the sqlite_stat1 table are not updated as the database
+ changes so after making significant changes it might be prudent to
+ rerun ANALYZE.
+ The results of an ANALYZE command are only available to database connections
+ that are opened after the ANALYZE command completes.
+}
+PARAGRAPH {
+ Once created, the sqlite_stat1 table cannot be dropped. But its
+ content can be viewed, modified, or erased. Erasing the entire content
+ of the sqlite_stat1 table has the effect of undoing the ANALYZE command.
+ Changing the content of the sqlite_stat1 table can get the optimizer
+ deeply confused and cause it to make silly index choices. Making
+ updates to the sqlite_stat1 table (except by running ANALYZE) is
+ not recommended.
+}
+PARAGRAPH {
+ Terms of the WHERE clause can be manually disqualified for use with
+ indices by prepending a unary *+* operator to the column name. The
+ unary *+* is a no-op and will not slow down the evaluation of the test
+ in any way. But it will prevent that term from constraining an index.
+ So, in the example above, if the query were rewritten as:
+}
+CODE {
+ SELECT z FROM ex2 WHERE +x=5 AND y=6;
+}
+PARAGRAPH {
+ The *+* operator on the *x* column would prevent that term from
+ constraining an index. This would force the use of the ex2i2 index.
+}
+
+HEADING 1 {Avoidance of table lookups} index_only
+
+PARAGRAPH {
+ When doing an indexed lookup of a row, the usual procedure is to
+ do a binary search on the index to find the index entry, then extract
+ the rowid from the index and use that rowid to do a binary search on
+ the original table. Thus a typical indexed lookup involves two
+ binary searches.
+ If, however, all columns that were to be fetched from the table are
+ already available in the index itself, SQLite will use the values
+ contained in the index and will never look up the original table
+ row. This saves a binary search for each table and can make many
+ queries run twice as fast.
+}
+
+HEADING 1 {ORDER BY optimizations} order_by
+
+PARAGRAPH {
+ SQLite attempts to use an index to satisfy the ORDER BY clause of a
+ query when possible.
+ When faced with the choice of using an index to satisfy WHERE clause
+ constraints or satisfying an ORDER BY clause, SQLite does the same
+ work analysis that it normally does when choosing between two indices
+ and selects that one that it believes will result in the fastest answer.
+ Usually this means satisfying the WHERE clause constraint.
+}
+
+HEADING 1 {Subquery flattening} flattening
+
+PARAGRAPH {
+ When a subquery occurs in the FROM clause of a SELECT, the default
+ behavior is to evaluate the subquery into a transient table, then run
+ the outer SELECT against the transient table.
+ This is problematic since the transient table will not have any indices
+ and the outer query (which is likely a join) will be forced to do a
+ full table scan on the transient table.
+}
+PARAGRAPH {
+ To overcome this problem, SQLite attempts to flatten subqueries in
+ the FROM clause of a SELECT.
+ This involves inserting the FROM clause of the subquery into the
+ FROM clause of the outer query and rewriting certain expressions in
+ the outer query.
+ There is a long list of conditions that must be met in order for
+ query flattening to occur. These conditions are fully documented in
+ source code comments in the select.c source file.
+}
+PARAGRAPH {
+ Query flattening is an important optimization when views are used as
+ each use of a view is translated into a subquery.
+}
+
+HEADING 1 {The MIN/MAX optimization} minmax
+
+PARAGRAPH {
+ Queries of the following forms will be optimized to run in logorithmic
+ time assuming appropriate indices exist:
+}
+CODE {
+ SELECT MIN(x) FROM table;
+ SELECT MAX(x) FROM table;
+}
+PARAGRAPH {
+ In order for these optimizations to occur, they must appear in exactly
+ the form shown above - changing only the name of the table and column.
+ And the column in the MIN or MAX function must be an indexed column.
+}