From: drh <drh@noemail.net>
Date: Wed, 31 Aug 2005 01:49:59 +0000 (+0000)
Subject: Adding the first cut of the optimizer overview document. (CVS 2647)
X-Git-Tag: version-3.6.10~3512
X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=f6ac657f2720b5416eeb1b6b16542822e101eeff;p=thirdparty%2Fsqlite.git

Adding the first cut of the optimizer overview document. (CVS 2647)

FossilOrigin-Name: 4c8d0a4c26e44088ffb9f21aa7641a5ebd0058be
---

diff --git a/main.mk b/main.mk
index f4d26b901f..d8247f6899 100644
--- a/main.mk
+++ b/main.mk
@@ -472,6 +472,9 @@ opcode.html:	$(TOP)/www/opcode.tcl $(TOP)/src/vdbe.c
 optimizer.html:	$(TOP)/www/optimizer.tcl
 	tclsh $(TOP)/www/optimizer.tcl >optimizer.html
 
+optoverview.html:	$(TOP)/www/optoverview.tcl
+	tclsh $(TOP)/www/optoverview.tcl >optoverview.html
+
 quickstart.html:	$(TOP)/www/quickstart.tcl
 	tclsh $(TOP)/www/quickstart.tcl >quickstart.html
 
@@ -528,6 +531,7 @@ DOC = \
   omitted.html \
   opcode.html \
   optimizer.html \
+  optoverview.html \
   pragma.html \
   quickstart.html \
   speed.html \
diff --git a/manifest b/manifest
index c1130b358b..9deb051f14 100644
--- a/manifest
+++ b/manifest
@@ -1,5 +1,5 @@
-C Adjust\sthe\smakefile\sfor\snew\soptimizer\sdocumentation.\s(CVS\s2646)
-D 2005-08-30T22:44:30
+C Adding\sthe\sfirst\scut\sof\sthe\soptimizer\soverview\sdocument.\s(CVS\s2647)
+D 2005-08-31T01:49:59
 F Makefile.in 12784cdce5ffc8dfb707300c34e4f1eb3b8a14f1
 F Makefile.linux-gcc 06be33b2a9ad4f005a5f42b22c4a19dab3cbb5c7
 F README 9c4e2d6706bdcc3efdd773ce752a8cdab4f90028
@@ -17,7 +17,7 @@ F doc/lemon.html f0f682f50210928c07e562621c3b7e8ab912a538
 F doc/report1.txt a031aaf37b185e4fa540223cb516d3bccec7eeac
 F install-sh 9d4de14ab9fb0facae2f48780b874848cbf2f895
 F ltmain.sh f6b283068efa69f06eb8aa1fe4bddfdbdeb35826
-F main.mk 940fb3d09568be099eaa183ed372650723d0d05b
+F main.mk 72176d7299c2fd6ca2850623195a6984f75380a6
 F mkdll.sh 5ec23622515d5bf8969404e80cfb5e220ddf0512
 F mkopcodec.awk bd46ad001c98dfbab07b1713cb8e692fa0e5415d
 F mkopcodeh.awk 7563ad235670e864ead95cf672be3fe081450ae0
@@ -295,6 +295,7 @@ F www/omitted.tcl f1e57977299c3ed54fbae55e4b5ea6a64de39e19
 F www/opcode.tcl 5bd68059416b223515a680d410a9f7cb6736485f
 F www/optimizer.tcl d6812a10269bd0d7c488987aac0ad5036cace9dc
 F www/optimizing.tcl f0b2538988d1bbad16cbfe63ec6e8f48c9eb04e5
+F www/optoverview.tcl e16344994a91318965d573ecc9ef2848153d5bf0
 F www/pragma.tcl 44f7b665ca598ad24724f35991653638a36a6e3f
 F www/quickstart.tcl 6f6f694b6139be2d967b1492eb9a6bdf7058aa60
 F www/speed.tcl 656ed5be8cc9d536353e1a96927b925634a62933
@@ -305,7 +306,7 @@ F www/tclsqlite.tcl 3df553505b6efcad08f91e9b975deb2e6c9bb955
 F www/vdbe.tcl 87a31ace769f20d3627a64fa1fade7fed47b90d0
 F www/version3.tcl a99cf5f6d8bd4d5537584a2b342f0fb9fa601d8b
 F www/whentouse.tcl 97e2b5cd296f7d8057e11f44427dea8a4c2db513
-P 5cebd7ba3ccbdd0f4c8fe77091992f52d3a4b24c
-R c4def8daacbbc5823b9d6194ef76988e
+P 14668c85b87f7c9b83efd592ddc693522c764969
+R 3e38efd9bc5b22f2b9f7cfdf76added5
 U drh
-Z e3c05aeda8ce259a105b8164e4009189
+Z f4986d89db2306d81b4b595a9239ef43
diff --git a/manifest.uuid b/manifest.uuid
index bc6cd8cd49..4f014875d9 100644
--- a/manifest.uuid
+++ b/manifest.uuid
@@ -1 +1 @@
-14668c85b87f7c9b83efd592ddc693522c764969
\ No newline at end of file
+4c8d0a4c26e44088ffb9f21aa7641a5ebd0058be
\ No newline at end of file
diff --git a/www/optoverview.tcl b/www/optoverview.tcl
new file mode 100644
index 0000000000..38bc981b70
--- /dev/null
+++ b/www/optoverview.tcl
@@ -0,0 +1,469 @@
+#
+# Run this TCL script to generate HTML for the goals.html file.
+#
+set rcsid {$Id: optoverview.tcl,v 1.1 2005/08/31 01:50:00 drh Exp $}
+source common.tcl
+header {The SQLite Query Optimizer Overview}
+
+proc CODE {text} {
+  puts "<blockquote><pre>"
+  puts $text
+  puts "</pre></blockquote>"
+}
+proc SYNTAX {text} {
+  puts "<blockquote><pre>"
+  set t2 [string map {& &amp; < &lt; > &gt;} $text]
+  regsub -all "/(\[^\n/\]+)/" $t2 {</b><i>\1</i><b>} t3
+  puts "<b>$t3</b>"
+  puts "</pre></blockquote>"
+}
+proc IMAGE {name {caption {}}} {
+  puts "<center><img src=\"$name\">"
+  if {$caption!=""} {
+    puts "<br>$caption"
+  }
+  puts "</center>"
+}
+proc PARAGRAPH {text} {
+  regsub -all "/(\[^\n/\]+)/" $text {<i>\1</i>} t2
+  regsub -all "\\*(\[^\n*\]+)\\*" $t2 {<tt><b><big>\1</big></b></tt>} t3
+  puts "<p>$t3</p>\n"
+}
+set level(0) 0
+set level(1) 0
+proc HEADING {n name {tag {}}} {
+  if {$tag!=""} {
+    puts "<a name=\"$tag\">"
+  }
+  global level
+  incr level($n)
+  for {set i [expr {$n+1}]} {$i<10} {incr i} {
+    set level($i) 0
+  }
+  if {$n==0} {
+    set num {}
+  } elseif {$n==1} {
+    set num $level(1).0
+  } else {
+    set num $level(1)
+    for {set i 2} {$i<=$n} {incr i} {
+      append num .$level($i)
+    }
+  }
+  incr n 1
+  puts "<h$n>$num $name</h$n>"
+}
+
+HEADING 0 {The SQLite Query Optimizer Overview}
+
+PARAGRAPH {
+  This document provides a terse overview of how the query optimizer
+  for SQLite works.  This is not a tutoral.  Some prior knowledge of how
+  database engines operate is likely needed in order to fully understand
+  this text.
+}
+
+HEADING 1 {WHERE clause analysis} where_clause
+
+PARAGRAPH {
+  The WHERE clause on a query is broken up into "terms" where each term
+  is separated from the others by an AND operator.
+  Parentheses are ignored when dividing the WHERE clause into terms.
+}
+PARAGRAPH {
+  All terms of the WHERE clause are analyzed.
+  Terms that cannot be satisfied through the use of indices become
+  tests that are evaluated against each row of the relevant input
+  tables.  No tests are done for terms that are completely satisfied by
+  indices.  Sometimes
+  one or more terms will provide hints to indices but still must be
+  evaluated against each row of the input tables.
+}
+
+PARAGRAPH {
+  Sometimes the analysis of a term will cause new "virtual" terms to
+  be added to the WHERE clause.  Virtual terms can be used with
+  indices to restrict a search.  But virtual terms never generate code
+  that is tested against input rows.
+}
+
+PARAGRAPH {
+  In order be used by an index, a term must be of one of the following
+  forms:
+}
+SYNTAX {
+  /column/ = /expression/
+  /column/ > /expression/
+  /column/ >= /expression/
+  /column/ < /expression/
+  /column/ <= /expression/
+  /expression/ = /column/
+  /expression/ > /column/
+  /expression/ >= /column/
+  /expression/ < /column/
+  /expression/ <= /column/
+  /column/ IN (/expression-list/)
+  /column/ IN (/subquery/)
+}
+PARAGRAPH {
+  If an index is created using a statement like this:
+}
+CODE {
+  CREATE INDEX idx_ex1 ON ex1(a,b,c,d,e,...,y,z);
+}
+PARAGRAPH {
+  Then the index might be used if the initial columns of the index
+  (columns a, b, and so forth) appear in WHERE clause terms.
+  All index columns must be used with
+  the *=* or *IN* operators except for
+  the right-most column which can use inequalities.  For the right-most
+  column of an index that is used, there can be up to two inequalities
+  that must sandwich the allowed values of the column between two extremes.
+}
+PARAGRAPH {
+  It is not necessary for every column of an index to appear in a
+  WHERE clause term in order for that index to be used.  In fact, there
+  are sometimes advantages if this is not the case.
+  But there can not be gaps in the columns of the index that are used.
+  Thus for the example index above, if there is no WHERE clause term
+  that constraints column c, then terms that constraint columns a and b can
+  be used with the index but not terms that constraint columns d through z.
+  Similarly, no index column will be used (for indexing purposes)
+  that is to the right of a 
+  column that is constrained only by inequalities.
+  Thus for the index above and WHERE clause like this:
+}
+CODE {
+  ... WHERE a=5 AND b IN (1,2,3) AND c>12 AND d='hello'
+}
+PARAGRAPH {
+  Only columns a, b, and c of the index would be usable.  The d column
+  would not be usable because it occurs to the right of c and c is
+  constrained only by inequalities.
+}
+
+HEADING 1 {The BETWEEN optimization} between_opt
+
+PARAGRAPH {
+  If a term of the WHERE clause is of the following form:
+}
+SYNTAX {
+  /expr1/ BETWEEN /expr2/ AND /expr3/
+}
+PARAGRAPH {
+  Then two virtual terms are added as follows:
+}
+SYNTAX {
+  /expr1/ >= /expr2/ AND /expr1/ <= /expr3/
+}
+PARAGRAPH {
+  If both virtual terms end up being used as constraints on an index,
+  then the original BETWEEN term is omitted and the corresponding test
+  is not performed on input rows.
+  Thus if the BETWEEN term ends up being used as an index constraint
+  no tests are ever performed on that term.
+  On the other hand, the
+  virtual terms themselves never causes tests to be performed on
+  input rows.
+  Thus if the BETWEEN term is not used as an index constraint and
+  instead must be used to test input rows, the /expr1/ expression is
+  only evaluated once.
+}
+
+HEADING 1 {The OR optimization} or_opt
+
+PARAGRAPH {
+  If a term consists of multiple subterms containing a common column
+  name and separated by OR, like this:
+}
+SYNTAX {
+  /column/ = /expr1/ OR /column/ = /expr2/ OR /column/ = /expr3/ OR ...
+}
+PARAGRAPH {
+  Then the term is rewritten as follows:
+}
+SYNTAX {
+  /column/ IN (/expr1/,/expr2/,/expr3/,/expr4/,...)
+}
+PARAGRAPH {
+  The rewritten term then might go on to constraint an index using the
+  normal rules for *IN* operators.
+  Note that /column/ must be the same column in every OR-connected subterm,
+  although the column can occur on either the left or the right side of
+  the *=* operator.
+}
+
+HEADING 1 {The LIKE optimization} like_opt
+
+PARAGRAPH {
+  Terms that are composed of the LIKE or GLOB operator
+  can sometimes be used to constrain indices.
+  There are many conditions on this use:
+}
+PARAGRAPH {
+  <ol>
+  <li>The left-hand side of the LIKE or GLOB operator must be the name
+      of an indexed column.</li>
+  <li>The right-hand side of the LIKE or GLOB must be a string literal
+      that does not begin with a wildcard character.</li>
+  <li>The ESCAPE clause cannot appear on the LIKE operator.</li>
+  <li>The build-in functions used to implement LIKE and GLOB must not
+      have been overloaded using the sqlite3_create_function() API.</li>
+  <li>For the GLOB operator, the column must use the default BINARY
+      collating sequence.</li>
+  <li>For the LIKE operator, if case_sensitive_like mode is enabled then
+      the column must use the default BINARY collating sequence, or if
+      case_sensitive_like mode is disabled then the column must use the
+      built-in NOCASE collating sequence.</li>
+  </ol>
+}
+PARAGRAPH {
+  The LIKE operator has two modes that can be set by a pragma.  The
+  default mode is for LIKE comparisons to be insensitive to differences
+  of case for latin1 characters.  Thus, by default, the following
+  expression is true:
+}
+CODE {
+  'a' LIKE 'A'
+}
+PARAGRAPH {
+  By turned on the case_sensitive_like pragma as follows:
+}
+CODE {
+  PRAGMA case_sensitive_like=ON;
+}
+PARAGRAPH {
+  Then the LIKE operator pays attention to case and the example above would
+  evaluate to false.  Note that case insensitivity only applies to
+  latin1 characters - basically the upper and lower case letters of English
+  in the lower 127 byte codes of ASCII.  International character sets
+  are always case sensitive in SQLite unless a user-supplied collating
+  sequence is used.  But if you employ a user-supplied collating sequence,
+  the LIKE optimization describe here will never be taken.
+}
+PARAGRAPH {
+  The LIKE operator is case insensitive by default because this is what
+  the SQL standard requires.  You can change the default behavior at
+  compile time by using the -DSQLITE_CASE_SENSITIVE_LIKE command-line option
+  to the compiler.
+}
+PARAGRAPH {
+  The LIKE optimization might occur if the column named on the left of the
+  operator uses the BINARY collating sequence (which is the default) and
+  case_sensitive_like is turned on.  Or the optimization might occur if
+  the column uses the built-in NOCASE collating sequence and the 
+  case_sensitive_like mode is off.  These are the only two combinations
+  under which LIKE operators will be optimized.  If the column on the
+  right-hand side of the LIKE operator uses any collating sequence other
+  than the built-in BINARY and NOCASE collating sequences, then no optimizations
+  will ever be attempted on the LIKE operator.
+}
+PARAGRAPH {
+  The GLOB operator is always case sensitive.  The column on the left side
+  of the GLOB operator must always use the built-in BINARY collating sequence
+  or no attempt will be made to optimize that operator with indices.
+}
+PARAGRAPH {
+  The right-hand side of the GLOB or LIKE operator must be a literal string
+  value that does not begin with a wildcard.  If the right-hand side is a
+  parameter that is bound to a string, then no optimization is attempted.
+  If the right-hand side begins with a wildcard character then no 
+  optimization is attempted.
+}
+PARAGRAPH {
+  Suppose the initial sequence of non-wildcard characters on the right-hand
+  side of the LIKE or GLOB operator is /x/.  We are using a single 
+  character to denote this non-wildcard prefix but the reader should
+  understand that the prefix can consist of more than 1 character.
+  Let /y/ the smallest string that is the same length as /x/ but which
+  compares greater than /x/.  For example, if /x/ is *hello* then
+  /y/ would be *hellp*.
+  The LIKE and GLOB optimizations consist of adding two virtual terms
+  like this:
+}
+SYNTAX {
+  /column/ >= /x/ AND /column/ < /y/
+}
+PARAGRAPH {
+  Under most circumstances, the original LIKE or GLOB operator is still
+  tested against each input row even if the virtual terms are used to
+  constrain an index.  This is because we do not know what additional
+  constraints may be imposed by characters to the right
+  of the /x/ prefix.  However, if there is only a single global wildcard
+  to the right of /x/, then the original LIKE or GLOB test is disabled.
+  In other words, if the pattern is like this:
+}
+SYNTAX {
+  /column/ LIKE /x/%
+  /column/ GLOB /x/*
+}
+PARAGRAPH {
+  Then the original LIKE or GLOB tests are disabled when the virtual
+  terms constrain an index because in that case we know that all of the
+  rows selected by the index will pass the LIKE or GLOB test.
+}
+
+HEADING 1 {Joins} joins
+
+PARAGRAPH {
+  The current implementation of 
+  SQLite uses only loop joins.  That is to say, joins are implemented as
+  nested loops.
+}
+PARAGRAPH {
+  The default order of the nested loops in a join is for the left-most
+  table in the FROM clause to form the outer loop and the right-most
+  table to form the inner loop.
+  However, SQLite will nest the loops in a different order if doing so
+  will help it to select better indices.
+}
+PARAGRAPH {
+  Inner joins can be freely reordered.  However a left outer join is
+  neither commutative nor associative and hence will not be reordered.
+  Inner joins to the left and right of the outer join might be reordered
+  if the optimizer thinks that is advantageous but the outer joins are
+  always evaluated in the order in which they occur.
+}
+PARAGRAPH {
+  When selecting the order of tables in a join, SQLite uses a greedy
+  algorithm that runs in polynomial time.
+}
+PARAGRAPH {
+  The ON and USING clauses of a join are converted into additional
+  terms of the WHERE clause prior to WHERE clause analysis described
+  above in paragraph 1.0.  Thus
+  with SQLite, there is no advantage to use the newer SQL92 join syntax
+  over the older SQL89 comma-join syntax.  They both end up accomplishing
+  exactly the same thing.
+}
+
+HEADING 1 {Choosing between multiple indices} multi_index
+
+PARAGRAPH {
+  Each table in the FROM clause of a query can use at most one index,
+  and SQLite strives to use at least one index on each table.  Sometimes,
+  two or more indices might be candidates for use on a single table.
+  For example:
+}
+CODE {
+  CREATE TABLE ex2(x,y,z);
+  CREATE INDEX ex2i1 ON ex2(x);
+  CREATE INDEX ex2i2 ON ex2(y);
+  SELECT z FROM ex2 WHERE x=5 AND y=6;
+}
+PARAGRAPH {
+  For the SELECT statement above, the optimizer can use the ex2i1 index
+  to lookup rows of ex2 that contain x=5 and then test each row against
+  the y=6 term.  Or it can use the ex2i2 index to lookup rows
+  of ex2 that contain y=6 then test each of those rows against the
+  x=5 term.
+}
+PARAGRAPH {
+  When faced with a choice of two or more indices, SQLite tries to estimate
+  the total amount of work needed to perform the query using each option.
+  It then selects the option that gives the least estimated work.
+}
+PARAGRAPH {
+  To help the optimizer get a more accurate estimate of the work involved
+  in using various indices, the user may optional run the ANALYZE command.
+  The ANALYZE command scans all indices of database where there might
+  be a choice between two or more indices and gathers statistics on the
+  selectiveness of those indices.  The results of this scan are stored
+  in the sqlite_stat1 table.
+  The contents of the sqlite_stat1 table are not updated as the database
+  changes so after making significant changes it might be prudent to
+  rerun ANALYZE.
+  The results of an ANALYZE command are only available to database connections
+  that are opened after the ANALYZE command completes.
+}
+PARAGRAPH {
+  Once created, the sqlite_stat1 table cannot be dropped.  But its
+  content can be viewed, modified, or erased.  Erasing the entire content
+  of the sqlite_stat1 table has the effect of undoing the ANALYZE command.
+  Changing the content of the sqlite_stat1 table can get the optimizer
+  deeply confused and cause it to make silly index choices.  Making
+  updates to the sqlite_stat1 table (except by running ANALYZE) is
+  not recommended.
+}
+PARAGRAPH {
+  Terms of the WHERE clause can be manually disqualified for use with
+  indices by prepending a unary *+* operator to the column name.  The
+  unary *+* is a no-op and will not slow down the evaluation of the test
+  in any way.  But it will prevent that term from constraining an index.
+  So, in the example above, if the query were rewritten as:
+}
+CODE {
+  SELECT z FROM ex2 WHERE +x=5 AND y=6;
+}
+PARAGRAPH {
+  The *+* operator on the *x* column would prevent that term from 
+  constraining an index.  This would force the use of the ex2i2 index.
+}
+
+HEADING 1 {Avoidance of table lookups} index_only
+
+PARAGRAPH {
+  When doing an indexed lookup of a row, the usual procedure is to
+  do a binary search on the index to find the index entry, then extract
+  the rowid from the index and use that rowid to do a binary search on
+  the original table.  Thus a typical indexed lookup involves two
+  binary searches.
+  If, however, all columns that were to be fetched from the table are
+  already available in the index itself, SQLite will use the values
+  contained in the index and will never look up the original table
+  row.  This saves a binary search for each table and can make many
+  queries run twice as fast.
+}
+
+HEADING 1 {ORDER BY optimizations} order_by
+
+PARAGRAPH {
+  SQLite attempts to use an index to satisfy the ORDER BY clause of a
+  query when possible.
+  When faced with the choice of using an index to satisfy WHERE clause
+  constraints or satisfying an ORDER BY clause, SQLite does the same
+  work analysis that it normally does when choosing between two indices
+  and selects that one that it believes will result in the fastest answer.
+  Usually this means satisfying the WHERE clause constraint.
+}
+
+HEADING 1 {Subquery flattening} flattening
+
+PARAGRAPH {
+  When a subquery occurs in the FROM clause of a SELECT, the default
+  behavior is to evaluate the subquery into a transient table, then run
+  the outer SELECT against the transient table. 
+  This is problematic since the transient table will not have any indices
+  and the outer query (which is likely a join) will be forced to do a
+  full table scan on the transient table.
+}
+PARAGRAPH {
+  To overcome this problem, SQLite attempts to flatten subqueries in
+  the FROM clause of a SELECT.
+  This involves inserting the FROM clause of the subquery into the
+  FROM clause of the outer query and rewriting certain expressions in
+  the outer query.
+  There is a long list of conditions that must be met in order for
+  query flattening to occur.  These conditions are fully documented in
+  source code comments in the select.c source file.
+}
+PARAGRAPH {
+  Query flattening is an important optimization when views are used as
+  each use of a view is translated into a subquery.
+}
+
+HEADING 1 {The MIN/MAX optimization} minmax
+
+PARAGRAPH {
+  Queries of the following forms will be optimized to run in logorithmic
+  time assuming appropriate indices exist:
+}
+CODE {
+  SELECT MIN(x) FROM table;
+  SELECT MAX(x) FROM table;
+}
+PARAGRAPH {
+  In order for these optimizations to occur, they must appear in exactly
+  the form shown above - changing only the name of the table and column.
+  And the column in the MIN or MAX function must be an indexed column.
+}