Add documentation for fts5 synonym support.

author dan <dan@noemail.net>

Thu, 3 Sep 2015 18:05:09 +0000 (18:05 +0000)

committer dan <dan@noemail.net>

Thu, 3 Sep 2015 18:05:09 +0000 (18:05 +0000)
author dan <dan@noemail.net>
Thu, 3 Sep 2015 18:05:09 +0000 (18:05 +0000)
committer dan <dan@noemail.net>
Thu, 3 Sep 2015 18:05:09 +0000 (18:05 +0000)
diff --git a/ext/fts5/extract_api_docs.tcl b/ext/fts5/extract_api_docs.tcl

index afb2699be5033dd922c9e958047eaf3ac9e1720d..2320d70b7d5bcc92b4ee0625e406be1a9e26ede6 100644 (file)
--- a/ext/fts5/extract_api_docs.tcl
+++ b/ext/fts5/extract_api_docs.tcl
@@ -108,13 +108,15 @@ proc get_tokenizer_docs {data} {
        append res "<dt><b>$line</b></dt><dd><p style=margin-top:0>\n"
        continue
      }
+    if {[regexp {SYNONYM SUPPORT} $line]} {
+      set line "</dl><h3>Synonym Support</h3>"
+    }
      if {[string trim $line] == ""} {
        append res "<p>\n"
      } else {
        append res "$line\n"
      }
    }
-  append res "</dl>\n"
  
    set res
  }
@@ -208,6 +210,10 @@ proc main {data} {
  
      fts5_tokenizer {
        output [get_fts5_struct $data "typedef struct Fts5Tokenizer" "^\};"]
+      output [get_fts5_struct $data \
+        "Flags that may be passed as the third argument to xTokenize()" \
+        "#define FTS5_TOKEN_COLOCATED"
+      ]
      }
  
      fts5_extension {
diff --git a/ext/fts5/fts5.h b/ext/fts5/fts5.h

index c9eb91d4cca0450e4547388eed39e2ba604dcb7a..6872918e9428437f70793814df0c7b47308238db 100644 (file)
--- a/ext/fts5/fts5.h
+++ b/ext/fts5/fts5.h
@@ -278,18 +278,46 @@ struct Fts5ExtensionApi {
  **
  ** xTokenize:
  **   This function is expected to tokenize the nText byte string indicated 
-**   by argument pText. pText may not be nul-terminated. The first argument
-**   passed to this function is a pointer to an Fts5Tokenizer object returned 
-**   by an earlier call to xCreate().
+**   by argument pText. pText may or may not be nul-terminated. The first
+**   argument passed to this function is a pointer to an Fts5Tokenizer object
+**   returned by an earlier call to xCreate().
+**
+**   The second argument indicates the reason that FTS5 is requesting
+**   tokenization of the supplied text. This is always one of the following
+**   four values:
+**
+**   <ul><li> <b>FTS5_TOKENIZE_DOCUMENT</b> - A document is being inserted into
+**            or removed from the FTS table. The tokenizer is being invoked to
+**            determine the set of tokens to add to (or delete from) the
+**            FTS index.
+**
+**       <li> <b>FTS5_TOKENIZE_QUERY</b> - A MATCH query is being executed 
+**            against the FTS index. The tokenizer is being called to tokenize 
+**            a bareword or quoted string specified as part of the query.
+**
+**       <li> <b>(FTS5_TOKENIZE_QUERY | FTS5_TOKENIZE_PREFIX)</b> - Same as
+**            FTS5_TOKENIZE_QUERY, except that the bareword or quoted string is
+**            followed by a "*" character, indicating that the last token
+**            returned by the tokenizer will be treated as a token prefix.
+**
+**       <li> <b>FTS5_TOKENIZE_AUX</b> - The tokenizer is being invoked to 
+**            satisfy an fts5_api.xTokenize() request made by an auxiliary
+**            function. Or an fts5_api.xColumnSize() request made by the same
+**            on a columnsize=0 database.  
+**   </ul>
  **
  **   For each token in the input string, the supplied callback xToken() must
  **   be invoked. The first argument to it should be a copy of the pointer
-**   passed as the second argument to xTokenize(). The next two arguments
-**   are a pointer to a buffer containing the token text, and the size of
-**   the token in bytes. The 4th and 5th arguments are the byte offsets of
-**   the first byte of and first byte immediately following the text from 
+**   passed as the second argument to xTokenize(). The third and fourth
+**   arguments are a pointer to a buffer containing the token text, and the
+**   size of the token in bytes. The 4th and 5th arguments are the byte offsets
+**   of the first byte of and first byte immediately following the text from
  **   which the token is derived within the input.
  **
+**   The second argument passed to the xToken() callback ("tflags") should
+**   normally be set to 0. The exception is if the tokenizer supports 
+**   synonyms. In this case see the discussion below for details.
+**
  **   FTS5 assumes the xToken() callback is invoked for each token in the 
  **   order that they occur within the input text.
  **
@@ -301,6 +329,112 @@ struct Fts5ExtensionApi {
  **   may abandon the tokenization and return any error code other than
  **   SQLITE_OK or SQLITE_DONE.
  **
+** SYNONYM SUPPORT
+**
+**   Custom tokenizers may also support synonyms. Consider a case in which a
+**   user wishes to query for a phrase such as "first place". Using the 
+**   built-in tokenizers, the FTS5 query 'first + place' will match instances
+**   of "first place" within the document set, but not alternative forms
+**   such as "1st place". In some applications, it would be better to match
+**   all instances of "first place" or "1st place" regardless of which form
+**   the user specified in the MATCH query text.
+**
+**   There are several ways to approach this in FTS5:
+**
+**   <ol><li> By mapping all synonyms to a single token. In this case, the 
+**            In the above example, this means that the tokenizer returns the
+**            same token for inputs "first" and "1st". Say that token is in
+**            fact "first", so that when the user inserts the document "I won
+**            1st place" entries are added to the index for tokens "i", "won",
+**            "first" and "place". If the user then queries for '1st + place',
+**            the tokenizer substitutes "first" for "1st" and the query works
+**            as expected.
+**
+**       <li> By adding multiple synonyms for a single term to the FTS index.
+**            In this case, when tokenizing query text, the tokenizer may 
+**            provide multiple synonyms for a single term within the document.
+**            FTS5 then queries the index for each synonym individually. For
+**            example, faced with the query:
+**
+**   <codeblock>
+**     ... MATCH 'first place'</codeblock>
+**
+**            the tokenizer offers both "1st" and "first" as synonyms for the
+**            first token in the MATCH query and FTS5 effectively runs a query 
+**            similar to:
+**
+**   <codeblock>
+**     ... MATCH '(first OR 1st) place'</codeblock>
+**
+**            except that, for the purposes of auxiliary functions, the query
+**            still appears to contain just two phrases - "(first OR 1st)" 
+**            being treated as a single phrase.
+**
+**       <li> By adding multiple synonyms for a single term to the FTS index.
+**            Using this method, when tokenizing document text, the tokenizer
+**            provides multiple synonyms for each token. So that when a 
+**            document such as "I won first place" is tokenized, entries are
+**            added to the FTS index for "i", "won", "first", "1st" and
+**            "place".
+**
+**            This way, even if the tokenizer does not provide synonyms
+**            when tokenizing query text (it should not - to do would be
+**            inefficient), it doesn't matter if the user queries for 
+**            'first + place' or '1st + place', as there are entires in the
+**            FTS index corresponding to both forms of the first token.
+**   </ol>
+**
+**   Whether is is parsing document or query text, any call to xToken that
+**   specifies a <i>tflags</i> argument with the FTS5_TOKEN_COLOCATED bit
+**   is considered to supply a synonym for the previous token. For example,
+**   when parsing the document "I won first place", a tokenizer that supports
+**   synonyms would call xToken() 5 times, as follows:
+**
+**   <codeblock>
+**       xToken(pCtx, 0, "i",                      1,  0,  1);
+**       xToken(pCtx, 0, "won",                    3,  2,  5);
+**       xToken(pCtx, 0, "first",                  5,  6, 11);
+**       xToken(pCtx, FTS5_TOKEN_COLOCATED, "1st", 3,  6, 11);
+**       xToken(pCtx, 0, "place",                  5, 12, 17);
+**</codeblock>
+**
+**   It is an error to specify the FTS5_TOKEN_COLOCATED flag the first time
+**   xToken() is called. Multiple synonyms may be specified for a single token
+**   by making multiple calls to xToken(FTS5_TOKEN_COLOCATED) in sequence. 
+**   There is no limit to the number of synonyms that may be provided for a
+**   single token.
+**
+**   In many cases, method (1) above is the best approach. It does not add 
+**   extra data to the FTS index or require FTS5 to query for multiple terms,
+**   so it is efficient in terms of disk space and query speed. However, it
+**   does not support prefix queries very well. If, as suggested above, the
+**   token "first" is subsituted for "1st" by the tokenizer, then the query:
+**
+**   <codeblock>
+**     ... MATCH '1s*'</codeblock>
+**
+**   will not match documents that contain the token "1st" (as the tokenizer
+**   will probably not map "1s" to any prefix of "first").
+**
+**   For full prefix support, method (3) may be preferred. In this case, 
+**   because the index contains entries for both "first" and "1st", prefix
+**   queries such as 'fi*' or '1s*' will match correctly. However, because
+**   extra entries are added to the FTS index, this method uses more space
+**   within the database.
+**
+**   Method (2) offers a midpoint between (1) and (3). Using this method,
+**   a query such as '1s*' will match documents that contain the literal 
+**   token "1st", but not "first" (assuming the tokenizer is not able to
+**   provide synonyms for prefixes). However, a non-prefix query like '1st'
+**   will match against "1st" and "first". This method does not require
+**   extra disk space, as no extra entries are added to the FTS index. 
+**   On the other hand, it may require more CPU cycles to run MATCH queries,
+**   as separate queries of the FTS index are required for each synonym.
+**
+**   When using methods (2) or (3), it is important that the tokenizer only
+**   provide synonyms when tokenizing document text (method (2)) or query
+**   text (method (3)), not both. Doing so will not cause any errors, but is
+**   inefficient.
  */
  typedef struct Fts5Tokenizer Fts5Tokenizer;
  typedef struct fts5_tokenizer fts5_tokenizer;
@@ -309,7 +443,7 @@ struct fts5_tokenizer {
    void (*xDelete)(Fts5Tokenizer*);
    int (*xTokenize)(Fts5Tokenizer*, 
        void *pCtx,
-      int flags,
+      int flags,            /* Mask of FTS5_TOKENIZE_* flags */
        const char *pText, int nText, 
        int (*xToken)(
          void *pCtx,         /* Copy of 2nd argument to xTokenize() */
diff --git a/manifest b/manifest

index c46ab774286448e3ced9afe06d5fee1e0774c3f7..2777d3602178caf2fd35c998c09c791d68789a9c 100644 (file)
--- a/manifest
+++ b/manifest
@@ -1,5 +1,5 @@
-C Add\stests\sto\simprove\scoverage\sof\sfts5_varint.c.
-D 2015-09-03T15:37:26.095
+C Add\sdocumentation\sfor\sfts5\ssynonym\ssupport.
+D 2015-09-03T18:05:09.505
  F Makefile.arm-wince-mingw32ce-gcc d6df77f1f48d690bd73162294bbba7f59507c72f
  F Makefile.in e2218eb228374422969de7b1680eda6864affcef
  F Makefile.linux-gcc 91d710bdc4998cb015f39edf3cb314ec4f4d7e23
@@ -104,8 +104,8 @@ F ext/fts3/unicode/CaseFolding.txt 8c678ca52ecc95e16bc7afc2dbf6fc9ffa05db8c
  F ext/fts3/unicode/UnicodeData.txt cd07314edb62d49fde34debdaf92fa2aa69011e7
  F ext/fts3/unicode/mkunicode.tcl 95cf7ec186e48d4985e433ff8a1c89090a774252
  F ext/fts3/unicode/parseunicode.tcl da577d1384810fb4e2b209bf3313074353193e95
-F ext/fts5/extract_api_docs.tcl 06583c935f89075ea0b32f85efa5dd7619fcbd03
-F ext/fts5/fts5.h 0784692f406588e6c90e13a78e1f36e7e3236e42
+F ext/fts5/extract_api_docs.tcl a36e54ec777172ddd3f9a88daf593b00848368e0
+F ext/fts5/fts5.h f04659e0df5af83731b102189a32280f74f4a6bc
  F ext/fts5/fts5Int.h f65d41f66accad0a289d6bd66b13c07d2932f9be
  F ext/fts5/fts5_aux.c 7a307760a9c57c750d043188ec0bad59f5b5ec7e
  F ext/fts5/fts5_buffer.c 80f9ba4431848cb857e3d2158f5280093dcd8015
@@ -1382,7 +1382,7 @@ F tool/vdbe_profile.tcl 67746953071a9f8f2f668b73fe899074e2c6d8c1
  F tool/warnings-clang.sh f6aa929dc20ef1f856af04a730772f59283631d4
  F tool/warnings.sh 48bd54594752d5be3337f12c72f28d2080cb630b
  F tool/win/sqlite.vsix deb315d026cc8400325c5863eef847784a219a2f
-P 59ae30b97b40faa363c55aa2664dead9eaeeddc0
-R e54371f57da08ab0c80326ea4baedd7e
+P 89f24f31a8f7d7cb0a66ee53523881f566dcb035
+R 763079caf8ff69ca16b541c2d3f4f0ad
  U dan
-Z 416751bdad2bf139cde3272ad789aa1a
+Z 726cf8268b191707cc8a3f300ac29036
diff --git a/manifest.uuid b/manifest.uuid

index 3c272efde7f9b949c66307f97b3c9ae2b98a2dd3..fc803bdc2cd69a9f67257008f010a1838f998472 100644 (file)
--- a/manifest.uuid
+++ b/manifest.uuid
@@ -1 +1 @@
-89f24f31a8f7d7cb0a66ee53523881f566dcb035
-\ No newline at end of file
+58aa1f435959852df74f1bca8e0bdbc4f47c256a
+\ No newline at end of file
author	dan <dan@noemail.net>
	Thu, 3 Sep 2015 18:05:09 +0000 (18:05 +0000)
committer	dan <dan@noemail.net>
	Thu, 3 Sep 2015 18:05:09 +0000 (18:05 +0000)
ext/fts5/extract_api_docs.tcl		patch \| blob \| blame \| history
ext/fts5/fts5.h		patch \| blob \| blame \| history
manifest		patch \| blob \| blame \| history
manifest.uuid		patch \| blob \| blame \| history