From: dan Date: Thu, 3 Sep 2015 18:05:09 +0000 (+0000) Subject: Add documentation for fts5 synonym support. X-Git-Tag: version-3.9.0~153^2~3 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=ea8b93ee2041a1f4e22132cb937eff137127f4b5;p=thirdparty%2Fsqlite.git Add documentation for fts5 synonym support. FossilOrigin-Name: 58aa1f435959852df74f1bca8e0bdbc4f47c256a --- diff --git a/ext/fts5/extract_api_docs.tcl b/ext/fts5/extract_api_docs.tcl index afb2699be5..2320d70b7d 100644 --- a/ext/fts5/extract_api_docs.tcl +++ b/ext/fts5/extract_api_docs.tcl @@ -108,13 +108,15 @@ proc get_tokenizer_docs {data} { append res "
$line

\n" continue } + if {[regexp {SYNONYM SUPPORT} $line]} { + set line "

Synonym Support

" + } if {[string trim $line] == ""} { append res "

\n" } else { append res "$line\n" } } - append res "\n" set res } @@ -208,6 +210,10 @@ proc main {data} { fts5_tokenizer { output [get_fts5_struct $data "typedef struct Fts5Tokenizer" "^\};"] + output [get_fts5_struct $data \ + "Flags that may be passed as the third argument to xTokenize()" \ + "#define FTS5_TOKEN_COLOCATED" + ] } fts5_extension { diff --git a/ext/fts5/fts5.h b/ext/fts5/fts5.h index c9eb91d4cc..6872918e94 100644 --- a/ext/fts5/fts5.h +++ b/ext/fts5/fts5.h @@ -278,18 +278,46 @@ struct Fts5ExtensionApi { ** ** xTokenize: ** This function is expected to tokenize the nText byte string indicated -** by argument pText. pText may not be nul-terminated. The first argument -** passed to this function is a pointer to an Fts5Tokenizer object returned -** by an earlier call to xCreate(). +** by argument pText. pText may or may not be nul-terminated. The first +** argument passed to this function is a pointer to an Fts5Tokenizer object +** returned by an earlier call to xCreate(). +** +** The second argument indicates the reason that FTS5 is requesting +** tokenization of the supplied text. This is always one of the following +** four values: +** +**

** ** For each token in the input string, the supplied callback xToken() must ** be invoked. The first argument to it should be a copy of the pointer -** passed as the second argument to xTokenize(). The next two arguments -** are a pointer to a buffer containing the token text, and the size of -** the token in bytes. The 4th and 5th arguments are the byte offsets of -** the first byte of and first byte immediately following the text from +** passed as the second argument to xTokenize(). The third and fourth +** arguments are a pointer to a buffer containing the token text, and the +** size of the token in bytes. The 4th and 5th arguments are the byte offsets +** of the first byte of and first byte immediately following the text from ** which the token is derived within the input. ** +** The second argument passed to the xToken() callback ("tflags") should +** normally be set to 0. The exception is if the tokenizer supports +** synonyms. In this case see the discussion below for details. +** ** FTS5 assumes the xToken() callback is invoked for each token in the ** order that they occur within the input text. ** @@ -301,6 +329,112 @@ struct Fts5ExtensionApi { ** may abandon the tokenization and return any error code other than ** SQLITE_OK or SQLITE_DONE. ** +** SYNONYM SUPPORT +** +** Custom tokenizers may also support synonyms. Consider a case in which a +** user wishes to query for a phrase such as "first place". Using the +** built-in tokenizers, the FTS5 query 'first + place' will match instances +** of "first place" within the document set, but not alternative forms +** such as "1st place". In some applications, it would be better to match +** all instances of "first place" or "1st place" regardless of which form +** the user specified in the MATCH query text. +** +** There are several ways to approach this in FTS5: +** +**
  1. By mapping all synonyms to a single token. In this case, the +** In the above example, this means that the tokenizer returns the +** same token for inputs "first" and "1st". Say that token is in +** fact "first", so that when the user inserts the document "I won +** 1st place" entries are added to the index for tokens "i", "won", +** "first" and "place". If the user then queries for '1st + place', +** the tokenizer substitutes "first" for "1st" and the query works +** as expected. +** +**
  2. By adding multiple synonyms for a single term to the FTS index. +** In this case, when tokenizing query text, the tokenizer may +** provide multiple synonyms for a single term within the document. +** FTS5 then queries the index for each synonym individually. For +** example, faced with the query: +** +** +** ... MATCH 'first place' +** +** the tokenizer offers both "1st" and "first" as synonyms for the +** first token in the MATCH query and FTS5 effectively runs a query +** similar to: +** +** +** ... MATCH '(first OR 1st) place' +** +** except that, for the purposes of auxiliary functions, the query +** still appears to contain just two phrases - "(first OR 1st)" +** being treated as a single phrase. +** +**
  3. By adding multiple synonyms for a single term to the FTS index. +** Using this method, when tokenizing document text, the tokenizer +** provides multiple synonyms for each token. So that when a +** document such as "I won first place" is tokenized, entries are +** added to the FTS index for "i", "won", "first", "1st" and +** "place". +** +** This way, even if the tokenizer does not provide synonyms +** when tokenizing query text (it should not - to do would be +** inefficient), it doesn't matter if the user queries for +** 'first + place' or '1st + place', as there are entires in the +** FTS index corresponding to both forms of the first token. +**
+** +** Whether is is parsing document or query text, any call to xToken that +** specifies a tflags argument with the FTS5_TOKEN_COLOCATED bit +** is considered to supply a synonym for the previous token. For example, +** when parsing the document "I won first place", a tokenizer that supports +** synonyms would call xToken() 5 times, as follows: +** +** +** xToken(pCtx, 0, "i", 1, 0, 1); +** xToken(pCtx, 0, "won", 3, 2, 5); +** xToken(pCtx, 0, "first", 5, 6, 11); +** xToken(pCtx, FTS5_TOKEN_COLOCATED, "1st", 3, 6, 11); +** xToken(pCtx, 0, "place", 5, 12, 17); +** +** +** It is an error to specify the FTS5_TOKEN_COLOCATED flag the first time +** xToken() is called. Multiple synonyms may be specified for a single token +** by making multiple calls to xToken(FTS5_TOKEN_COLOCATED) in sequence. +** There is no limit to the number of synonyms that may be provided for a +** single token. +** +** In many cases, method (1) above is the best approach. It does not add +** extra data to the FTS index or require FTS5 to query for multiple terms, +** so it is efficient in terms of disk space and query speed. However, it +** does not support prefix queries very well. If, as suggested above, the +** token "first" is subsituted for "1st" by the tokenizer, then the query: +** +** +** ... MATCH '1s*' +** +** will not match documents that contain the token "1st" (as the tokenizer +** will probably not map "1s" to any prefix of "first"). +** +** For full prefix support, method (3) may be preferred. In this case, +** because the index contains entries for both "first" and "1st", prefix +** queries such as 'fi*' or '1s*' will match correctly. However, because +** extra entries are added to the FTS index, this method uses more space +** within the database. +** +** Method (2) offers a midpoint between (1) and (3). Using this method, +** a query such as '1s*' will match documents that contain the literal +** token "1st", but not "first" (assuming the tokenizer is not able to +** provide synonyms for prefixes). However, a non-prefix query like '1st' +** will match against "1st" and "first". This method does not require +** extra disk space, as no extra entries are added to the FTS index. +** On the other hand, it may require more CPU cycles to run MATCH queries, +** as separate queries of the FTS index are required for each synonym. +** +** When using methods (2) or (3), it is important that the tokenizer only +** provide synonyms when tokenizing document text (method (2)) or query +** text (method (3)), not both. Doing so will not cause any errors, but is +** inefficient. */ typedef struct Fts5Tokenizer Fts5Tokenizer; typedef struct fts5_tokenizer fts5_tokenizer; @@ -309,7 +443,7 @@ struct fts5_tokenizer { void (*xDelete)(Fts5Tokenizer*); int (*xTokenize)(Fts5Tokenizer*, void *pCtx, - int flags, + int flags, /* Mask of FTS5_TOKENIZE_* flags */ const char *pText, int nText, int (*xToken)( void *pCtx, /* Copy of 2nd argument to xTokenize() */ diff --git a/manifest b/manifest index c46ab77428..2777d36021 100644 --- a/manifest +++ b/manifest @@ -1,5 +1,5 @@ -C Add\stests\sto\simprove\scoverage\sof\sfts5_varint.c. -D 2015-09-03T15:37:26.095 +C Add\sdocumentation\sfor\sfts5\ssynonym\ssupport. +D 2015-09-03T18:05:09.505 F Makefile.arm-wince-mingw32ce-gcc d6df77f1f48d690bd73162294bbba7f59507c72f F Makefile.in e2218eb228374422969de7b1680eda6864affcef F Makefile.linux-gcc 91d710bdc4998cb015f39edf3cb314ec4f4d7e23 @@ -104,8 +104,8 @@ F ext/fts3/unicode/CaseFolding.txt 8c678ca52ecc95e16bc7afc2dbf6fc9ffa05db8c F ext/fts3/unicode/UnicodeData.txt cd07314edb62d49fde34debdaf92fa2aa69011e7 F ext/fts3/unicode/mkunicode.tcl 95cf7ec186e48d4985e433ff8a1c89090a774252 F ext/fts3/unicode/parseunicode.tcl da577d1384810fb4e2b209bf3313074353193e95 -F ext/fts5/extract_api_docs.tcl 06583c935f89075ea0b32f85efa5dd7619fcbd03 -F ext/fts5/fts5.h 0784692f406588e6c90e13a78e1f36e7e3236e42 +F ext/fts5/extract_api_docs.tcl a36e54ec777172ddd3f9a88daf593b00848368e0 +F ext/fts5/fts5.h f04659e0df5af83731b102189a32280f74f4a6bc F ext/fts5/fts5Int.h f65d41f66accad0a289d6bd66b13c07d2932f9be F ext/fts5/fts5_aux.c 7a307760a9c57c750d043188ec0bad59f5b5ec7e F ext/fts5/fts5_buffer.c 80f9ba4431848cb857e3d2158f5280093dcd8015 @@ -1382,7 +1382,7 @@ F tool/vdbe_profile.tcl 67746953071a9f8f2f668b73fe899074e2c6d8c1 F tool/warnings-clang.sh f6aa929dc20ef1f856af04a730772f59283631d4 F tool/warnings.sh 48bd54594752d5be3337f12c72f28d2080cb630b F tool/win/sqlite.vsix deb315d026cc8400325c5863eef847784a219a2f -P 59ae30b97b40faa363c55aa2664dead9eaeeddc0 -R e54371f57da08ab0c80326ea4baedd7e +P 89f24f31a8f7d7cb0a66ee53523881f566dcb035 +R 763079caf8ff69ca16b541c2d3f4f0ad U dan -Z 416751bdad2bf139cde3272ad789aa1a +Z 726cf8268b191707cc8a3f300ac29036 diff --git a/manifest.uuid b/manifest.uuid index 3c272efde7..fc803bdc2c 100644 --- a/manifest.uuid +++ b/manifest.uuid @@ -1 +1 @@ -89f24f31a8f7d7cb0a66ee53523881f566dcb035 \ No newline at end of file +58aa1f435959852df74f1bca8e0bdbc4f47c256a \ No newline at end of file