From: dan Date: Sat, 29 Aug 2015 15:44:27 +0000 (+0000) Subject: Another change to the fts5 tokenizer API. X-Git-Tag: version-3.9.0~153^2~18 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=ee0c0a8de3cdcb336b062677eedd92e8f3d44cdb;p=thirdparty%2Fsqlite.git Another change to the fts5 tokenizer API. FossilOrigin-Name: fc71868496f45f9c7a79ed2bf2d164a7c4718ce1 --- diff --git a/ext/fts5/fts5.h b/ext/fts5/fts5.h index 2e145d1068..c9eb91d4cc 100644 --- a/ext/fts5/fts5.h +++ b/ext/fts5/fts5.h @@ -217,7 +217,7 @@ struct Fts5ExtensionApi { int (*xTokenize)(Fts5Context*, const char *pText, int nText, /* Text to tokenize */ void *pCtx, /* Context passed to xToken() */ - int (*xToken)(void*, const char*, int, int, int, int) /* Callback */ + int (*xToken)(void*, int, const char*, int, int, int) /* Callback */ ); int (*xPhraseCount)(Fts5Context*); @@ -313,20 +313,25 @@ struct fts5_tokenizer { const char *pText, int nText, int (*xToken)( void *pCtx, /* Copy of 2nd argument to xTokenize() */ + int tflags, /* Mask of FTS5_TOKEN_* flags */ const char *pToken, /* Pointer to buffer containing token */ int nToken, /* Size of token in bytes */ int iStart, /* Byte offset of token within input text */ - int iEnd, /* Byte offset of end of token within input text */ - int iPos /* Number of tokens before this one in input text */ + int iEnd /* Byte offset of end of token within input text */ ) ); }; +/* Flags that may be passed as the third argument to xTokenize() */ #define FTS5_TOKENIZE_QUERY 0x0001 #define FTS5_TOKENIZE_PREFIX 0x0002 #define FTS5_TOKENIZE_DOCUMENT 0x0004 #define FTS5_TOKENIZE_AUX 0x0008 +/* Flags that may be passed by the tokenizer implementation back to FTS5 +** as the third argument to the supplied xToken callback. */ +#define FTS5_TOKEN_COLOCATED 0x0001 /* Same position as prev. token */ + /* ** END OF CUSTOM TOKENIZERS *************************************************************************/ diff --git a/ext/fts5/fts5Int.h b/ext/fts5/fts5Int.h index 0357903c7b..c61f8c4052 100644 --- a/ext/fts5/fts5Int.h +++ b/ext/fts5/fts5Int.h @@ -169,7 +169,7 @@ int sqlite3Fts5Tokenize( int flags, /* FTS5_TOKENIZE_* flags */ const char *pText, int nText, /* Text to tokenize */ void *pCtx, /* Context passed to xToken() */ - int (*xToken)(void*, const char*, int, int, int, int) /* Callback */ + int (*xToken)(void*, int, const char*, int, int, int) /* Callback */ ); void sqlite3Fts5Dequote(char *z); diff --git a/ext/fts5/fts5_aux.c b/ext/fts5/fts5_aux.c index 0e608721f7..2e33d5132f 100644 --- a/ext/fts5/fts5_aux.c +++ b/ext/fts5/fts5_aux.c @@ -148,17 +148,18 @@ static void fts5HighlightAppend( */ static int fts5HighlightCb( void *pContext, /* Pointer to HighlightContext object */ + int tflags, /* Mask of FTS5_TOKEN_* flags */ const char *pToken, /* Buffer containing token */ int nToken, /* Size of token in bytes */ int iStartOff, /* Start offset of token */ - int iEndOff, /* End offset of token */ - int iPos + int iEndOff /* End offset of token */ ){ HighlightContext *p = (HighlightContext*)pContext; int rc = SQLITE_OK; + int iPos; - if( iPosiPos ) return SQLITE_OK; - p->iPos = iPos+1; + if( tflags & FTS5_TOKEN_COLOCATED ) return SQLITE_OK; + iPos = p->iPos++; if( p->iRangeEnd>0 ){ if( iPosiRangeStart || iPos>p->iRangeEnd ) return SQLITE_OK; diff --git a/ext/fts5/fts5_config.c b/ext/fts5/fts5_config.c index 7987b8af76..74faf6dd30 100644 --- a/ext/fts5/fts5_config.c +++ b/ext/fts5/fts5_config.c @@ -648,7 +648,7 @@ int sqlite3Fts5Tokenize( int flags, /* FTS5_TOKENIZE_* flags */ const char *pText, int nText, /* Text to tokenize */ void *pCtx, /* Context passed to xToken() */ - int (*xToken)(void*, const char*, int, int, int, int) /* Callback */ + int (*xToken)(void*, int, const char*, int, int, int) /* Callback */ ){ if( pText==0 ) return SQLITE_OK; return pConfig->pTokApi->xTokenize( diff --git a/ext/fts5/fts5_expr.c b/ext/fts5/fts5_expr.c index 46b4091e37..9b52bb08c6 100644 --- a/ext/fts5/fts5_expr.c +++ b/ext/fts5/fts5_expr.c @@ -1338,11 +1338,11 @@ struct TokenCtx { */ static int fts5ParseTokenize( void *pContext, /* Pointer to Fts5InsertCtx object */ + int tflags, /* Mask of FTS5_TOKEN_* flags */ const char *pToken, /* Buffer containing token */ int nToken, /* Size of token in bytes */ int iStart, /* Start offset of token */ - int iEnd, /* End offset of token */ - int iPos + int iEnd /* End offset of token */ ){ int rc = SQLITE_OK; const int SZALLOC = 8; @@ -1350,6 +1350,8 @@ static int fts5ParseTokenize( Fts5ExprPhrase *pPhrase = pCtx->pPhrase; Fts5ExprTerm *pTerm; + if( tflags & FTS5_TOKEN_COLOCATED ) return rc; + if( pPhrase==0 || (pPhrase->nTerm % SZALLOC)==0 ){ Fts5ExprPhrase *pNew; int nNew = SZALLOC + (pPhrase ? pPhrase->nTerm : 0); diff --git a/ext/fts5/fts5_main.c b/ext/fts5/fts5_main.c index 5f0a90eadf..4704fb8906 100644 --- a/ext/fts5/fts5_main.c +++ b/ext/fts5/fts5_main.c @@ -1498,7 +1498,7 @@ static int fts5ApiTokenize( Fts5Context *pCtx, const char *pText, int nText, void *pUserData, - int (*xToken)(void*, const char*, int, int, int, int) + int (*xToken)(void*, int, const char*, int, int, int) ){ Fts5Cursor *pCsr = (Fts5Cursor*)pCtx; Fts5Table *pTab = (Fts5Table*)(pCsr->base.pVtab); @@ -1657,14 +1657,16 @@ static int fts5ApiColumnText( static int fts5ColumnSizeCb( void *pContext, /* Pointer to int */ + int tflags, const char *pToken, /* Buffer containing token */ int nToken, /* Size of token in bytes */ int iStart, /* Start offset of token */ - int iEnd, /* End offset of token */ - int iPos + int iEnd /* End offset of token */ ){ int *pCnt = (int*)pContext; - *pCnt = iPos+1; + if( (tflags & FTS5_TOKEN_COLOCATED)==0 ){ + (*pCnt)++; + } return SQLITE_OK; } diff --git a/ext/fts5/fts5_storage.c b/ext/fts5/fts5_storage.c index 6b1c69c008..d0572e60e6 100644 --- a/ext/fts5/fts5_storage.c +++ b/ext/fts5/fts5_storage.c @@ -359,17 +359,18 @@ struct Fts5InsertCtx { */ static int fts5StorageInsertCallback( void *pContext, /* Pointer to Fts5InsertCtx object */ + int tflags, const char *pToken, /* Buffer containing token */ int nToken, /* Size of token in bytes */ int iStart, /* Start offset of token */ - int iEnd, /* End offset of token */ - int iPos + int iEnd /* End offset of token */ ){ Fts5InsertCtx *pCtx = (Fts5InsertCtx*)pContext; Fts5Index *pIdx = pCtx->pStorage->pIndex; - assert( iPos+1>=pCtx->szCol ); - pCtx->szCol = iPos+1; - return sqlite3Fts5IndexWrite(pIdx, pCtx->iCol, iPos, pToken, nToken); + if( (tflags & FTS5_TOKEN_COLOCATED)==0 ){ + pCtx->szCol++; + } + return sqlite3Fts5IndexWrite(pIdx, pCtx->iCol, pCtx->szCol-1, pToken, nToken); } /* @@ -844,17 +845,18 @@ struct Fts5IntegrityCtx { */ static int fts5StorageIntegrityCallback( void *pContext, /* Pointer to Fts5InsertCtx object */ + int tflags, const char *pToken, /* Buffer containing token */ int nToken, /* Size of token in bytes */ int iStart, /* Start offset of token */ - int iEnd, /* End offset of token */ - int iPos + int iEnd /* End offset of token */ ){ Fts5IntegrityCtx *pCtx = (Fts5IntegrityCtx*)pContext; - assert( iPos+1>=pCtx->szCol ); - pCtx->szCol = iPos+1; + if( (tflags & FTS5_TOKEN_COLOCATED)==0 ){ + pCtx->szCol++; + } pCtx->cksum ^= sqlite3Fts5IndexCksum( - pCtx->pConfig, pCtx->iRowid, pCtx->iCol, iPos, pToken, nToken + pCtx->pConfig, pCtx->iRowid, pCtx->iCol, pCtx->szCol-1, pToken, nToken ); return SQLITE_OK; } diff --git a/ext/fts5/fts5_tcl.c b/ext/fts5/fts5_tcl.c index fecbd41d32..528ca958f2 100644 --- a/ext/fts5/fts5_tcl.c +++ b/ext/fts5/fts5_tcl.c @@ -141,8 +141,9 @@ struct F5tAuxData { static int xTokenizeCb( void *pCtx, + int tflags, const char *zToken, int nToken, - int iStart, int iEnd, int iPos + int iStart, int iEnd ){ F5tFunction *p = (F5tFunction*)pCtx; Tcl_Obj *pEval = Tcl_DuplicateObj(p->pScript); @@ -584,8 +585,9 @@ struct F5tTokenizeCtx { static int xTokenizeCb2( void *pCtx, + int tflags, const char *zToken, int nToken, - int iStart, int iEnd, int iPos + int iStart, int iEnd ){ F5tTokenizeCtx *p = (F5tTokenizeCtx*)pCtx; if( p->bSubst ){ @@ -694,7 +696,7 @@ typedef struct F5tTokenizerModule F5tTokenizerInstance; struct F5tTokenizerContext { void *pCtx; - int (*xToken)(void*, const char*, int, int, int); + int (*xToken)(void*, int, const char*, int, int, int); }; struct F5tTokenizerModule { @@ -752,11 +754,11 @@ static int f5tTokenizerTokenize( void *pCtx, int flags, const char *pText, int nText, - int (*xToken)(void*, const char*, int, int, int, int) + int (*xToken)(void*, int, const char*, int, int, int) ){ F5tTokenizerInstance *pInst = (F5tTokenizerInstance*)p; void *pOldCtx; - int (*xOldToken)(void*, const char*, int, int, int); + int (*xOldToken)(void*, int, const char*, int, int, int); Tcl_Obj *pEval; int rc; @@ -813,7 +815,7 @@ static int f5tTokenizerReturn( return TCL_ERROR; } - rc = p->xToken(p->pCtx, zToken, nToken, iStart, iEnd); + rc = p->xToken(p->pCtx, 0, zToken, nToken, iStart, iEnd); Tcl_SetResult(interp, (char*)sqlite3ErrName(rc), TCL_VOLATILE); return TCL_OK; } diff --git a/ext/fts5/fts5_tokenize.c b/ext/fts5/fts5_tokenize.c index d2477a050b..6b1129c24d 100644 --- a/ext/fts5/fts5_tokenize.c +++ b/ext/fts5/fts5_tokenize.c @@ -118,13 +118,12 @@ static int fts5AsciiTokenize( void *pCtx, int flags, const char *pText, int nText, - int (*xToken)(void*, const char*, int nToken, int iStart, int iEnd, int iPos) + int (*xToken)(void*, int, const char*, int nToken, int iStart, int iEnd) ){ AsciiTokenizer *p = (AsciiTokenizer*)pTokenizer; int rc = SQLITE_OK; int ie; int is = 0; - int iPos = 0; char aFold[64]; int nFold = sizeof(aFold); @@ -160,7 +159,7 @@ static int fts5AsciiTokenize( asciiFold(pFold, &pText[is], nByte); /* Invoke the token callback */ - rc = xToken(pCtx, pFold, nByte, is, ie, iPos++); + rc = xToken(pCtx, 0, pFold, nByte, is, ie); is = ie+1; } @@ -389,12 +388,11 @@ static int fts5UnicodeTokenize( void *pCtx, int flags, const char *pText, int nText, - int (*xToken)(void*, const char*, int nToken, int iStart, int iEnd, int iPos) + int (*xToken)(void*, int, const char*, int nToken, int iStart, int iEnd) ){ Unicode61Tokenizer *p = (Unicode61Tokenizer*)pTokenizer; int rc = SQLITE_OK; unsigned char *a = p->aTokenChar; - int iPos = 0; unsigned char *zTerm = (unsigned char*)&pText[nText]; unsigned char *zCsr = (unsigned char *)pText; @@ -479,7 +477,7 @@ static int fts5UnicodeTokenize( } /* Invoke the token callback */ - rc = xToken(pCtx, aFold, zOut-aFold, is, ie, iPos++); + rc = xToken(pCtx, 0, aFold, zOut-aFold, is, ie); } tokenize_done: @@ -557,7 +555,7 @@ static int fts5PorterCreate( typedef struct PorterContext PorterContext; struct PorterContext { void *pCtx; - int (*xToken)(void*, const char*, int, int, int, int); + int (*xToken)(void*, int, const char*, int, int, int); char *aBuf; }; @@ -1122,11 +1120,11 @@ static void fts5PorterStep1A(char *aBuf, int *pnBuf){ static int fts5PorterCb( void *pCtx, + int tflags, const char *pToken, int nToken, int iStart, - int iEnd, - int iPos + int iEnd ){ PorterContext *p = (PorterContext*)pCtx; @@ -1180,10 +1178,10 @@ static int fts5PorterCb( nBuf--; } - return p->xToken(p->pCtx, aBuf, nBuf, iStart, iEnd, iPos); + return p->xToken(p->pCtx, tflags, aBuf, nBuf, iStart, iEnd); pass_through: - return p->xToken(p->pCtx, pToken, nToken, iStart, iEnd, iPos); + return p->xToken(p->pCtx, tflags, pToken, nToken, iStart, iEnd); } /* @@ -1194,7 +1192,7 @@ static int fts5PorterTokenize( void *pCtx, int flags, const char *pText, int nText, - int (*xToken)(void*, const char*, int nToken, int iStart, int iEnd, int iPos) + int (*xToken)(void*, int, const char*, int nToken, int iStart, int iEnd) ){ PorterTokenizer *p = (PorterTokenizer*)pTokenizer; PorterContext sCtx; diff --git a/manifest b/manifest index 824f32d4c0..964f295f10 100644 --- a/manifest +++ b/manifest @@ -1,5 +1,5 @@ -C Change\sthe\sfts5\stokenizer\sAPI\sto\sallow\smore\sthan\sone\stoken\sto\soccupy\sa\ssingle\sposition\swithin\sa\sdocument. -D 2015-08-28T19:56:47.300 +C Another\schange\sto\sthe\sfts5\stokenizer\sAPI. +D 2015-08-29T15:44:27.938 F Makefile.arm-wince-mingw32ce-gcc d6df77f1f48d690bd73162294bbba7f59507c72f F Makefile.in e2218eb228374422969de7b1680eda6864affcef F Makefile.linux-gcc 91d710bdc4998cb015f39edf3cb314ec4f4d7e23 @@ -105,19 +105,19 @@ F ext/fts3/unicode/UnicodeData.txt cd07314edb62d49fde34debdaf92fa2aa69011e7 F ext/fts3/unicode/mkunicode.tcl 95cf7ec186e48d4985e433ff8a1c89090a774252 F ext/fts3/unicode/parseunicode.tcl da577d1384810fb4e2b209bf3313074353193e95 F ext/fts5/extract_api_docs.tcl 06583c935f89075ea0b32f85efa5dd7619fcbd03 -F ext/fts5/fts5.h b9dfb487ada3caab4400210609b8309b71a4fb4d -F ext/fts5/fts5Int.h b0cfe44ec9451f766b77c4e5f771e7919c6dc8d5 -F ext/fts5/fts5_aux.c 7d0e275ee94ad7afdd4208d6b071b4319e8f9ca0 +F ext/fts5/fts5.h 0784692f406588e6c90e13a78e1f36e7e3236e42 +F ext/fts5/fts5Int.h 9fd31e682acae32806f77e7c3b543c4294274c92 +F ext/fts5/fts5_aux.c 7a307760a9c57c750d043188ec0bad59f5b5ec7e F ext/fts5/fts5_buffer.c 80f9ba4431848cb857e3d2158f5280093dcd8015 -F ext/fts5/fts5_config.c ab81c8ccff6c0fb79f21c369e18e8e0dec365ec5 -F ext/fts5/fts5_expr.c f53917b6e68dee62e4c525466edacacf82eb7cbc +F ext/fts5/fts5_config.c 80b61fd2c6844b64a3e72a64572d50a812da9384 +F ext/fts5/fts5_expr.c 7ea46f676491989069d31ae1f75c9439b0858711 F ext/fts5/fts5_hash.c 4bf4b99708848357b8a2b5819e509eb6d3df9246 F ext/fts5/fts5_index.c 076c4995bf06a6d1559a6e31f9a86b90f2105374 -F ext/fts5/fts5_main.c 7afdb84ac40b0e5bbb920a07a5cd5e062963816c -F ext/fts5/fts5_storage.c 9c263323479a4aa554738e421813cd05615d379c -F ext/fts5/fts5_tcl.c 41e2d6b455547a157085fd35fd59d4fd890dc7d3 +F ext/fts5/fts5_main.c b00834ac543431dc35edbe18018b4befe0c7fd42 +F ext/fts5/fts5_storage.c 9820e7b53ea12baf3c818485efd66346b73030c3 +F ext/fts5/fts5_tcl.c 058f8da51964458e9859edfc1ee13b1863edaeae F ext/fts5/fts5_test_mi.c 80a9e86fb4c5b6b58f8fefac05e9b96d1a6574e1 -F ext/fts5/fts5_tokenize.c 07a894410bc074685ddc0a9d89b5e7bf57ea4482 +F ext/fts5/fts5_tokenize.c 710541513ecf3fe6d9365326fc85aee6efe97229 F ext/fts5/fts5_unicode2.c 78273fbd588d1d9bd0a7e4e0ccc9207348bae33c F ext/fts5/fts5_varint.c 3f86ce09cab152e3d45490d7586b7ed2e40c13f1 F ext/fts5/fts5_vocab.c 4622e0b7d84a488a1585aaa56eb214ee67a988bc @@ -1380,10 +1380,7 @@ F tool/vdbe_profile.tcl 67746953071a9f8f2f668b73fe899074e2c6d8c1 F tool/warnings-clang.sh f6aa929dc20ef1f856af04a730772f59283631d4 F tool/warnings.sh 48bd54594752d5be3337f12c72f28d2080cb630b F tool/win/sqlite.vsix deb315d026cc8400325c5863eef847784a219a2f -P 0fdc36fe35ae2fc8e9688fe6c53437f4d47502d9 -R 694c0e23ba08ed9bcc32d2c502ed8f13 -T *branch * fts5-incompatible -T *sym-fts5-incompatible * -T -sym-trunk * +P 90b85b42f2b2dd3e939b129b7df2b822a05e243d +R f343432805e01f14633e088d58d566cf U dan -Z 745a50831400d199b74f44c2476ec260 +Z e4288542e4294b868813263b0597051d diff --git a/manifest.uuid b/manifest.uuid index 91a79bb98f..c7283c3a07 100644 --- a/manifest.uuid +++ b/manifest.uuid @@ -1 +1 @@ -90b85b42f2b2dd3e939b129b7df2b822a05e243d \ No newline at end of file +fc71868496f45f9c7a79ed2bf2d164a7c4718ce1 \ No newline at end of file