From: dan Date: Mon, 12 Aug 2024 17:03:37 +0000 (+0000) Subject: Fix further issues to do with fts5 locale support. X-Git-Tag: version-3.47.0~220^2~15 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=10e54e365a5f9b2a2ea4ab103f762171e733a903;p=thirdparty%2Fsqlite.git Fix further issues to do with fts5 locale support. FossilOrigin-Name: e626123580065986f7df50b6140f00048944becce179b9391fbf09f97ac55485 --- diff --git a/ext/fts5/fts5.h b/ext/fts5/fts5.h index 784511de26..dfa075f0a9 100644 --- a/ext/fts5/fts5.h +++ b/ext/fts5/fts5.h @@ -395,10 +395,10 @@ struct Fts5ExtensionApi { ** ** Applications may also register custom tokenizer types. A tokenizer ** is registered by providing fts5 with a populated instance of the -** following structure. Of the three structure methods, xCreate, xDelete and -** xTokenize must be supplied, any fo these three members of the -** fts5_tokenizer_v2 struct to NULL leads to undefined behaviour. The -** structure methods are expected to function as follows: +** following structure. All structure methods must be defined, setting +** +** any member of the fts5_tokenizer struct to NULL leads to undefined +** behaviour. The structure methods are expected to function as follows: ** ** xCreate: ** This function is used to allocate and initialize a tokenizer instance. @@ -481,8 +481,8 @@ struct Fts5ExtensionApi { ** and nLocale. These specify the locale that the tokenizer should use ** for the current request. If pLocale and nLocale are both 0, then the ** tokenizer should use its default locale. Otherwise, pLocale points to -** a buffer containing the name of the locale to use as utf-8 text. nLocale -** contains the number of bytes in pLocale. pLocale is not nul-terminated. +** an nLocale byte buffer containing the name of the locale to use as utf-8 +** text. pLocale is not nul-terminated. ** ** SYNONYM SUPPORT ** @@ -658,7 +658,7 @@ struct fts5_tokenizer { */ typedef struct fts5_api fts5_api; struct fts5_api { - int iVersion; /* Currently 3, was once 2 */ + int iVersion; /* Currently always set to 3 */ /* Create a new tokenizer */ int (*xCreateTokenizer)( diff --git a/ext/fts5/fts5Int.h b/ext/fts5/fts5Int.h index 9ed15fb3ac..7e42ea82c1 100644 --- a/ext/fts5/fts5Int.h +++ b/ext/fts5/fts5Int.h @@ -168,7 +168,7 @@ struct Fts5TokenizerConfig { int nArg; int ePattern; /* FTS_PATTERN_XXX constant */ const char *pLocale; /* Current locale to use */ - int nLocale; + int nLocale; /* Size of pLocale in bytes */ }; /* diff --git a/ext/fts5/fts5_aux.c b/ext/fts5/fts5_aux.c index fb177d561e..eb3f7e359d 100644 --- a/ext/fts5/fts5_aux.c +++ b/ext/fts5/fts5_aux.c @@ -257,8 +257,8 @@ static void fts5HighlightFunction( sqlite3_result_text(pCtx, "", -1, SQLITE_STATIC); rc = SQLITE_OK; }else if( ctx.zIn ){ - const char *pLoc = 0; - int nLoc = 0; + const char *pLoc = 0; /* Locale of column iCol */ + int nLoc = 0; /* Size of pLoc in bytes */ if( rc==SQLITE_OK ){ rc = fts5CInstIterInit(pApi, pFts, iCol, &ctx.iter); } @@ -466,8 +466,8 @@ static void fts5SnippetFunction( memset(&sFinder, 0, sizeof(Fts5SFinder)); for(i=0; ixColumnSize(pFts, iBestCol, &nColSize); } if( ctx.zIn ){ - const char *pLoc = 0; /* Locale to tokenize in */ + const char *pLoc = 0; /* Locale of column iBestCol */ int nLoc = 0; /* Bytes in pLoc */ if( rc==SQLITE_OK ){ diff --git a/ext/fts5/fts5_expr.c b/ext/fts5/fts5_expr.c index b1a0f48209..a39cc16bbf 100644 --- a/ext/fts5/fts5_expr.c +++ b/ext/fts5/fts5_expr.c @@ -3092,8 +3092,8 @@ int sqlite3Fts5ExprPopulatePoslists( } } - return sqlite3Fts5Tokenize(pConfig, FTS5_TOKENIZE_DOCUMENT, z, n, - (void*)&sCtx, fts5ExprPopulatePoslistsCb + return sqlite3Fts5Tokenize(pConfig, + FTS5_TOKENIZE_DOCUMENT, z, n, (void*)&sCtx, fts5ExprPopulatePoslistsCb ); } diff --git a/ext/fts5/fts5_main.c b/ext/fts5/fts5_main.c index 9cc7b5cbf3..b4ccd04028 100644 --- a/ext/fts5/fts5_main.c +++ b/ext/fts5/fts5_main.c @@ -115,13 +115,16 @@ struct Fts5Auxiliary { ** Of course, if bV2Native is false, then x1 contains the real routines and ** x2 the synthesized ones. In this case a pointer to the Fts5TokenizerModule ** object should be passed to x2.xCreate. +** +** The synthesized wrapper routines are necessary for xFindTokenizer(_v2) +** calls. */ struct Fts5TokenizerModule { char *zName; /* Name of tokenizer */ void *pUserData; /* User pointer passed to xCreate() */ int bV2Native; /* True if v2 native tokenizer */ fts5_tokenizer x1; /* Tokenizer functions */ - fts5_tokenizer_v2 x2; /* Tokenizer functions */ + fts5_tokenizer_v2 x2; /* V2 tokenizer functions */ void (*xDestroy)(void*); /* Destructor function */ Fts5TokenizerModule *pNext; /* Next registered tokenizer module */ }; @@ -179,12 +182,6 @@ struct Fts5Sorter { ** If the cursor iterates in descending order of rowid, iFirstRowid ** is the upper limit (i.e. the "first" rowid visited) and iLastRowid ** the lower. -** -** pLocale, nLocale: -** These are set by API method xTokenizeSetLocale(). xTokenizeSetLocale() -** does not actually configure the tokenizer, it just stores the values -** it is passed in these variables. The fts5_tokenizer_v2.xSetLocale() -** method is called from within the xTokenize() API method if required. */ struct Fts5Cursor { sqlite3_vtab_cursor base; /* Base class used by SQLite core */ @@ -251,7 +248,7 @@ struct Fts5Cursor { #define BitFlagTest(x,y) (((x) & (y))!=0) /* -** The subtype values returned by fts5_locale() are tagged with. +** The subtype value and header bytes used by fts5_locale(). */ #define FTS5_LOCALE_SUBTYPE ((unsigned int)'L') #define FTS5_LOCALE_HEADER "\x00\xE0\xB2\xEB" @@ -1255,24 +1252,24 @@ static void fts5SetVtabError(Fts5FullTable *p, const char *zFormat, ...){ } /* -** Configure the tokenizer to use the locale specified by nLocale byte -** buffer zLocale. Return SQLITE_OK if successful, or an SQLite error -** code otherwise. +** Arrange for subsequent calls to sqlite3Fts5Tokenize() to use the locale +** specified by pLocale/nLocale. The buffer indicated by pLocale must remain +** valid until after the final call to sqlite3Fts5Tokenize() that will use +** the locale. */ -static int fts5SetLocale( +static void fts5SetLocale( Fts5Config *pConfig, const char *zLocale, int nLocale ){ - int rc = SQLITE_OK; Fts5TokenizerConfig *pT = &pConfig->t; pT->pLocale = zLocale; pT->nLocale = nLocale; - return rc; } /* -** Reset the locale of the tokenizer to its default. +** Clear any locale configured by an earlier call to fts5SetLocale() or +** sqlite3Fts5ExtractText(). */ void sqlite3Fts5ClearLocale(Fts5Config *pConfig){ fts5SetLocale(pConfig, 0, 0); @@ -1293,9 +1290,20 @@ void sqlite3Fts5ClearLocale(Fts5Config *pConfig){ ** 1) Ordinary values. The text can be extracted from these using ** sqlite3_value_text(). ** -** 2) Blobs tagged with sub-type FTS5_LOCALE_SUBTYPE, or those read from -** the content table of a normal content or external-conten table -** with locale=1 set. +** 2) Combination text/locale blobs created by fts5_locale(). There +** are several cases for these: +** +** * Blobs tagged with FTS5_LOCALE_SUBTYPE. +** * Blobs read from the content table of a locale=1 external-content +** table, and +** * Blobs read from the content table of a locale=1 regular +** content table. +** +** The first two cases above should have the 4 byte FTS5_LOCALE_HEADER +** header. It is an error if a blob with the subtype or a blob read +** from the content table of an external content table does not have +** the required header. A blob read from the content table of a regular +** locale=1 table does not have the header. This is to save space. ** ** If successful, SQLITE_OK is returned and output parameters (*ppText) ** and (*pnText) are set to point to a buffer containing the extracted utf-8 @@ -1306,11 +1314,11 @@ void sqlite3Fts5ClearLocale(Fts5Config *pConfig){ ** Parameter bContent must be true if the value was read from an indexed ** column (i.e. not UNINDEXED) of the on disk content. ** -** If pbResetTokenizer is not NULL and if case (2) is used, then the -** tokenizer is configured to use the locale. In this case (*pbResetTokenizer) -** is set to true before returning, to indicate that the caller must -** call sqlite3Fts5ClearLocale() to reset the tokenizer after tokenizing -** the text. +** If pbResetTokenizer is not NULL and if case (2) is used, then +** fts5SetLocale() is called to ensure subsequent sqlite3Fts5Tokenize() calls +** use the locale. In this case (*pbResetTokenizer) is set to true before +** returning, to indicate that the caller must call sqlite3Fts5ClearLocale() +** to clear the locale after tokenizing the text. */ int sqlite3Fts5ExtractText( Fts5Config *pConfig, @@ -1367,7 +1375,7 @@ int sqlite3Fts5ExtractText( nText = nBlob-nLocale-1; if( pbResetTokenizer ){ - rc = fts5SetLocale(pConfig, (const char*)pBlob, nLocale); + fts5SetLocale(pConfig, (const char*)pBlob, nLocale); *pbResetTokenizer = 1; } } @@ -1389,18 +1397,18 @@ int sqlite3Fts5ExtractText( ** the text of the expression, and sets output variable (*pzText) to ** point to a nul-terminated buffer containing the expression. ** -** If pVal was an fts5_locale() value, then the tokenizer has been -** configured to us the required locale. +** If pVal was an fts5_locale() value, then fts5SetLocale() is called to +** set the tokenizer to use the specified locale. ** ** If output variable (*pbFreeAndReset) is set to true, then the caller ** is required to (a) call sqlite3Fts5ClearLocale() to reset the tokenizer ** locale, and (b) call sqlite3_free() to free (*pzText). */ static int fts5ExtractExprText( - Fts5FullTable *pTab, - sqlite3_value *pVal, - char **pzText, - int *pbFreeAndReset + Fts5Config *pConfig, /* Fts5 configuration */ + sqlite3_value *pVal, /* Value to extract expression text from */ + char **pzText, /* OUT: nul-terminated buffer of text */ + int *pbFreeAndReset /* OUT: Free (*pzText) and clear locale */ ){ const char *zText = 0; int nText = 0; @@ -1408,12 +1416,12 @@ static int fts5ExtractExprText( int bReset = 0; *pbFreeAndReset = 0; - rc = sqlite3Fts5ExtractText(pTab->p.pConfig, pVal, 0, &bReset, &zText,&nText); + rc = sqlite3Fts5ExtractText(pConfig, pVal, 0, &bReset, &zText, &nText); if( rc==SQLITE_OK ){ if( bReset ){ *pzText = sqlite3Fts5Mprintf(&rc, "%.*s", nText, zText); if( rc!=SQLITE_OK ){ - sqlite3Fts5ClearLocale(pTab->p.pConfig); + sqlite3Fts5ClearLocale(pConfig); }else{ *pbFreeAndReset = 1; } @@ -1494,7 +1502,7 @@ static int fts5FilterMethod( int bFreeAndReset = 0; int bInternal = 0; - rc = fts5ExtractExprText(pTab, apVal[i], &zText, &bFreeAndReset); + rc = fts5ExtractExprText(pConfig, apVal[i], &zText, &bFreeAndReset); if( rc!=SQLITE_OK ) goto filter_out; if( zText==0 ) zText = ""; @@ -2124,6 +2132,9 @@ static int fts5ApiRowCount(Fts5Context *pCtx, i64 *pnRow){ return sqlite3Fts5StorageRowCount(pTab->pStorage, pnRow); } +/* +** Implementation of xTokenize_v2() API. +*/ static int fts5ApiTokenize_v2( Fts5Context *pCtx, const char *pText, int nText, @@ -2143,6 +2154,11 @@ static int fts5ApiTokenize_v2( return rc; } + +/* +** Implementation of xTokenize() API. This is just xTokenize_v2() with NULL/0 +** passed as the locale. +*/ static int fts5ApiTokenize( Fts5Context *pCtx, const char *pText, int nText, @@ -2190,11 +2206,18 @@ static int fts5ApiColumnText( return rc; } +/* +** This is called by various API functions - xInst, xPhraseFirst, +** xPhraseFirstColumn etc. - to obtain the position list for phrase iPhrase +** of the current row. This function works for both detail=full tables (in +** which case the position-list was read from the fts index) or for other +** detail= modes if the row content is available. +*/ static int fts5CsrPoslist( - Fts5Cursor *pCsr, - int iPhrase, - const u8 **pa, - int *pn + Fts5Cursor *pCsr, /* Fts5 cursor object */ + int iPhrase, /* Phrase to find position list for */ + const u8 **pa, /* OUT: Pointer to position list buffer */ + int *pn /* OUT: Size of (*pa) in bytes */ ){ Fts5Config *pConfig = ((Fts5Table*)(pCsr->base.pVtab))->pConfig; int rc = SQLITE_OK; @@ -2240,7 +2263,6 @@ static int fts5CsrPoslist( *pn = 0; } - return rc; } @@ -2808,6 +2830,11 @@ static Fts5Cursor *fts5CursorFromCsrid(Fts5Global *pGlobal, i64 iCsrId){ return pCsr; } +/* +** Parameter zFmt is a printf() style formatting string. This function +** formats it using the trailing arguments and returns the result as +** an error message to the context passed as the first argument. +*/ static void fts5ResultError(sqlite3_context *pCtx, const char *zFmt, ...){ char *zErr = 0; va_list ap; @@ -2931,12 +2958,13 @@ static int fts5PoslistBlob(sqlite3_context *pCtx, Fts5Cursor *pCsr){ /* ** Value pVal was read from column iCol of the FTS5 table. This function ** returns it to the owner of pCtx via a call to an sqlite3_result_xxx() -** function. This function deals with the same 3 cases as +** function. This function deals with the same cases as ** sqlite3Fts5ExtractText(): ** ** 1) Ordinary values. These can be returned using sqlite3_result_value(). ** -** 2) Blobs from fts5_locale(). +** 2) Blobs from fts5_locale(). The text is extracted from these and +** returned via sqlite3_result_text(). The locale is discarded. */ static void fts5ExtractValueFromColumn( sqlite3_context *pCtx, @@ -3176,6 +3204,21 @@ static int fts5CreateAux( return rc; } +/* +** This function is used by xCreateTokenizer_v2() and xCreateTokenizer(). +** It allocates and partially populates a new Fts5TokenizerModule object. +** The new object is already linked into the Fts5Global context before +** returning. +** +** If successful, SQLITE_OK is returned and a pointer to the new +** Fts5TokenizerModule object returned via output parameter (*ppNew). All +** that is required is for the caller to fill in the methods in +** Fts5TokenizerModule.x1 and x2, and to set Fts5TokenizerModule.bV2Native +** as appropriate. +** +** If an error occurs, an SQLite error code is returned and the final value +** of (*ppNew) undefined. +*/ static int fts5NewTokenizerModule( Fts5Global *pGlobal, /* Global context (one per db handle) */ const char *zName, /* Name of new function */ diff --git a/manifest b/manifest index ce9f94a445..6750b90740 100644 --- a/manifest +++ b/manifest @@ -1,5 +1,5 @@ -C Update\sthe\sporter\stokenizer\sto\suse\slocales. -D 2024-08-12T11:46:09.154 +C Fix\sfurther\sissues\sto\sdo\swith\sfts5\slocale\ssupport. +D 2024-08-12T17:03:37.726 F .fossil-settings/empty-dirs dbb81e8fc0401ac46a1491ab34a7f2c7c0452f2f06b54ebb845d024ca8283ef1 F .fossil-settings/ignore-glob 35175cdfcf539b2318cb04a9901442804be81cd677d8b889fcc9149c21f239ea F LICENSE.md df5091916dbb40e6e9686186587125e1b2ff51f022cc334e886c19a0e9982724 @@ -92,15 +92,15 @@ F ext/fts3/unicode/UnicodeData.txt cd07314edb62d49fde34debdaf92fa2aa69011e7 F ext/fts3/unicode/mkunicode.tcl d5aebf022fa4577ee8cdf27468f0d847879993959101f6dbd6348ef0cfc324a7 F ext/fts3/unicode/parseunicode.tcl a981bd6466d12dd17967515801c3ff23f74a281be1a03cf1e6f52a6959fc77eb F ext/fts5/extract_api_docs.tcl bc3a0ca78be7d3df08e7602c00ca48021ebae40682d75eb001bfdf6e54ffb44e -F ext/fts5/fts5.h 7f1197009fc0e9822a8a584aa1f90591bdbf04f4503ecfe06949f3afe7a1fe06 -F ext/fts5/fts5Int.h b40bb0bd54aaa4ac4712b6c5763b2167764614aaef204dbae81638b4548bca5d -F ext/fts5/fts5_aux.c 0d0ee62dfebe93ccf6b293edb0b21ebe5c8bdc85e962a001745f2d13ea3e79d2 +F ext/fts5/fts5.h 4c6998c6186268b4dbe9baef2c0d2ab974bd90996d61d4dbe801367249be6de4 +F ext/fts5/fts5Int.h 776b21159eef8d30379e5bc4627eae9e841d36e43f19dc8908c786e62aaf9e38 +F ext/fts5/fts5_aux.c 12cd2512f869217c38b70c31de5b5f741812734fafa80f55b32ea9bbd96e2152 F ext/fts5/fts5_buffer.c 0eec58bff585f1a44ea9147eae5da2447292080ea435957f7488c70673cb6f09 F ext/fts5/fts5_config.c 187f7ffa5eddd6539ffa592de85e95b18be951728491390121bb215549a24a2a -F ext/fts5/fts5_expr.c ee1949c5c20901cbaca0885902f1d0c136679262dee71b457a34a92e1d16ddac +F ext/fts5/fts5_expr.c 3a24c6ab5b7545312a5ec03085ae705ede820a08f9a63f1d72829ed4a35da6f6 F ext/fts5/fts5_hash.c adda4272be401566a6e0ba1acbe70ee5cb97fce944bc2e04dc707152a0ec91b1 F ext/fts5/fts5_index.c eb9a0dda3bc6ef969a6be8d2746af56856e67251810ddba08622b45be8477abe -F ext/fts5/fts5_main.c cd61abbfd02f0f22e3c124ae2ad10c2a51cdf8acf38177410d44e134c1d1364b +F ext/fts5/fts5_main.c 4fe8349b812a9fde8e44ac5568f19d713ccc4790eb3ecb692f6551729c481b2b F ext/fts5/fts5_storage.c 5bf88213ff5911625c142ac332ddba10dcd0869e757f91f2a3d27f27ba595992 F ext/fts5/fts5_tcl.c 50c7e16753fde0c4d80d8abd00a4ed2b0e998d5d3899a484510d01923c5da43b F ext/fts5/fts5_test_mi.c 08c11ec968148d4cb4119d96d819f8c1f329812c568bac3684f5464be177d3ee @@ -2207,8 +2207,8 @@ F vsixtest/vsixtest.tcl 6195aba1f12a5e10efc2b8c0009532167be5e301abe5b31385638080 F vsixtest/vsixtest.vcxproj.data 2ed517e100c66dc455b492e1a33350c1b20fbcdc F vsixtest/vsixtest.vcxproj.filters 37e51ffedcdb064aad6ff33b6148725226cd608e F vsixtest/vsixtest_TemporaryKey.pfx e5b1b036facdb453873e7084e1cae9102ccc67a0 -P f7d56a1f2149f0da117167db62e2c28ec337e8da3403873b64cdfc6a951e2e8e -R 7151af5ed6816182b47b60322cc8dcba +P 3291ce3a3359a80e51e4546a3d7a187cbe4c7530fca6632f0bb2728525efe212 +R 5bdde041363e74c68796cadade4d8480 U dan -Z a17240af0068f64d6da9a8176108962c +Z 098e7ed7a851f6658bf54618988e8ebd # Remove this line to create a well-formed Fossil manifest. diff --git a/manifest.uuid b/manifest.uuid index 6edb67265e..902e51759c 100644 --- a/manifest.uuid +++ b/manifest.uuid @@ -1 +1 @@ -3291ce3a3359a80e51e4546a3d7a187cbe4c7530fca6632f0bb2728525efe212 +e626123580065986f7df50b6140f00048944becce179b9391fbf09f97ac55485