From: dan Date: Fri, 26 Jul 2024 20:50:33 +0000 (+0000) Subject: Add the fts5_locale() function, and begin adding the related functionality to fts5. X-Git-Tag: version-3.47.0~220^2~32 X-Git-Url: http://git.ipfire.org/gitweb/gitweb.cgi?a=commitdiff_plain;h=2ec78c0e4bab11ce288a830c3a13fe34a9224c4a;p=thirdparty%2Fsqlite.git Add the fts5_locale() function, and begin adding the related functionality to fts5. FossilOrigin-Name: 8839ef7cfb49239e7f1c4812a53a93a672827c88d6921408b1d5062b352c87cc --- diff --git a/ext/fts5/fts5.h b/ext/fts5/fts5.h index d3042fcb8c..ccdebbe1ab 100644 --- a/ext/fts5/fts5.h +++ b/ext/fts5/fts5.h @@ -570,6 +570,29 @@ struct fts5_tokenizer { ); }; +typedef struct fts5_tokenizer_v2 fts5_tokenizer_v2; +struct fts5_tokenizer_v2 { + int iVersion; /* Currently always 2 */ + + int (*xCreate)(void*, const char **azArg, int nArg, Fts5Tokenizer **ppOut); + void (*xDelete)(Fts5Tokenizer*); + int (*xTokenize)(Fts5Tokenizer*, + void *pCtx, + int flags, /* Mask of FTS5_TOKENIZE_* flags */ + const char *pText, int nText, + int (*xToken)( + void *pCtx, /* Copy of 2nd argument to xTokenize() */ + int tflags, /* Mask of FTS5_TOKEN_* flags */ + const char *pToken, /* Pointer to buffer containing token */ + int nToken, /* Size of token in bytes */ + int iStart, /* Byte offset of token within input text */ + int iEnd /* Byte offset of end of token within input text */ + ) + ); + + int (*xSetLocale)(Fts5Tokenizer*, const char *pLocale, int nLocale); +}; + /* Flags that may be passed as the third argument to xTokenize() */ #define FTS5_TOKENIZE_QUERY 0x0001 #define FTS5_TOKENIZE_PREFIX 0x0002 @@ -589,7 +612,7 @@ struct fts5_tokenizer { */ typedef struct fts5_api fts5_api; struct fts5_api { - int iVersion; /* Currently always set to 2 */ + int iVersion; /* Currently 3, was once 2 */ /* Create a new tokenizer */ int (*xCreateTokenizer)( @@ -616,6 +639,25 @@ struct fts5_api { fts5_extension_function xFunction, void (*xDestroy)(void*) ); + + /* APIs below this point are only available if iVersion>=3 */ + + /* Create a new tokenizer */ + int (*xCreateTokenizer_v2)( + fts5_api *pApi, + const char *zName, + void *pUserData, + fts5_tokenizer_v2 *pTokenizer, + void (*xDestroy)(void*) + ); + + /* Find an existing tokenizer */ + int (*xFindTokenizer_v2)( + fts5_api *pApi, + const char *zName, + void **ppUserData, + fts5_tokenizer_v2 **ppTokenizer + ); }; /* diff --git a/ext/fts5/fts5Int.h b/ext/fts5/fts5Int.h index 4311faceb5..e8fdd8a4fa 100644 --- a/ext/fts5/fts5Int.h +++ b/ext/fts5/fts5Int.h @@ -162,7 +162,7 @@ typedef struct Fts5TokenizerConfig Fts5TokenizerConfig; struct Fts5TokenizerConfig { Fts5Tokenizer *pTok; - fts5_tokenizer *pTokApi; + fts5_tokenizer_v2 *pTokApi; const char **azArg; int nArg; int ePattern; /* FTS_PATTERN_XXX constant */ @@ -223,6 +223,7 @@ struct Fts5Config { char *zContentRowid; /* "content_rowid=" option value */ int bColumnsize; /* "columnsize=" option value (dflt==1) */ int bTokendata; /* "tokendata=" option value (dflt==0) */ + int bLocale; /* "locale=" option value (dflt==0) */ int eDetail; /* FTS5_DETAIL_XXX value */ char *zContentExprlist; Fts5TokenizerConfig t; @@ -292,6 +293,8 @@ int sqlite3Fts5ConfigSetValue(Fts5Config*, const char*, sqlite3_value*, int*); int sqlite3Fts5ConfigParseRank(const char*, char**, char**); +void sqlite3Fts5ConfigErrmsg(Fts5Config *pConfig, const char *zFmt, ...); + /* ** End of interface to code in fts5_config.c. **************************************************************************/ @@ -627,6 +630,17 @@ Fts5Table *sqlite3Fts5TableFromCsrid(Fts5Global*, i64); int sqlite3Fts5FlushToDisk(Fts5Table*); +int sqlite3Fts5ExtractText( + Fts5Config *pConfig, + int bContent, /* Loaded from content table */ + sqlite3_value *pVal, /* Value to extract text from */ + int *pbResetTokenizer, /* OUT: True if xSetLocale(NULL) required */ + const char **ppText, /* OUT: Pointer to text buffer */ + int *pnText /* OUT: Size of (*ppText) in bytes */ +); + +void sqlite3Fts5ClearLocale(Fts5Config *pConfig); + /* ** End of interface to code in fts5.c. **************************************************************************/ diff --git a/ext/fts5/fts5_config.c b/ext/fts5/fts5_config.c index 01f40455a0..3736f8685f 100644 --- a/ext/fts5/fts5_config.c +++ b/ext/fts5/fts5_config.c @@ -380,6 +380,16 @@ static int fts5ConfigParseSpecial( return rc; } + if( sqlite3_strnicmp("locale", zCmd, nCmd)==0 ){ + if( (zArg[0]!='0' && zArg[0]!='1') || zArg[1]!='\0' ){ + *pzErr = sqlite3_mprintf("malformed locale=... directive"); + rc = SQLITE_ERROR; + }else{ + pConfig->bLocale = (zArg[0]=='1'); + } + return rc; + } + if( sqlite3_strnicmp("detail", zCmd, nCmd)==0 ){ const Fts5Enum aDetail[] = { { "none", FTS5_DETAIL_NONE }, @@ -605,6 +615,11 @@ int sqlite3Fts5ConfigParse( sqlite3_free(zTwo); } + /* If this is not an FTS5_CONTENT_NORMAL table, set bLocale */ + if( pRet->eContent!=FTS5_CONTENT_NORMAL ){ + pRet->bLocale = 1; + } + /* We only allow contentless_delete=1 if the table is indeed contentless. */ if( rc==SQLITE_OK && pRet->bContentlessDelete @@ -1027,3 +1042,20 @@ int sqlite3Fts5ConfigLoad(Fts5Config *pConfig, int iCookie){ } return rc; } + +void sqlite3Fts5ConfigErrmsg(Fts5Config *pConfig, const char *zFmt, ...){ + va_list ap; /* ... printf arguments */ + char *zMsg = 0; + + va_start(ap, zFmt); + zMsg = sqlite3_vmprintf(zFmt, ap); + if( pConfig->pzErrmsg ){ + *pConfig->pzErrmsg = zMsg; + }else{ + sqlite3_free(zMsg); + } + + va_end(ap); +} + + diff --git a/ext/fts5/fts5_main.c b/ext/fts5/fts5_main.c index c6e7e346aa..06bbd820c6 100644 --- a/ext/fts5/fts5_main.c +++ b/ext/fts5/fts5_main.c @@ -107,7 +107,7 @@ struct Fts5Auxiliary { struct Fts5TokenizerModule { char *zName; /* Name of tokenizer */ void *pUserData; /* User pointer passed to xCreate() */ - fts5_tokenizer x; /* Tokenizer functions */ + fts5_tokenizer_v2 x; /* Tokenizer functions */ void (*xDestroy)(void*); /* Destructor function */ Fts5TokenizerModule *pNext; /* Next registered tokenizer module */ }; @@ -230,6 +230,8 @@ struct Fts5Cursor { #define BitFlagAllTest(x,y) (((x) & (y))==(y)) #define BitFlagTest(x,y) (((x) & (y))!=0) +#define FTS5_LOCALE_SUBTYPE ((unsigned int)'L') + /* ** Macros to Set(), Clear() and Test() cursor flags. @@ -1229,6 +1231,116 @@ static void fts5SetVtabError(Fts5FullTable *p, const char *zFormat, ...){ } +static int fts5SetLocale( + Fts5Config *pConfig, + const char *zLocale, + int nLocale +){ + Fts5TokenizerConfig *pT = &pConfig->t; + int rc = SQLITE_OK; + if( pT->pTokApi->xSetLocale ){ + rc = pT->pTokApi->xSetLocale(pT->pTok, zLocale, nLocale); + } + return rc; +} + +void sqlite3Fts5ClearLocale(Fts5Config *pConfig){ + fts5SetLocale(pConfig, 0, 0); +} + +int sqlite3Fts5ExtractText( + Fts5Config *pConfig, + int bContent, + sqlite3_value *pVal, /* Value to extract text from */ + int *pbResetTokenizer, /* OUT: True if xSetLocale(NULL) required */ + const char **ppText, /* OUT: Pointer to text buffer */ + int *pnText /* OUT: Size of (*ppText) in bytes */ +){ + const char *pText = 0; + int nText = 0; + int bResetTokenizer = 0; + int rc = SQLITE_OK; + + int bDecodeBlob = 0; + if( sqlite3_value_type(pVal)==SQLITE_BLOB ){ + if( sqlite3_value_subtype(pVal)==FTS5_LOCALE_SUBTYPE + || (bContent && pConfig->bLocale && pConfig->eContent==FTS5_CONTENT_NORMAL) + ){ + bDecodeBlob = 1; + } + } + + if( bDecodeBlob ){ + const u8 *pBlob = sqlite3_value_blob(pVal); + int nBlob = sqlite3_value_bytes(pVal); + int nLocale = 0; + + for(nLocale=0; nLocalep.pConfig, 0, pVal, &bReset, &zText,&nText); + if( rc==SQLITE_OK ){ + if( bReset ){ + *pzText = sqlite3Fts5Mprintf(&rc, "%.*s", nText, zText); + if( rc!=SQLITE_OK ){ + sqlite3Fts5ClearLocale(pTab->p.pConfig); + }else{ + *pbFreeAndReset = 1; + } + }else{ + *pzText = (char*)zText; + } + } + + return rc; +} + + /* ** This is the xFilter interface for the virtual table. See ** the virtual table xFilter method documentation for additional @@ -1293,8 +1405,13 @@ static int fts5FilterMethod( pRank = apVal[i]; break; case 'M': { - const char *zText = (const char*)sqlite3_value_text(apVal[i]); + char *zText = 0; + int bFreeAndReset = 0; + + rc = fts5ExtractExprText(pTab, apVal[i], &zText, &bFreeAndReset); + if( rc!=SQLITE_OK ) goto filter_out; if( zText==0 ) zText = ""; + iCol = 0; do{ iCol = iCol*10 + (idxStr[iIdxStr]-'0'); @@ -1306,7 +1423,6 @@ static int fts5FilterMethod( ** indicates that the MATCH expression is not a full text query, ** but a request for an internal parameter. */ rc = fts5SpecialMatch(pTab, pCsr, &zText[1]); - goto filter_out; }else{ char **pzErr = &pTab->p.base.zErrMsg; rc = sqlite3Fts5ExprNew(pConfig, 0, iCol, zText, &pExpr, pzErr); @@ -1314,9 +1430,15 @@ static int fts5FilterMethod( rc = sqlite3Fts5ExprAnd(&pCsr->pExpr, pExpr); pExpr = 0; } - if( rc!=SQLITE_OK ) goto filter_out; } + if( bFreeAndReset ){ + sqlite3_free(zText); + sqlite3Fts5ClearLocale(pConfig); + } + + if( zText[0]=='*' || rc!=SQLITE_OK ) goto filter_out; + break; } case 'L': @@ -2596,6 +2718,35 @@ static int fts5PoslistBlob(sqlite3_context *pCtx, Fts5Cursor *pCsr){ return rc; } +static void fts5ExtractValueFromColumn( + sqlite3_context *pCtx, + Fts5Config *pConfig, + sqlite3_value *pVal +){ + if( sqlite3_value_type(pVal)==SQLITE_BLOB ){ + if( sqlite3_value_subtype(pVal)==FTS5_LOCALE_SUBTYPE + || (pConfig->bLocale && pConfig->eContent==FTS5_CONTENT_NORMAL) + ){ + const u8 *pBlob = sqlite3_value_blob(pVal); + int nBlob = sqlite3_value_bytes(pVal); + + if( nBlob>=4 && memcmp(pBlob, "\0\0\0\0", 4)==0 ){ + sqlite3_result_blob(pCtx, &pBlob[4], nBlob-4, SQLITE_TRANSIENT); + }else{ + int ii; + for(ii=0; iipzErrmsg = &pTab->p.base.zErrMsg; rc = fts5SeekCursor(pCsr, 1); if( rc==SQLITE_OK ){ - sqlite3_result_value(pCtx, sqlite3_column_value(pCsr->pStmt, iCol+1)); + sqlite3_value *pVal = sqlite3_column_value(pCsr->pStmt, iCol+1); + fts5ExtractValueFromColumn(pCtx, pConfig, pVal); } pConfig->pzErrmsg = 0; }else if( pConfig->bContentlessDelete && sqlite3_vtab_nochange(pCtx) ){ @@ -2791,43 +2943,66 @@ static int fts5CreateAux( /* ** Register a new tokenizer. This is the implementation of the -** fts5_api.xCreateTokenizer() method. +** fts5_api.xCreateTokenizer_v2() method. */ -static int fts5CreateTokenizer( +static int fts5CreateTokenizer_v2( fts5_api *pApi, /* Global context (one per db handle) */ const char *zName, /* Name of new function */ void *pUserData, /* User data for aux. function */ - fts5_tokenizer *pTokenizer, /* Tokenizer implementation */ + fts5_tokenizer_v2 *pTokenizer, /* Tokenizer implementation */ void(*xDestroy)(void*) /* Destructor for pUserData */ ){ Fts5Global *pGlobal = (Fts5Global*)pApi; - Fts5TokenizerModule *pNew; - sqlite3_int64 nName; /* Size of zName and its \0 terminator */ - sqlite3_int64 nByte; /* Bytes of space to allocate */ int rc = SQLITE_OK; - nName = strlen(zName) + 1; - nByte = sizeof(Fts5TokenizerModule) + nName; - pNew = (Fts5TokenizerModule*)sqlite3_malloc64(nByte); - if( pNew ){ - memset(pNew, 0, (size_t)nByte); - pNew->zName = (char*)&pNew[1]; - memcpy(pNew->zName, zName, nName); - pNew->pUserData = pUserData; - pNew->x = *pTokenizer; - pNew->xDestroy = xDestroy; - pNew->pNext = pGlobal->pTok; - pGlobal->pTok = pNew; - if( pNew->pNext==0 ){ - pGlobal->pDfltTok = pNew; - } + if( pTokenizer->iVersion>2 ){ + rc = SQLITE_ERROR; }else{ - rc = SQLITE_NOMEM; + Fts5TokenizerModule *pNew; + sqlite3_int64 nName; /* Size of zName and its \0 terminator */ + sqlite3_int64 nByte; /* Bytes of space to allocate */ + + nName = strlen(zName) + 1; + nByte = sizeof(Fts5TokenizerModule) + nName; + pNew = (Fts5TokenizerModule*)sqlite3Fts5MallocZero(&rc, nByte); + if( pNew ){ + pNew->zName = (char*)&pNew[1]; + memcpy(pNew->zName, zName, nName); + pNew->pUserData = pUserData; + pNew->x = *pTokenizer; + pNew->xDestroy = xDestroy; + pNew->pNext = pGlobal->pTok; + pGlobal->pTok = pNew; + if( pNew->pNext==0 ){ + pGlobal->pDfltTok = pNew; + } + } } return rc; } +/* +** The fts5_api.xCreateTokenizer() method. +*/ +static int fts5CreateTokenizer( + fts5_api *pApi, /* Global context (one per db handle) */ + const char *zName, /* Name of new function */ + void *pUserData, /* User data for aux. function */ + fts5_tokenizer *pTokenizer, /* Tokenizer implementation */ + void(*xDestroy)(void*) /* Destructor for pUserData */ +){ + fts5_tokenizer_v2 tok; + + memset(&tok, 0, sizeof(tok)); + tok.iVersion = 2; + tok.xCreate = pTokenizer->xCreate; + tok.xTokenize = pTokenizer->xTokenize; + tok.xDelete = pTokenizer->xDelete; + + return fts5CreateTokenizer_v2(pApi, zName, pUserData, &tok, xDestroy); +} + static Fts5TokenizerModule *fts5LocateTokenizer( Fts5Global *pGlobal, const char *zName @@ -2847,29 +3022,53 @@ static Fts5TokenizerModule *fts5LocateTokenizer( /* ** Find a tokenizer. This is the implementation of the -** fts5_api.xFindTokenizer() method. +** fts5_api.xFindTokenizer_v2() method. */ -static int fts5FindTokenizer( +static int fts5FindTokenizer_v2( fts5_api *pApi, /* Global context (one per db handle) */ - const char *zName, /* Name of new function */ + const char *zName, /* Name of tokenizer */ void **ppUserData, - fts5_tokenizer *pTokenizer /* Populate this object */ + fts5_tokenizer_v2 **ppTokenizer /* Populate this object */ ){ int rc = SQLITE_OK; Fts5TokenizerModule *pMod; pMod = fts5LocateTokenizer((Fts5Global*)pApi, zName); if( pMod ){ - *pTokenizer = pMod->x; + *ppTokenizer = &pMod->x; *ppUserData = pMod->pUserData; }else{ - memset(pTokenizer, 0, sizeof(fts5_tokenizer)); + *ppTokenizer = 0; + *ppUserData = 0; rc = SQLITE_ERROR; } return rc; } +/* +** Find a tokenizer. This is the implementation of the +** fts5_api.xFindTokenizer() method. +*/ +static int fts5FindTokenizer( + fts5_api *pApi, /* Global context (one per db handle) */ + const char *zName, /* Name of new function */ + void **ppUserData, + fts5_tokenizer *pTokenizer /* Populate this object */ +){ + fts5_tokenizer_v2 *pV2 = 0; + int rc = SQLITE_OK; + + rc = fts5FindTokenizer_v2(pApi, zName, ppUserData, &pV2); + if( rc==SQLITE_OK ){ + pTokenizer->xCreate = pV2->xCreate; + pTokenizer->xDelete = pV2->xDelete; + pTokenizer->xTokenize = pV2->xTokenize; + } + + return rc; +} + int fts5GetTokenizer( Fts5Global *pGlobal, const char **azArg, @@ -2966,6 +3165,46 @@ static void fts5SourceIdFunc( sqlite3_result_text(pCtx, "--FTS5-SOURCE-ID--", -1, SQLITE_TRANSIENT); } +/* +** Implementation of fts5_locale() function. +*/ +static void fts5LocaleFunc( + sqlite3_context *pCtx, /* Function call context */ + int nArg, /* Number of args */ + sqlite3_value **apArg /* Function arguments */ +){ + u8 *pBlob = 0; + int nBlob = 0; + + const char *zLocale = 0; + int nLocale = 0; + const char *zText = 0; + int nText = 0; + + assert( nArg==2 ); + UNUSED_PARAM(nArg); + + zLocale = (const char*)sqlite3_value_text(apArg[0]); + nLocale = sqlite3_value_bytes(apArg[0]); + + zText = (const char*)sqlite3_value_text(apArg[1]); + nText = sqlite3_value_bytes(apArg[1]); + + nBlob = nLocale + 1 + nText; + pBlob = (u8*)sqlite3_malloc(nBlob); + if( pBlob==0 ){ + sqlite3_result_error_nomem(pCtx); + return; + } + + if( zLocale ) memcpy(pBlob, zLocale, nLocale); + pBlob[nLocale] = 0x00; + if( zText ) memcpy(&pBlob[nLocale+1], zText, nText); + + sqlite3_result_blob(pCtx, pBlob, nBlob, sqlite3_free); + sqlite3_result_subtype(pCtx, FTS5_LOCALE_SUBTYPE); +} + /* ** Return true if zName is the extension on one of the shadow tables used ** by this module. @@ -3058,10 +3297,12 @@ static int fts5Init(sqlite3 *db){ void *p = (void*)pGlobal; memset(pGlobal, 0, sizeof(Fts5Global)); pGlobal->db = db; - pGlobal->api.iVersion = 2; + pGlobal->api.iVersion = 3; pGlobal->api.xCreateFunction = fts5CreateAux; pGlobal->api.xCreateTokenizer = fts5CreateTokenizer; pGlobal->api.xFindTokenizer = fts5FindTokenizer; + pGlobal->api.xCreateTokenizer_v2 = fts5CreateTokenizer_v2; + pGlobal->api.xFindTokenizer_v2 = fts5FindTokenizer_v2; rc = sqlite3_create_module_v2(db, "fts5", &fts5Mod, p, fts5ModuleDestroy); if( rc==SQLITE_OK ) rc = sqlite3Fts5IndexInit(db); if( rc==SQLITE_OK ) rc = sqlite3Fts5ExprInit(pGlobal, db); @@ -3080,6 +3321,13 @@ static int fts5Init(sqlite3 *db){ p, fts5SourceIdFunc, 0, 0 ); } + if( rc==SQLITE_OK ){ + rc = sqlite3_create_function( + db, "fts5_locale", 2, + SQLITE_UTF8|SQLITE_INNOCUOUS|SQLITE_RESULT_SUBTYPE, + p, fts5LocaleFunc, 0, 0 + ); + } } /* If SQLITE_FTS5_ENABLE_TEST_MI is defined, assume that the file diff --git a/ext/fts5/fts5_storage.c b/ext/fts5/fts5_storage.c index 0b676e6b4f..5d7f3f055e 100644 --- a/ext/fts5/fts5_storage.c +++ b/ext/fts5/fts5_storage.c @@ -429,26 +429,30 @@ static int fts5StorageDeleteFromIndex( ctx.iCol = -1; for(iCol=1; rc==SQLITE_OK && iCol<=pConfig->nCol; iCol++){ if( pConfig->abUnindexed[iCol-1]==0 ){ - const char *zText; - int nText; + sqlite3_value *pVal = 0; + const char *pText = 0; + int nText = 0; + int bReset = 0; + assert( pSeek==0 || apVal==0 ); assert( pSeek!=0 || apVal!=0 ); if( pSeek ){ - zText = (const char*)sqlite3_column_text(pSeek, iCol); - nText = sqlite3_column_bytes(pSeek, iCol); - }else if( ALWAYS(apVal) ){ - zText = (const char*)sqlite3_value_text(apVal[iCol-1]); - nText = sqlite3_value_bytes(apVal[iCol-1]); + pVal = sqlite3_column_value(pSeek, iCol); }else{ - continue; + pVal = apVal[iCol-1]; } - ctx.szCol = 0; - rc = sqlite3Fts5Tokenize(pConfig, FTS5_TOKENIZE_DOCUMENT, - zText, nText, (void*)&ctx, fts5StorageInsertCallback - ); - p->aTotalSize[iCol-1] -= (i64)ctx.szCol; - if( p->aTotalSize[iCol-1]<0 && rc==SQLITE_OK ){ - rc = FTS5_CORRUPT; + + rc = sqlite3Fts5ExtractText(pConfig,pSeek!=0,pVal,&bReset,&pText,&nText); + if( rc==SQLITE_OK ){ + ctx.szCol = 0; + rc = sqlite3Fts5Tokenize(pConfig, FTS5_TOKENIZE_DOCUMENT, + pText, nText, (void*)&ctx, fts5StorageInsertCallback + ); + p->aTotalSize[iCol-1] -= (i64)ctx.szCol; + if( p->aTotalSize[iCol-1]<0 && rc==SQLITE_OK ){ + rc = FTS5_CORRUPT; + } + if( bReset ) sqlite3Fts5ClearLocale(pConfig); } } } @@ -684,14 +688,22 @@ int sqlite3Fts5StorageRebuild(Fts5Storage *p){ for(ctx.iCol=0; rc==SQLITE_OK && ctx.iColnCol; ctx.iCol++){ ctx.szCol = 0; if( pConfig->abUnindexed[ctx.iCol]==0 ){ - const char *zText = (const char*)sqlite3_column_text(pScan, ctx.iCol+1); - int nText = sqlite3_column_bytes(pScan, ctx.iCol+1); - rc = sqlite3Fts5Tokenize(pConfig, - FTS5_TOKENIZE_DOCUMENT, - zText, nText, - (void*)&ctx, - fts5StorageInsertCallback + int bReset = 0; + int nText = 0; + const char *pText = 0; + rc = sqlite3Fts5ExtractText(pConfig, 1, + sqlite3_column_value(pScan, ctx.iCol+1), &bReset, &pText, &nText ); + + if( rc==SQLITE_OK ){ + rc = sqlite3Fts5Tokenize(pConfig, + FTS5_TOKENIZE_DOCUMENT, + pText, nText, + (void*)&ctx, + fts5StorageInsertCallback + ); + if( bReset ) sqlite3Fts5ClearLocale(pConfig); + } } sqlite3Fts5BufferAppendVarint(&rc, &buf, ctx.szCol); p->aTotalSize[ctx.iCol] += (i64)ctx.szCol; @@ -810,14 +822,26 @@ int sqlite3Fts5StorageIndexInsert( for(ctx.iCol=0; rc==SQLITE_OK && ctx.iColnCol; ctx.iCol++){ ctx.szCol = 0; if( pConfig->abUnindexed[ctx.iCol]==0 ){ - const char *zText = (const char*)sqlite3_value_text(apVal[ctx.iCol+2]); - int nText = sqlite3_value_bytes(apVal[ctx.iCol+2]); - rc = sqlite3Fts5Tokenize(pConfig, - FTS5_TOKENIZE_DOCUMENT, - zText, nText, - (void*)&ctx, - fts5StorageInsertCallback + int bReset = 0; + int nText = 0; + const char *pText = 0; + rc = sqlite3Fts5ExtractText( + pConfig, 0, apVal[ctx.iCol+2], &bReset, &pText, &nText ); + if( rc==SQLITE_OK ){ + if( bReset && pConfig->bLocale==0 ){ + rc = SQLITE_ERROR; + sqlite3Fts5ConfigErrmsg(pConfig, + "fts5_locale() may not be used without locale=1" + ); + }else{ + rc = sqlite3Fts5Tokenize(pConfig, + FTS5_TOKENIZE_DOCUMENT, pText, nText, (void*)&ctx, + fts5StorageInsertCallback + ); + } + if( bReset ) sqlite3Fts5ClearLocale(pConfig); + } } sqlite3Fts5BufferAppendVarint(&rc, &buf, ctx.szCol); p->aTotalSize[ctx.iCol] += (i64)ctx.szCol; @@ -988,14 +1012,23 @@ int sqlite3Fts5StorageIntegrity(Fts5Storage *p, int iArg){ rc = sqlite3Fts5TermsetNew(&ctx.pTermset); } if( rc==SQLITE_OK ){ - const char *zText = (const char*)sqlite3_column_text(pScan, i+1); - int nText = sqlite3_column_bytes(pScan, i+1); - rc = sqlite3Fts5Tokenize(pConfig, - FTS5_TOKENIZE_DOCUMENT, - zText, nText, - (void*)&ctx, - fts5StorageIntegrityCallback + const char *pText = 0; + int nText = 0; + int bReset = 0; + + rc = sqlite3Fts5ExtractText(pConfig, 1, + sqlite3_column_value(pScan, i+1), &bReset, &pText, &nText ); + + if( rc==SQLITE_OK ){ + rc = sqlite3Fts5Tokenize(pConfig, + FTS5_TOKENIZE_DOCUMENT, + pText, nText, + (void*)&ctx, + fts5StorageIntegrityCallback + ); + if( bReset ) sqlite3Fts5ClearLocale(pConfig); + } } if( rc==SQLITE_OK && pConfig->bColumnsize && ctx.szCol!=aColSize[i] ){ rc = FTS5_CORRUPT; diff --git a/ext/fts5/fts5_tcl.c b/ext/fts5/fts5_tcl.c index c5b5f41f83..c1fc7f82ae 100644 --- a/ext/fts5/fts5_tcl.c +++ b/ext/fts5/fts5_tcl.c @@ -801,6 +801,7 @@ typedef struct F5tTokenizerInstance F5tTokenizerInstance; struct F5tTokenizerContext { void *pCtx; int (*xToken)(void*, int, const char*, int, int, int); + F5tTokenizerInstance *pInst; }; struct F5tTokenizerModule { @@ -809,12 +810,37 @@ struct F5tTokenizerModule { F5tTokenizerContext *pContext; }; +/* +** zLocale: +** Buffer zLocale contains the current locale, as configured by the most +** recent call to xSetLocale(). A NULL (default) locale is represented as +** a 0 byte string - "\0". +** +** This can be retrieved by a Tcl tokenize script using [sqlite3_fts5_locale]. +*/ struct F5tTokenizerInstance { Tcl_Interp *interp; Tcl_Obj *pScript; F5tTokenizerContext *pContext; + char zLocale[128]; }; +static int f5tTokenizerSetLocale( + Fts5Tokenizer *pTokenizer, + const char *pLocale, + int nLocale +){ + F5tTokenizerInstance *pInst = (F5tTokenizerInstance*)pTokenizer; + if( nLocale>=sizeof(pInst->zLocale) ){ + return SQLITE_ERROR; + } + + memset(pInst->zLocale, 0, sizeof(pInst->zLocale)); + memcpy(pInst->zLocale, pLocale, nLocale); + + return SQLITE_OK; +} + static int f5tTokenizerCreate( void *pCtx, const char **azArg, @@ -867,6 +893,7 @@ static int f5tTokenizerTokenize( int (*xToken)(void*, int, const char*, int, int, int) ){ F5tTokenizerInstance *pInst = (F5tTokenizerInstance*)p; + F5tTokenizerInstance *pOldInst = 0; void *pOldCtx; int (*xOldToken)(void*, int, const char*, int, int, int); Tcl_Obj *pEval; @@ -875,9 +902,11 @@ static int f5tTokenizerTokenize( pOldCtx = pInst->pContext->pCtx; xOldToken = pInst->pContext->xToken; + pOldInst = pInst->pContext->pInst; pInst->pContext->pCtx = pCtx; pInst->pContext->xToken = xToken; + pInst->pContext->pInst = pInst; assert( flags==FTS5_TOKENIZE_DOCUMENT @@ -913,9 +942,37 @@ static int f5tTokenizerTokenize( pInst->pContext->pCtx = pOldCtx; pInst->pContext->xToken = xOldToken; + pInst->pContext->pInst = pOldInst; return rc; } +/* +** sqlite3_fts5_locale +*/ +static int SQLITE_TCLAPI f5tTokenizerLocale( + void * clientData, + Tcl_Interp *interp, + int objc, + Tcl_Obj *CONST objv[] +){ + F5tTokenizerContext *p = (F5tTokenizerContext*)clientData; + + if( objc!=1 ){ + Tcl_WrongNumArgs(interp, 1, objv, ""); + return TCL_ERROR; + } + + if( p->xToken==0 ){ + Tcl_AppendResult(interp, + "sqlite3_fts5_locale may only be used by tokenizer callback", 0 + ); + return TCL_ERROR; + } + + Tcl_SetObjResult(interp, Tcl_NewStringObj(p->pInst->zLocale, -1)); + return TCL_OK; +} + /* ** sqlite3_fts5_token ?-colocated? TEXT START END */ @@ -1001,12 +1058,21 @@ static int SQLITE_TCLAPI f5tCreateTokenizer( fts5_api *pApi; char *zName; Tcl_Obj *pScript; - fts5_tokenizer t; F5tTokenizerModule *pMod; int rc; + int bV2 = 0; /* True to use _v2 API */ + + if( objc==5 ){ + const char *zArg = Tcl_GetString(objv[1]); + if( 0==strcmp(zArg, "-v2") ){ + objv++; + objc--; + bV2 = 1; + } + } if( objc!=4 ){ - Tcl_WrongNumArgs(interp, 1, objv, "DB NAME SCRIPT"); + Tcl_WrongNumArgs(interp, 1, objv, "?-v2? DB NAME SCRIPT"); return TCL_ERROR; } if( f5tDbAndApi(interp, objv[1], &db, &pApi) ){ @@ -1015,18 +1081,34 @@ static int SQLITE_TCLAPI f5tCreateTokenizer( zName = Tcl_GetString(objv[2]); pScript = objv[3]; - t.xCreate = f5tTokenizerCreate; - t.xTokenize = f5tTokenizerTokenize; - t.xDelete = f5tTokenizerDelete; - pMod = (F5tTokenizerModule*)ckalloc(sizeof(F5tTokenizerModule)); pMod->interp = interp; pMod->pScript = pScript; pMod->pContext = pContext; Tcl_IncrRefCount(pScript); - rc = pApi->xCreateTokenizer(pApi, zName, (void*)pMod, &t, f5tDelTokenizer); + + if( bV2==0 ){ + fts5_tokenizer t; + t.xCreate = f5tTokenizerCreate; + t.xTokenize = f5tTokenizerTokenize; + t.xDelete = f5tTokenizerDelete; + rc = pApi->xCreateTokenizer(pApi, zName, (void*)pMod, &t, f5tDelTokenizer); + }else{ + fts5_tokenizer_v2 t2; + memset(&t2, 0, sizeof(t2)); + t2.iVersion = 2; + t2.xCreate = f5tTokenizerCreate; + t2.xTokenize = f5tTokenizerTokenize; + t2.xDelete = f5tTokenizerDelete; + t2.xSetLocale = f5tTokenizerSetLocale; + rc = pApi->xCreateTokenizer_v2(pApi, zName,(void*)pMod,&t2,f5tDelTokenizer); + } + if( rc!=SQLITE_OK ){ - Tcl_AppendResult(interp, "error in fts5_api.xCreateTokenizer()", 0); + Tcl_AppendResult(interp, ( + bV2 ? "error in fts5_api.xCreateTokenizer_v2()" + : "error in fts5_api.xCreateTokenizer()" + ), 0); return TCL_ERROR; } @@ -1333,6 +1415,7 @@ int Fts5tcl_Init(Tcl_Interp *interp){ } aCmd[] = { { "sqlite3_fts5_create_tokenizer", f5tCreateTokenizer, 1 }, { "sqlite3_fts5_token", f5tTokenizerReturn, 1 }, + { "sqlite3_fts5_locale", f5tTokenizerLocale, 1 }, { "sqlite3_fts5_tokenize", f5tTokenize, 0 }, { "sqlite3_fts5_create_function", f5tCreateFunction, 0 }, { "sqlite3_fts5_may_be_corrupt", f5tMayBeCorrupt, 0 }, diff --git a/ext/fts5/test/fts5locale.test b/ext/fts5/test/fts5locale.test new file mode 100644 index 0000000000..923990f5a9 --- /dev/null +++ b/ext/fts5/test/fts5locale.test @@ -0,0 +1,176 @@ +# 2014 Dec 20 +# +# The author disclaims copyright to this source code. In place of +# a legal notice, here is a blessing: +# +# May you do good and not evil. +# May you find forgiveness for yourself and forgive others. +# May you share freely, never taking more than you give. +# +#*********************************************************************** +# +# Tests focusing on the built-in fts5 tokenizers. +# + +source [file join [file dirname [info script]] fts5_common.tcl] +set testprefix fts5locale + +# If SQLITE_ENABLE_FTS5 is not defined, omit this file. +ifcapable !fts5 { + finish_test + return +} + +proc transform_token {locale token} { + switch -- $locale { + reverse { + set ret "" + foreach c [split $token ""] { + set ret "$c$ret" + } + set token $ret + } + + default { + # no-op + } + } + + set token +} + +proc tcl_create {args} { return "tcl_tokenize" } +proc tcl_tokenize {tflags text} { + foreach {w iStart iEnd} [fts5_tokenize_split $text] { + set w [transform_token [sqlite3_fts5_locale] $w] + sqlite3_fts5_token $w $iStart $iEnd + } +} + +#------------------------------------------------------------------------- +# Check that queries can have a locale attached to them. +# +reset_db +sqlite3_fts5_create_tokenizer -v2 db tcl tcl_create + +do_execsql_test 1.0 { + CREATE VIRTUAL TABLE t1 USING fts5(a, tokenize=tcl); + INSERT INTO t1 VALUES('abc'); + INSERT INTO t1 VALUES('cba'); +} {} + +do_execsql_test 1.1 { + SELECT rowid, a FROM t1( fts5_locale('en_US', 'abc') ); +} {1 abc} + +do_execsql_test 1.2 { + SELECT rowid, a FROM t1( fts5_locale('reverse', 'abc') ); +} {2 cba} + +#------------------------------------------------------------------------- +# Test that the locale= option exists and seems to accept values. And +# that fts5_locale() values may only be inserted into an internal-content +# table if the locale=1 option was specified. +# +reset_db +sqlite3_fts5_create_tokenizer -v2 db tcl tcl_create + +do_execsql_test 2.1 { + CREATE VIRTUAL TABLE b1 USING fts5(x, y, locale=1, tokenize=tcl); + CREATE VIRTUAL TABLE b2 USING fts5(x, y, locale=0, tokenize=tcl); + + CREATE VIRTUAL TABLE ttt USING fts5vocab('b1', instance); +} + +do_catchsql_test 2.2 { + CREATE VIRTUAL TABLE b3 USING fts5(x, y, locale=2); +} {1 {malformed locale=... directive}} + +do_catchsql_test 2.3 { + INSERT INTO b1(b1, rank) VALUES('locale', 0); +} {1 {SQL logic error}} + +do_execsql_test 2.4 { + INSERT INTO b1 VALUES('abc', 'one two three'); + INSERT INTO b1 VALUES('def', fts5_locale('reverse', 'four five six')); +} + +do_execsql_test 2.5 { + INSERT INTO b2 VALUES('abc', 'one two three'); +} + +do_catchsql_test 2.6 { + INSERT INTO b2 VALUES('def', fts5_locale('reverse', 'four five six')); +} {1 {fts5_locale() may not be used without locale=1}} + +do_execsql_test 2.7 { SELECT rowid FROM b1('one') } {1} +do_execsql_test 2.8 { SELECT rowid FROM b1('four') } {} +do_execsql_test 2.9 { SELECT rowid FROM b1('ruof') } 2 +do_execsql_test 2.10 { SELECT rowid FROM b1(fts5_locale('reverse', 'five'))} 2 + +do_execsql_test 2.11 { + SELECT x, quote(y) FROM b1 +} { + abc {'one two three'} + def {'four five six'} +} + +do_execsql_test 2.12 { SELECT quote(y) FROM b1('ruof') } { + {'four five six'} +} + +do_execsql_test 2.13 { + INSERT INTO b1(b1) VALUES('integrity-check'); +} +do_execsql_test 2.14 { + INSERT INTO b1(b1) VALUES('rebuild'); +} +do_execsql_test 2.15 { + INSERT INTO b1(b1) VALUES('integrity-check'); +} + +do_execsql_test 2.16 { + DELETE FROM b1 WHERE rowid=2 +} +do_execsql_test 2.17 { + INSERT INTO b1(b1) VALUES('integrity-check'); +} + +#------------------------------------------------------------------------- +# Test the 'delete' command with contentless tables. +# +reset_db +sqlite3_fts5_create_tokenizer -v2 db tcl tcl_create + +do_execsql_test 3.1 { + CREATE VIRTUAL TABLE c1 USING fts5(x, content=, tokenize=tcl); + CREATE VIRTUAL TABLE c2 USING fts5vocab('c1', instance); + + INSERT INTO c1 VALUES('hello world'); + INSERT INTO c1 VALUES( fts5_locale('reverse', 'one two three') ); +} + +do_execsql_test 3.2 { + SELECT DISTINCT term FROM c2 ORDER BY 1 +} { + eerht eno hello owt world +} + +do_execsql_test 3.3 { + INSERT INTO c1(c1, rowid, x) + VALUES('delete', 2, fts5_locale('reverse', 'one two three') ); +} + +do_execsql_test 3.4 { + SELECT DISTINCT term FROM c2 ORDER BY 1 +} { + hello world +} + + + +# execsql_pp { SELECT * FROM ttt } + +finish_test + + diff --git a/manifest b/manifest index bccfb3c0a2..6f223ff085 100644 --- a/manifest +++ b/manifest @@ -1,5 +1,5 @@ -C Add\sthe\spercentile\sextension\sto\sthe\sCLI\sby\sdefault. -D 2024-07-24T13:53:51.649 +C Add\sthe\sfts5_locale()\sfunction,\sand\sbegin\sadding\sthe\srelated\sfunctionality\sto\sfts5. +D 2024-07-26T20:50:33.303 F .fossil-settings/empty-dirs dbb81e8fc0401ac46a1491ab34a7f2c7c0452f2f06b54ebb845d024ca8283ef1 F .fossil-settings/ignore-glob 35175cdfcf539b2318cb04a9901442804be81cd677d8b889fcc9149c21f239ea F LICENSE.md df5091916dbb40e6e9686186587125e1b2ff51f022cc334e886c19a0e9982724 @@ -92,17 +92,17 @@ F ext/fts3/unicode/UnicodeData.txt cd07314edb62d49fde34debdaf92fa2aa69011e7 F ext/fts3/unicode/mkunicode.tcl d5aebf022fa4577ee8cdf27468f0d847879993959101f6dbd6348ef0cfc324a7 F ext/fts3/unicode/parseunicode.tcl a981bd6466d12dd17967515801c3ff23f74a281be1a03cf1e6f52a6959fc77eb F ext/fts5/extract_api_docs.tcl bc3a0ca78be7d3df08e7602c00ca48021ebae40682d75eb001bfdf6e54ffb44e -F ext/fts5/fts5.h 6b49ce6eb2e395e7fd84557b21d32f5de8041f2fada4c617e481e99427e24b6e -F ext/fts5/fts5Int.h 41fb3a2dd40e818cc96c6f4176dbdf2aaa8f57043cfc9a8f2676e7e6a72ad764 +F ext/fts5/fts5.h 38a9553791828b3cf677b9347735fc531d54015ce4f5229d5cf1e2a5c1d3955a +F ext/fts5/fts5Int.h b4a5ed934cb3da55737c4d75cb5f26a39b17470fca67c06c7fe6878992998c99 F ext/fts5/fts5_aux.c 4584e88878e54828bf7d4d0d83deedd232ec60628b7731be02bad6adb62304b1 F ext/fts5/fts5_buffer.c 0eec58bff585f1a44ea9147eae5da2447292080ea435957f7488c70673cb6f09 -F ext/fts5/fts5_config.c 68cb87a49215f8e7028000b681df4057c430a4a6afbd676463886da94c9e1c37 +F ext/fts5/fts5_config.c 0c96490fbad746b3780174f38b2ee5e3d719f2f81ee6b58ca828772871e0f680 F ext/fts5/fts5_expr.c c7336d5f9ecc0e2b014d700be2bec0ea383b0e82c494a7c5c4ac622327c2bfad F ext/fts5/fts5_hash.c adda4272be401566a6e0ba1acbe70ee5cb97fce944bc2e04dc707152a0ec91b1 F ext/fts5/fts5_index.c eb9a0dda3bc6ef969a6be8d2746af56856e67251810ddba08622b45be8477abe -F ext/fts5/fts5_main.c 77fefb37e7931095a5ff271a28fbe4f73ec46d5492ef1f35d405d98e137ad8ed -F ext/fts5/fts5_storage.c 1d7e08d4331da2f3f7e78e70eef2ed6a013d91ba16175c651adbc5ad672235aa -F ext/fts5/fts5_tcl.c fdf7e2bb9a9186cfcaf2d2ce11d338309342b7a7593c2812bc54455db53da5d2 +F ext/fts5/fts5_main.c 5b6f85aae5f25ee4e8762f26eb8c998c9c53443bb56483ebf712aca591bcb41e +F ext/fts5/fts5_storage.c 1d7b358af3d4a7a4c5a7258a847229ca54c1b26d4f1b9e971ea5f2539631c3d4 +F ext/fts5/fts5_tcl.c a1c307785bb505735a8d914fff7d08881e64ba28c40c406b218c591010d1bc9e F ext/fts5/fts5_test_mi.c 08c11ec968148d4cb4119d96d819f8c1f329812c568bac3684f5464be177d3ee F ext/fts5/fts5_test_tok.c 3cb0a9b508b30d17ef025ccddd26ae3dc8ddffbe76c057616e59a9aa85d36f3b F ext/fts5/fts5_tokenize.c fa5493075101540270f572038fc1723d44fcc97bfbf237c8530013b8a27860be @@ -184,6 +184,7 @@ F ext/fts5/test/fts5interrupt.test 09613247b273a99889808ef852898177e671406fe71fd F ext/fts5/test/fts5lastrowid.test f36298a1fb9f988bde060a274a7ce638faa9c38a31400f8d2d27ea9373e0c4a1 F ext/fts5/test/fts5leftjoin.test c0b4cafb9661379e576dc4405c0891d8fcc2782680740513c4d1fc114b43d4ad F ext/fts5/test/fts5limits.test 8ab67cf5d311c124b6ceb0062d0297767176df4572d955fce79fa43004dff01c +F ext/fts5/test/fts5locale.test 92c6ae79df0aa57b379c50e400151f4a9a36d292819beefc31019c749249844a F ext/fts5/test/fts5matchinfo.test 877520582feb86bbfd95ab780099bcba4526f18ac75ee34979144cf86ba3a5a3 F ext/fts5/test/fts5merge.test 2654df0bcdb2d117c2d38b6aeb0168061be01c643f9e9194b36c43a2970e8082 F ext/fts5/test/fts5merge2.test 3ebad1a59d6ad3fb66eff6523a09e95dc6367cbefb3cd73196801dea0425c8e2 @@ -2195,8 +2196,11 @@ F vsixtest/vsixtest.tcl 6a9a6ab600c25a91a7acc6293828957a386a8a93 F vsixtest/vsixtest.vcxproj.data 2ed517e100c66dc455b492e1a33350c1b20fbcdc F vsixtest/vsixtest.vcxproj.filters 37e51ffedcdb064aad6ff33b6148725226cd608e F vsixtest/vsixtest_TemporaryKey.pfx e5b1b036facdb453873e7084e1cae9102ccc67a0 -P 095c22e62248f8ef50cd8531171827f50a7bdd4fc1128bf0e616a3eb2dce980e -R 76e7f4761efc98a28727d2a1442d8c13 -U drh -Z 956d236c4ff2c748d10dde2ade86c2ae +P bcc31846964102385d5a21eb5e85d7db153b155e76b4e2847c9453d3d0e1af04 +R f9f51b6d625a93fd57b76dfee17ab828 +T *branch * fts5-locale +T *sym-fts5-locale * +T -sym-trunk * +U dan +Z 2b42cc25153434047e86f4740a226c4e # Remove this line to create a well-formed Fossil manifest. diff --git a/manifest.uuid b/manifest.uuid index c7d267a281..64398355c9 100644 --- a/manifest.uuid +++ b/manifest.uuid @@ -1 +1 @@ -bcc31846964102385d5a21eb5e85d7db153b155e76b4e2847c9453d3d0e1af04 +8839ef7cfb49239e7f1c4812a53a93a672827c88d6921408b1d5062b352c87cc