From: dan Date: Mon, 15 Apr 2024 20:24:50 +0000 (+0000) Subject: Add experimental way to specify an alternative tokenizer when writing to or querying... X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=ff2f29aac1fb9ada55a3f23668ee3d4a699832d3;p=thirdparty%2Fsqlite.git Add experimental way to specify an alternative tokenizer when writing to or querying an fts5 table. FossilOrigin-Name: 6c51c9c6a8a6a730c1d9e0119bc39edeefbbcb3b30476347a51d2e08eb91fe36 --- diff --git a/ext/fts5/fts5.h b/ext/fts5/fts5.h index 551618e718..2772579d4e 100644 --- a/ext/fts5/fts5.h +++ b/ext/fts5/fts5.h @@ -27,6 +27,8 @@ extern "C" { #endif +#define SQLITE_FTS5_TOKENIZE_SUBTYPE ((unsigned int)'T') + /************************************************************************* ** CUSTOM AUXILIARY FUNCTIONS ** diff --git a/ext/fts5/fts5Int.h b/ext/fts5/fts5Int.h index 9beb26e056..12db44b118 100644 --- a/ext/fts5/fts5Int.h +++ b/ext/fts5/fts5Int.h @@ -143,6 +143,14 @@ struct Fts5Colset { typedef struct Fts5Config Fts5Config; +typedef struct Fts5TokenizerInst Fts5TokenizerInst; +struct Fts5TokenizerInst { + char *zSpec; + Fts5Tokenizer *pTok; + fts5_tokenizer *pTokApi; + Fts5TokenizerInst *pNext; +}; + /* ** An instance of the following structure encodes all information that can ** be gleaned from the CREATE VIRTUAL TABLE statement. @@ -184,6 +192,7 @@ typedef struct Fts5Config Fts5Config; */ struct Fts5Config { sqlite3 *db; /* Database handle */ + Fts5Global *pGlobal; /* Database wide data */ char *zDb; /* Database holding FTS index (e.g. "main") */ char *zName; /* Name of FTS index */ int nCol; /* Number of columns */ @@ -199,8 +208,7 @@ struct Fts5Config { int bTokendata; /* "tokendata=" option value (dflt==0) */ int eDetail; /* FTS5_DETAIL_XXX value */ char *zContentExprlist; - Fts5Tokenizer *pTok; - fts5_tokenizer *pTokApi; + Fts5TokenizerInst *pTokList; int bLock; /* True when table is preparing statement */ int ePattern; /* FTS_PATTERN_XXX constant */ @@ -258,6 +266,21 @@ int sqlite3Fts5Tokenize( int (*xToken)(void*, int, const char*, int, int, int) /* Callback */ ); +int sqlite3Fts5ConfigFindTokenizer( + Fts5Config *pConfig, + const char *z, + Fts5TokenizerInst **ppOut +); + +int sqlite3Fts5SpecTokenize( + Fts5Config *pConfig, /* FTS5 Configuration object */ + const char *zSpec, /* Tokenizer specification */ + int flags, /* FTS5_TOKENIZE_* flags */ + const char *pText, int nText, /* Text to tokenize */ + void *pCtx, /* Context passed to xToken() */ + int (*xToken)(void*, int, const char*, int, int, int) /* Callback */ +); + void sqlite3Fts5Dequote(char *z); /* Load the contents of the %_config table */ @@ -598,11 +621,8 @@ struct Fts5Table { }; int sqlite3Fts5GetTokenizer( - Fts5Global*, - const char **azArg, - int nArg, Fts5Config*, - char **pzErr + const char *zSpec ); Fts5Table *sqlite3Fts5TableFromCsrid(Fts5Global*, i64); @@ -714,6 +734,14 @@ int sqlite3Fts5StorageOptimize(Fts5Storage *p); int sqlite3Fts5StorageMerge(Fts5Storage *p, int nMerge); int sqlite3Fts5StorageReset(Fts5Storage *p); +int sqlite3Fts5UnpackTokenizeBlob( + Fts5Config *pConfig, + sqlite3_value *pVal, + Fts5TokenizerInst **ppTok, + char **pzText, + int *pbDel +); + /* ** End of interface to code in fts5_storage.c. **************************************************************************/ diff --git a/ext/fts5/fts5_config.c b/ext/fts5/fts5_config.c index d2e8309cd2..88f0a32ee8 100644 --- a/ext/fts5/fts5_config.c +++ b/ext/fts5/fts5_config.c @@ -224,6 +224,33 @@ static int fts5ConfigSetEnum( return iVal<0 ? SQLITE_ERROR : SQLITE_OK; } +int sqlite3Fts5ConfigFindTokenizer( + Fts5Config *pConfig, + const char *z, + Fts5TokenizerInst **ppOut +){ + Fts5TokenizerInst *pRet = 0; + int rc = SQLITE_OK; + + assert( pConfig->pzErrmsg ); + + /* Search for an existing tokenizer that matches this spec */ + for(pRet=pConfig->pTokList; pRet; pRet=pRet->pNext){ + if( strcmp(pRet->zSpec, z)==0 ) break; + } + + if( pRet==0 ){ + /* No tokenizer found - create one. */ + rc = sqlite3Fts5GetTokenizer(pConfig, z); + if( rc==SQLITE_OK ){ + pRet = pConfig->pTokList->pNext; + } + } + + if( ppOut ) *ppOut = pRet; + return rc; +} + /* ** Parse a "special" CREATE VIRTUAL TABLE directive and update ** configuration object pConfig as appropriate. @@ -296,46 +323,15 @@ static int fts5ConfigParseSpecial( } if( sqlite3_strnicmp("tokenize", zCmd, nCmd)==0 ){ - const char *p = (const char*)zArg; - sqlite3_int64 nArg = strlen(zArg) + 1; - char **azArg = sqlite3Fts5MallocZero(&rc, sizeof(char*) * nArg); - char *pDel = sqlite3Fts5MallocZero(&rc, nArg * 2); - char *pSpace = pDel; - - if( azArg && pSpace ){ - if( pConfig->pTok ){ - *pzErr = sqlite3_mprintf("multiple tokenize=... directives"); - rc = SQLITE_ERROR; - }else{ - for(nArg=0; p && *p; nArg++){ - const char *p2 = fts5ConfigSkipWhitespace(p); - if( *p2=='\'' ){ - p = fts5ConfigSkipLiteral(p2); - }else{ - p = fts5ConfigSkipBareword(p2); - } - if( p ){ - memcpy(pSpace, p2, p-p2); - azArg[nArg] = pSpace; - sqlite3Fts5Dequote(pSpace); - pSpace += (p - p2) + 1; - p = fts5ConfigSkipWhitespace(p); - } - } - if( p==0 ){ - *pzErr = sqlite3_mprintf("parse error in tokenize directive"); - rc = SQLITE_ERROR; - }else{ - rc = sqlite3Fts5GetTokenizer(pGlobal, - (const char**)azArg, (int)nArg, pConfig, - pzErr - ); - } - } + if( pConfig->pTokList ){ + *pzErr = sqlite3_mprintf("multiple tokenize=... directives"); + rc = SQLITE_ERROR; + }else{ + assert( pConfig->pzErrmsg==0 ); + pConfig->pzErrmsg = pzErr; + rc = sqlite3Fts5GetTokenizer(pConfig, zArg); + pConfig->pzErrmsg = 0; } - - sqlite3_free(azArg); - sqlite3_free(pDel); return rc; } @@ -412,16 +408,6 @@ static int fts5ConfigParseSpecial( return SQLITE_ERROR; } -/* -** Allocate an instance of the default tokenizer ("simple") at -** Fts5Config.pTokenizer. Return SQLITE_OK if successful, or an SQLite error -** code if an error occurs. -*/ -static int fts5ConfigDefaultTokenizer(Fts5Global *pGlobal, Fts5Config *pConfig){ - assert( pConfig->pTok==0 && pConfig->pTokApi==0 ); - return sqlite3Fts5GetTokenizer(pGlobal, 0, 0, pConfig, 0); -} - /* ** Gobble up the first bareword or quoted word from the input buffer zIn. ** Return a pointer to the character immediately following the last in @@ -555,6 +541,7 @@ int sqlite3Fts5ConfigParse( if( pRet==0 ) return SQLITE_NOMEM; memset(pRet, 0, sizeof(Fts5Config)); pRet->db = db; + pRet->pGlobal = pGlobal; pRet->iCookie = -1; nByte = nArg * (sizeof(char*) + sizeof(u8)); @@ -643,8 +630,8 @@ int sqlite3Fts5ConfigParse( /* If a tokenizer= option was successfully parsed, the tokenizer has ** already been allocated. Otherwise, allocate an instance of the default ** tokenizer (unicode61) now. */ - if( rc==SQLITE_OK && pRet->pTok==0 ){ - rc = fts5ConfigDefaultTokenizer(pGlobal, pRet); + if( rc==SQLITE_OK && pRet->pTokList==0 ){ + rc = sqlite3Fts5GetTokenizer(pRet, 0); } /* If no zContent option was specified, fill in the default values. */ @@ -682,17 +669,27 @@ int sqlite3Fts5ConfigParse( return rc; } +static void fts5ConfigFreeTokenizers(Fts5Config *pConfig){ + Fts5TokenizerInst *p = pConfig->pTokList; + while( p ){ + Fts5TokenizerInst *pNext = p->pNext; + p->pTokApi->xDelete(p->pTok); + sqlite3_free(p); + p = pNext; + } + pConfig->pTokList = 0; +} + /* ** Free the configuration object passed as the only argument. */ void sqlite3Fts5ConfigFree(Fts5Config *pConfig){ if( pConfig ){ int i; - if( pConfig->pTok ){ - pConfig->pTokApi->xDelete(pConfig->pTok); - } + fts5ConfigFreeTokenizers(pConfig); sqlite3_free(pConfig->zDb); sqlite3_free(pConfig->zName); + fts5ConfigFreeTokenizers(pConfig); for(i=0; inCol; i++){ sqlite3_free(pConfig->azCol[i]); } @@ -765,10 +762,30 @@ int sqlite3Fts5Tokenize( void *pCtx, /* Context passed to xToken() */ int (*xToken)(void*, int, const char*, int, int, int) /* Callback */ ){ - if( pText==0 ) return SQLITE_OK; - return pConfig->pTokApi->xTokenize( - pConfig->pTok, pCtx, flags, pText, nText, xToken - ); + if( pText ){ + Fts5TokenizerInst *p = pConfig->pTokList; + return p->pTokApi->xTokenize(p->pTok, pCtx, flags, pText, nText, xToken); + } + return SQLITE_OK; +} + +int sqlite3Fts5SpecTokenize( + Fts5Config *pConfig, /* FTS5 Configuration object */ + const char *zSpec, /* Tokenizer specification */ + int flags, /* FTS5_TOKENIZE_* flags */ + const char *pText, int nText, /* Text to tokenize */ + void *pCtx, /* Context passed to xToken() */ + int (*xToken)(void*, int, const char*, int, int, int) /* Callback */ +){ + if( pText ){ + Fts5TokenizerInst *p = pConfig->pTokList; + if( zSpec ){ + int rc = sqlite3Fts5ConfigFindTokenizer(pConfig, zSpec, &p); + if( rc!=SQLITE_OK ) return rc; + } + return p->pTokApi->xTokenize(p->pTok, pCtx, flags, pText, nText, xToken); + } + return SQLITE_OK; } /* diff --git a/ext/fts5/fts5_main.c b/ext/fts5/fts5_main.c index f609f7f34a..bcf3241732 100644 --- a/ext/fts5/fts5_main.c +++ b/ext/fts5/fts5_main.c @@ -115,7 +115,9 @@ struct Fts5TokenizerModule { struct Fts5FullTable { Fts5Table p; /* Public class members from fts5Int.h */ Fts5Storage *pStorage; /* Document store */ +#if 0 Fts5Global *pGlobal; /* Global (connection wide) data */ +#endif Fts5Cursor *pSortCsr; /* Sort data from this cursor */ int iSavepoint; /* Successful xSavepoint()+1 */ @@ -378,7 +380,6 @@ static int fts5InitVtab( } if( rc==SQLITE_OK ){ pTab->p.pConfig = pConfig; - pTab->pGlobal = pGlobal; } /* Open the index sub-system */ @@ -693,7 +694,7 @@ static int fts5BestIndexMethod(sqlite3_vtab *pVTab, sqlite3_index_info *pInfo){ static int fts5NewTransaction(Fts5FullTable *pTab){ Fts5Cursor *pCsr; - for(pCsr=pTab->pGlobal->pCsr; pCsr; pCsr=pCsr->pNext){ + for(pCsr=pTab->p.pConfig->pGlobal->pCsr; pCsr; pCsr=pCsr->pNext){ if( pCsr->base.pVtab==(sqlite3_vtab*)pTab ) return SQLITE_OK; } return sqlite3Fts5StorageReset(pTab->pStorage); @@ -714,7 +715,7 @@ static int fts5OpenMethod(sqlite3_vtab *pVTab, sqlite3_vtab_cursor **ppCsr){ nByte = sizeof(Fts5Cursor) + pConfig->nCol * sizeof(int); pCsr = (Fts5Cursor*)sqlite3_malloc64(nByte); if( pCsr ){ - Fts5Global *pGlobal = pTab->pGlobal; + Fts5Global *pGlobal = pConfig->pGlobal; memset(pCsr, 0, (size_t)nByte); pCsr->aColumnSize = (int*)&pCsr[1]; pCsr->pNext = pGlobal->pCsr; @@ -801,7 +802,7 @@ static int fts5CloseMethod(sqlite3_vtab_cursor *pCursor){ fts5FreeCursorComponents(pCsr); /* Remove the cursor from the Fts5Global.pCsr list */ - for(pp=&pTab->pGlobal->pCsr; (*pp)!=pCsr; pp=&(*pp)->pNext); + for(pp=&pTab->p.pConfig->pGlobal->pCsr; (*pp)!=pCsr; pp=&(*pp)->pNext); *pp = pCsr->pNext; sqlite3_free(pCsr); @@ -854,7 +855,7 @@ static int fts5SorterNext(Fts5Cursor *pCsr){ */ static void fts5TripCursors(Fts5FullTable *pTab){ Fts5Cursor *pCsr; - for(pCsr=pTab->pGlobal->pCsr; pCsr; pCsr=pCsr->pNext){ + for(pCsr=pTab->p.pConfig->pGlobal->pCsr; pCsr; pCsr=pCsr->pNext){ if( pCsr->ePlan==FTS5_PLAN_MATCH && pCsr->base.pVtab==(sqlite3_vtab*)pTab ){ @@ -1106,7 +1107,7 @@ static int fts5SpecialMatch( static Fts5Auxiliary *fts5FindAuxiliary(Fts5FullTable *pTab, const char *zName){ Fts5Auxiliary *pAux; - for(pAux=pTab->pGlobal->pAux; pAux; pAux=pAux->pNext){ + for(pAux=pTab->p.pConfig->pGlobal->pAux; pAux; pAux=pAux->pNext){ if( sqlite3_stricmp(zName, pAux->zFunc)==0 ) return pAux; } @@ -1277,7 +1278,14 @@ static int fts5FilterMethod( pRank = apVal[i]; break; case 'M': { - const char *zText = (const char*)sqlite3_value_text(apVal[i]); + Fts5TokenizerInst *pInst = 0; + char *zText = 0; + int bDel = 0; + + rc = sqlite3Fts5UnpackTokenizeBlob( + pConfig, apVal[i], &pInst, &zText, &bDel + ); + if( zText==0 ) zText = ""; iCol = 0; do{ @@ -1290,6 +1298,7 @@ static int fts5FilterMethod( ** indicates that the MATCH expression is not a full text query, ** but a request for an internal parameter. */ rc = fts5SpecialMatch(pTab, pCsr, &zText[1]); + if( bDel ) sqlite3_free(zText); goto filter_out; }else{ char **pzErr = &pTab->p.base.zErrMsg; @@ -1298,9 +1307,10 @@ static int fts5FilterMethod( rc = sqlite3Fts5ExprAnd(&pCsr->pExpr, pExpr); pExpr = 0; } - if( rc!=SQLITE_OK ) goto filter_out; } + if( bDel ) sqlite3_free(zText); + if( rc!=SQLITE_OK ) goto filter_out; break; } case 'L': @@ -2861,39 +2871,93 @@ static int fts5FindTokenizer( } int sqlite3Fts5GetTokenizer( - Fts5Global *pGlobal, - const char **azArg, - int nArg, Fts5Config *pConfig, - char **pzErr + const char *zSpec ){ - Fts5TokenizerModule *pMod; int rc = SQLITE_OK; - - pMod = fts5LocateTokenizer(pGlobal, nArg==0 ? 0 : azArg[0]); - if( pMod==0 ){ - assert( nArg>0 ); - rc = SQLITE_ERROR; - *pzErr = sqlite3_mprintf("no such tokenizer: %s", azArg[0]); - }else{ - rc = pMod->x.xCreate( - pMod->pUserData, (azArg?&azArg[1]:0), (nArg?nArg-1:0), &pConfig->pTok - ); - pConfig->pTokApi = &pMod->x; - if( rc!=SQLITE_OK ){ - if( pzErr ) *pzErr = sqlite3_mprintf("error in tokenizer constructor"); - }else{ - pConfig->ePattern = sqlite3Fts5TokenizerPattern( - pMod->x.xCreate, pConfig->pTok - ); + char **pzErr = pConfig->pzErrmsg; + const char **azArg = 0; + char *pDel = 0; + sqlite3_int64 nArg = 0; + + assert( pzErr || (zSpec==0 && pConfig->pTokList==0) ); + if( zSpec ){ + const char *p = (const char*)zSpec; + char *pSpace = 0; + + nArg = strlen(zSpec) + 1; + pDel = sqlite3Fts5MallocZero(&rc, nArg * 2); + pSpace = pDel; + azArg = (const char**)sqlite3Fts5MallocZero(&rc, sizeof(char*) * nArg); + + if( azArg && pSpace ){ + for(nArg=0; p && *p; nArg++){ + const char *p2 = fts5ConfigSkipWhitespace(p); + if( *p2=='\'' ){ + p = fts5ConfigSkipLiteral(p2); + }else{ + p = fts5ConfigSkipBareword(p2); + } + if( p ){ + memcpy(pSpace, p2, p-p2); + azArg[nArg] = pSpace; + sqlite3Fts5Dequote(pSpace); + pSpace += (p - p2) + 1; + p = fts5ConfigSkipWhitespace(p); + } + } + if( p==0 ){ + *pzErr= sqlite3_mprintf("parse error in tokenize directive"); + rc = SQLITE_ERROR; + } } } - if( rc!=SQLITE_OK ){ - pConfig->pTokApi = 0; - pConfig->pTok = 0; + if( rc==SQLITE_OK ){ + Fts5TokenizerModule *pMod; + pMod = fts5LocateTokenizer(pConfig->pGlobal, nArg==0 ? 0 : azArg[0]); + if( pMod==0 ){ + assert( nArg>0 ); + rc = SQLITE_ERROR; + *pzErr = sqlite3_mprintf("no such tokenizer: %s", azArg[0]); + }else{ + int nSpec = zSpec ? strlen(zSpec) + 1 : 0; + int nByte = sizeof(Fts5TokenizerInst) + nSpec; + Fts5TokenizerInst *pNew = sqlite3Fts5MallocZero(&rc, nByte); + if( pNew ){ + if( zSpec ){ + pNew->zSpec = (char*)&pNew[1]; + memcpy(pNew->zSpec, zSpec, nSpec); + } + rc = pMod->x.xCreate( + pMod->pUserData, (azArg?&azArg[1]:0), (nArg?nArg-1:0), &pNew->pTok + ); + pNew->pTokApi = &pMod->x; + if( rc!=SQLITE_OK ){ + if( pzErr ){ + *pzErr = sqlite3_mprintf("error in tokenizer constructor"); + } + }else if( pConfig->pTokList==0 ){ + pConfig->ePattern = sqlite3Fts5TokenizerPattern( + pMod->x.xCreate, pNew->pTok + ); + } + } + if( rc==SQLITE_OK ){ + if( pConfig->pTokList ){ + pNew->pNext = pConfig->pTokList->pNext; + pConfig->pTokList->pNext = pNew; + }else{ + pConfig->pTokList = pNew; + } + }else{ + sqlite3_free(pNew); + } + } } + sqlite3_free(azArg); + sqlite3_free(pDel); return rc; } diff --git a/ext/fts5/fts5_storage.c b/ext/fts5/fts5_storage.c index a04b152fb0..5261b3c1b1 100644 --- a/ext/fts5/fts5_storage.c +++ b/ext/fts5/fts5_storage.c @@ -399,6 +399,37 @@ static int fts5StorageInsertCallback( return sqlite3Fts5IndexWrite(pIdx, pCtx->iCol, pCtx->szCol-1, pToken, nToken); } +#define IS_TOKENIZE_BLOB(pVal) ( \ + sqlite3_value_subtype(pVal)==SQLITE_FTS5_TOKENIZE_SUBTYPE \ + && sqlite3_value_type(pVal)==SQLITE_BLOB \ +) + +static const char *fts5UnpackTokenizeBlob( + sqlite3_value *pVal, + const char **pzT, + int *pnT +){ + const u8 *pBlob = sqlite3_value_blob(pVal); + int nBlob = sqlite3_value_bytes(pVal); + int ii; + + assert( sqlite3_value_subtype(pVal)==SQLITE_FTS5_TOKENIZE_SUBTYPE ); + assert( sqlite3_value_type(pVal)==SQLITE_BLOB ); + + for(ii=0; pBlob[ii]; ii++){ + if( ii==nBlob ){ + *pzT = 0; + *pnT = 0; + return 0; + } + } + + *pzT = (const char*)&pBlob[ii+1]; + *pnT = nBlob - ii - 1; + return (const char*)pBlob; +} + + /* ** If a row with rowid iDel is present in the %_content table, add the ** delete-markers to the FTS index necessary to delete it. Do not actually @@ -429,21 +460,27 @@ static int fts5StorageDeleteFromIndex( ctx.iCol = -1; for(iCol=1; rc==SQLITE_OK && iCol<=pConfig->nCol; iCol++){ if( pConfig->abUnindexed[iCol-1]==0 ){ - const char *zText; - int nText; + const char *zText = 0; + const char *zTok = 0; + int nText = 0; assert( pSeek==0 || apVal==0 ); assert( pSeek!=0 || apVal!=0 ); if( pSeek ){ zText = (const char*)sqlite3_column_text(pSeek, iCol); nText = sqlite3_column_bytes(pSeek, iCol); }else if( ALWAYS(apVal) ){ - zText = (const char*)sqlite3_value_text(apVal[iCol-1]); - nText = sqlite3_value_bytes(apVal[iCol-1]); + sqlite3_value *pVal = apVal[iCol-1]; + if( IS_TOKENIZE_BLOB(pVal) ){ + zTok = fts5UnpackTokenizeBlob(pVal, &zText, &nText); + }else{ + zText = (const char*)sqlite3_value_text(apVal[iCol-1]); + nText = sqlite3_value_bytes(apVal[iCol-1]); + } }else{ continue; } ctx.szCol = 0; - rc = sqlite3Fts5Tokenize(pConfig, FTS5_TOKENIZE_DOCUMENT, + rc = sqlite3Fts5SpecTokenize(pConfig, zTok, FTS5_TOKENIZE_DOCUMENT, zText, nText, (void*)&ctx, fts5StorageInsertCallback ); p->aTotalSize[iCol-1] -= (i64)ctx.szCol; @@ -752,6 +789,30 @@ static int fts5StorageNewRowid(Fts5Storage *p, i64 *piRowid){ return rc; } +int sqlite3Fts5UnpackTokenizeBlob( + Fts5Config *pConfig, + sqlite3_value *pVal, + Fts5TokenizerInst **ppTok, + char **pzText, + int *pbDel +){ + int rc = SQLITE_OK; + if( IS_TOKENIZE_BLOB(pVal) ){ + const char *zTok = 0; + const char *zText = 0; + int nText = 0; + zTok = fts5UnpackTokenizeBlob(pVal, &zText, &nText); + rc = sqlite3Fts5ConfigFindTokenizer(pConfig, zTok, ppTok); + *pzText = sqlite3Fts5Mprintf(&rc, "%.*s", nText, zText); + *pbDel = 1; + }else{ + *pzText = (char*)sqlite3_value_text(pVal); + *pbDel = 0; + *ppTok = pConfig->pTokList; + } + return rc; +} + /* ** Insert a new row into the FTS content table. */ @@ -775,7 +836,15 @@ int sqlite3Fts5StorageContentInsert( int i; /* Counter variable */ rc = fts5StorageGetStmt(p, FTS5_STMT_INSERT_CONTENT, &pInsert, 0); for(i=1; rc==SQLITE_OK && i<=pConfig->nCol+1; i++){ - rc = sqlite3_bind_value(pInsert, i, apVal[i]); + sqlite3_value *pVal = apVal[i]; + if( IS_TOKENIZE_BLOB(pVal) ){ + const char *zT = 0; + int nT = 0; + fts5UnpackTokenizeBlob(pVal, &zT, &nT); + rc = sqlite3_bind_text(pInsert, i, zT, nT, SQLITE_STATIC); + }else{ + rc = sqlite3_bind_value(pInsert, i, apVal[i]); + } } if( rc==SQLITE_OK ){ sqlite3_step(pInsert); @@ -810,13 +879,20 @@ int sqlite3Fts5StorageIndexInsert( for(ctx.iCol=0; rc==SQLITE_OK && ctx.iColnCol; ctx.iCol++){ ctx.szCol = 0; if( pConfig->abUnindexed[ctx.iCol]==0 ){ - const char *zText = (const char*)sqlite3_value_text(apVal[ctx.iCol+2]); - int nText = sqlite3_value_bytes(apVal[ctx.iCol+2]); - rc = sqlite3Fts5Tokenize(pConfig, - FTS5_TOKENIZE_DOCUMENT, - zText, nText, - (void*)&ctx, - fts5StorageInsertCallback + sqlite3_value *pVal = apVal[ctx.iCol+2]; + const char *zText = 0; + const char *zTok = 0; + int nText = 0; + + if( IS_TOKENIZE_BLOB(pVal) ){ + zTok = fts5UnpackTokenizeBlob(pVal, &zText, &nText); + }else{ + zText = (const char*)sqlite3_value_text(apVal[ctx.iCol+2]); + nText = sqlite3_value_bytes(apVal[ctx.iCol+2]); + } + rc = sqlite3Fts5SpecTokenize(pConfig, + zTok, FTS5_TOKENIZE_DOCUMENT, zText, nText, + (void*)&ctx, fts5StorageInsertCallback ); } sqlite3Fts5BufferAppendVarint(&rc, &buf, ctx.szCol); diff --git a/ext/fts5/fts5_tcl.c b/ext/fts5/fts5_tcl.c index c5b5f41f83..3da2eb8eca 100644 --- a/ext/fts5/fts5_tcl.c +++ b/ext/fts5/fts5_tcl.c @@ -1126,6 +1126,35 @@ static int SQLITE_TCLAPI f5tRegisterMatchinfo( return TCL_OK; } +static void f5tFree(void *p) { sqlite3_free(p); } + +static void f5tScalarFunc( + sqlite3_context *ctx, + int nArg, + sqlite3_value **apArg +){ + const char *zText = (const char*)sqlite3_value_text(apArg[0]); + int nText = sqlite3_value_bytes(apArg[0]); + const char *zTok = (const char*)sqlite3_value_text(apArg[1]); + int nTok = sqlite3_value_bytes(apArg[1]); + unsigned char *aBuf = 0; + int nBuf = 0; + + assert( nArg==2 ); + + if( zTok==0 ) zTok = ""; + nBuf = nTok + 1 + nText; + aBuf = (unsigned char*)sqlite3_malloc(nBuf); + if( aBuf==0 ){ + sqlite3_result_error_nomem(ctx); + }else{ + memcpy(aBuf, zTok, nTok+1); + memcpy(&aBuf[nTok+1], zText, nText); + sqlite3_result_blob(ctx, aBuf, nBuf, f5tFree); + sqlite3_result_subtype(ctx, SQLITE_FTS5_TOKENIZE_SUBTYPE); + } +} + static int SQLITE_TCLAPI f5tRegisterTok( void * clientData, Tcl_Interp *interp, @@ -1145,10 +1174,16 @@ static int SQLITE_TCLAPI f5tRegisterTok( } rc = sqlite3Fts5TestRegisterTok(db, pApi); + if( rc==SQLITE_OK ){ + rc = sqlite3_create_function(db, "fts5tokenize", 2, SQLITE_UTF8, 0, + f5tScalarFunc, 0, 0 + ); + } if( rc!=SQLITE_OK ){ Tcl_SetResult(interp, (char*)sqlite3ErrName(rc), TCL_VOLATILE); return TCL_ERROR; } + return TCL_OK; } diff --git a/manifest b/manifest index c7c6ea43af..a605fad152 100644 --- a/manifest +++ b/manifest @@ -1,5 +1,5 @@ -C If\sa\sbuild\sfails\sin\stestrunner.tcl,\sdo\snot\sattempt\sto\srun\sthe\sjobs\sthat\ndepend\son\sthat\sbuild.\s\sInstead,\sreport\sthose\sjobs\sas\shaving\sbeen\sskipped. -D 2024-04-12T18:46:34.309 +C Add\sexperimental\sway\sto\sspecify\san\salternative\stokenizer\swhen\swriting\sto\sor\squerying\san\sfts5\stable. +D 2024-04-15T20:24:50.588 F .fossil-settings/empty-dirs dbb81e8fc0401ac46a1491ab34a7f2c7c0452f2f06b54ebb845d024ca8283ef1 F .fossil-settings/ignore-glob 35175cdfcf539b2318cb04a9901442804be81cd677d8b889fcc9149c21f239ea F LICENSE.md df5091916dbb40e6e9686186587125e1b2ff51f022cc334e886c19a0e9982724 @@ -92,17 +92,17 @@ F ext/fts3/unicode/UnicodeData.txt cd07314edb62d49fde34debdaf92fa2aa69011e7 F ext/fts3/unicode/mkunicode.tcl d5aebf022fa4577ee8cdf27468f0d847879993959101f6dbd6348ef0cfc324a7 F ext/fts3/unicode/parseunicode.tcl a981bd6466d12dd17967515801c3ff23f74a281be1a03cf1e6f52a6959fc77eb F ext/fts5/extract_api_docs.tcl bc3a0ca78be7d3df08e7602c00ca48021ebae40682d75eb001bfdf6e54ffb44e -F ext/fts5/fts5.h 8856e11a5f0269cd346754cea0765efe8089635b80cad3222e8bfdb08cd5348a -F ext/fts5/fts5Int.h defa43c0932265138ee910ca416e6baccf8b774e0f3d610e74be1ab2880e9834 +F ext/fts5/fts5.h e701ea20480be693f2b50ab314ec4d002bd9b97cd89636427ed1528c690107ae +F ext/fts5/fts5Int.h 5fdd75e46cbaabf84c072907f0f3c5da8dbab76c226355a8bdf528e18a530ba8 F ext/fts5/fts5_aux.c 4584e88878e54828bf7d4d0d83deedd232ec60628b7731be02bad6adb62304b1 F ext/fts5/fts5_buffer.c 0eec58bff585f1a44ea9147eae5da2447292080ea435957f7488c70673cb6f09 -F ext/fts5/fts5_config.c 8072a207034b51ae9b7694121d1b5715c794e94b275e088f70ae532378ca5cdf +F ext/fts5/fts5_config.c 777bfe8e7131a07f5074e7fcaec91ef88580a7bde400e4561a89495e7d9bae99 F ext/fts5/fts5_expr.c e91156ebdcc08d837f4f324168f69f3c0d7fdef0e521fd561efb48ef3297b696 F ext/fts5/fts5_hash.c adda4272be401566a6e0ba1acbe70ee5cb97fce944bc2e04dc707152a0ec91b1 F ext/fts5/fts5_index.c ee0f4d50bc0c58a7c5ef7d645e7e38e1e59315b8ea9d722ae00c5f949ee65379 -F ext/fts5/fts5_main.c d68bd9533d5a638b7f6fae61c3cb0a15257dcdcccedaf3d0b3c9f55940c85048 -F ext/fts5/fts5_storage.c f9e31b0d155e9b2c92d5d3a09ad7a56b937fbf1c7f962e10f4ca6281349f3934 -F ext/fts5/fts5_tcl.c fdf7e2bb9a9186cfcaf2d2ce11d338309342b7a7593c2812bc54455db53da5d2 +F ext/fts5/fts5_main.c 49111d5d88bd35fa3d052ec8a3047c02fed8e9a3bf3bcd2048cb3ae78dcefe65 +F ext/fts5/fts5_storage.c 768fafc623dd2d9974cc9816f5ab1006baaa105ba055d3c51578d11d73d76d24 +F ext/fts5/fts5_tcl.c 97e5e14f7d0447979f918ecfd7bcadb0e15ce15f79d007c7400190cafd265beb F ext/fts5/fts5_test_mi.c 08c11ec968148d4cb4119d96d819f8c1f329812c568bac3684f5464be177d3ee F ext/fts5/fts5_test_tok.c 3cb0a9b508b30d17ef025ccddd26ae3dc8ddffbe76c057616e59a9aa85d36f3b F ext/fts5/fts5_tokenize.c 83cfcede3898001cab84432a36ce1503e3080cf9b1c682b022ec82e267ea4c13 @@ -2184,8 +2184,11 @@ F vsixtest/vsixtest.tcl 6a9a6ab600c25a91a7acc6293828957a386a8a93 F vsixtest/vsixtest.vcxproj.data 2ed517e100c66dc455b492e1a33350c1b20fbcdc F vsixtest/vsixtest.vcxproj.filters 37e51ffedcdb064aad6ff33b6148725226cd608e F vsixtest/vsixtest_TemporaryKey.pfx e5b1b036facdb453873e7084e1cae9102ccc67a0 -P 5dede50d9e7b6942df9f7b00fbfeaa2103c36c5da01d63d88136fb0ef4b7d26d -R 4f3eb54a5cce9aa23292ec96983abdb3 -U drh -Z 1648e8675f24aa01764212367c0fba7b +P b40580be719a129ecd1aa3c69d1086c967d063920fdd48617c864e73c059abc1 +R 3c57328f56827a22357c3ccf2b463e50 +T *branch * fts5-tokenize-blob +T *sym-fts5-tokenize-blob * +T -sym-trunk * +U dan +Z 169f93ab8cae82ba28e335410f939065 # Remove this line to create a well-formed Fossil manifest. diff --git a/manifest.uuid b/manifest.uuid index b40ee8722e..f578ded6a6 100644 --- a/manifest.uuid +++ b/manifest.uuid @@ -1 +1 @@ -b40580be719a129ecd1aa3c69d1086c967d063920fdd48617c864e73c059abc1 \ No newline at end of file +6c51c9c6a8a6a730c1d9e0119bc39edeefbbcb3b30476347a51d2e08eb91fe36 \ No newline at end of file