From: dan Date: Wed, 17 Apr 2024 19:48:41 +0000 (+0000) Subject: Prevent tokenize-blobs from being used with non-contentless tables. Fix some other... X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=85ced205da46d95de1cbb2ad675e49bfc321e0fb;p=thirdparty%2Fsqlite.git Prevent tokenize-blobs from being used with non-contentless tables. Fix some other issues with the new code on this branch. FossilOrigin-Name: 6a640ea4d8938e2b0c73b5aa35d1ce96eea1d1304a26c98dc054e9949d1b7c8c --- diff --git a/ext/fts5/fts5Int.h b/ext/fts5/fts5Int.h index 6f4210f9f4..6bd2f3fa67 100644 --- a/ext/fts5/fts5Int.h +++ b/ext/fts5/fts5Int.h @@ -143,9 +143,13 @@ struct Fts5Colset { typedef struct Fts5Config Fts5Config; +/* +** All instantiated tokenizers are stored in a list of the following objects, +** starting at Fts5Config.pTokList. +*/ typedef struct Fts5TokenizerInst Fts5TokenizerInst; struct Fts5TokenizerInst { - char *zSpec; + char *zSpec; /* Tokenizer specification */ Fts5Tokenizer *pTok; fts5_tokenizer *pTokApi; Fts5TokenizerInst *pNext; diff --git a/ext/fts5/fts5_config.c b/ext/fts5/fts5_config.c index 88f0a32ee8..4ecce4c2f6 100644 --- a/ext/fts5/fts5_config.c +++ b/ext/fts5/fts5_config.c @@ -224,10 +224,15 @@ static int fts5ConfigSetEnum( return iVal<0 ? SQLITE_ERROR : SQLITE_OK; } +/* +** Locate a tokenizer instance with a specification matching the second +** argument. Create a new tokenizer if one can not be found. Return SQLITE_OK +** if successful, or an SQLite error code otherwise. +*/ int sqlite3Fts5ConfigFindTokenizer( - Fts5Config *pConfig, - const char *z, - Fts5TokenizerInst **ppOut + Fts5Config *pConfig, /* Table configuration */ + const char *z, /* Requested tokenizer specification */ + Fts5TokenizerInst **ppOut /* OUT: Tokenizer instance */ ){ Fts5TokenizerInst *pRet = 0; int rc = SQLITE_OK; @@ -669,6 +674,9 @@ int sqlite3Fts5ConfigParse( return rc; } +/* +** Free all tokenizer instances in the list starting at Fts5Config.pTokList. +*/ static void fts5ConfigFreeTokenizers(Fts5Config *pConfig){ Fts5TokenizerInst *p = pConfig->pTokList; while( p ){ @@ -769,6 +777,10 @@ int sqlite3Fts5Tokenize( return SQLITE_OK; } +/* +** Like sqlite3Fts5Tokenize(), but using the tokenizer defined by +** specification zSpec. +*/ int sqlite3Fts5SpecTokenize( Fts5Config *pConfig, /* FTS5 Configuration object */ const char *zSpec, /* Tokenizer specification */ diff --git a/ext/fts5/fts5_main.c b/ext/fts5/fts5_main.c index 4217d7040b..3e9333f226 100644 --- a/ext/fts5/fts5_main.c +++ b/ext/fts5/fts5_main.c @@ -115,9 +115,6 @@ struct Fts5TokenizerModule { struct Fts5FullTable { Fts5Table p; /* Public class members from fts5Int.h */ Fts5Storage *pStorage; /* Document store */ -#if 0 - Fts5Global *pGlobal; /* Global (connection wide) data */ -#endif Fts5Cursor *pSortCsr; /* Sort data from this cursor */ int iSavepoint; /* Successful xSavepoint()+1 */ @@ -2871,6 +2868,10 @@ static int fts5FindTokenizer( return rc; } +/* +** Add a tokenizer with specification zSpec to the list at Fts5Config.pTokList. +** Return SQLITE_OK if successful, or an SQLite error code otherwise. +*/ int sqlite3Fts5GetTokenizer( Fts5Config *pConfig, const char *zSpec diff --git a/ext/fts5/fts5_storage.c b/ext/fts5/fts5_storage.c index 5261b3c1b1..4050d10e9c 100644 --- a/ext/fts5/fts5_storage.c +++ b/ext/fts5/fts5_storage.c @@ -399,11 +399,43 @@ static int fts5StorageInsertCallback( return sqlite3Fts5IndexWrite(pIdx, pCtx->iCol, pCtx->szCol-1, pToken, nToken); } -#define IS_TOKENIZE_BLOB(pVal) ( \ - sqlite3_value_subtype(pVal)==SQLITE_FTS5_TOKENIZE_SUBTYPE \ - && sqlite3_value_type(pVal)==SQLITE_BLOB \ -) +/* +** If the value passed as the third argument is a tokenizer blob, and the +** Fts5Config object indicates that the table is a contentless-table, +** return non-zero. +** +** Or, if the value passed as the third argument is a tokenizer blob but +** the table is not a contentless table, set *pRc to SQLITE_ERROR and leave +** an error message in the Fts5Config object. Return 0 in this case. +** +** Finally, if the value is not a tokenizer blob, return 0. +*/ +static int fts5IsTokenizeBlob( + int *pRc, + Fts5Config *pConfig, + sqlite3_value *pVal +){ + assert( *pRc==SQLITE_OK ); + if( sqlite3_value_subtype(pVal)==SQLITE_FTS5_TOKENIZE_SUBTYPE + && sqlite3_value_type(pVal)==SQLITE_BLOB + ){ + if( pConfig->eContent==FTS5_CONTENT_NONE ) return 1; + + *pRc = SQLITE_ERROR; + *pConfig->pzErrmsg = sqlite3_mprintf( + "table does not support alternative tokenizers" + ); + } + return 0; +} +/* +** Value pVal is guaranteed to be a tokenize-blob. This function unpacks +** the blob and returns a pointer to the nul-terminated tokenizer +** specification. It also sets output parameter (*pzT) to point to the +** start of the utf-8 text value (not nul-terminated) and (*pnT) to the +** number of valid bytes in this buffer. +*/ static const char *fts5UnpackTokenizeBlob( sqlite3_value *pVal, const char **pzT, @@ -470,7 +502,7 @@ static int fts5StorageDeleteFromIndex( nText = sqlite3_column_bytes(pSeek, iCol); }else if( ALWAYS(apVal) ){ sqlite3_value *pVal = apVal[iCol-1]; - if( IS_TOKENIZE_BLOB(pVal) ){ + if( fts5IsTokenizeBlob(&rc, pConfig, pVal) ){ zTok = fts5UnpackTokenizeBlob(pVal, &zText, &nText); }else{ zText = (const char*)sqlite3_value_text(apVal[iCol-1]); @@ -480,12 +512,14 @@ static int fts5StorageDeleteFromIndex( continue; } ctx.szCol = 0; - rc = sqlite3Fts5SpecTokenize(pConfig, zTok, FTS5_TOKENIZE_DOCUMENT, - zText, nText, (void*)&ctx, fts5StorageInsertCallback - ); - p->aTotalSize[iCol-1] -= (i64)ctx.szCol; - if( p->aTotalSize[iCol-1]<0 ){ - rc = FTS5_CORRUPT; + if( rc==SQLITE_OK ){ + rc = sqlite3Fts5SpecTokenize(pConfig, zTok, FTS5_TOKENIZE_DOCUMENT, + zText, nText, (void*)&ctx, fts5StorageInsertCallback + ); + p->aTotalSize[iCol-1] -= (i64)ctx.szCol; + if( p->aTotalSize[iCol-1]<0 ){ + rc = FTS5_CORRUPT; + } } } } @@ -789,15 +823,27 @@ static int fts5StorageNewRowid(Fts5Storage *p, i64 *piRowid){ return rc; } +/* +** This function is used to extract text from an sqlite3_value to use +** as an fts5 query string. It also finds the required tokenizer to use +** for tokenizing query terms. +** +** If successful, SQLITE_OK is returned, output variable (*ppTok) is set +** to point to the required tokenizer instance, (*pzText) points to a +** nul-terminated buffer containing the query string as utf-8 text, and +** (*pbDel) is set to true if the caller must sqlite3_free(*pzText) at +** some point in the future. Or, if an error occurs, an SQLite error +** code is returned. +*/ int sqlite3Fts5UnpackTokenizeBlob( Fts5Config *pConfig, sqlite3_value *pVal, Fts5TokenizerInst **ppTok, char **pzText, - int *pbDel + int *pbDel /* OUT: Set to true if sqlite3_free() req. */ ){ int rc = SQLITE_OK; - if( IS_TOKENIZE_BLOB(pVal) ){ + if( fts5IsTokenizeBlob(&rc, pConfig, pVal) ){ const char *zTok = 0; const char *zText = 0; int nText = 0; @@ -837,12 +883,12 @@ int sqlite3Fts5StorageContentInsert( rc = fts5StorageGetStmt(p, FTS5_STMT_INSERT_CONTENT, &pInsert, 0); for(i=1; rc==SQLITE_OK && i<=pConfig->nCol+1; i++){ sqlite3_value *pVal = apVal[i]; - if( IS_TOKENIZE_BLOB(pVal) ){ + if( fts5IsTokenizeBlob(&rc, pConfig, pVal) ){ const char *zT = 0; int nT = 0; fts5UnpackTokenizeBlob(pVal, &zT, &nT); rc = sqlite3_bind_text(pInsert, i, zT, nT, SQLITE_STATIC); - }else{ + }else if( rc==SQLITE_OK ){ rc = sqlite3_bind_value(pInsert, i, apVal[i]); } } @@ -884,16 +930,18 @@ int sqlite3Fts5StorageIndexInsert( const char *zTok = 0; int nText = 0; - if( IS_TOKENIZE_BLOB(pVal) ){ + if( fts5IsTokenizeBlob(&rc, pConfig, pVal) ){ zTok = fts5UnpackTokenizeBlob(pVal, &zText, &nText); }else{ zText = (const char*)sqlite3_value_text(apVal[ctx.iCol+2]); nText = sqlite3_value_bytes(apVal[ctx.iCol+2]); } - rc = sqlite3Fts5SpecTokenize(pConfig, - zTok, FTS5_TOKENIZE_DOCUMENT, zText, nText, - (void*)&ctx, fts5StorageInsertCallback - ); + if( rc==SQLITE_OK ){ + rc = sqlite3Fts5SpecTokenize(pConfig, + zTok, FTS5_TOKENIZE_DOCUMENT, zText, nText, + (void*)&ctx, fts5StorageInsertCallback + ); + } } sqlite3Fts5BufferAppendVarint(&rc, &buf, ctx.szCol); p->aTotalSize[ctx.iCol] += (i64)ctx.szCol; diff --git a/ext/fts5/test/fts5tokenizer3.test b/ext/fts5/test/fts5tokenizer3.test index 87e4106d2a..f8a5e89a16 100644 --- a/ext/fts5/test/fts5tokenizer3.test +++ b/ext/fts5/test/fts5tokenizer3.test @@ -132,6 +132,27 @@ do_test 1.20 { set ::constructor_count } 3 +#------------------------------------------------------------------------- +# non-contentless tables. +# +do_execsql_test 2.0 { + CREATE TABLE t1(x); + CREATE VIRTUAL TABLE x2 USING fts5(x); + CREATE VIRTUAL TABLE x3 USING fts5(x, content=t1); +} + +do_catchsql_test 2.1 { + INSERT INTO x2 VALUES( fts5tokenize('hello world', 'tcl2') ); +} {1 {table does not support alternative tokenizers}} +do_catchsql_test 2.2 { + INSERT INTO x3 VALUES( fts5tokenize('hello world', 'tcl2') ); +} {1 {table does not support alternative tokenizers}} +do_catchsql_test 2.3 { + SELECT * FROM x2( fts5tokenize('hello world', 'tcl2') ); +} {1 {table does not support alternative tokenizers}} +do_catchsql_test 2.4 { + SELECT * FROM x3( fts5tokenize('hello world', 'tcl2') ); +} {1 {table does not support alternative tokenizers}} finish_test diff --git a/manifest b/manifest index 392440b60d..d6f8a7de7c 100644 --- a/manifest +++ b/manifest @@ -1,5 +1,5 @@ -C Add\ssome\stests\sfor\sthe\sfts5\stokenize-blob\sfunctionality\son\sthis\sbranch. -D 2024-04-16T14:23:56.964 +C Prevent\stokenize-blobs\sfrom\sbeing\sused\swith\snon-contentless\stables.\sFix\ssome\sother\sissues\swith\sthe\snew\scode\son\sthis\sbranch. +D 2024-04-17T19:48:41.847 F .fossil-settings/empty-dirs dbb81e8fc0401ac46a1491ab34a7f2c7c0452f2f06b54ebb845d024ca8283ef1 F .fossil-settings/ignore-glob 35175cdfcf539b2318cb04a9901442804be81cd677d8b889fcc9149c21f239ea F LICENSE.md df5091916dbb40e6e9686186587125e1b2ff51f022cc334e886c19a0e9982724 @@ -93,15 +93,15 @@ F ext/fts3/unicode/mkunicode.tcl d5aebf022fa4577ee8cdf27468f0d847879993959101f6d F ext/fts3/unicode/parseunicode.tcl a981bd6466d12dd17967515801c3ff23f74a281be1a03cf1e6f52a6959fc77eb F ext/fts5/extract_api_docs.tcl bc3a0ca78be7d3df08e7602c00ca48021ebae40682d75eb001bfdf6e54ffb44e F ext/fts5/fts5.h e701ea20480be693f2b50ab314ec4d002bd9b97cd89636427ed1528c690107ae -F ext/fts5/fts5Int.h 655147fa7eaba54753b9642c52d2476965be77d0da31e651989dfeaf351f6e8e +F ext/fts5/fts5Int.h 098b3fd928d10035e9b52756affe6315fe337abfbc19e80ea33d1db07d4f5f7a F ext/fts5/fts5_aux.c 4584e88878e54828bf7d4d0d83deedd232ec60628b7731be02bad6adb62304b1 F ext/fts5/fts5_buffer.c 0eec58bff585f1a44ea9147eae5da2447292080ea435957f7488c70673cb6f09 -F ext/fts5/fts5_config.c 777bfe8e7131a07f5074e7fcaec91ef88580a7bde400e4561a89495e7d9bae99 +F ext/fts5/fts5_config.c fe565c6a12d6897053a5ab7b0cc6a0691c668103e3c3f1de8dec5491a72316fb F ext/fts5/fts5_expr.c f1e9110062a9ff63007431d0af1b1506cca3e5f79e1b2f2dc47795b9e98d4b13 F ext/fts5/fts5_hash.c adda4272be401566a6e0ba1acbe70ee5cb97fce944bc2e04dc707152a0ec91b1 F ext/fts5/fts5_index.c ee0f4d50bc0c58a7c5ef7d645e7e38e1e59315b8ea9d722ae00c5f949ee65379 -F ext/fts5/fts5_main.c b2dfe719a003337c159e1bfb97fa885d0dc1b9921de2e96953e188481663ae5e -F ext/fts5/fts5_storage.c 768fafc623dd2d9974cc9816f5ab1006baaa105ba055d3c51578d11d73d76d24 +F ext/fts5/fts5_main.c 86b2c807711fc6eef3c1cf3e558093669093bf91a2014bfe5b71be8ad1ea41cb +F ext/fts5/fts5_storage.c 19fc854c3fad12e3f79ed3608b944a07fb41ffd50af493a1f521cee3a35af192 F ext/fts5/fts5_tcl.c fd485d0fb56f2c42885e68c74dd53c594a4761af6088617ce120804a6a5aca82 F ext/fts5/fts5_test_mi.c 08c11ec968148d4cb4119d96d819f8c1f329812c568bac3684f5464be177d3ee F ext/fts5/fts5_test_tok.c 3cb0a9b508b30d17ef025ccddd26ae3dc8ddffbe76c057616e59a9aa85d36f3b @@ -229,7 +229,7 @@ F ext/fts5/test/fts5tok1.test 1f7817499f5971450d8c4a652114b3d833393c8134e32422d0 F ext/fts5/test/fts5tok2.test dcacb32d4a2a3f0dd3215d4a3987f78ae4be21a2 F ext/fts5/test/fts5tokenizer.test ac3c9112b263a639fb0508ae73a3ee886bf4866d2153771a8e8a20c721305a43 F ext/fts5/test/fts5tokenizer2.test cb5428c7cfb3b6a74b7adfcde65506e329112003e8dffa7501d01c2d18d02569 -F ext/fts5/test/fts5tokenizer3.test c96e232d51d21a4deb59d797070df9087121a7f5e3dc5d1cea60c6b3d9e76e69 +F ext/fts5/test/fts5tokenizer3.test 507d50608b61031f72f8cf3c752ea8db51d3d67ae99ebe6f0d191e58455dc19c F ext/fts5/test/fts5trigram.test 6c4e37864f3e7d90673db5563d9736d7e40080ab94d10ebdffa94c1b77941da0 F ext/fts5/test/fts5trigram2.test 9fe4207f8a4241747aff1005258b564958588d21bfd240d6cd4c2e955d31c156 F ext/fts5/test/fts5ubsan.test 783d5a8d13ebfa169e634940228db54540780e3ba7a87ad1e4510e61440bf64b @@ -2185,8 +2185,8 @@ F vsixtest/vsixtest.tcl 6a9a6ab600c25a91a7acc6293828957a386a8a93 F vsixtest/vsixtest.vcxproj.data 2ed517e100c66dc455b492e1a33350c1b20fbcdc F vsixtest/vsixtest.vcxproj.filters 37e51ffedcdb064aad6ff33b6148725226cd608e F vsixtest/vsixtest_TemporaryKey.pfx e5b1b036facdb453873e7084e1cae9102ccc67a0 -P 6c51c9c6a8a6a730c1d9e0119bc39edeefbbcb3b30476347a51d2e08eb91fe36 -R e266f84378ceb47ebfd337fc2f715b86 +P c2f9d1259cc094ad1d3e5e0a50b262a248915743fed3b1a730a1d9f0f845f48b +R 1ef80b3031ba5f310c6dd6728c192f80 U dan -Z 7dd42f59f6272cbb7143aeed0598944f +Z 1752c8f42f1640cfd39a63e89bc136e9 # Remove this line to create a well-formed Fossil manifest. diff --git a/manifest.uuid b/manifest.uuid index 4ea54b4cd6..dce827b751 100644 --- a/manifest.uuid +++ b/manifest.uuid @@ -1 +1 @@ -c2f9d1259cc094ad1d3e5e0a50b262a248915743fed3b1a730a1d9f0f845f48b \ No newline at end of file +6a640ea4d8938e2b0c73b5aa35d1ce96eea1d1304a26c98dc054e9949d1b7c8c \ No newline at end of file