From: dan Date: Mon, 11 Nov 2024 19:49:26 +0000 (+0000) Subject: Fix the fts5 trigram tokenizer so that it handles non-nul-terminated strings. X-Git-Tag: major-relase~172 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=0cd2ffffb71ef91c7a3d643f454dc00cf87424d4;p=thirdparty%2Fsqlite.git Fix the fts5 trigram tokenizer so that it handles non-nul-terminated strings. FossilOrigin-Name: 84f4e37178a65e3128ac0240d37ac40df08b4050ab070d10707e35d11dcbeb10 --- diff --git a/ext/fts5/fts5_tcl.c b/ext/fts5/fts5_tcl.c index 247b4f0e90..25cd5c0633 100644 --- a/ext/fts5/fts5_tcl.c +++ b/ext/fts5/fts5_tcl.c @@ -730,8 +730,9 @@ static int SQLITE_TCLAPI f5tTokenize( int objc, Tcl_Obj *CONST objv[] ){ - char *zText; - Tcl_Size nText; + char *pCopy = 0; + char *zText = 0; + Tcl_Size nText = 0; sqlite3 *db = 0; fts5_api *pApi = 0; Fts5Tokenizer *pTok = 0; @@ -778,22 +779,33 @@ static int SQLITE_TCLAPI f5tTokenize( return TCL_ERROR; } + if( nText>0 ){ + pCopy = sqlite3_malloc(nText); + if( pCopy==0 ){ + tokenizer.xDelete(pTok); + Tcl_AppendResult(interp, "error in sqlite3_malloc()", (char*)0); + return TCL_ERROR; + }else{ + memcpy(pCopy, zText, nText); + } + } + pRet = Tcl_NewObj(); Tcl_IncrRefCount(pRet); ctx.bSubst = (objc==5); ctx.pRet = pRet; - ctx.zInput = zText; + ctx.zInput = pCopy; rc = tokenizer.xTokenize( - pTok, (void*)&ctx, FTS5_TOKENIZE_DOCUMENT, zText,(int)nText, xTokenizeCb2 + pTok, (void*)&ctx, FTS5_TOKENIZE_DOCUMENT, pCopy,(int)nText, xTokenizeCb2 ); tokenizer.xDelete(pTok); + sqlite3_free(pCopy); if( rc!=SQLITE_OK ){ Tcl_AppendResult(interp, "error in tokenizer.xTokenize()", (char*)0); Tcl_DecrRefCount(pRet); return TCL_ERROR; } - Tcl_Free((void*)azArg); Tcl_SetObjResult(interp, pRet); Tcl_DecrRefCount(pRet); diff --git a/ext/fts5/fts5_tokenize.c b/ext/fts5/fts5_tokenize.c index f9581b080c..f10c2379de 100644 --- a/ext/fts5/fts5_tokenize.c +++ b/ext/fts5/fts5_tokenize.c @@ -1354,7 +1354,7 @@ static int fts5TriTokenize( int ii; const unsigned char *zIn = (const unsigned char*)pText; const unsigned char *zEof = &zIn[nText]; - u32 iCode; + u32 iCode = 0; int aStart[3]; /* Input offset of each character in aBuf[] */ UNUSED_PARAM(unusedFlags); @@ -1363,8 +1363,8 @@ static int fts5TriTokenize( for(ii=0; ii<3; ii++){ do { aStart[ii] = zIn - (const unsigned char*)pText; + if( zIn>=zEof ) return SQLITE_OK; READ_UTF8(zIn, zEof, iCode); - if( iCode==0 ) return SQLITE_OK; if( p->bFold ) iCode = sqlite3Fts5UnicodeFold(iCode, p->iFoldParam); }while( iCode==0 ); WRITE_UTF8(zOut, iCode); @@ -1385,8 +1385,11 @@ static int fts5TriTokenize( /* Read characters from the input up until the first non-diacritic */ do { iNext = zIn - (const unsigned char*)pText; + if( zIn>=zEof ){ + iCode = 0; + break; + } READ_UTF8(zIn, zEof, iCode); - if( iCode==0 ) break; if( p->bFold ) iCode = sqlite3Fts5UnicodeFold(iCode, p->iFoldParam); }while( iCode==0 ); diff --git a/ext/fts5/test/fts5trigram.test b/ext/fts5/test/fts5trigram.test index 5048f8beea..377e3f7813 100644 --- a/ext/fts5/test/fts5trigram.test +++ b/ext/fts5/test/fts5trigram.test @@ -350,5 +350,17 @@ do_execsql_test 11.1 { INSERT INTO t4 VALUES( str('') ); } +do_test 12.0 { + sqlite3_fts5_tokenize db trigram "abcd" +} {abc 0 3 bcd 1 4} + +do_test 12.1 { + sqlite3_fts5_tokenize db trigram "a" +} {} + +do_test 12.2 { + sqlite3_fts5_tokenize db trigram "" +} {} + finish_test diff --git a/manifest b/manifest index 7154ba0838..4b02cc873e 100644 --- a/manifest +++ b/manifest @@ -1,5 +1,5 @@ -C Add\sthe\s".dbtotxt"\scommand\sto\sthe\sCLI. -D 2024-11-11T19:07:58.682 +C Fix\sthe\sfts5\strigram\stokenizer\sso\sthat\sit\shandles\snon-nul-terminated\sstrings. +D 2024-11-11T19:49:26.299 F .fossil-settings/empty-dirs dbb81e8fc0401ac46a1491ab34a7f2c7c0452f2f06b54ebb845d024ca8283ef1 F .fossil-settings/ignore-glob 35175cdfcf539b2318cb04a9901442804be81cd677d8b889fcc9149c21f239ea F LICENSE.md e108e1e69ae8e8a59e93c455654b8ac9356a11720d3345df2a4743e9590fb20d @@ -113,10 +113,10 @@ F ext/fts5/fts5_hash.c adda4272be401566a6e0ba1acbe70ee5cb97fce944bc2e04dc707152a F ext/fts5/fts5_index.c 368a968570ce12ba40223e284a588d9f93ee23a0133727f0df1fcd64086b1fb6 F ext/fts5/fts5_main.c 50eb059e51d730e8e0c77df4e568b018079e112a755c094488b0d5b1aa06afbb F ext/fts5/fts5_storage.c 337b05e4c66fc822d031e264d65bde807ec2fab08665ca2cc8aaf9c5fa06792c -F ext/fts5/fts5_tcl.c aee6ae6d0c6968564c392bf0d09aaabb4d8bea9ca69fd224dc9b44243324acbf +F ext/fts5/fts5_tcl.c 7fb5a3d3404099075aaa2457307cb459bbc257c0de3dbd52b1e80a5b503e0329 F ext/fts5/fts5_test_mi.c 08c11ec968148d4cb4119d96d819f8c1f329812c568bac3684f5464be177d3ee F ext/fts5/fts5_test_tok.c 3cb0a9b508b30d17ef025ccddd26ae3dc8ddffbe76c057616e59a9aa85d36f3b -F ext/fts5/fts5_tokenize.c 033e2e43b8e852c0ef6cecc611266d61e2346e52ec7dcfb76a428fe56a07efa9 +F ext/fts5/fts5_tokenize.c 87ab719f0556172da3414f1741c11bb4d333ebecde157945a55478bfe6e46c44 F ext/fts5/fts5_unicode2.c 6f9b0fb79a8facaed76628ffd4eb9c16d7f2b84b52872784f617cf3422a9b043 F ext/fts5/fts5_varint.c e64d2113f6e1bfee0032972cffc1207b77af63319746951bf1d09885d1dadf80 F ext/fts5/fts5_vocab.c e4830b00809e5da53bc10f93adc59e321407b0f801c7f4167c0e47f5552267e0 @@ -248,7 +248,7 @@ F ext/fts5/test/fts5tok2.test dcacb32d4a2a3f0dd3215d4a3987f78ae4be21a2 F ext/fts5/test/fts5tokenizer.test 7937cec672b148223fff8746d21d3e7ed0965fd7caf35ccdc888a005bb452f98 F ext/fts5/test/fts5tokenizer2.test ddb8b10fbe4b84b2a75812671f127774c1d2e3e2bf82d2e0e4f0bb1cd8a2b2d6 F ext/fts5/test/fts5tokenizer3.test eea778f7bb7024c3e904e28915f9d53286141671b138722148be22a9c758bdc3 -F ext/fts5/test/fts5trigram.test 9927c9e9b35116ea00748c8e41d9cbc2b95a6c90845cd82a59c11fedfd16404a +F ext/fts5/test/fts5trigram.test a55fde7065ae69a0f82c5a7a5bf5286a97de11ae4bff6537fd3e27ca9a01416f F ext/fts5/test/fts5trigram2.test 6fde9de7f63a6b4aa18dc731be56dbd6be4e755c9b13dcd55479e200d1df0e61 F ext/fts5/test/fts5ubsan.test 9a2dcf399dc8d0e0de661f0d93884d1d27e5b7f0693cfceb97dd24d818df5dd2 F ext/fts5/test/fts5umlaut.test a42fe2fe6387c40c49ab27ccbd070e1ae38e07f38d05926482cc0bccac9ad602 @@ -2198,8 +2198,8 @@ F tool/version-info.c 3b36468a90faf1bbd59c65fd0eb66522d9f941eedd364fabccd7227350 F tool/warnings-clang.sh bbf6a1e685e534c92ec2bfba5b1745f34fb6f0bc2a362850723a9ee87c1b31a7 F tool/warnings.sh 49a486c5069de041aedcbde4de178293e0463ae9918ecad7539eedf0ec77a139 F tool/win/sqlite.vsix deb315d026cc8400325c5863eef847784a219a2f -P 073080cae3ea0d12b133d9c9ae98413bb32870a9738c98b89bc345181be81f23 -R 6bdbd63031b22370793e2437aff7acb5 -U drh -Z d185df0f51d41a6d3859bb43f39c0c62 +P b43acf5a8cd4a5efbb90b71af7710084f49bb90ffe4f56de168e8c3a6b679124 +R 823df7bdc2f581383fdd27b861d3511c +U dan +Z 55fb3f376e1035a7680545a25c6be334 # Remove this line to create a well-formed Fossil manifest. diff --git a/manifest.uuid b/manifest.uuid index b4972f3476..4728648ef5 100644 --- a/manifest.uuid +++ b/manifest.uuid @@ -1 +1 @@ -b43acf5a8cd4a5efbb90b71af7710084f49bb90ffe4f56de168e8c3a6b679124 +84f4e37178a65e3128ac0240d37ac40df08b4050ab070d10707e35d11dcbeb10