From: dan Date: Sat, 2 Dec 2023 17:32:16 +0000 (+0000) Subject: Ensure that tokendata=1 queries avoid loading large doclists for queries like "common... X-Git-Tag: version-3.45.0~114^2~10 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=c44041e03bc4d7ad0a5edbe8277a325eaaf5f5e6;p=thirdparty%2Fsqlite.git Ensure that tokendata=1 queries avoid loading large doclists for queries like "common AND uncommon", just as tokendata=0 queries do. FossilOrigin-Name: 7bda09ab404a110d57449e149a3281fca8dc4cacf7bd9832ea2a1356ad20fe8e --- diff --git a/ext/fts5/fts5Int.h b/ext/fts5/fts5Int.h index cee9528858..911f547d17 100644 --- a/ext/fts5/fts5Int.h +++ b/ext/fts5/fts5Int.h @@ -397,6 +397,7 @@ struct Fts5IndexIter { #define FTS5INDEX_QUERY_NOOUTPUT 0x0020 #define FTS5INDEX_QUERY_SKIPHASH 0x0040 #define FTS5INDEX_QUERY_NOTOKENDATA 0x0080 +#define FTS5INDEX_QUERY_SCANONETERM 0x0100 /* ** Create/destroy an Fts5Index object. @@ -786,6 +787,7 @@ int sqlite3Fts5ExprPhraseCollist(Fts5Expr *, int, const u8 **, int *); int sqlite3Fts5ExprQueryToken(Fts5Expr*, int, int, const char**, int*); int sqlite3Fts5ExprInstToken(Fts5Expr*, int, int, int, int, const char**, int*); +void sqlite3Fts5ExprClearTokens(Fts5Expr*); /******************************************* ** The fts5_expr.c API above this point is used by the other hand-written diff --git a/ext/fts5/fts5_expr.c b/ext/fts5/fts5_expr.c index 89e7cbf364..95d102062d 100644 --- a/ext/fts5/fts5_expr.c +++ b/ext/fts5/fts5_expr.c @@ -3050,17 +3050,6 @@ int sqlite3Fts5ExprPopulatePoslists( sCtx.aPopulator = aPopulator; sCtx.iOff = (((i64)iCol) << 32) - 1; - /* If this is a tokendata=1 table, clear out the hash tables of - ** full-terms. */ - if( pConfig->bTokendata ){ - for(i=0; inPhrase; i++){ - Fts5ExprTerm *pT; - for(pT=&pExpr->apExprPhrase[i]->aTerm[0]; pT; pT=pT->pSynonym){ - sqlite3Fts5IndexIterClearTokendata(pT->pIter); - } - } - } - for(i=0; inPhrase; i++){ Fts5ExprNode *pNode = pExpr->apExprPhrase[i]->pNode; Fts5Colset *pColset = pNode->pNear->pColset; @@ -3225,3 +3214,17 @@ int sqlite3Fts5ExprInstToken( return sqlite3Fts5IterToken(pIter, iRowid, iCol, iOff+iToken, ppOut, pnOut); } +/* +** Clear the token mappings for all Fts5IndexIter objects mannaged by +** the expression passed as the only argument. +*/ +void sqlite3Fts5ExprClearTokens(Fts5Expr *pExpr){ + int ii; + for(ii=0; iinPhrase; ii++){ + Fts5ExprTerm *pT; + for(pT=&pExpr->apExprPhrase[ii]->aTerm[0]; pT; pT=pT->pSynonym){ + sqlite3Fts5IndexIterClearTokendata(pT->pIter); + } + } +} + diff --git a/ext/fts5/fts5_index.c b/ext/fts5/fts5_index.c index 05f4c5e6aa..94b4767677 100644 --- a/ext/fts5/fts5_index.c +++ b/ext/fts5/fts5_index.c @@ -366,6 +366,7 @@ struct Fts5Index { sqlite3_stmt *pIdxWriter; /* "INSERT ... %_idx VALUES(?,?,?,?)" */ sqlite3_stmt *pIdxDeleter; /* "DELETE FROM %_idx WHERE segid=?" */ sqlite3_stmt *pIdxSelect; + sqlite3_stmt *pIdxNextSelect; int nRead; /* Total number of blocks read */ sqlite3_stmt *pDeleteFromIdx; @@ -2660,7 +2661,7 @@ static void fts5SegIterSeekInit( fts5LeafSeek(p, bGe, pIter, pTerm, nTerm); } - if( p->rc==SQLITE_OK && bGe==0 ){ + if( p->rc==SQLITE_OK && (bGe==0 || (flags & FTS5INDEX_QUERY_SCANONETERM)) ){ pIter->flags |= FTS5_SEGITER_ONETERM; if( pIter->pLeaf ){ if( flags & FTS5INDEX_QUERY_DESC ){ @@ -2693,6 +2694,79 @@ static void fts5SegIterSeekInit( ); } + +/* +** SQL used by fts5SegIterNextInit() to find the page to open. +*/ +static sqlite3_stmt *fts5IdxNextStmt(Fts5Index *p){ + if( p->pIdxNextSelect==0 ){ + Fts5Config *pConfig = p->pConfig; + fts5IndexPrepareStmt(p, &p->pIdxNextSelect, sqlite3_mprintf( + "SELECT pgno FROM '%q'.'%q_idx' WHERE " + "segid=? AND term>? ORDER BY term ASC LIMIT 1", + pConfig->zDb, pConfig->zName + )); + + } + return p->pIdxNextSelect; +} + +/* +** This is similar to fts5SegIterSeekInit(), except that it initializes +** the segment iterator to point to the first term following the page +** with pToken/nToken on it. +*/ +static void fts5SegIterNextInit( + Fts5Index *p, + const char *pTerm, int nTerm, + Fts5StructureSegment *pSeg, /* Description of segment */ + Fts5SegIter *pIter /* Object to populate */ +){ + int iPg = -1; /* Page of segment to open */ + int bDlidx = 0; + sqlite3_stmt *pSel = 0; /* SELECT to find iPg */ + + pSel = fts5IdxNextStmt(p); + if( pSel ){ + assert( p->rc==SQLITE_OK ); + sqlite3_bind_int(pSel, 1, pSeg->iSegid); + sqlite3_bind_blob(pSel, 2, pTerm, nTerm, SQLITE_STATIC); + + if( sqlite3_step(pSel)==SQLITE_ROW ){ + i64 val = sqlite3_column_int64(pSel, 0); + iPg = (int)(val>>1); + bDlidx = (val & 0x0001); + } + p->rc = sqlite3_reset(pSel); + if( p->rc ) return; + } + + memset(pIter, 0, sizeof(*pIter)); + pIter->pSeg = pSeg; + pIter->flags |= FTS5_SEGITER_ONETERM; + if( iPg>=0 ){ + pIter->iLeafPgno = iPg - 1; + fts5SegIterNextPage(p, pIter); + fts5SegIterSetNext(p, pIter); + fts5SegIterAllocTombstone(p, pIter); + } + if( pIter->pLeaf ){ + const u8 *a = pIter->pLeaf->p; + int iTermOff = 0; + + pIter->iPgidxOff = pIter->pLeaf->szLeaf; + pIter->iPgidxOff += fts5GetVarint32(&a[pIter->iPgidxOff], iTermOff); + pIter->iLeafOffset = iTermOff; + fts5SegIterLoadTerm(p, pIter, 0); + fts5SegIterLoadNPos(p, pIter); + if( bDlidx ) fts5SegIterLoadDlidx(p, pIter); + + assert( p->rc!=SQLITE_OK || + fts5BufferCompareBlob(&pIter->term, pTerm, nTerm)>0 + ); + } +} + /* ** Initialize the object pIter to point to term pTerm/nTerm within the ** in-memory hash table. If there is no such term in the hash-table, the @@ -6346,6 +6420,7 @@ int sqlite3Fts5IndexClose(Fts5Index *p){ sqlite3_finalize(p->pIdxWriter); sqlite3_finalize(p->pIdxDeleter); sqlite3_finalize(p->pIdxSelect); + sqlite3_finalize(p->pIdxNextSelect); sqlite3_finalize(p->pDataVersion); sqlite3_finalize(p->pDeleteFromIdx); sqlite3Fts5HashFree(p->pHash); @@ -6496,7 +6571,7 @@ static Fts5TokenDataIter *fts5AppendTokendataIter( if( p->rc==SQLITE_OK ){ if( pIn==0 || pIn->nIter==pIn->nIterAlloc ){ int nAlloc = pIn ? pIn->nIterAlloc*2 : 16; - int nByte = nAlloc * sizeof(Fts5Iter*); + int nByte = nAlloc * sizeof(Fts5Iter*) + sizeof(Fts5TokenDataIter); Fts5TokenDataIter *pNew = (Fts5TokenDataIter*)sqlite3_realloc(pIn, nByte); if( pNew==0 ){ @@ -6513,6 +6588,7 @@ static Fts5TokenDataIter *fts5AppendTokendataIter( }else{ pRet->apIter[pRet->nIter++] = pAppend; } + assert( pRet==0 || pRet->nIter<=pRet->nIterAlloc ); return pRet; } @@ -6747,6 +6823,10 @@ static void fts5TokendataSetTermIfEof(Fts5Iter *pIter, Fts5Buffer *pTerm){ } } +/* +** This function sets up an iterator to use for a non-prefix query on a +** tokendata=1 table. +*/ static Fts5Iter *fts5SetupTokendataIter( Fts5Index *p, /* FTS index to query */ const u8 *pToken, /* Buffer containing query term */ @@ -6756,7 +6836,7 @@ static Fts5Iter *fts5SetupTokendataIter( Fts5Iter *pRet = 0; Fts5TokenDataIter *pSet = 0; Fts5Structure *pStruct = 0; - const int flags = FTS5INDEX_QUERY_SKIPEMPTY | FTS5INDEX_QUERY_SCAN; + const int flags = FTS5INDEX_QUERY_SCANONETERM | FTS5INDEX_QUERY_SCAN; Fts5Buffer bSeek = {0, 0, 0}; Fts5Buffer *pSmall = 0; @@ -6787,20 +6867,32 @@ static Fts5Iter *fts5SetupTokendataIter( for(iLvl=0; iLvlnLevel; iLvl++){ for(iSeg=pStruct->aLevel[iLvl].nSeg-1; iSeg>=0; iSeg--){ Fts5StructureSegment *pSeg = &pStruct->aLevel[iLvl].aSeg[iSeg]; - fts5SegIterSeekInit(p, bSeek.p, bSeek.n, flags, pSeg, pNewIter); + int bDone = 0; - pNewIter++; if( pPrevIter ){ if( fts5BufferCompare(pSmall, &pPrevIter->term) ){ - fts5SegIterSetEOF(pPrevIter); + memcpy(pNewIter, pPrevIter, sizeof(Fts5SegIter)); + memset(pPrevIter, 0, sizeof(Fts5SegIter)); + bDone = 1; + }else if( pPrevIter->pLeaf + && pPrevIter->iEndofDoclist>pPrevIter->pLeaf->szLeaf + ){ + fts5SegIterNextInit(p,(const char*)bSeek.p,bSeek.n-1,pSeg,pNewIter); + bDone = 1; } - pPrevIter++; } + + if( bDone==0 ){ + fts5SegIterSeekInit(p, bSeek.p, bSeek.n, flags, pSeg, pNewIter); + } + + pNewIter++; + if( pPrevIter ) pPrevIter++; } } fts5TokendataSetTermIfEof(pPrev, pSmall); - pNew->bSkipEmpty = (0!=(flags & FTS5INDEX_QUERY_SKIPEMPTY)); + pNew->bSkipEmpty = 1; pNew->pColset = pColset; fts5IterSetOutputCb(&p->rc, pNew); @@ -7043,7 +7135,6 @@ int sqlite3Fts5IterToken( */ void sqlite3Fts5IndexIterClearTokendata(Fts5IndexIter *pIndexIter){ Fts5Iter *pIter = (Fts5Iter*)pIndexIter; - assert( pIter->pIndex->pConfig->eDetail!=FTS5_DETAIL_FULL ); if( pIter->pTokenDataIter ){ pIter->pTokenDataIter->nMap = 0; } diff --git a/ext/fts5/fts5_main.c b/ext/fts5/fts5_main.c index 34050474f8..e911b0c0f9 100644 --- a/ext/fts5/fts5_main.c +++ b/ext/fts5/fts5_main.c @@ -916,6 +916,16 @@ static int fts5NextMethod(sqlite3_vtab_cursor *pCursor){ ); assert( !CsrFlagTest(pCsr, FTS5CSR_EOF) ); + /* If this cursor uses FTS5_PLAN_MATCH and this is a tokendata=1 table, + ** clear any token mappings accumulated at the fts5_index.c level. In + ** other cases, specifically FTS5_PLAN_SOURCE and FTS5_PLAN_SORTED_MATCH, + ** we need to retain the mappings for the entire query. */ + if( pCsr->ePlan==FTS5_PLAN_MATCH + && ((Fts5Table*)pCursor->pVtab)->pConfig->bTokendata + ){ + sqlite3Fts5ExprClearTokens(pCsr->pExpr); + } + if( pCsr->ePlan<3 ){ int bSkip = 0; if( (rc = fts5CursorReseek(pCsr, &bSkip)) || bSkip ) return rc; diff --git a/ext/fts5/test/fts5origintext2.test b/ext/fts5/test/fts5origintext2.test index 948db1c519..a27309fe0c 100644 --- a/ext/fts5/test/fts5origintext2.test +++ b/ext/fts5/test/fts5origintext2.test @@ -98,7 +98,6 @@ do_execsql_test 1.10 { INSERT INTO ft VALUES('WORLD'); } -breakpoint do_execsql_test 1.11 { SELECT rowid FROM ft('hello'); } {1 2 3} do_execsql_test 1.12 { SELECT rowid FROM ft('today'); } {4 5 6} do_execsql_test 1.13 { SELECT rowid FROM ft('world'); } {7 8 9} diff --git a/ext/fts5/test/fts5origintext3.test b/ext/fts5/test/fts5origintext3.test index 2b1e5c6387..ac00bfabc0 100644 --- a/ext/fts5/test/fts5origintext3.test +++ b/ext/fts5/test/fts5origintext3.test @@ -46,7 +46,6 @@ foreach_detail_mode $testprefix { SELECT fts5_test_poslist(ft) FROM ft('hello'); } {{0.0.0 0.0.2 0.0.4}} -breakpoint do_execsql_test 1.3 { SELECT insttoken(ft, 0, 0), @@ -63,6 +62,18 @@ breakpoint FROM ft('hello') ORDER BY rank; } {hello.Hello hello.HELLO hello} + do_execsql_test 1.5 { + CREATE VIRTUAL TABLE ft2 USING fts5( + x, tokenize="origintext unicode61", tokendata=1, detail=%DETAIL% + ); + INSERT INTO ft2(rowid, x) VALUES(1, 'ONE one two three ONE'); + INSERT INTO ft2(rowid, x) VALUES(2, 'TWO one two three TWO'); + INSERT INTO ft2(rowid, x) VALUES(3, 'THREE one two three THREE'); + } + + do_execsql_test 1.6 { + SELECT insttoken(ft2, 0, 0), rowid FROM ft2('three') ORDER BY rank; + } {three.THREE 3 three 1 three 2} } finish_test diff --git a/ext/fts5/test/fts5origintext4.test b/ext/fts5/test/fts5origintext4.test new file mode 100644 index 0000000000..8973a24b05 --- /dev/null +++ b/ext/fts5/test/fts5origintext4.test @@ -0,0 +1,66 @@ +# 2023 November 22 +# +# The author disclaims copyright to this source code. In place of +# a legal notice, here is a blessing: +# +# May you do good and not evil. +# May you find forgiveness for yourself and forgive others. +# May you share freely, never taking more than you give. +# +#*********************************************************************** +# +# Tests focused on phrase queries. +# + +source [file join [file dirname [info script]] fts5_common.tcl] +set testprefix fts5origintext4 + +# If SQLITE_ENABLE_FTS5 is defined, omit this file. +ifcapable !fts5 { + finish_test + return +} + +sqlite3_fts5_register_origintext db +do_execsql_test 1.0 { + CREATE VIRTUAL TABLE ft USING fts5( + x, tokenize="origintext unicode61", tokendata=1 + ); +} + +do_execsql_test 1.1 { + BEGIN; + INSERT INTO ft SELECT 'the first thing'; + + WITH s(i) AS ( + SELECT 1 UNION ALL SELECT i+1 FROM s WHERE i<90000 + ) + INSERT INTO ft SELECT 'The second thing' FROM s; + + INSERT INTO ft SELECT 'the first thing'; + COMMIT; + INSERT INTO ft(ft) VALUES('optimize'); +} + +foreach {tn sql expr} { + 1 { SELECT rowid FROM ft('the') } {$mem > 250000} + 2 { SELECT rowid FROM ft('first') } {$mem < 50000} + 3 { SELECT rowid FROM ft('the first') } {$mem < 50000} +} { + db close + sqlite3 db test.db + sqlite3_fts5_register_origintext db + + execsql $sql + do_test 1.2.$tn { + set mem [lindex [sqlite3_db_status db CACHE_USED 0] 1] + expr $expr + } 1 +} + +proc b {x} { string map [list "\0" "."] $x } +db func b b +# execsql_pp { SELECT segid, b(term), pgno from ft_idx } + +finish_test + diff --git a/manifest b/manifest index 92cc5ca8d2..60fd22f769 100644 --- a/manifest +++ b/manifest @@ -1,5 +1,5 @@ -C Remove\sold\scode\sfor\stokendata=1\squeries. -D 2023-12-01T20:37:11.688 +C Ensure\sthat\stokendata=1\squeries\savoid\sloading\slarge\sdoclists\sfor\squeries\slike\s"common\sAND\suncommon",\sjust\sas\stokendata=0\squeries\sdo. +D 2023-12-02T17:32:16.568 F .fossil-settings/empty-dirs dbb81e8fc0401ac46a1491ab34a7f2c7c0452f2f06b54ebb845d024ca8283ef1 F .fossil-settings/ignore-glob 35175cdfcf539b2318cb04a9901442804be81cd677d8b889fcc9149c21f239ea F LICENSE.md df5091916dbb40e6e9686186587125e1b2ff51f022cc334e886c19a0e9982724 @@ -90,14 +90,14 @@ F ext/fts3/unicode/mkunicode.tcl d5aebf022fa4577ee8cdf27468f0d847879993959101f6d F ext/fts3/unicode/parseunicode.tcl a981bd6466d12dd17967515801c3ff23f74a281be1a03cf1e6f52a6959fc77eb F ext/fts5/extract_api_docs.tcl a36e54ec777172ddd3f9a88daf593b00848368e0 F ext/fts5/fts5.h 5e5630fc81e212f658afaa5b2650dac939d2729d0723aef1eeaff908f1725648 -F ext/fts5/fts5Int.h 2dc73393460e5c5cab67adc7e32e1387cc225b57e05f629d490e65cddea1a8c5 +F ext/fts5/fts5Int.h 285118aa6dfccb382e84eaeb9f7bec334e4f7104efa9303240605447003445c9 F ext/fts5/fts5_aux.c ee770eec0af8646db9e18fc01a0dad7345b5f5e8cbba236704cfae2d777022ad F ext/fts5/fts5_buffer.c 3001fbabb585d6de52947b44b455235072b741038391f830d6b729225eeaf6a5 F ext/fts5/fts5_config.c 8072a207034b51ae9b7694121d1b5715c794e94b275e088f70ae532378ca5cdf -F ext/fts5/fts5_expr.c aac8026aedf56c9a6e32b31c89f9dd7e5548378457085093307d06be14f1a176 +F ext/fts5/fts5_expr.c f83259b52b7b3e76768b835fe155cb7e345affdfafb96574372b4127d5f5496a F ext/fts5/fts5_hash.c adda4272be401566a6e0ba1acbe70ee5cb97fce944bc2e04dc707152a0ec91b1 -F ext/fts5/fts5_index.c 2296bcd6736eaf093212892474619dfdb7ac1e262732e2b72cb528172c0b13d6 -F ext/fts5/fts5_main.c 20596de592af135f68b9be875f0a28715f6562bbdedd215e1c89eac1b42e97f9 +F ext/fts5/fts5_index.c a02b6ff2d391dd9c2119f437eba1e8af5ac4b2f1798c7c39a93d73de95ad2337 +F ext/fts5/fts5_main.c 075995302198fe6f591fdbbedd415dfac564a9bfc20aea81e6fa0503b2d94af0 F ext/fts5/fts5_storage.c 5d10b9bdcce5b90656cad13c7d12ad4148677d4b9e3fca0481fca56d6601426d F ext/fts5/fts5_tcl.c cf0fd0dbe64ec272491b749e0d594f563cda03336aeb60900129e6d18b0aefb8 F ext/fts5/fts5_test_mi.c 08c11ec968148d4cb4119d96d819f8c1f329812c568bac3684f5464be177d3ee @@ -191,8 +191,9 @@ F ext/fts5/test/fts5optimize.test 36a752d24c818792032e4ff502936fc9cc5ef938721696 F ext/fts5/test/fts5optimize2.test 93e742c36b487d8874621360af5b1ce4d39b04fb9e71ce9bc34015c5fc811785 F ext/fts5/test/fts5optimize3.test bf9c91bb927d0fb2b9a06318a217a0419183ac5913842e062c7e0b98ea5d0fca F ext/fts5/test/fts5origintext.test 6574e8d2121460cda72866afe3e582693d9992f150b0703aff5981625b527e62 -F ext/fts5/test/fts5origintext2.test 3259b331073fec918e02fd4d14d50586f9a3531da047a2a8f4624983eb654229 -F ext/fts5/test/fts5origintext3.test cb0f5835f8dff5954ee20570b68ee520cf04a08f6f9ca967b9d01d27e532da37 +F ext/fts5/test/fts5origintext2.test 43b07dd62d087743322b0003a27c8efdbda6c8659a27fde71f32ead27b5a0969 +F ext/fts5/test/fts5origintext3.test e0d47c187e7c279d25aa27aa3de8dd0d26b050a74db90670c9b20d0ecfcfb52a +F ext/fts5/test/fts5origintext4.test 296b1b1e6630d492b99db0769e8127087548f0e939376047716a68b77ca3c871 F ext/fts5/test/fts5phrase.test 13e5d8e9083077b3d9c74315b3c92ec723cc6eb37c8155e0bfe1bba00559f07b F ext/fts5/test/fts5plan.test b65cfcca9ddd6fdaa118c61e17aeec8e8433bc5b6bb307abd116514f79c49c5a F ext/fts5/test/fts5porter.test 8d08010c28527db66bc3feebd2b8767504aaeb9b101a986342fa7833d49d0d15 @@ -2148,8 +2149,8 @@ F vsixtest/vsixtest.tcl 6a9a6ab600c25a91a7acc6293828957a386a8a93 F vsixtest/vsixtest.vcxproj.data 2ed517e100c66dc455b492e1a33350c1b20fbcdc F vsixtest/vsixtest.vcxproj.filters 37e51ffedcdb064aad6ff33b6148725226cd608e F vsixtest/vsixtest_TemporaryKey.pfx e5b1b036facdb453873e7084e1cae9102ccc67a0 -P 8258967411d3ff212424b25fec79ded0d8ae83e773cd35a0bbf300c94923f25b -R d3ee6e0cbe8554b06da39b239b5142bc +P b0a489e8e1bf0290c2117ab32d78b1cc7d67bcb226b55ec044c8367ebde3815b +R aa740197e8da931b3ffaf4ddf853a1ed U dan -Z cbe01276ff2d44c618e4cc899051c453 +Z 62655ccf950056e7477fcaff4611778d # Remove this line to create a well-formed Fossil manifest. diff --git a/manifest.uuid b/manifest.uuid index 359f2cca45..7fc4017fab 100644 --- a/manifest.uuid +++ b/manifest.uuid @@ -1 +1 @@ -b0a489e8e1bf0290c2117ab32d78b1cc7d67bcb226b55ec044c8367ebde3815b \ No newline at end of file +7bda09ab404a110d57449e149a3281fca8dc4cacf7bd9832ea2a1356ad20fe8e \ No newline at end of file