From: dan Date: Tue, 6 Oct 2015 20:53:26 +0000 (+0000) Subject: Optimizations for fts5 expressions that filter on column. More still to come. X-Git-Tag: version-3.9.0~34 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=a2507137f337255ccc708b7bfcda1ac54181fbcf;p=thirdparty%2Fsqlite.git Optimizations for fts5 expressions that filter on column. More still to come. FossilOrigin-Name: bf1607ac155018573ca40fb58aca62c5fea7e60b --- diff --git a/ext/fts5/fts5Int.h b/ext/fts5/fts5Int.h index 99b07bfb2f..3ab8dcdc93 100644 --- a/ext/fts5/fts5Int.h +++ b/ext/fts5/fts5Int.h @@ -81,6 +81,20 @@ extern int sqlite3_fts5_may_be_corrupt; #endif typedef struct Fts5Global Fts5Global; +typedef struct Fts5ExprColset Fts5ExprColset; + +/* If a NEAR() clump or phrase may only match a specific set of columns, +** then an object of the following type is used to record the set of columns. +** Each entry in the aiCol[] array is a column that may be matched. +** +** This object is used by fts5_expr.c and fts5_index.c. +*/ +struct Fts5ExprColset { + int nCol; + int aiCol[1]; +}; + + /************************************************************************** ** Interface to code in fts5_config.c. fts5_config.c contains contains code @@ -305,7 +319,7 @@ int sqlite3Fts5IndexClose(Fts5Index *p); /* ** for( -** pIter = sqlite3Fts5IndexQuery(p, "token", 5, 0); +** sqlite3Fts5IndexQuery(p, "token", 5, 0, 0, &pIter); ** 0==sqlite3Fts5IterEof(pIter); ** sqlite3Fts5IterNext(pIter) ** ){ @@ -321,7 +335,8 @@ int sqlite3Fts5IndexQuery( Fts5Index *p, /* FTS index to query */ const char *pToken, int nToken, /* Token (or prefix) to query for */ int flags, /* Mask of FTS5INDEX_QUERY_X flags */ - Fts5IndexIter **ppIter + Fts5ExprColset *pColset, /* Match these columns only */ + Fts5IndexIter **ppIter /* OUT: New iterator object */ ); /* @@ -567,7 +582,6 @@ typedef struct Fts5Parse Fts5Parse; typedef struct Fts5Token Fts5Token; typedef struct Fts5ExprPhrase Fts5ExprPhrase; typedef struct Fts5ExprNearset Fts5ExprNearset; -typedef struct Fts5ExprColset Fts5ExprColset; struct Fts5Token { const char *p; /* Token text (not NULL terminated) */ diff --git a/ext/fts5/fts5_expr.c b/ext/fts5/fts5_expr.c index 559f0db82b..9b0c262301 100644 --- a/ext/fts5/fts5_expr.c +++ b/ext/fts5/fts5_expr.c @@ -89,16 +89,6 @@ struct Fts5ExprPhrase { Fts5ExprTerm aTerm[1]; /* Terms that make up this phrase */ }; -/* -** If a NEAR() clump may only match a specific set of columns, then -** Fts5ExprNearset.pColset points to an object of the following type. -** Each entry in the aiCol[] array -*/ -struct Fts5ExprColset { - int nCol; - int aiCol[1]; -}; - /* ** One or more phrases that must appear within a certain token distance of ** each other within each matching document. @@ -1002,6 +992,7 @@ static int fts5ExprNearInitAll( pExpr->pIndex, p->zTerm, strlen(p->zTerm), (pTerm->bPrefix ? FTS5INDEX_QUERY_PREFIX : 0) | (pExpr->bDesc ? FTS5INDEX_QUERY_DESC : 0), + pNear->pColset, &p->pIter ); assert( rc==SQLITE_OK || p->pIter==0 ); diff --git a/ext/fts5/fts5_index.c b/ext/fts5/fts5_index.c index 490dbd7cb7..46aff77728 100644 --- a/ext/fts5/fts5_index.c +++ b/ext/fts5/fts5_index.c @@ -3942,12 +3942,81 @@ int sqlite3Fts5IndexMerge(Fts5Index *p, int nMerge){ static void fts5PoslistCallback( Fts5Index *p, - void *pCtx, + void *pContext, const u8 *pChunk, int nChunk ){ assert_nc( nChunk>=0 ); if( nChunk>0 ){ - fts5BufferAppendBlob(&p->rc, (Fts5Buffer*)pCtx, nChunk, pChunk); + fts5BufferAppendBlob(&p->rc, (Fts5Buffer*)pContext, nChunk, pChunk); + } +} + +typedef struct PoslistCallbackCtx PoslistCallbackCtx; +struct PoslistCallbackCtx { + Fts5Buffer *pBuf; /* Append to this buffer */ + Fts5ExprColset *pColset; /* Restrict matches to this column */ + int eState; /* See above */ +}; + +/* +** TODO: Make this more efficient! +*/ +static int fts5IndexColsetTest(Fts5ExprColset *pColset, int iCol){ + int i; + for(i=0; inCol; i++){ + if( pColset->aiCol[i]==iCol ) return 1; + } + return 0; +} + +static void fts5PoslistFilterCallback( + Fts5Index *p, + void *pContext, + const u8 *pChunk, int nChunk +){ + PoslistCallbackCtx *pCtx = (PoslistCallbackCtx*)pContext; + assert_nc( nChunk>=0 ); + if( nChunk>0 ){ + /* Search through to find the first varint with value 1. This is the + ** start of the next columns hits. */ + int i = 0; + int iStart = 0; + + if( pCtx->eState==2 ){ + int iCol; + fts5IndexGetVarint32(pChunk, i, iCol); + if( fts5IndexColsetTest(pCtx->pColset, iCol) ){ + pCtx->eState = 1; + fts5BufferAppendVarint(&p->rc, pCtx->pBuf, 1); + }else{ + pCtx->eState = 0; + } + } + + do { + while( ieState ){ + fts5BufferAppendBlob(&p->rc, pCtx->pBuf, i-iStart, &pChunk[iStart]); + } + if( i=nChunk ){ + pCtx->eState = 2; + }else{ + fts5IndexGetVarint32(pChunk, i, iCol); + pCtx->eState = fts5IndexColsetTest(pCtx->pColset, iCol); + if( pCtx->eState ){ + fts5BufferAppendBlob(&p->rc, pCtx->pBuf, i-iStart, &pChunk[iStart]); + iStart = i; + } + } + } + }while( irc. It is assumed ** no error has already occurred when this function is called. */ -static void fts5MultiIterPoslist( +static int fts5MultiIterPoslist( Fts5Index *p, Fts5IndexIter *pMulti, + Fts5ExprColset *pColset, int bSz, /* Append a size field before the data */ Fts5Buffer *pBuf ){ if( p->rc==SQLITE_OK ){ + int iSz; + int iData; + Fts5SegIter *pSeg = &pMulti->aSeg[ pMulti->aFirst[1].iFirst ]; assert( fts5MultiIterEof(p, pMulti)==0 ); if( bSz ){ /* WRITEPOSLISTSIZE */ + iSz = pBuf->n; fts5BufferAppendVarint(&p->rc, pBuf, pSeg->nPos*2); + iData = pBuf->n; + } + + fts5SegiterPoslist(p, pSeg, pColset, pBuf); + + if( bSz && pColset ){ + int nActual = pBuf->n - iData; + if( nActual!=pSeg->nPos ){ + /* WRITEPOSLISTSIZE */ + if( nActual==0 ){ + return 1; + }else{ + int nReq = sqlite3Fts5GetVarintLen((u32)(nActual*2)); + while( iSz<(iData-nReq) ){ pBuf->p[iSz++] = 0x80; } + sqlite3Fts5PutVarint(&pBuf->p[iSz], nActual*2); + } + } } - fts5SegiterPoslist(p, pSeg, pBuf); } + + return 0; } static void fts5DoclistIterNext(Fts5DoclistIter *pIter){ @@ -4149,7 +4251,8 @@ static void fts5SetupPrefixIter( int bDesc, /* True for "ORDER BY rowid DESC" */ const u8 *pToken, /* Buffer containing prefix to match */ int nToken, /* Size of buffer pToken in bytes */ - Fts5IndexIter **ppIter /* OUT: New iterator */ + Fts5ExprColset *pColset, /* Restrict matches to these columns */ + Fts5IndexIter **ppIter /* OUT: New iterator */ ){ Fts5Structure *pStruct; Fts5Buffer *aBuf; @@ -4192,8 +4295,14 @@ static void fts5SetupPrefixIter( } if( 0==sqlite3Fts5BufferGrow(&p->rc, &doclist, 9) ){ - fts5MergeAppendDocid(&doclist, iLastRowid, iRowid); - fts5MultiIterPoslist(p, p1, 1, &doclist); + int iSave = doclist.n; + assert( doclist.n!=0 || iLastRowid==0 ); + fts5BufferSafeAppendVarint(&doclist, iRowid - iLastRowid); + if( fts5MultiIterPoslist(p, p1, pColset, 1, &doclist) ){ + doclist.n = iSave; + }else{ + iLastRowid = iRowid; + } } } @@ -4427,6 +4536,7 @@ int sqlite3Fts5IndexQuery( Fts5Index *p, /* FTS index to query */ const char *pToken, int nToken, /* Token (or prefix) to query for */ int flags, /* Mask of FTS5INDEX_QUERY_X flags */ + Fts5ExprColset *pColset, /* Match these columns only */ Fts5IndexIter **ppIter /* OUT: New iterator object */ ){ Fts5Config *pConfig = p->pConfig; @@ -4470,7 +4580,7 @@ int sqlite3Fts5IndexQuery( }else{ int bDesc = (flags & FTS5INDEX_QUERY_DESC)!=0; buf.p[0] = FTS5_MAIN_PREFIX; - fts5SetupPrefixIter(p, bDesc, buf.p, nToken+1, &pRet); + fts5SetupPrefixIter(p, bDesc, buf.p, nToken+1, pColset, &pRet); } if( p->rc ){ @@ -4572,7 +4682,7 @@ int sqlite3Fts5IterPoslist( *pp = &pSeg->pLeaf->p[pSeg->iLeafOffset]; }else{ fts5BufferZero(&pIter->poslist); - fts5SegiterPoslist(pIter->pIndex, pSeg, &pIter->poslist); + fts5SegiterPoslist(pIter->pIndex, pSeg, 0, &pIter->poslist); *pp = pIter->poslist.p; } return fts5IndexReturn(pIter->pIndex); @@ -4588,7 +4698,7 @@ int sqlite3Fts5IterPoslistBuffer(Fts5IndexIter *pIter, Fts5Buffer *pBuf){ assert( p->rc==SQLITE_OK ); fts5BufferZero(pBuf); - fts5MultiIterPoslist(p, pIter, 0, pBuf); + fts5MultiIterPoslist(p, pIter, 0, 0, pBuf); return fts5IndexReturn(p); } @@ -4763,7 +4873,7 @@ static int fts5QueryCksum( ){ u64 cksum = *pCksum; Fts5IndexIter *pIdxIter = 0; - int rc = sqlite3Fts5IndexQuery(p, z, n, flags, &pIdxIter); + int rc = sqlite3Fts5IndexQuery(p, z, n, flags, 0, &pIdxIter); while( rc==SQLITE_OK && 0==sqlite3Fts5IterEof(pIdxIter) ){ i64 dummy; @@ -5137,7 +5247,7 @@ int sqlite3Fts5IndexIntegrityCheck(Fts5Index *p, u64 cksum){ fts5TestTerm(p, &term, z, n, cksum2, &cksum3); poslist.n = 0; - fts5MultiIterPoslist(p, pIter, 0, &poslist); + fts5MultiIterPoslist(p, pIter, 0, 0, &poslist); while( 0==sqlite3Fts5PoslistNext64(poslist.p, poslist.n, &iOff, &iPos) ){ int iCol = FTS5_POS2COLUMN(iPos); int iTokOff = FTS5_POS2OFFSET(iPos); diff --git a/ext/fts5/fts5_vocab.c b/ext/fts5/fts5_vocab.c index bdf2e36b63..1f80adc372 100644 --- a/ext/fts5/fts5_vocab.c +++ b/ext/fts5/fts5_vocab.c @@ -402,7 +402,7 @@ static int fts5VocabFilterMethod( const int flags = FTS5INDEX_QUERY_SCAN; fts5VocabResetCursor(pCsr); - rc = sqlite3Fts5IndexQuery(pCsr->pIndex, 0, 0, flags, &pCsr->pIter); + rc = sqlite3Fts5IndexQuery(pCsr->pIndex, 0, 0, flags, 0, &pCsr->pIter); if( rc==SQLITE_OK ){ rc = fts5VocabNextMethod(pCursor); } diff --git a/ext/fts5/test/fts5prefix.test b/ext/fts5/test/fts5prefix.test index 076ecaa09b..f4fcc4bfa3 100644 --- a/ext/fts5/test/fts5prefix.test +++ b/ext/fts5/test/fts5prefix.test @@ -62,6 +62,89 @@ foreach {tn q res} { do_execsql_test 2.3.$tn $q $res } +#------------------------------------------------------------------------- +# Check that prefix queries with: +# +# * a column filter, and +# * no prefix index. +# +# work Ok. +# +do_execsql_test 3.0 { + CREATE VIRTUAL TABLE t3 USING fts5(a, b, c); + INSERT INTO t3(t3, rank) VALUES('pgsz', 32); + BEGIN; + INSERT INTO t3 VALUES('acb ccc bba', 'cca bba bca', 'bbc ccc bca'); -- 1 + INSERT INTO t3 VALUES('cbb cac cab', 'abb aac bba', 'aab ccc cac'); -- 2 + INSERT INTO t3 VALUES('aac bcb aac', 'acb bcb caa', 'aca bab bca'); -- 3 + INSERT INTO t3 VALUES('aab ccb ccc', 'aca cba cca', 'aca aac cbb'); -- 4 + INSERT INTO t3 VALUES('bac aab bab', 'ccb bac cba', 'acb aba abb'); -- 5 + INSERT INTO t3 VALUES('bab abc ccb', 'acb cba abb', 'cbb aaa cab'); -- 6 + INSERT INTO t3 VALUES('cbb bbc baa', 'aab aca baa', 'bcc cca aca'); -- 7 + INSERT INTO t3 VALUES('abc bba abb', 'cac abc cba', 'acc aac cac'); -- 8 + INSERT INTO t3 VALUES('bbc bbc cab', 'bcb ccb cba', 'bcc cac acb'); -- 9 + COMMIT; +} + +foreach {tn match res} { + 1 "a : c*" {1 2 4 6 7 9} + 2 "b : c*" {1 3 4 5 6 8 9} + 3 "c : c*" {1 2 4 6 7 8 9} + 4 "a : b*" {1 3 5 6 7 8 9} + 5 "b : b*" {1 2 3 5 7 9} + 6 "c : b*" {1 3 7 9} + 7 "a : a*" {1 3 4 5 6 8} + 8 "b : a*" {2 3 4 6 7 8} + 9 "c : a*" {2 3 4 5 6 7 8 9} +} { + do_execsql_test 3.1.$tn { + SELECT rowid FROM t3($match) + } $res +} + +do_test 3.2 { + expr srand(0) + execsql { DELETE FROM t3 } + for {set i 0} {$i < 1000} {incr i} { + set a [fts5_rnddoc 3] + set b [fts5_rnddoc 8] + set c [fts5_rnddoc 20] + execsql { INSERT INTO t3 VALUES($a, $b, $c) } + } + execsql { INSERT INTO t3(t3) VALUES('integrity-check') } +} {} + +proc gmatch {col pattern} { + expr {[lsearch -glob $col $pattern]>=0} +} +db func gmatch gmatch + +for {set x 0} {$x<2} {incr x} { + foreach {tn pattern} { + 1 {xa*} + 2 {xb*} + 3 {xc*} + 4 {xd*} + 5 {xe*} + 6 {xf*} + 7 {xg*} + 8 {xh*} + 9 {xi*} + 10 {xj*} + } { + foreach col {b} { + set res [db eval "SELECT rowid FROM t3 WHERE gmatch($col, '$pattern')"] + set query "$col : $pattern" + do_execsql_test 3.3.$x.$tn.$col { + SELECT rowid FROM t3($query); + } $res + } + } + execsql { INSERT INTO t3(t3) VALUES('optimize') } + execsql { INSERT INTO t3(t3) VALUES('integrity-check') } +} + finish_test + diff --git a/ext/fts5/test/fts5simple.test b/ext/fts5/test/fts5simple.test index ecdb11080c..77407f5e2c 100644 --- a/ext/fts5/test/fts5simple.test +++ b/ext/fts5/test/fts5simple.test @@ -239,6 +239,17 @@ do_execsql_test 9.3 { SELECT rowid FROM ft2('b AND c'); } {2} +#------------------------------------------------------------------------- +# +do_execsql_test 10.0 { + CREATE VIRTUAL TABLE t3 USING fts5(a, b, c); + INSERT INTO t3 VALUES('bac aab bab', 'c bac c', 'acb aba abb'); -- 1 + INSERT INTO t3 VALUES('bab abc c', 'acb c abb', 'c aaa c'); -- 2 +} + +do_execsql_test 10.1 { + SELECT rowid FROM t3('c: c*'); +} {2} finish_test diff --git a/manifest b/manifest index f4bc8f8a8c..b5fce18292 100644 --- a/manifest +++ b/manifest @@ -1,5 +1,5 @@ -C Fix\sthe\sLIMIT\sand\sOFFSET\shandling\sfor\sUNION\sALL\squeries\sthat\scontain\sa\s\nsubquery\swith\sORDER\sBY\son\sthe\sright-hand\sside.\s\sFix\sfor\sticket\n[b65cb2c8d91f668584]. -D 2015-10-06T17:27:18.737 +C Optimizations\sfor\sfts5\sexpressions\sthat\sfilter\son\scolumn.\sMore\sstill\sto\scome. +D 2015-10-06T20:53:26.753 F Makefile.arm-wince-mingw32ce-gcc d6df77f1f48d690bd73162294bbba7f59507c72f F Makefile.in 2143eeef6d0cc26006ae5fc4bb242a4a8b973412 F Makefile.linux-gcc 91d710bdc4998cb015f39edf3cb314ec4f4d7e23 @@ -106,13 +106,13 @@ F ext/fts3/unicode/mkunicode.tcl 95cf7ec186e48d4985e433ff8a1c89090a774252 F ext/fts3/unicode/parseunicode.tcl da577d1384810fb4e2b209bf3313074353193e95 F ext/fts5/extract_api_docs.tcl a36e54ec777172ddd3f9a88daf593b00848368e0 F ext/fts5/fts5.h 98f802fe41481f9d797fce496f0fefcad72c7782 -F ext/fts5/fts5Int.h ff78a77d819a7fc04a7f8b08b0e1ce361a3395e4 +F ext/fts5/fts5Int.h eba5b20f1049a8908f867ff1b59299f49bb392a4 F ext/fts5/fts5_aux.c 7a307760a9c57c750d043188ec0bad59f5b5ec7e F ext/fts5/fts5_buffer.c 54b18497395a19dfe1d00f63a3b403e5f93d4fd1 F ext/fts5/fts5_config.c 57ee5fe71578cb494574fc0e6e51acb9a22a8695 -F ext/fts5/fts5_expr.c 667faaf14a69a5683ac383acdc8d942cf32c3f93 +F ext/fts5/fts5_expr.c bd2618ceaaadadbc8a4792ba977b393d2d1d3a08 F ext/fts5/fts5_hash.c 4bf4b99708848357b8a2b5819e509eb6d3df9246 -F ext/fts5/fts5_index.c ca3912a44ef5a173ef098f3454465519bd4b8e88 +F ext/fts5/fts5_index.c 11687c48902238e1fedb0bb8e1e8b5b8f6d82e1c F ext/fts5/fts5_main.c fe5243d6bbb79217394f0ec7f4f5199ddbc9e7e8 F ext/fts5/fts5_storage.c df061a5caf9e50fbbd43113009b5b248362f4995 F ext/fts5/fts5_tcl.c 6da58d6e8f42a93c4486b5ba9b187a7f995dee37 @@ -120,7 +120,7 @@ F ext/fts5/fts5_test_mi.c e96be827aa8f571031e65e481251dc1981d608bf F ext/fts5/fts5_tokenize.c f380f46f341af9c9a9908e1aade685ba1eaa157a F ext/fts5/fts5_unicode2.c 78273fbd588d1d9bd0a7e4e0ccc9207348bae33c F ext/fts5/fts5_varint.c 3f86ce09cab152e3d45490d7586b7ed2e40c13f1 -F ext/fts5/fts5_vocab.c 4622e0b7d84a488a1585aaa56eb214ee67a988bc +F ext/fts5/fts5_vocab.c 17320c476a5296ee475ab616d95fd10515bacfec F ext/fts5/fts5parse.y 833db1101b78c0c47686ab1b84918e38c36e9452 F ext/fts5/mkportersteps.tcl 5acf962d2e0074f701620bb5308155fa1e4a63ba F ext/fts5/test/fts5_common.tcl b6e6a40ef5d069c8e86ca4fbad491e1195485dbc @@ -169,12 +169,12 @@ F ext/fts5/test/fts5optimize.test 42741e7c085ee0a1276140a752d4407d97c2c9f5 F ext/fts5/test/fts5plan.test 6a55ecbac9890765b0e16f8c421c7e0888cfe436 F ext/fts5/test/fts5porter.test 7cdc07bef301d70eebbfa75dcaf45c3680e1d0e1 F ext/fts5/test/fts5porter2.test 2e65633d58a1c525d5af0f6c01e5a59155bb3487 -F ext/fts5/test/fts5prefix.test 552a462f0e8595676611f41643de217fb4ac2808 +F ext/fts5/test/fts5prefix.test 5d4fd42696789843ff98a62f4b84e3f66ecad9d6 F ext/fts5/test/fts5rank.test 11dcebba31d822f7e99685b4ea2c2ae3ec0b16f1 F ext/fts5/test/fts5rebuild.test 03935f617ace91ed23a6099c7c74d905227ff29b F ext/fts5/test/fts5restart.test c17728fdea26e7d0f617d22ad5b4b2862b994c17 F ext/fts5/test/fts5rowid.test 400384798349d658eaf06aefa1e364957d5d4821 -F ext/fts5/test/fts5simple.test 06d4afbecc37f6f490a58ece4f2f7324cf2b2024 +F ext/fts5/test/fts5simple.test 84d22123e0a7584f1ffb6efcd37eee46f317ab90 F ext/fts5/test/fts5synonym.test cf88c0a56d5ea9591e3939ef1f6e294f7f2d0671 F ext/fts5/test/fts5tokenizer.test ea4df698b35cc427ebf2ba22829d0e28386d8c89 F ext/fts5/test/fts5unicode.test fbef8d8a3b4b88470536cc57604a82ca52e51841 @@ -1392,7 +1392,7 @@ F tool/vdbe_profile.tcl 67746953071a9f8f2f668b73fe899074e2c6d8c1 F tool/warnings-clang.sh f6aa929dc20ef1f856af04a730772f59283631d4 F tool/warnings.sh 48bd54594752d5be3337f12c72f28d2080cb630b F tool/win/sqlite.vsix deb315d026cc8400325c5863eef847784a219a2f -P 3168326ebfa1c961d8fc6435453b02be23d910cc -R 3e47dd30d61b7e1fff555418e15c6a1d -U drh -Z e6b33681aca5812f13faee0e253c4fc3 +P 4b631364354068af95a01630469cb6fbfe8b52fd +R 8d25a208807a117d422df371476abd9e +U dan +Z f7ba9c7b6fd75653c93e200489a4e473 diff --git a/manifest.uuid b/manifest.uuid index b73745c378..709fd4d260 100644 --- a/manifest.uuid +++ b/manifest.uuid @@ -1 +1 @@ -4b631364354068af95a01630469cb6fbfe8b52fd \ No newline at end of file +bf1607ac155018573ca40fb58aca62c5fea7e60b \ No newline at end of file