From: dan Date: Wed, 8 Jun 2011 18:39:07 +0000 (+0000) Subject: Fix various issues to do with deferred tokens, NEAR expressions and matchinfo(). X-Git-Tag: version-3.7.7~62^2~9 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=abf2545ed9c058bd62502b858adbdddcdabef446;p=thirdparty%2Fsqlite.git Fix various issues to do with deferred tokens, NEAR expressions and matchinfo(). FossilOrigin-Name: 3972a787df5ec253b99b148385655e7b68d851fa --- diff --git a/ext/fts3/fts3.c b/ext/fts3/fts3.c index d3a49f9df6..5536a684ae 100644 --- a/ext/fts3/fts3.c +++ b/ext/fts3/fts3.c @@ -1122,9 +1122,7 @@ static int fts3InitVtab( } /* Figure out the page-size for the database. This is required in order to - ** estimate the cost of loading large doclists from the database (see - ** function sqlite3Fts3SegReaderCost() for details). - */ + ** estimate the cost of loading large doclists from the database. */ fts3DatabasePageSize(&rc, p); p->nNodeSize = p->nPgsz-35; @@ -1965,9 +1963,9 @@ static void fts3PutDeltaVarint3( static int fts3DoclistOrMerge( int bDescIdx, /* True if arguments are desc */ - u8 *a1, int n1, /* First doclist */ - u8 *a2, int n2, /* Second doclist */ - u8 **paOut, int *pnOut /* OUT: Malloc'd doclist */ + char *a1, int n1, /* First doclist */ + char *a2, int n2, /* Second doclist */ + char **paOut, int *pnOut /* OUT: Malloc'd doclist */ ){ sqlite3_int64 i1 = 0; sqlite3_int64 i2 = 0; @@ -1977,7 +1975,6 @@ static int fts3DoclistOrMerge( char *p1 = a1; char *p2 = a2; char *p; - int nOut; char *aOut; int bFirstOut = 0; @@ -2016,8 +2013,8 @@ static int fts3DoclistOrMerge( static void fts3DoclistPhraseMerge( int bDescIdx, /* True if arguments are desc */ int nDist, /* Distance from left to right (1=adjacent) */ - u8 *aLeft, int nLeft, /* Left doclist */ - u8 *aRight, int *pnRight /* IN/OUT: Right/output doclist */ + char *aLeft, int nLeft, /* Left doclist */ + char *aRight, int *pnRight /* IN/OUT: Right/output doclist */ ){ sqlite3_int64 i1 = 0; sqlite3_int64 i2 = 0; @@ -2063,83 +2060,6 @@ static void fts3DoclistPhraseMerge( *pnRight = p - aOut; } -/* -** This function merges two doclists according to the requirements of a -** NEAR operator. -*/ -static int fts3DoclistNearMerge( - int bDescIdx, - int nNear, /* Parameter to NEAR operator */ - int nTokenLeft, /* Number of tokens in LHS phrase arg */ - char *aLeft, /* Doclist for LHS (incl. positions) */ - int nLeft, /* Size of LHS doclist in bytes */ - int nTokenRight, /* As nTokenLeft */ - char *aRight, /* As aLeft */ - int nRight, /* As nRight */ - char **paOut, /* OUT: Results of merge (malloced) */ - int *pnOut /* OUT: Sized of output buffer */ -){ - char *aOut; /* Buffer to write output doclist to */ - char *aTmp; /* Temp buffer used by PoslistNearMerge() */ - - sqlite3_int64 i1 = 0; - sqlite3_int64 i2 = 0; - sqlite3_int64 iPrev = 0; - int bFirstOut = 0; - - char *pEnd1 = &aLeft[nLeft]; - char *pEnd2 = &aRight[nRight]; - char *p1 = aLeft; - char *p2 = aRight; - char *p; - - int nParam1 = nNear+nTokenRight; - int nParam2 = nNear+nTokenLeft; - - p = aOut = sqlite3_malloc(nLeft+nRight+1); - aTmp = sqlite3_malloc(2*(nLeft+nRight+1)); - if( !aOut || !aTmp ){ - sqlite3_free(aOut); - sqlite3_free(aTmp); - *paOut = 0; - *pnOut = 0; - return SQLITE_NOMEM; - } - - fts3GetDeltaVarint3(&p1, pEnd1, 0, &i1); - fts3GetDeltaVarint3(&p2, pEnd2, 0, &i2); - - while( p1 && p2 ){ - sqlite3_int64 iDiff = COMPARE_DOCID(i1, i2); - if( iDiff==0 ){ - char *pSave = p; - sqlite3_int64 iPrevSave = iPrev; - int bFirstOutSave = bFirstOut; - fts3PutDeltaVarint3(&p, bDescIdx, &iPrev, &bFirstOut, i1); - if( !fts3PoslistNearMerge(&p, aTmp, nParam1, nParam2, &p1, &p2) ){ - p = pSave; - iPrev = iPrevSave; - bFirstOut = bFirstOutSave; - } - - fts3GetDeltaVarint3(&p1, pEnd1, bDescIdx, &i1); - fts3GetDeltaVarint3(&p2, pEnd2, bDescIdx, &i2); - }else if( iDiff<0 ){ - fts3PoslistCopy(0, &p1); - fts3GetDeltaVarint3(&p1, pEnd1, bDescIdx, &i1); - }else{ - fts3PoslistCopy(0, &p2); - fts3GetDeltaVarint3(&p2, pEnd2, bDescIdx, &i2); - } - } - - sqlite3_free(aTmp); - *paOut = aOut; - *pnOut = p - aOut; - return SQLITE_OK; -} - - /* ** Merge all doclists in the TermSelect.aaOutput[] array into a single @@ -2166,7 +2086,7 @@ static int fts3TermSelectMerge(Fts3Table *p, TermSelect *pTS){ pTS->aaOutput[i] = 0; }else{ int nNew; - u8 *aNew; + char *aNew; int rc = fts3DoclistOrMerge(p->bDescIdx, pTS->aaOutput[i], pTS->anOutput[i], aOut, nOut, &aNew, &nNew @@ -2231,7 +2151,7 @@ static int fts3TermSelectCb( pTS->anOutput[iOut] = nMerge; break; }else{ - u8 *aNew; + char *aNew; int nNew; int rc = fts3DoclistOrMerge(p->bDescIdx, aMerge, nMerge, @@ -2403,7 +2323,6 @@ int sqlite3Fts3TermSegReaderCursor( pSegcsr = sqlite3_malloc(sizeof(Fts3MultiSegReader)); if( pSegcsr ){ int i; - int nCost = 0; int bFound = 0; /* True once an index has been found */ Fts3Table *p = (Fts3Table *)pCsr->base.pVtab; @@ -2436,10 +2355,6 @@ int sqlite3Fts3TermSegReaderCursor( ); pSegcsr->bLookup = !isPrefix; } - for(i=0; rc==SQLITE_OK && inSegment; i++){ - rc = sqlite3Fts3SegReaderCost(pCsr, pSegcsr->apSegment[i], &nCost); - } - pSegcsr->nCost = nCost; } *ppSegcsr = pSegcsr; @@ -3053,11 +2968,10 @@ static int fts3RenameMethod( } static int fts3SavepointMethod(sqlite3_vtab *pVtab, int iSavepoint){ - Fts3Table *p = (Fts3Table*)pVtab; UNUSED_PARAMETER(iSavepoint); - assert( p->inTransaction ); - assert( p->mxSavepoint < iSavepoint ); - TESTONLY( p->mxSavepoint = iSavepoint ); + assert( ((Fts3Table *)pVtab)->inTransaction ); + assert( ((Fts3Table *)pVtab)->mxSavepoint < iSavepoint ); + TESTONLY( ((Fts3Table *)pVtab)->mxSavepoint = iSavepoint ); return fts3SyncMethod(pVtab); } static int fts3ReleaseMethod(sqlite3_vtab *pVtab, int iSavepoint){ @@ -3328,7 +3242,6 @@ static int fts3EvalPhraseLoad( } static int fts3EvalDeferredPhrase(Fts3Cursor *pCsr, Fts3Phrase *pPhrase){ - Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab; int iToken; int rc = SQLITE_OK; @@ -3450,12 +3363,10 @@ static int fts3EvalDeferredPhrase(Fts3Cursor *pCsr, Fts3Phrase *pPhrase){ */ static int fts3EvalPhraseStart(Fts3Cursor *pCsr, int bOptOk, Fts3Phrase *p){ int rc; - Fts3Doclist *pList = &p->doclist; Fts3PhraseToken *pFirst = &p->aToken[0]; Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab; - assert( pList->aAll==0 ); - + assert( p->doclist.aAll==0 ); if( pCsr->bDesc==pTab->bDescIdx && bOptOk==1 && p->nToken==1 && pFirst->pSegcsr && pFirst->pSegcsr->bLookup ){ @@ -3565,14 +3476,15 @@ static int fts3EvalPhraseNext( ); pDL->pList = pDL->pNextDocid; }else{ - char *pIter; + char *pIter; /* Used to iterate through aAll */ + char *pEnd = &pDL->aAll[pDL->nAll]; /* 1 byte past end of aAll */ if( pDL->pNextDocid ){ pIter = pDL->pNextDocid; }else{ pIter = pDL->aAll; } - if( pIter>=&pDL->aAll[pDL->nAll] ){ + if( pIter>=pEnd ){ /* We have already reached the end of this doclist. EOF. */ *pbEof = 1; }else{ @@ -3586,7 +3498,17 @@ static int fts3EvalPhraseNext( pDL->pList = pIter; fts3PoslistCopy(0, &pIter); pDL->nList = (pIter - pDL->pList); + + /* pIter now points just past the 0x00 that terminates the position- + ** list for document pDL->iDocid. However, if this position-list was + ** edited in place by fts3EvalNearTrim2(), then pIter may not actually + ** point to the start of the next docid value. The following line deals + ** with this case by advancing pIter past the zero-padding added by + ** fts3EvalNearTrim2(). */ + while( pIterpNextDocid = pIter; + assert( *pIter || pIter>=&pDL->aAll[pDL->nAll] ); *pbEof = 0; } } @@ -3617,90 +3539,6 @@ static void fts3EvalStartReaders( } } -static void fts3EvalNearMerge( - int bDescIdx, - Fts3Expr *p1, - Fts3Expr *p2, - int nNear, - int *pRc -){ - if( *pRc==SQLITE_OK ){ - int rc; /* Return code */ - Fts3Phrase *pLeft = p1->pPhrase; - Fts3Phrase *pRight = p2->pPhrase; - - assert( p2->eType==FTSQUERY_PHRASE && pLeft ); - assert( p2->eType==FTSQUERY_PHRASE && pRight ); - - if( pLeft->doclist.aAll==0 ){ - sqlite3_free(pRight->doclist.aAll); - pRight->doclist.aAll = 0; - pRight->doclist.nAll = 0; - }else if( pRight->doclist.aAll ){ - char *aOut; /* Buffer in which to assemble new doclist */ - int nOut; /* Size of buffer aOut in bytes */ - - *pRc = fts3DoclistNearMerge(bDescIdx, nNear, - pLeft->nToken, pLeft->doclist.aAll, pLeft->doclist.nAll, - pRight->nToken, pRight->doclist.aAll, pRight->doclist.nAll, - &aOut, &nOut - ); - sqlite3_free(pRight->doclist.aAll); - pRight->doclist.aAll = aOut; - pRight->doclist.nAll = nOut; - } - } -} - -static void fts3EvalNearTrim(Fts3Cursor *pCsr, Fts3Expr *pExpr, int *pRc){ - - if( pExpr && SQLITE_OK==*pRc ){ - if( pExpr->eType==FTSQUERY_NEAR ){ - Fts3Expr *pLeft = pExpr->pLeft; - int nPhrase = 2; - Fts3Expr **aPhrase; - - assert( pLeft ); - assert( pExpr->pRight ); - assert( pExpr->pRight->eType==FTSQUERY_PHRASE ); - - while( pLeft->eType!=FTSQUERY_PHRASE ){ - assert( pLeft->eType==FTSQUERY_NEAR ); - assert( pLeft->pRight->eType==FTSQUERY_PHRASE ); - pLeft = pLeft->pLeft; - nPhrase++; - } - - aPhrase = (Fts3Expr **)sqlite3_malloc(sizeof(Fts3Expr *) * nPhrase); - if( !aPhrase ){ - *pRc = SQLITE_NOMEM; - }else{ - Fts3Table *p = (Fts3Table *)pCsr->base.pVtab; - int i = 1; - aPhrase[0] = pLeft; - do { - pLeft = pLeft->pParent; - aPhrase[i++] = pLeft->pRight; - }while( pLeft!=pExpr ); - - for(i=0; i<(nPhrase-1); i++){ - int nNear = aPhrase[i+1]->pParent->nNear; - fts3EvalNearMerge(p->bDescIdx, aPhrase[i], aPhrase[i+1], nNear, pRc); - } - for(i=nPhrase-2; i>=0; i--){ - int nNear = aPhrase[i+1]->pParent->nNear; - fts3EvalNearMerge(p->bDescIdx, aPhrase[i+1], aPhrase[i], nNear, pRc); - } - - sqlite3_free(aPhrase); - } - - }else{ - fts3EvalNearTrim(pCsr, pExpr->pLeft, pRc); - fts3EvalNearTrim(pCsr, pExpr->pRight, pRc); - } - } -} typedef struct Fts3TokenAndCost Fts3TokenAndCost; struct Fts3TokenAndCost { @@ -3777,6 +3615,7 @@ static int fts3EvalAverageDocsize(Fts3Cursor *pCsr, int *pnPage){ return SQLITE_CORRUPT_VTAB; } + pCsr->nDoc = nDoc; pCsr->nRowAvg = (int)(((nByte / nDoc) + p->nPgsz) / p->nPgsz); assert( pCsr->nRowAvg>0 ); rc = sqlite3_reset(pStmt); @@ -3902,7 +3741,6 @@ int sqlite3Fts3EvalStart(Fts3Cursor *pCsr, Fts3Expr *pExpr, int bOptOk){ rc = SQLITE_NOMEM; }else{ int ii; - int nDocSize; Fts3TokenAndCost *pTC = aTC; Fts3Expr **ppOr = apOr; @@ -3910,55 +3748,12 @@ int sqlite3Fts3EvalStart(Fts3Cursor *pCsr, Fts3Expr *pExpr, int bOptOk){ nToken = pTC-aTC; nOr = ppOr-apOr; - rc = fts3EvalSelectDeferred(pCsr, 0, aTC, nToken); - for(ii=0; rc==SQLITE_OK && iinOvfl) ){ - pTC = &aTC[jj]; - } - } - assert( pTC ); - - - /* At this point pTC points to the cheapest remaining token. */ - if( ii==0 ){ - if( pTC->nOvfl ){ - nDocEst = (pTC->nOvfl * pTab->nPgsz + pTab->nPgsz) / 10; - }else{ - /* TODO: Fix this so that the doclist need not be read twice. */ - Fts3PhraseToken *pToken = pTC->pToken; - int nList = 0; - char *pList = 0; - rc = fts3TermSelect(pTab, pToken, pTC->iCol, 1, &nList, &pList); - if( rc==SQLITE_OK ){ - nDocEst = fts3DoclistCountDocids(1, pList, nList); - } - sqlite3_free(pList); - if( rc==SQLITE_OK ){ - rc = sqlite3Fts3TermSegReaderCursor(pCsr, - pToken->z, pToken->n, pToken->isPrefix, &pToken->pSegcsr - ); - } - } - }else{ - if( pTC->nOvfl>=(nDocEst*nDocSize) ){ - Fts3PhraseToken *pToken = pTC->pToken; - rc = sqlite3Fts3DeferToken(pCsr, pToken, pTC->iCol); - fts3SegReaderCursorFree(pToken->pSegcsr); - pToken->pSegcsr = 0; - } - nDocEst = 1 + (nDocEst/4); + if( rc==SQLITE_OK ){ + rc = fts3EvalSelectDeferred(pCsr, 0, aTC, nToken); + for(ii=0; rc==SQLITE_OK && iipToken = 0; } -#endif sqlite3_free(aTC); } @@ -3986,6 +3781,7 @@ static int fts3EvalNearTrim2( ){ int nParam1 = nNear + pPhrase->nToken; int nParam2 = nNear + *pnToken; + int nNew; char *p2; char *pOut; int res; @@ -3994,9 +3790,15 @@ static int fts3EvalNearTrim2( res = fts3PoslistNearMerge( &pOut, aTmp, nParam1, nParam2, paPoslist, &p2 ); - pPhrase->doclist.nList = pOut - pPhrase->doclist.pList; - *paPoslist = pPhrase->doclist.pList; - *pnToken = pPhrase->nToken; + if( res ){ + nNew = (pOut - pPhrase->doclist.pList) - 1; + assert( pPhrase->doclist.pList[nNew]=='\0' ); + assert( nNew<=pPhrase->doclist.nList && nNew>0 ); + memset(&pPhrase->doclist.pList[nNew], 0, pPhrase->doclist.nList - nNew); + pPhrase->doclist.nList = nNew; + *paPoslist = pPhrase->doclist.pList; + *pnToken = pPhrase->nToken; + } return res; } @@ -4305,75 +4107,202 @@ int sqlite3Fts3EvalNext(Fts3Cursor *pCsr){ return rc; } +static void fts3EvalRestart( + Fts3Cursor *pCsr, + Fts3Expr *pExpr, + int *pRc +){ + if( pExpr && *pRc==SQLITE_OK ){ + Fts3Phrase *pPhrase = pExpr->pPhrase; + + if( pPhrase ){ + fts3EvalFreeDeferredDoclist(pPhrase); + if( pPhrase->bIncr ){ + sqlite3Fts3EvalPhraseCleanup(pPhrase); + memset(&pPhrase->doclist, 0, sizeof(Fts3Doclist)); + *pRc = sqlite3Fts3EvalStart(pCsr, pExpr, 0); + }else{ + pPhrase->doclist.pNextDocid = 0; + pPhrase->doclist.iDocid = 0; + } + } + + pExpr->iDocid = 0; + pExpr->bEof = 0; + pExpr->bStart = 0; + + fts3EvalRestart(pCsr, pExpr->pLeft, pRc); + fts3EvalRestart(pCsr, pExpr->pRight, pRc); + } +} + +static void fts3EvalUpdateCounts( + Fts3Cursor *pCsr, + Fts3Expr *pExpr, + int *pRc +){ + if( pExpr && *pRc==SQLITE_OK ){ + Fts3Phrase *pPhrase = pExpr->pPhrase; + if( pPhrase && pPhrase->doclist.pList ){ + int iCol = 0; + char *p = pPhrase->doclist.pList; + + assert( *p ); + while( 1 ){ + u8 c = 0; + int iCnt = 0; + while( 0xFE & (*p | c) ){ + if( (c&0x80)==0 ) iCnt++; + c = *p++ & 0x80; + } + + /* aMI[iCol*3 + 1] = Number of occurrences + ** aMI[iCol*3 + 2] = Number of rows containing at least one instance + */ + pExpr->aMI[iCol*3 + 1] += iCnt; + pExpr->aMI[iCol*3 + 2] += (iCnt>0); + if( *p==0x00 ) break; + p++; + p += sqlite3Fts3GetVarint32(p, &iCol); + } + } + + fts3EvalUpdateCounts(pCsr, pExpr->pLeft, pRc); + fts3EvalUpdateCounts(pCsr, pExpr->pRight, pRc); + } +} + +static int fts3EvalNearStats( + Fts3Cursor *pCsr, + Fts3Expr *pExpr +){ + int rc = SQLITE_OK; /* Return code */ + + assert( pExpr->eType==FTSQUERY_PHRASE ); + if( pExpr->aMI==0 ){ + Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab; + Fts3Expr *pRoot; /* Root of NEAR expression */ + Fts3Expr *p; /* Iterator used for several purposes */ + + sqlite3_int64 iPrevId = pCsr->iPrevId; + sqlite3_int64 iDocid; + u8 bEof; + + /* Find the root of the NEAR expression */ + pRoot = pExpr; + while( pRoot->pParent && pRoot->pParent->eType==FTSQUERY_NEAR ){ + pRoot = pRoot->pParent; + } + iDocid = pRoot->iDocid; + bEof = pRoot->bEof; + + /* Allocate space for the aMSI[] array of each FTSQUERY_PHRASE node */ + for(p=pRoot; p; p=p->pLeft){ + Fts3Expr *pE = (p->eType==FTSQUERY_PHRASE?p:p->pRight); + assert( pE->aMI==0 ); + pE->aMI = (u32 *)sqlite3_malloc(pTab->nColumn * 3 * sizeof(u32)); + if( !pE->aMI ) return SQLITE_NOMEM; + memset(pE->aMI, 0, pTab->nColumn * 3 * sizeof(u32)); + } + + fts3EvalRestart(pCsr, pRoot, &rc); + + while( pCsr->isEof==0 && rc==SQLITE_OK ){ + + do { + /* Ensure the %_content statement is reset. */ + if( pCsr->isRequireSeek==0 ) sqlite3_reset(pCsr->pStmt); + assert( sqlite3_data_count(pCsr->pStmt)==0 ); + + /* Advance to the next document */ + fts3EvalNext(pCsr, pRoot, &rc); + pCsr->isEof = pRoot->bEof; + pCsr->isRequireSeek = 1; + pCsr->isMatchinfoNeeded = 1; + pCsr->iPrevId = pRoot->iDocid; + }while( pCsr->isEof==0 + && pRoot->eType==FTSQUERY_NEAR + && fts3EvalLoadDeferred(pCsr, &rc) + ); + + if( pCsr->isEof==0 ){ + fts3EvalUpdateCounts(pCsr, pRoot, &rc); + } + } + + pCsr->isEof = 0; + pCsr->iPrevId = iPrevId; + + if( bEof ){ + pRoot->bEof = bEof; + }else{ + fts3EvalRestart(pCsr, pRoot, &rc); + while( pRoot->iDocidbEof==0 ); + } + fts3EvalLoadDeferred(pCsr, &rc); + } + } + return rc; +} + /* -** Return a pointer to the entire doclist, including positions, associated -** with the phrase passed as the second argument. It is illegal to call -** this function if the phrase consists entirely of deferred tokens. +** This function is used by the matchinfo() module to query a phrase +** expression node for the following information: +** +** 1. The total number of occurrences of the phrase in each column of +** the FTS table (considering all rows), and ** -** TODO: This function is only used by the code for the matchinfo('x') -** auxiliary function - to obtain the following two values: +** 2. For each column, the number of rows in the table for which the +** column contains at least one instance of the phrase. ** -** 1. The total number of times the phrase appears in each column in all -** rows in the FTS table. +** If no error occurs, SQLITE_OK is returned and the values for each column +** written into the array aiOut as follows: ** -** 2. For each column, the total number of rows in the FTS table for which -** the phrase appears at least once in the column. +** aiOut[iCol*3 + 1] = Number of occurrences +** aiOut[iCol*3 + 2] = Number of rows containing at least one instance ** -** It would be better if there was an sqlite3Fts3EvalXXX() function -** specifically to retrieve these values. If that were done, the concept -** of which tokens are deferred or incremental would be entirely encapsulated -** within the sqlite3Fts3EvalXXX()/fts3EvalXXX() functions in this file. +** Caveats: +** +** * If a phrase consists entirely of deferred tokens, then all output +** values are set to the number of documents in the table. In other +** words we assume that very common tokens occur exactly once in each +** column of each row of the table. +** +** * If a phrase contains some deferred tokens (and some non-deferred +** tokens), count the potential occurrence identified by considering +** the non-deferred tokens instead of actual phrase occurrences. +** +** * If the phrase is part of a NEAR expression, then only phrase instances +** that meet the NEAR constraint are included in the counts. */ -int sqlite3Fts3EvalPhraseDoclist( - Fts3Cursor *pCsr, /* FTS3 cursor object */ - Fts3Expr *pExpr, /* Phrase to return doclist for */ - const char **ppList, /* OUT: Buffer containing doclist */ - int *pnList /* OUT: Size of returned buffer, in bytes */ +int sqlite3Fts3EvalPhraseStats( + Fts3Cursor *pCsr, /* FTS cursor handle */ + Fts3Expr *pExpr, /* Phrase expression */ + u32 *aiOut /* Array to write results into (see above) */ ){ + Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab; int rc = SQLITE_OK; - Fts3Phrase *pPhrase = pExpr->pPhrase; - - /* It is illegal to call this function if the phrase is entirely deferred - ** (it may contain some deferred tokens, but must also contain at least - ** one token for which the doclist may be read from the full-text index). - */ - assert( !pExpr->bDeferred ); - - if( pPhrase->bIncr ){ - /* This phrase was being loaded from disk incrementally. But the - ** matchinfo() function requires that the entire doclist be loaded into - ** memory. This block loads the doclist into memory and modifies the - ** Fts3Phrase structure so that it does not use the incremental strategy. - */ - TESTONLY( int bEof = pExpr->bEof; ) - TESTONLY( int bStart = pExpr->bStart; ) - sqlite3_int64 iDocid = pExpr->iDocid; - - sqlite3Fts3EvalPhraseCleanup(pPhrase); - pExpr->iDocid = 0; + int iCol; - rc = sqlite3Fts3EvalStart(pCsr, pExpr, 0); - assert( pExpr->bEof==bEof ); - assert( pExpr->bStart==bStart ); - assert( rc!=SQLITE_OK || pPhrase->bIncr==0 ); - if( pExpr->bStart && !pExpr->bEof ){ - pExpr->bStart = 0; - while( rc==SQLITE_OK && (pExpr->bStart==0 || pExpr->iDocid!=iDocid) ){ - fts3EvalNext(pCsr, pExpr, &rc); - assert( !pExpr->bEof ); + if( pExpr->bDeferred ){ + assert( pCsr->nDoc>0 ); + for(iCol=0; iColnColumn; iCol++){ + aiOut[iCol*3 + 1] = pCsr->nDoc; + aiOut[iCol*3 + 2] = pCsr->nDoc; + } + }else{ + rc = fts3EvalNearStats(pCsr, pExpr); + if( rc==SQLITE_OK ){ + assert( pExpr->aMI ); + for(iCol=0; iColnColumn; iCol++){ + aiOut[iCol*3 + 1] = pExpr->aMI[iCol*3 + 1]; + aiOut[iCol*3 + 2] = pExpr->aMI[iCol*3 + 2]; } } } - if( rc==SQLITE_OK - && pExpr->pParent - && pExpr->pParent->eType==FTSQUERY_NEAR - ){ - - } - - *pnList = pPhrase->doclist.nAll; - *ppList = pPhrase->doclist.aAll; return rc; } diff --git a/ext/fts3/fts3Int.h b/ext/fts3/fts3Int.h index c6329c30d0..80604d6fa3 100644 --- a/ext/fts3/fts3Int.h +++ b/ext/fts3/fts3Int.h @@ -240,6 +240,7 @@ struct Fts3Cursor { u8 bDesc; /* True to sort in descending order */ int eEvalmode; /* An FTS3_EVAL_XX constant */ int nRowAvg; /* Average size of database rows, in pages */ + int nDoc; /* Documents in table */ int isMatchinfoNeeded; /* True when aMatchinfo[] needs filling in */ u32 *aMatchinfo; /* Information about most recent match */ @@ -323,9 +324,17 @@ struct Fts3Phrase { ** "Length" field found in doclists stored on disk is omitted from this ** buffer. ** -** Variable pCurrent always points to the start of a docid field within -** aDoclist. Since the doclist is usually scanned in docid order, this can -** be used to accelerate seeking to the required docid within the doclist. +** Variable aMI is used only for FTSQUERY_NEAR nodes to store the global +** matchinfo data. If it is not NULL, it points to an array of size nCol*3, +** where nCol is the number of columns in the queried FTS table. The array +** is populated as follows: +** +** aMI[iCol*3 + 0] = Undefined +** aMI[iCol*3 + 1] = Number of occurrences +** aMI[iCol*3 + 2] = Number of rows containing at least one instance +** +** The aMI array is allocated using sqlite3_malloc(). It should be freed +** when the expression node is. */ struct Fts3Expr { int eType; /* One of the FTSQUERY_XXX values defined below */ @@ -340,6 +349,8 @@ struct Fts3Expr { u8 bEof; /* True this expression is at EOF already */ u8 bStart; /* True if iDocid is valid */ u8 bDeferred; /* True if this expression is entirely deferred */ + + u32 *aMI; }; /* @@ -370,7 +381,6 @@ int sqlite3Fts3SegReaderNew(int, sqlite3_int64, int sqlite3Fts3SegReaderPending( Fts3Table*,int,const char*,int,int,Fts3SegReader**); void sqlite3Fts3SegReaderFree(Fts3SegReader *); -int sqlite3Fts3SegReaderCost(Fts3Cursor *, Fts3SegReader *, int *); int sqlite3Fts3AllSegdirs(Fts3Table*, int, int, sqlite3_stmt **); int sqlite3Fts3ReadLock(Fts3Table *); int sqlite3Fts3ReadBlock(Fts3Table*, sqlite3_int64, char **, int*, int*); @@ -382,7 +392,6 @@ void sqlite3Fts3FreeDeferredTokens(Fts3Cursor *); int sqlite3Fts3DeferToken(Fts3Cursor *, Fts3PhraseToken *, int); int sqlite3Fts3CacheDeferredDoclists(Fts3Cursor *); void sqlite3Fts3FreeDeferredDoclists(Fts3Cursor *); -char *sqlite3Fts3DeferredDoclist(Fts3DeferredToken *, int *); void sqlite3Fts3SegmentsClose(Fts3Table *); /* Special values interpreted by sqlite3SegReaderCursor() */ @@ -441,8 +450,7 @@ int sqlite3Fts3VarintLen(sqlite3_uint64); void sqlite3Fts3Dequote(char *); void sqlite3Fts3DoclistPrev(int,char*,int,char**,sqlite3_int64*,int*,u8*); -int sqlite3Fts3ExprLoadDoclist(Fts3Cursor *, Fts3Expr *); -int sqlite3Fts3ExprNearTrim(Fts3Expr *, Fts3Expr *, int); +int sqlite3Fts3EvalPhraseStats(Fts3Cursor *, Fts3Expr *, u32 *); /* fts3_tokenizer.c */ const char *sqlite3Fts3NextToken(const char *, int *); @@ -480,9 +488,6 @@ int sqlite3Fts3TermSegReaderCursor( Fts3MultiSegReader **ppSegcsr /* OUT: Allocated seg-reader cursor */ ); -int sqlite3Fts3EvalPhraseCache(Fts3Cursor *, Fts3Phrase *); -sqlite3_int64 sqlite3Fts3EvalDocid(Fts3Cursor *, Fts3Expr *); -int sqlite3Fts3EvalPhraseDoclist(Fts3Cursor*, Fts3Expr*, const char**,int*); void sqlite3Fts3EvalPhraseCleanup(Fts3Phrase *); int sqlite3Fts3EvalStart(Fts3Cursor *, Fts3Expr *, int); diff --git a/ext/fts3/fts3_expr.c b/ext/fts3/fts3_expr.c index be40a9cfc2..0383d1a276 100644 --- a/ext/fts3/fts3_expr.c +++ b/ext/fts3/fts3_expr.c @@ -769,6 +769,7 @@ void sqlite3Fts3ExprFree(Fts3Expr *p){ sqlite3Fts3ExprFree(p->pLeft); sqlite3Fts3ExprFree(p->pRight); sqlite3Fts3EvalPhraseCleanup(p->pPhrase); + sqlite3_free(p->aMI); sqlite3_free(p); } } diff --git a/ext/fts3/fts3_snippet.c b/ext/fts3/fts3_snippet.c index 45e3c32f07..5ae3a16fc4 100644 --- a/ext/fts3/fts3_snippet.c +++ b/ext/fts3/fts3_snippet.c @@ -720,26 +720,6 @@ static int fts3ColumnlistCount(char **ppCollist){ return nEntry; } -static void fts3LoadColumnlistCounts(char **pp, u32 *aOut, int isGlobal){ - char *pCsr = *pp; - while( *pCsr ){ - int nHit; - sqlite3_int64 iCol = 0; - if( *pCsr==0x01 ){ - pCsr++; - pCsr += sqlite3Fts3GetVarint(pCsr, &iCol); - } - nHit = fts3ColumnlistCount(&pCsr); - assert( nHit>0 ); - if( isGlobal ){ - aOut[iCol*3+1]++; - } - aOut[iCol*3] += nHit; - } - pCsr++; - *pp = pCsr; -} - /* ** fts3ExprIterate() callback used to collect the "global" matchinfo stats ** for a single query. @@ -773,32 +753,9 @@ static int fts3ExprGlobalHitsCb( void *pCtx /* Pointer to MatchInfo structure */ ){ MatchInfo *p = (MatchInfo *)pCtx; - u32 *aOut = &p->aMatchinfo[3*iPhrase*p->nCol]; - - if( pExpr->bDeferred ){ - int iCol; /* Column index */ - for(iCol=0; iColnCol; iCol++){ - aOut[iCol*3 + 1] = (u32)p->nDoc; - aOut[iCol*3 + 2] = (u32)p->nDoc; - } - }else{ - char *pIter; - char *pEnd; - int n; - int rc = sqlite3Fts3EvalPhraseDoclist( - p->pCursor, pExpr, (const char **)&pIter, &n - ); - if( rc!=SQLITE_OK ) return rc; - pEnd = &pIter[n]; - - /* Fill in the global hit count matrix row for this phrase. */ - while( pIterpCursor, pExpr, &p->aMatchinfo[3*iPhrase*p->nCol] + ); } /* diff --git a/ext/fts3/fts3_write.c b/ext/fts3/fts3_write.c index a72556384c..30c0a1a15b 100644 --- a/ext/fts3/fts3_write.c +++ b/ext/fts3/fts3_write.c @@ -94,7 +94,6 @@ struct Fts3DeferredToken { ** ** sqlite3Fts3SegReaderNew() ** sqlite3Fts3SegReaderFree() -** sqlite3Fts3SegReaderCost() ** sqlite3Fts3SegReaderIterate() ** ** Methods used to manipulate Fts3SegReader structures: @@ -1295,95 +1294,6 @@ static int fts3SegReaderNextDocid( return SQLITE_OK; } -/* -** This function is called to estimate the amount of data that will be -** loaded from the disk If SegReaderIterate() is called on this seg-reader, -** in units of average document size. -** -** This can be used as follows: If the caller has a small doclist that -** contains references to N documents, and is considering merging it with -** a large doclist (size X "average documents"), it may opt not to load -** the large doclist if X>N. -*/ -int sqlite3Fts3SegReaderCost( - Fts3Cursor *pCsr, /* FTS3 cursor handle */ - Fts3SegReader *pReader, /* Segment-reader handle */ - int *pnCost /* IN/OUT: Number of bytes read */ -){ - Fts3Table *p = (Fts3Table*)pCsr->base.pVtab; - int rc = SQLITE_OK; /* Return code */ - int nCost = 0; /* Cost in bytes to return */ - int pgsz = p->nPgsz; /* Database page size */ - - assert( pgsz>0 ); - - /* If this seg-reader is reading the pending-terms table, or if all data - ** for the segment is stored on the root page of the b-tree, then the cost - ** is zero. In this case all required data is already in main memory. - */ - if( p->bHasStat - && !fts3SegReaderIsPending(pReader) - && !fts3SegReaderIsRootOnly(pReader) - ){ - int nBlob = 0; - sqlite3_int64 iBlock; - - if( pCsr->nRowAvg==0 ){ - /* The average document size, which is required to calculate the cost - ** of each doclist, has not yet been determined. Read the required - ** data from the %_stat table to calculate it. - ** - ** Entry 0 of the %_stat table is a blob containing (nCol+1) FTS3 - ** varints, where nCol is the number of columns in the FTS3 table. - ** The first varint is the number of documents currently stored in - ** the table. The following nCol varints contain the total amount of - ** data stored in all rows of each column of the table, from left - ** to right. - */ - sqlite3_stmt *pStmt; - sqlite3_int64 nDoc = 0; - sqlite3_int64 nByte = 0; - const char *pEnd; - const char *a; - - rc = sqlite3Fts3SelectDoctotal(p, &pStmt); - if( rc!=SQLITE_OK ) return rc; - a = sqlite3_column_blob(pStmt, 0); - assert( a ); - - pEnd = &a[sqlite3_column_bytes(pStmt, 0)]; - a += sqlite3Fts3GetVarint(a, &nDoc); - while( anRowAvg = (int)(((nByte / nDoc) + pgsz) / pgsz); - assert( pCsr->nRowAvg>0 ); - rc = sqlite3_reset(pStmt); - if( rc!=SQLITE_OK ) return rc; - } - - /* Assume that a blob flows over onto overflow pages if it is larger - ** than (pgsz-35) bytes in size (the file-format documentation - ** confirms this). - */ - for(iBlock=pReader->iStartBlock; iBlock<=pReader->iLeafEndBlock; iBlock++){ - rc = sqlite3Fts3ReadBlock(p, iBlock, 0, &nBlob, 0); - if( rc!=SQLITE_OK ) break; - if( (nBlob+35)>pgsz ){ - int nOvfl = (nBlob + 34)/pgsz; - nCost += ((nOvfl + pCsr->nRowAvg - 1)/pCsr->nRowAvg); - } - } - } - - *pnCost += nCost; - return rc; -} int sqlite3Fts3MsrOvfl( Fts3Cursor *pCsr, @@ -2416,7 +2326,6 @@ int sqlite3Fts3MsrIncrNext( } while( 1 ){ - int nSort; Fts3SegReader *pSeg; pSeg = pMsr->apSegment[0]; @@ -2958,20 +2867,6 @@ static int fts3SpecialInsert(Fts3Table *p, sqlite3_value *pVal){ return rc; } -/* -** Return the deferred doclist associated with deferred token pDeferred. -** This function assumes that sqlite3Fts3CacheDeferredDoclists() has already -** been called to allocate and populate the doclist. -*/ -char *sqlite3Fts3DeferredDoclist(Fts3DeferredToken *pDeferred, int *pnByte){ - if( pDeferred->pList ){ - *pnByte = pDeferred->pList->nData; - return pDeferred->pList->aData; - } - *pnByte = 0; - return 0; -} - /* ** Delete all cached deferred doclists. Deferred doclists are cached ** (allocated) by the sqlite3Fts3CacheDeferredDoclists() function. diff --git a/manifest b/manifest index a97b87491f..c0c0a4d2ec 100644 --- a/manifest +++ b/manifest @@ -1,5 +1,5 @@ -C Have\sNEAR\squeries\suse\sincremental\smerging.\sFix\sissues\ssurrounding\sthe\sdeferred\stoken\soptimization. -D 2011-06-07T18:35:45.780 +C Fix\svarious\sissues\sto\sdo\swith\sdeferred\stokens,\sNEAR\sexpressions\sand\smatchinfo(). +D 2011-06-08T18:39:07.487 F Makefile.arm-wince-mingw32ce-gcc d6df77f1f48d690bd73162294bbba7f59507c72f F Makefile.in 11dcc00a8d0e5202def00e81732784fb0cc4fe1d F Makefile.linux-gcc 91d710bdc4998cb015f39edf3cb314ec4f4d7e23 @@ -61,21 +61,21 @@ F ext/fts2/mkfts2amal.tcl 974d5d438cb3f7c4a652639262f82418c1e4cff0 F ext/fts3/README.syntax a19711dc5458c20734b8e485e75fb1981ec2427a F ext/fts3/README.tokenizers 998756696647400de63d5ba60e9655036cb966e9 F ext/fts3/README.txt 8c18f41574404623b76917b9da66fcb0ab38328d -F ext/fts3/fts3.c 9d2d2cab4d64f0769046d88b6740c6e1f229d1e3 +F ext/fts3/fts3.c b44083cafb9840be0927f8b9fb2ab4f373167f77 F ext/fts3/fts3.h 3a10a0af180d502cecc50df77b1b22df142817fe -F ext/fts3/fts3Int.h d76b021d5b7061eff7aa4055b5938eebef2bdb6a +F ext/fts3/fts3Int.h a999cfbf605efec293a88519f74192f5204c84d6 F ext/fts3/fts3_aux.c baed9dab7fb4604ae8cafdb2d7700abe93beffbe -F ext/fts3/fts3_expr.c 0ae554230ada457e61e8184b24faac96aad78f6b +F ext/fts3/fts3_expr.c b95f0d76bcf4507c73a838f3178c4ed8c42dc2bb F ext/fts3/fts3_hash.c 3c8f6387a4a7f5305588b203fa7c887d753e1f1c F ext/fts3/fts3_hash.h 8331fb2206c609f9fc4c4735b9ab5ad6137c88ec F ext/fts3/fts3_icu.c ac494aed69835008185299315403044664bda295 F ext/fts3/fts3_porter.c d61cfd81fb0fd8fbcb25adcaee0ba671aefaa5c2 -F ext/fts3/fts3_snippet.c 0485969cce410760b50d587a77186f9c7f7e96be +F ext/fts3/fts3_snippet.c 82e2c1e420c871c02f6e85ea438570118d7105c8 F ext/fts3/fts3_term.c 6c7f33ab732a2a0f281898685650e3a492e1e2f1 F ext/fts3/fts3_tokenizer.c 055f3dc7369585350b28db1ee0f3b214dca6724d F ext/fts3/fts3_tokenizer.h 13ffd9fcb397fec32a05ef5cd9e0fa659bf3dbd3 F ext/fts3/fts3_tokenizer1.c 6e5cbaa588924ac578263a598e4fb9f5c9bb179d -F ext/fts3/fts3_write.c ed525afd524d713abe7da174d56ad935dfc26008 +F ext/fts3/fts3_write.c bc24cec303d86aeb4b40fcbdf9f252f93ef78fc7 F ext/fts3/fts3speed.tcl b54caf6a18d38174f1a6e84219950d85e98bb1e9 F ext/fts3/mkfts3amal.tcl 252ecb7fe6467854f2aa237bf2c390b74e71f100 F ext/icu/README.txt bf8461d8cdc6b8f514c080e4e10dc3b2bbdfefa9 @@ -471,7 +471,7 @@ F test/fts3expr2.test 18da930352e5693eaa163a3eacf96233b7290d1a F test/fts3fault.test f83e556465bb69dc8bc676339eca408dce4ca246 F test/fts3fault2.test dc96203af6ba31ce20163fc35460e1556e8edf4d F test/fts3malloc.test 9c8cc3f885bb4dfc66d0460c52f68f45e4710d1b -F test/fts3matchinfo.test f424597b6843659ecbc2009e8823380233ebf375 +F test/fts3matchinfo.test 08a82d18cc08abb28aec41d412b4c2ef25ba6a5f F test/fts3near.test 2e318ee434d32babd27c167142e2b94ddbab4844 F test/fts3prefix.test 36246609111ec1683f7ea5ed27666ce2cefb5676 F test/fts3query.test ef79d31fdb355d094baec1c1b24b60439a1fb8a2 @@ -943,7 +943,7 @@ F tool/split-sqlite3c.tcl d9be87f1c340285a3e081eb19b4a247981ed290c F tool/symbols.sh bc2a3709940d47c8ac8e0a1fdf17ec801f015a00 F tool/vdbe-compress.tcl d70ea6d8a19e3571d7ab8c9b75cba86d1173ff0f F tool/warnings.sh 347d974d143cf132f953b565fbc03026f19fcb4d -P 567dd84359218245d4e6887547e2a48881f2c8e0 -R 020efe4a51ef4472e0e5c3f4175d0de6 +P 9d10a6846b12a9cc8fd4fdc3affd931a27218b5a +R 37e4da2cb9907d0ccf1d8076445165fd U dan -Z 740d1ddba83232619fca71041707ab60 +Z 147c4bbcabf01e6d99dff7a301984a70 diff --git a/manifest.uuid b/manifest.uuid index 4da09088b8..f3e1134a0a 100644 --- a/manifest.uuid +++ b/manifest.uuid @@ -1 +1 @@ -9d10a6846b12a9cc8fd4fdc3affd931a27218b5a \ No newline at end of file +3972a787df5ec253b99b148385655e7b68d851fa \ No newline at end of file diff --git a/test/fts3matchinfo.test b/test/fts3matchinfo.test index e26bcf1052..40366b6aef 100644 --- a/test/fts3matchinfo.test +++ b/test/fts3matchinfo.test @@ -244,9 +244,13 @@ do_matchinfo_test 4.2.6 t5 {t5 MATCH 'a OR b'} { s {1 2 1} } do_execsql_test 4.3.0 "INSERT INTO t5 VALUES('x y [string repeat {b } 50000]')"; -do_matchinfo_test 4.3.1 t5 {t5 MATCH 'a a'} { - x {{5 8 2 5 5 5} {3 8 2 3 5 5}} - s {2 1} +# It used to be that the second 'a' token would be deferred. That doesn't +# work any longer. +if 0 { + do_matchinfo_test 4.3.1 t5 {t5 MATCH 'a a'} { + x {{5 8 2 5 5 5} {3 8 2 3 5 5}} + s {2 1} + } } do_matchinfo_test 4.3.2 t5 {t5 MATCH 'a b'} { s {2} }