From: dan Date: Mon, 11 Jan 2010 12:00:47 +0000 (+0000) Subject: Modify snippets code to run more efficiently. And to avoid a bug relating to snippets... X-Git-Tag: version-3.7.2~650 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=9a34020f05ca1dcbe930cf8b5440c6d09f1451f3;p=thirdparty%2Fsqlite.git Modify snippets code to run more efficiently. And to avoid a bug relating to snippets based on full-text queries that contain duplicate terms. FossilOrigin-Name: a2b1183d9e9898d06d623b342bbb552e85a9b3f6 --- diff --git a/ext/fts3/fts3.c b/ext/fts3/fts3.c index 8eeec364fd..cd6144160b 100644 --- a/ext/fts3/fts3.c +++ b/ext/fts3/fts3.c @@ -2113,7 +2113,7 @@ int sqlite3Fts3ExprLoadDoclist(Fts3Table *pTab, Fts3Expr *pExpr){ /* ** After ExprLoadDoclist() (see above) has been called, this function is -** used to iterate through the position lists that make up the doclist +** used to iterate/search through the position lists that make up the doclist ** stored in pExpr->aDoclist. */ char *sqlite3Fts3FindPositions( diff --git a/ext/fts3/fts3_snippet.c b/ext/fts3/fts3_snippet.c index 1f110da9c4..20b636305b 100644 --- a/ext/fts3/fts3_snippet.c +++ b/ext/fts3/fts3_snippet.c @@ -28,6 +28,27 @@ static void fts3GetDeltaPosition(char **pp, int *piPos){ *piPos += (iVal-2); } +static int fts3ExprIterate2( + Fts3Expr *pExpr, /* Expression to iterate phrases of */ + int *piPhrase, /* Pointer to phrase counter */ + int (*x)(Fts3Expr*,int,void*), /* Callback function to invoke for phrases */ + void *pCtx /* Second argument to pass to callback */ +){ + int rc; + int eType = pExpr->eType; + if( eType!=FTSQUERY_PHRASE ){ + assert( pExpr->pLeft && pExpr->pRight ); + rc = fts3ExprIterate2(pExpr->pLeft, piPhrase, x, pCtx); + if( rc==SQLITE_OK && eType!=FTSQUERY_NOT ){ + rc = fts3ExprIterate2(pExpr->pRight, piPhrase, x, pCtx); + } + }else{ + rc = x(pExpr, *piPhrase, pCtx); + (*piPhrase)++; + } + return rc; +} + /* ** Iterate through all phrase nodes in an FTS3 query, except those that ** are part of a sub-tree that is the right-hand-side of a NOT operator. @@ -40,21 +61,11 @@ static void fts3GetDeltaPosition(char **pp, int *piPos){ */ static int fts3ExprIterate( Fts3Expr *pExpr, /* Expression to iterate phrases of */ - int (*x)(Fts3Expr *, void *), /* Callback function to invoke for phrases */ + int (*x)(Fts3Expr*,int,void*), /* Callback function to invoke for phrases */ void *pCtx /* Second argument to pass to callback */ ){ - int rc; - int eType = pExpr->eType; - if( eType!=FTSQUERY_PHRASE ){ - assert( pExpr->pLeft && pExpr->pRight ); - rc = fts3ExprIterate(pExpr->pLeft, x, pCtx); - if( rc==SQLITE_OK && eType!=FTSQUERY_NOT ){ - rc = fts3ExprIterate(pExpr->pRight, x, pCtx); - } - }else{ - rc = x(pExpr, pCtx); - } - return rc; + int iPhrase = 0; + return fts3ExprIterate2(pExpr, &iPhrase, x, pCtx); } typedef struct LoadDoclistCtx LoadDoclistCtx; @@ -95,7 +106,7 @@ static int fts3ExprNearTrim(Fts3Expr *pExpr){ return rc; } -static int fts3ExprLoadDoclistsCb1(Fts3Expr *pExpr, void *ctx){ +static int fts3ExprLoadDoclistsCb1(Fts3Expr *pExpr, int iPhrase, void *ctx){ int rc = SQLITE_OK; LoadDoclistCtx *p = (LoadDoclistCtx *)ctx; @@ -113,7 +124,7 @@ static int fts3ExprLoadDoclistsCb1(Fts3Expr *pExpr, void *ctx){ return rc; } -static int fts3ExprLoadDoclistsCb2(Fts3Expr *pExpr, void *ctx){ +static int fts3ExprLoadDoclistsCb2(Fts3Expr *pExpr, int iPhrase, void *ctx){ if( pExpr->aDoclist ){ pExpr->pCurrent = pExpr->aDoclist; pExpr->iCurrent = 0; @@ -140,131 +151,173 @@ static int fts3ExprLoadDoclists( } /* -** Each call to this function populates a chunk of a snippet-buffer -** SNIPPET_BUFFER_CHUNK bytes in size. -** -** Return true if the end of the data has been reached (and all subsequent -** calls to fts3LoadSnippetBuffer() with the same arguments will be no-ops), -** or false otherwise. +** The following types are used as part of the implementation of the +** fts3BestSnippet() routine. */ -static int fts3LoadSnippetBuffer( - int iPos, /* Document token offset to load data for */ - u8 *aBuffer, /* Circular snippet buffer to populate */ - int nList, /* Number of position lists in appList */ - char **apList, /* IN/OUT: nList position list pointers */ - int *aiPrev /* IN/OUT: Previous positions read */ -){ - int i; - int nFin = 0; - - assert( (iPos&(SNIPPET_BUFFER_CHUNK-1))==0 ); - - memset(&aBuffer[iPos&SNIPPET_BUFFER_MASK], 0, SNIPPET_BUFFER_CHUNK); - - for(i=0; i=iPos ); - aBuffer[iPrev&SNIPPET_BUFFER_MASK] = i+1; - if( 0==((*pList)&0xFE) ){ - iPrev = -1; +/* +** Advance the position list iterator specified by the first two +** arguments so that it points to the first element with a value greater +** than or equal to parameter iNext. +*/ +static void fts3SnippetAdvance(char **ppIter, int *piIter, int iNext){ + char *pIter = *ppIter; + if( pIter ){ + int iIter = *piIter; + + while( iIteriPhrase++; - char *pCsr; + if( pIter->iCurrent<0 ){ + /* The SnippetCtx object has just been initialized. The first snippet + ** candidate always starts at offset 0 (even if this candidate has a + ** score of 0.0). + */ + pIter->iCurrent = 0; - p->anToken[iPhrase] = pExpr->pPhrase->nToken; - pCsr = sqlite3Fts3FindPositions(pExpr, p->pCsr->iPrevId, p->iCol); + /* Advance the 'head' iterator of each phrase to the first offset that + ** is greater than or equal to (iNext+nSnippet). + */ + for(i=0; inPhrase; i++){ + SnippetPhrase *pPhrase = &pIter->aPhrase[i]; + fts3SnippetAdvance(&pPhrase->pHead, &pPhrase->iHead, pIter->nSnippet); + } + }else{ + int iStart; + int iEnd = 0x7FFFFFFF; - if( pCsr ){ - int iVal; - pCsr += sqlite3Fts3GetVarint32(pCsr, &iVal); - p->apList[iPhrase] = pCsr; - p->aiPrev[iPhrase] = iVal-2; + for(i=0; inPhrase; i++){ + SnippetPhrase *pPhrase = &pIter->aPhrase[i]; + if( pPhrase->pHead && pPhrase->iHeadiHead; + } + } + if( iEnd==0x7FFFFFFF ){ + return 1; + } + + pIter->iCurrent = iStart = iEnd - pIter->nSnippet + 1; + for(i=0; inPhrase; i++){ + SnippetPhrase *pPhrase = &pIter->aPhrase[i]; + fts3SnippetAdvance(&pPhrase->pHead, &pPhrase->iHead, iEnd+1); + fts3SnippetAdvance(&pPhrase->pTail, &pPhrase->iTail, iStart); + } } - return SQLITE_OK; + + return 0; } -static void fts3SnippetCnt( - int iIdx, - int nSnippet, - int *anCnt, - u8 *aBuffer, - int *anToken, - u64 *pHlmask +static void fts3SnippetDetails( + SnippetCtx *pIter, /* Snippet iterator */ + u64 mCovered, /* Bitmask of phrases already covered */ + int *piToken, /* OUT: First token of proposed snippet */ + int *piScore, /* OUT: "Score" for this snippet */ + u64 *pmCover, /* OUT: Bitmask of phrases covered */ + u64 *pmHighlight /* OUT: Bitmask of terms to highlight */ ){ - int iSub = (iIdx-1)&SNIPPET_BUFFER_MASK; - int iAdd = (iIdx+nSnippet-1)&SNIPPET_BUFFER_MASK; + int iStart = pIter->iCurrent; /* First token of snippet */ - u64 h = *pHlmask; + int iScore = 0; + int i; + u64 mCover = 0; + u64 mHighlight = 0; + + for(i=0; inPhrase; i++){ + SnippetPhrase *pPhrase = &pIter->aPhrase[i]; + if( pPhrase->pTail ){ + char *pCsr = pPhrase->pTail; + int iCsr = pPhrase->iTail; + + while( iCsr<(iStart+pIter->nSnippet) ){ + int j; + u64 mPhrase = (u64)1 << i; + u64 mPos = (u64)1 << (iCsr - iStart); + assert( iCsr>=iStart ); + if( (mCover|mCovered)&mPhrase ){ + iScore++; + }else{ + iScore += 1000; + } + mCover |= mPhrase; - anCnt[ aBuffer[iSub] ]--; - anCnt[ aBuffer[iAdd] ]++; + for(j=0; jnToken; j++){ + mHighlight |= (mPos>>j); + } - h = h >> 1; - if( aBuffer[iAdd] ){ - int j; - for(j=anToken[aBuffer[iAdd]-1]; j>=1; j--){ - h |= (u64)1 << (nSnippet-j); + if( 0==(*pCsr & 0x0FE) ) break; + fts3GetDeltaPosition(&pCsr, &iCsr); + } } } - *pHlmask = h; -} -static int fts3SnippetScore(int n, int *anCnt, u64 covmask){ - int j; - int iScore = 0; - for(j=1; j<=n; j++){ - int nCnt = anCnt[j]; - iScore += nCnt; - if( nCnt && 0==(covmask & ((u64)1 << (j-1))) ){ - iScore += 1000; - } - } - return iScore; + *piToken = iStart; + *piScore = iScore; + *pmCover = mCover; + *pmHighlight = mHighlight; } -static u64 fts3SnippetMask(int n, int *anCnt){ - int j; - u64 mask = 0; +/* +** This function is an fts3ExprIterate() callback used by fts3BestSnippet(). +** Each invocation populates an element of the SnippetCtx.aPhrase[] array. +*/ +static int fts3SnippetFindPositions(Fts3Expr *pExpr, int iPhrase, void *ctx){ + SnippetCtx *p = (SnippetCtx *)ctx; + SnippetPhrase *pPhrase = &p->aPhrase[iPhrase]; + char *pCsr; - if( n>64 ) n = 64; - for(j=1; j<=n; j++){ - if( anCnt[j] ) mask |= ((u64)1)<<(j-1); + pPhrase->nToken = pExpr->pPhrase->nToken; + + pCsr = sqlite3Fts3FindPositions(pExpr, p->pCsr->iPrevId, p->iCol); + if( pCsr ){ + int iFirst = 0; + pPhrase->pList = pCsr; + fts3GetDeltaPosition(&pCsr, &iFirst); + pPhrase->pHead = pCsr; + pPhrase->pTail = pCsr; + pPhrase->iHead = iFirst; + pPhrase->iTail = iFirst; + }else{ + assert( pPhrase->pList==0 && pPhrase->pHead==0 && pPhrase->pTail==0 ); } - return mask; + + return SQLITE_OK; } +#define BITMASK_SIZE 64 + typedef struct SnippetFragment SnippetFragment; struct SnippetFragment { int iCol; /* Column snippet is extracted from */ @@ -283,21 +336,13 @@ static int fts3BestSnippet( int *piScore /* OUT: Score of snippet pFragment */ ){ int rc; /* Return Code */ - u8 aBuffer[SNIPPET_BUFFER_SIZE];/* Circular snippet buffer */ - int *aiPrev; /* Used by fts3LoadSnippetBuffer() */ - int *anToken; /* Number of tokens in each phrase */ - char **apList; /* Array of position lists */ - int *anCnt; /* Running totals of phrase occurences */ int nList; /* Number of phrases in expression */ - int nByte; /* Bytes of dynamic space required */ - int i; /* Loop counter */ - u64 hlmask = 0; /* Current mask of highlighted terms */ - u64 besthlmask = 0; /* Mask of highlighted terms for iBestPos */ - u64 bestcovmask = 0; /* Mask of terms with at least one hit */ - int iBestPos = 0; /* Starting position of 'best' snippet */ - int iBestScore = 0; /* Score of best snippet higher->better */ - int iEnd = 0x7FFFFFFF; - SnippetCtx sCtx; + SnippetCtx sCtx; /* Snippet context object */ + int nByte; /* Number of bytes of space to allocate */ + int iBestScore = -1; + int i; + + memset(&sCtx, 0, sizeof(sCtx)); /* Iterate through the phrases in the expression to count them. The same ** callback makes sure the doclists are loaded for each phrase. @@ -308,85 +353,54 @@ static int fts3BestSnippet( } /* Now that it is known how many phrases there are, allocate and zero - ** the required arrays using malloc(). + ** the required space using malloc(). */ - nByte = sizeof(u8*)*nList + /* apList */ - sizeof(int)*(nList) + /* anToken */ - sizeof(int)*nList + /* aiPrev */ - sizeof(int)*(nList+1); /* anCnt */ - apList = (char **)sqlite3_malloc(nByte); - if( !apList ){ + nByte = sizeof(SnippetPhrase) * nList; + sCtx.aPhrase = (SnippetPhrase *)sqlite3_malloc(nByte); + if( !sCtx.aPhrase ){ return SQLITE_NOMEM; } - memset(apList, 0, nByte); - anToken = (int *)&apList[nList]; - aiPrev = &anToken[nList]; - anCnt = &aiPrev[nList]; + memset(sCtx.aPhrase, 0, nByte); - /* Initialize the contents of the aiPrev and aiList arrays. */ + /* Initialize the contents of the SnippetCtx object. Then iterate through + ** the set of phrases in the expression to populate the aPhrase[] array. + */ sCtx.pCsr = pCsr; sCtx.iCol = iCol; - sCtx.apList = apList; - sCtx.aiPrev = aiPrev; - sCtx.anToken = anToken; - sCtx.iPhrase = 0; + sCtx.nSnippet = nSnippet; + sCtx.nPhrase = nList; + sCtx.iCurrent = -1; (void)fts3ExprIterate(pCsr->pExpr, fts3SnippetFindPositions, (void *)&sCtx); for(i=0; i=0 ){ + if( sCtx.aPhrase[i].pHead ){ *pmSeen |= (u64)1 << i; } } - /* Load the first two chunks of data into the buffer. */ - memset(aBuffer, 0, SNIPPET_BUFFER_SIZE); - fts3LoadSnippetBuffer(0, aBuffer, nList, apList, aiPrev); - fts3LoadSnippetBuffer(SNIPPET_BUFFER_CHUNK, aBuffer, nList, apList, aiPrev); - - /* Set the initial contents of the highlight-mask and anCnt[] array. */ - for(i=1-nSnippet; i<=0; i++){ - fts3SnippetCnt(i, nSnippet, anCnt, aBuffer, anToken, &hlmask); - } - iBestScore = fts3SnippetScore(nList, anCnt, mCovered); - besthlmask = hlmask; - iBestPos = 0; - bestcovmask = fts3SnippetMask(nList, anCnt); - - for(i=1; iiCol = iCol; + while( !fts3SnippetNextCandidate(&sCtx) ){ + int iPos; int iScore; + u64 mCover; + u64 mHighlight; + fts3SnippetDetails(&sCtx, mCovered, &iPos, &iScore, &mCover, &mHighlight); - if( 0==(i&(SNIPPET_BUFFER_CHUNK-1)) ){ - int iLoad = i + SNIPPET_BUFFER_CHUNK; - if( fts3LoadSnippetBuffer(iLoad, aBuffer, nList, apList, aiPrev) ){ - iEnd = iLoad; - } - } - - /* Figure out how highly a snippet starting at token offset i scores - ** according to fts3SnippetScore(). If it is higher than any previously - ** considered position, save the current position, score and hlmask as - ** the best snippet candidate found so far. - */ - fts3SnippetCnt(i, nSnippet, anCnt, aBuffer, anToken, &hlmask); - iScore = fts3SnippetScore(nList, anCnt, mCovered); + assert( iScore>=0 ); if( iScore>iBestScore ){ - iBestPos = i; + pFragment->iPos = iPos; + pFragment->hlmask = mHighlight; + pFragment->covered = mCover; iBestScore = iScore; - besthlmask = hlmask; - bestcovmask = fts3SnippetMask(nList, anCnt); } } - sqlite3_free(apList); - - pFragment->iPos = iBestPos; - pFragment->hlmask = besthlmask; - pFragment->iCol = iCol; - pFragment->covered = bestcovmask; + sqlite3_free(sCtx.aPhrase); *piScore = iBestScore; return SQLITE_OK; } + typedef struct StrBuffer StrBuffer; struct StrBuffer { char *z; @@ -639,6 +653,7 @@ static void fts3LoadColumnlistCounts(char **pp, u32 *aOut){ */ static int fts3ExprGlobalMatchinfoCb( Fts3Expr *pExpr, /* Phrase expression node */ + int iPhrase, void *pCtx /* Pointer to MatchInfo structure */ ){ MatchInfo *p = (MatchInfo *)pCtx; @@ -662,10 +677,11 @@ static int fts3ExprGlobalMatchinfoCb( static int fts3ExprLocalMatchinfoCb( Fts3Expr *pExpr, /* Phrase expression node */ + int iPhrase, void *pCtx /* Pointer to MatchInfo structure */ ){ MatchInfo *p = (MatchInfo *)pCtx; - int iPhrase = p->iPhrase++; + p->iPhrase++; if( pExpr->aDoclist ){ char *pCsr; @@ -836,7 +852,7 @@ struct TermOffsetCtx { /* ** This function is an fts3ExprIterate() callback used by sqlite3Fts3Offsets(). */ -static int fts3ExprTermOffsetInit(Fts3Expr *pExpr, void *ctx){ +static int fts3ExprTermOffsetInit(Fts3Expr *pExpr, int iPhrase, void *ctx){ TermOffsetCtx *p = (TermOffsetCtx *)ctx; int nTerm; /* Number of tokens in phrase */ int iTerm; /* For looping through nTerm phrase terms */ diff --git a/manifest b/manifest index 3d453c02a1..6151352c0e 100644 --- a/manifest +++ b/manifest @@ -1,5 +1,5 @@ -C Fix\shandling\sof\san\sOOM\serror\sin\sthe\sfts3\soffsets()\sfunction.\sFix\sa\scouple\sof\ssnippet\srelated\stest\scases\sin\se_fts3.test. -D 2010-01-09T07:33:54 +C Modify\ssnippets\scode\sto\srun\smore\sefficiently.\sAnd\sto\savoid\sa\sbug\srelating\sto\ssnippets\sbased\son\sfull-text\squeries\sthat\scontain\sduplicate\sterms. +D 2010-01-11T12:00:48 F Makefile.arm-wince-mingw32ce-gcc fcd5e9cd67fe88836360bb4f9ef4cb7f8e2fb5a0 F Makefile.in c5827ead754ab32b9585487177c93bb00b9497b3 F Makefile.linux-gcc d53183f4aa6a9192d249731c90dbdffbd2c68654 @@ -56,7 +56,7 @@ F ext/fts2/mkfts2amal.tcl 974d5d438cb3f7c4a652639262f82418c1e4cff0 F ext/fts3/README.syntax a19711dc5458c20734b8e485e75fb1981ec2427a F ext/fts3/README.tokenizers 998756696647400de63d5ba60e9655036cb966e9 F ext/fts3/README.txt 8c18f41574404623b76917b9da66fcb0ab38328d -F ext/fts3/fts3.c 451eb6554f3fce20e39ad6e3aea8b73e570582eb +F ext/fts3/fts3.c 699abb6cc18a3d923daca5866548de09c52ff59f F ext/fts3/fts3.h 3a10a0af180d502cecc50df77b1b22df142817fe F ext/fts3/fts3Int.h 45bc7e284806042119722c8f4127ee944b77f0dd F ext/fts3/fts3_expr.c f4ff02ebe854e97ac03ff00b38b728a9ab57fd4b @@ -64,7 +64,7 @@ F ext/fts3/fts3_hash.c 3c8f6387a4a7f5305588b203fa7c887d753e1f1c F ext/fts3/fts3_hash.h 8331fb2206c609f9fc4c4735b9ab5ad6137c88ec F ext/fts3/fts3_icu.c ac494aed69835008185299315403044664bda295 F ext/fts3/fts3_porter.c a651e287e02b49b565a6ccf9441959d434489156 -F ext/fts3/fts3_snippet.c a521f904baca19e0ed7025bc0e38d24fc05d21bf +F ext/fts3/fts3_snippet.c 99bc7e7356ff17667afa5ef744cae5c8dc71c7d6 F ext/fts3/fts3_tokenizer.c 1a49ee3d79cbf0b9386250370d9cbfe4bb89c8ff F ext/fts3/fts3_tokenizer.h 13ffd9fcb397fec32a05ef5cd9e0fa659bf3dbd3 F ext/fts3/fts3_tokenizer1.c 11a604a53cff5e8c28882727bf794e5252e5227b @@ -785,7 +785,7 @@ F tool/speedtest2.tcl ee2149167303ba8e95af97873c575c3e0fab58ff F tool/speedtest8.c 2902c46588c40b55661e471d7a86e4dd71a18224 F tool/speedtest8inst1.c 293327bc76823f473684d589a8160bde1f52c14e F tool/vdbe-compress.tcl d70ea6d8a19e3571d7ab8c9b75cba86d1173ff0f -P e424a0307359fee6875424c10ecad1a10acfba0e -R 172138024910e4c6fdf1759b2e1ab812 +P 14dc46a74aafe44c0bf7dffd26268395b2c5edb2 +R 400acb68bff0c67f9d3c184630804917 U dan -Z 370e96d093c040b65438f7314092ae6c +Z d8fdb287511f28eb0b92ee0535b223e5 diff --git a/manifest.uuid b/manifest.uuid index f9c4bf7331..56e8c15806 100644 --- a/manifest.uuid +++ b/manifest.uuid @@ -1 +1 @@ -14dc46a74aafe44c0bf7dffd26268395b2c5edb2 \ No newline at end of file +a2b1183d9e9898d06d623b342bbb552e85a9b3f6 \ No newline at end of file