From: dan Date: Tue, 17 Nov 2009 12:52:10 +0000 (+0000) Subject: Improvements to the way fts3 reads the full-text index. X-Git-Tag: fts3-refactor~4 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=f13b704ee6266ede37f4767302b8257293cbf3c6;p=thirdparty%2Fsqlite.git Improvements to the way fts3 reads the full-text index. FossilOrigin-Name: 45c051e78651d8204c17cecdda2bde705698881f --- diff --git a/ext/fts3/fts3.c b/ext/fts3/fts3.c index 93ad7a9f8a..08d4311163 100644 --- a/ext/fts3/fts3.c +++ b/ext/fts3/fts3.c @@ -905,7 +905,7 @@ static int fts3NextMethod(sqlite3_vtab_cursor *pCursor){ static int fts3ReadBlock( Fts3Table *p, sqlite3_int64 iBlock, - char **pzBlock, + char const **pzBlock, int *pnBlock ){ sqlite3_stmt *pStmt; @@ -928,27 +928,28 @@ static int fts3ReadBlock( } /* -** The buffer pointed to by argument zNode (size nNode bytes) contains a -** b-tree segment interior node. This function inspects the sub-tree headed -** by the node to determine the range of leaf-nodes (if any) that may -** contain a term that matches the contents of buffer zTerm (size nTerm -** bytes). If the isPrefix parameter is true, then the range of leaves -** returned are those that may contain any term for which zTerm/nTerm is -** a prefix. -** -** If successful, SQLITE_OK is returned. The blockid of the first leaf in the -** selected range is written to piStart before returning. The blockid of the -** final leaf in the selected range is written to *piEnd. +** The buffer pointed to by argument zNode (size nNode bytes) contains the +** root node of a b-tree segment. The segment is guaranteed to be at least +** one level high (i.e. the root node is not also a leaf). If successful, +** this function locates the leaf node of the segment that may contain the +** term specified by arguments zTerm and nTerm and writes its block number +** to *piLeaf. +** +** It is possible that the returned leaf node does not contain the specified +** term. However, if the segment does contain said term, it is stored on +** the identified leaf node. Because this function only inspects interior +** segment nodes (and never loads leaf nodes into memory), it is not possible +** to be sure. +** +** If an error occurs, an error code other than SQLITE_OK is returned. */ -static int fts3SelectLeaves( +static int fts3SelectLeaf( Fts3Table *p, /* Virtual table handle */ const char *zTerm, /* Term to select leaves for */ int nTerm, /* Size of term zTerm in bytes */ - int isPrefix, /* True for a prefix search */ const char *zNode, /* Buffer containing segment interior node */ int nNode, /* Size of buffer at zNode */ - sqlite3_int64 *piStart, /* First selected leaf */ - sqlite3_int64 *piEnd /* Second selected leaf */ + sqlite3_int64 *piLeaf /* Selected leaf node */ ){ int rc = SQLITE_OK; /* Return code */ const char *zCsr = zNode; /* Cursor to iterate through node */ @@ -956,79 +957,67 @@ static int fts3SelectLeaves( char *zBuffer = 0; /* Buffer to load terms into */ int nAlloc = 0; /* Size of allocated buffer */ - int iHeight; /* Height of this node in tree */ - sqlite3_int64 iChild; - sqlite3_int64 iStart = 0; - sqlite3_int64 iEnd; - - zCsr += sqlite3Fts3GetVarint32(zCsr, &iHeight); - zCsr += sqlite3Fts3GetVarint(zCsr, &iChild); - - while( zCsrnAlloc ){ - char *zNew; - nAlloc = (nPrefix+nSuffix) * 2; - zNew = (char *)sqlite3_realloc(zBuffer, nAlloc); - if( !zNew ){ - sqlite3_free(zBuffer); - return SQLITE_NOMEM; + while( 1 ){ + int iHeight; /* Height of this node in tree */ + sqlite3_int64 iChild; /* Block id of child node to descend to */ + int nBlock; /* Size of child node in bytes */ + + zCsr += sqlite3Fts3GetVarint32(zCsr, &iHeight); + zCsr += sqlite3Fts3GetVarint(zCsr, &iChild); + + while( zCsrnAlloc ){ + char *zNew; + nAlloc = (nPrefix+nSuffix) * 2; + zNew = (char *)sqlite3_realloc(zBuffer, nAlloc); + if( !zNew ){ + sqlite3_free(zBuffer); + return SQLITE_NOMEM; + } + zBuffer = zNew; + } + memcpy(&zBuffer[nPrefix], zCsr, nSuffix); + nBuffer = nPrefix + nSuffix; + zCsr += nSuffix; + + /* Compare the term we are searching for with the term just loaded from + ** the interior node. If the specified term is greater than or equal + ** to the term from the interior node, then all terms on the sub-tree + ** headed by node iChild are smaller than zTerm. No need to search + ** iChild. + ** + ** If the interior node term is larger than the specified term, then + ** the tree headed by iChild may contain the specified term. + */ + nMin = (nBuffer>nTerm ? nTerm : nBuffer); + if( memcmp(zTerm, zBuffer, nMin)<0 ) break; + iChild++; + }; + + /* If (iHeight==1), the children of this interior node are leaves. The + ** specified term may be present on leaf node iChild. */ - nMin = (nBuffer>nTerm ? nTerm : nBuffer); - cmp = memcmp(zTerm, zBuffer, nMin); - if( isPrefix && cmp==0 && iStart==0 ){ - iStart = iChild; + if( iHeight==1 ){ + *piLeaf = iChild; + break; } - if( cmp<0 ) break; - iChild++; - }; - iEnd = iChild; - if( iStart==0 ) iStart = iChild; - sqlite3_free(zBuffer); - if( iHeight==1 ){ - if( piEnd ) *piEnd = iEnd; - if( piStart ) *piStart = iStart; - }else{ - char *zBlock; - int nBlock; - if( piEnd ){ - rc = fts3ReadBlock(p, iEnd, &zBlock, &nBlock); - if( rc==SQLITE_OK ){ - rc = fts3SelectLeaves(p,zTerm,nTerm,isPrefix,zBlock,nBlock,0,piEnd); - } - } - if( piStart && rc==SQLITE_OK ){ - rc = fts3ReadBlock(p, iStart, &zBlock, &nBlock); - if( rc==SQLITE_OK ){ - rc = fts3SelectLeaves(p,zTerm,nTerm,isPrefix,zBlock,nBlock,piStart,0); - } - } + /* Descend to interior node iChild. */ + rc = fts3ReadBlock(p, iChild, &zCsr, &nBlock); + if( rc!=SQLITE_OK ) break; + zEnd = &zCsr[nBlock]; } - + sqlite3_free(zBuffer); return rc; } @@ -1409,7 +1398,6 @@ static int fts3DoclistMerge( return SQLITE_NOMEM; } } - (mergetype==MERGE_NEAR ? 0 : &p); while( p1 && p2 ){ if( i1==i2 ){ @@ -1444,58 +1432,54 @@ static int fts3DoclistMerge( return SQLITE_OK; } +/* +** A pointer to an instance of this structure is used as the context +** argument to sqlite3Fts3SegReaderIterate() +*/ typedef struct TermSelect TermSelect; struct TermSelect { - char const *zTerm; - int nTerm; - int isPrefix; int isReqPos; char *aOutput; /* Malloc'd output buffer */ int nOutput; /* Size of output in bytes */ }; static int fts3TermSelectCb( - Fts3Table *p, - void *pContext, + Fts3Table *p, /* Virtual table object */ + void *pContext, /* Pointer to TermSelect structure */ char *zTerm, int nTerm, char *aDoclist, int nDoclist ){ TermSelect *pTS = (TermSelect *)pContext; + int nNew = pTS->nOutput + nDoclist; - if( (pTS->nTerm==nTerm || (pTS->isPrefix && pTS->nTermzTerm, pTS->nTerm) - ){ - int nNew = pTS->nOutput + nDoclist; - char *aNew = sqlite3_malloc(nNew); - if( !aNew ){ - return SQLITE_NOMEM; - } - - if( pTS->nOutput==0 ){ - /* If this is the first term selected, copy the doclist to the output - ** buffer using memcpy(). TODO: Add a way to transfer control of the - ** aDoclist buffer from the caller so as to avoid the memcpy(). - */ - memcpy(aNew, aDoclist, nDoclist); - }else{ - /* The output buffer is not empty. Merge doclist aDoclist with the - ** existing output. This can only happen with prefix-searches (as - ** searches for exact terms return exactly one doclist). - */ - int mergetype = (pTS->isReqPos ? MERGE_POS_OR : MERGE_OR); - assert( pTS->isPrefix ); - fts3DoclistMerge(mergetype, 0, 0, - aNew, &nNew, pTS->aOutput, pTS->nOutput, aDoclist, nDoclist - ); - } + char *aNew = sqlite3_malloc(nNew); + if( !aNew ){ + return SQLITE_NOMEM; + } - sqlite3_free(pTS->aOutput); - pTS->aOutput = aNew; - pTS->nOutput = nNew; + if( pTS->nOutput==0 ){ + /* If this is the first term selected, copy the doclist to the output + ** buffer using memcpy(). TODO: Add a way to transfer control of the + ** aDoclist buffer from the caller so as to avoid the memcpy(). + */ + memcpy(aNew, aDoclist, nDoclist); + }else{ + /* The output buffer is not empty. Merge doclist aDoclist with the + ** existing output. This can only happen with prefix-searches (as + ** searches for exact terms return exactly one doclist). + */ + int mergetype = (pTS->isReqPos ? MERGE_POS_OR : MERGE_OR); + fts3DoclistMerge(mergetype, 0, 0, + aNew, &nNew, pTS->aOutput, pTS->nOutput, aDoclist, nDoclist + ); } + sqlite3_free(pTS->aOutput); + pTS->aOutput = aNew; + pTS->nOutput = nNew; + return SQLITE_OK; } @@ -1522,13 +1506,13 @@ static int fts3TermSelect( ){ int i; TermSelect tsc; + Fts3SegFilter filter; /* Segment term filter configuration */ Fts3SegReader **apSegment = 0; /* Array of segments to read data from */ int nSegment = 0; /* Size of apSegment array */ int nAlloc = 0; /* Allocated size of segment array */ int rc; /* Return code */ sqlite3_stmt *pStmt; /* SQL statement to scan %_segdir table */ int iAge = 0; /* Used to assign ages to segments */ - int flags; /* Loop through the entire %_segdir table. For each segment, create a ** Fts3SegReader to iterate through the subset of the segment leaves @@ -1552,10 +1536,10 @@ static int fts3TermSelect( */ rc = sqlite3Fts3SegReaderNew(p, iAge, 0, 0, 0, zRoot, nRoot, &pNew); }else{ - sqlite3_int64 i1, i2; - rc = fts3SelectLeaves(p, zTerm, nTerm, isPrefix, zRoot, nRoot, &i1, &i2); + sqlite3_int64 i1; + rc = fts3SelectLeaf(p, zTerm, nTerm, zRoot, nRoot, &i1); if( rc==SQLITE_OK ){ - assert( i1 && i2 ); + sqlite3_int64 i2 = sqlite3_column_int64(pStmt, 3); rc = sqlite3Fts3SegReaderNew(p, iAge, i1, i2, 0, 0, 0, &pNew); } } @@ -1585,16 +1569,17 @@ static int fts3TermSelect( } memset(&tsc, 0, sizeof(TermSelect)); - tsc.zTerm = zTerm; - tsc.nTerm = nTerm; - tsc.isPrefix = isPrefix; tsc.isReqPos = isReqPos; - flags = FTS3_SEGMENT_IGNORE_EMPTY + filter.flags = FTS3_SEGMENT_IGNORE_EMPTY + | (isPrefix ? FTS3_SEGMENT_PREFIX : 0) | (isReqPos ? FTS3_SEGMENT_REQUIRE_POS : 0) | (iColumnnColumn ? FTS3_SEGMENT_COLUMN_FILTER : 0); - rc = sqlite3Fts3SegReaderIterate(p, apSegment, nSegment, flags, - iColumn, fts3TermSelectCb, (void *)&tsc + filter.iCol = iColumn; + filter.zTerm = zTerm; + filter.nTerm = nTerm; + rc = sqlite3Fts3SegReaderIterate(p, apSegment, nSegment, &filter, + fts3TermSelectCb, (void *)&tsc ); if( rc==SQLITE_OK ){ diff --git a/ext/fts3/fts3Int.h b/ext/fts3/fts3Int.h index ff285e1992..4c66825642 100644 --- a/ext/fts3/fts3Int.h +++ b/ext/fts3/fts3Int.h @@ -58,6 +58,7 @@ typedef struct Fts3Cursor Fts3Cursor; typedef struct Fts3Expr Fts3Expr; typedef struct Fts3Phrase Fts3Phrase; typedef struct Fts3SegReader Fts3SegReader; +typedef struct Fts3SegFilter Fts3SegFilter; /* ** A connection to a fulltext index is an instance of the following @@ -123,14 +124,14 @@ struct Fts3Cursor { ** of tokens in the string. */ struct Fts3Phrase { - int nToken; /* Number of tokens in the phrase */ - int iColumn; /* Index of column this phrase must match */ - int isNot; /* Phrase prefixed by unary not (-) operator */ + int nToken; /* Number of tokens in the phrase */ + int iColumn; /* Index of column this phrase must match */ + int isNot; /* Phrase prefixed by unary not (-) operator */ struct PhraseToken { - char *z; /* Text of the token */ - int n; /* Number of bytes in buffer pointed to by z */ - int isPrefix; /* True if token ends in with a "*" character */ - } aToken[1]; /* One entry for each token in the phrase */ + char *z; /* Text of the token */ + int n; /* Number of bytes in buffer pointed to by z */ + int isPrefix; /* True if token ends in with a "*" character */ + } aToken[1]; /* One entry for each token in the phrase */ }; /* @@ -178,12 +179,21 @@ int sqlite3Fts3Optimize(Fts3Table *); #define FTS3_SEGMENT_REQUIRE_POS 0x00000001 #define FTS3_SEGMENT_IGNORE_EMPTY 0x00000002 #define FTS3_SEGMENT_COLUMN_FILTER 0x00000004 +#define FTS3_SEGMENT_PREFIX 0x00000008 + +struct Fts3SegFilter { + const char *zTerm; + int nTerm; + int iCol; + int flags; +}; int sqlite3Fts3SegReaderNew(Fts3Table *,int, sqlite3_int64, sqlite3_int64, sqlite3_int64, const char *, int, Fts3SegReader**); void sqlite3Fts3SegReaderFree(Fts3SegReader *); + int sqlite3Fts3SegReaderIterate( - Fts3Table *, Fts3SegReader **, int, int, int, + Fts3Table *, Fts3SegReader **, int, Fts3SegFilter *, int (*)(Fts3Table *, void *, char *, int, char *, int), void * ); diff --git a/ext/fts3/fts3_write.c b/ext/fts3/fts3_write.c index cd83e692e2..f4b8a9b2ec 100644 --- a/ext/fts3/fts3_write.c +++ b/ext/fts3/fts3_write.c @@ -873,6 +873,33 @@ static int fts3SegReaderCmp2(Fts3SegReader *pLhs, Fts3SegReader *pRhs){ return rc; } +/* +** Compare the term that the Fts3SegReader object passed as the first argument +** points to with the term specified by arguments zTerm and nTerm. +** +** If the pSeg iterator is already at EOF, return 0. Otherwise, return +** -ve if the pSeg term is less than zTerm/nTerm, 0 if the two terms are +** equal, or +ve if the pSeg term is greater than zTerm/nTerm. +*/ +static int fts3SegReaderTermCmp( + Fts3SegReader *pSeg, /* Segment reader object */ + const char *zTerm, /* Term to compare to */ + int nTerm /* Size of term zTerm in bytes */ +){ + int res = 0; + if( pSeg->aNode ){ + if( pSeg->nTerm>nTerm ){ + res = memcmp(pSeg->zTerm, zTerm, nTerm); + }else{ + res = memcmp(pSeg->zTerm, zTerm, pSeg->nTerm); + } + if( res==0 ){ + res = pSeg->nTerm-nTerm; + } + } + return res; +} + /* ** Argument apSegment is an array of nSegment elements. It is known that ** the final (nSegment-nSuspect) members are already in sorted order @@ -1427,8 +1454,9 @@ static void fts3ColumnFilter(int iCol, char **ppList, int *pnList){ int nList = *pnList; char *pEnd = &pList[nList]; int iCurrent = 0; - char *p = pList; + + assert( iCol>=0 ); while( 1 ){ char c = 0; while( pflags & FTS3_SEGMENT_IGNORE_EMPTY); + int isRequirePos = (pFilter->flags & FTS3_SEGMENT_REQUIRE_POS); + int isColFilter = (pFilter->flags & FTS3_SEGMENT_COLUMN_FILTER); + int isPrefix = (pFilter->flags & FTS3_SEGMENT_PREFIX); + + /* If the Fts3SegFilter defines a specific term (or term prefix) to search + ** for, then advance each segment iterator until it points to a term of + ** equal or greater value than the specified term. This prevents many + ** unnecessary merge/sort operations for the case where single segment + ** b-tree leaf nodes contain more than one term. + */ + if( pFilter->zTerm ){ + int nTerm = pFilter->nTerm; + char *zTerm = pFilter->zTerm; + for(i=0; iaNode ){ @@ -1487,6 +1533,22 @@ int sqlite3Fts3SegReaderIterate( char *zTerm = apSegment[0]->zTerm; int nMerge = 1; + /* If this is a prefix-search, and if the term that apSegment[0] points + ** to does not share a suffix with pFilter->zTerm/nTerm, then all + ** required callbacks have been made. In this case exit early. + ** + ** Similarly, if this is a search for an exact match, and the first term + ** of segment apSegment[0] is not a match, exit early. + */ + if( pFilter->zTerm ){ + if( nTermnTerm + || (!isPrefix && nTerm>pFilter->nTerm) + || memcmp(zTerm, pFilter->zTerm, pFilter->nTerm) + ){ + goto finished; + } + } + while( nMergeaNode && apSegment[nMerge]->nTerm==nTerm @@ -1527,9 +1589,8 @@ int sqlite3Fts3SegReaderIterate( j++; } - assert( iCol>=0 || isColFilter==0 ); if( isColFilter ){ - fts3ColumnFilter(iCol, &pList, &nList); + fts3ColumnFilter(pFilter->iCol, &pList, &nList); } if( !isIgnoreEmpty || nList>0 ){ @@ -1562,6 +1623,14 @@ int sqlite3Fts3SegReaderIterate( } } + /* If there is a term specified to filter on, and this is not a prefix + ** search, return now. The callback that corresponds to the required + ** term (if such a term exists in the index) has already been made. + */ + if( pFilter->zTerm && !isPrefix ){ + goto finished; + } + for(i=0; i