From: drh Date: Sun, 10 Sep 2006 03:34:06 +0000 (+0000) Subject: Add some simple test cases for the OR and NOT logic of the fts1 module. X-Git-Tag: version-3.6.10~2767 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=a2a9d188691e52735b077f283e63879bf42636bc;p=thirdparty%2Fsqlite.git Add some simple test cases for the OR and NOT logic of the fts1 module. Fix lots of bugs discovered while developing these test cases. (CVS 3400) FossilOrigin-Name: 70bcff024b44d1b40afac6eba959fa89fb993147 --- diff --git a/ext/fts1/fts1.c b/ext/fts1/fts1.c index b0b99769e2..0c973a6dd1 100644 --- a/ext/fts1/fts1.c +++ b/ext/fts1/fts1.c @@ -283,7 +283,7 @@ static sqlite_int64 peekDocid(DocListReader *pReader){ return ret; } -/* Read the next docid. +/* Read the next docid. See also nextValidDocid(). */ static sqlite_int64 readDocid(DocListReader *pReader){ sqlite_int64 ret; @@ -302,8 +302,10 @@ static int readPosition(DocListReader *pReader){ int i; int iType = pReader->pDoclist->iType; + if( pReader->iLastPos==-1 ){ + return -1; + } assert( !atEnd(pReader) ); - assert( pReader->iLastPos!=-1 ); if( iTypeiType>=DL_POSITIONS ){ - docListAddPos(pOut, iLeftPos); + docListAddPos(pOut, iRightPos); } iLeftPos = readPosition(pLeft); iRightPos = readPosition(pRight); - }else if( iRightPos=0 ) skipPositionList(pRight); } -/* -** Read the next non-deleted docid off of pIn. Return -** 0 if we reach the end of pDoclist. -*/ -static sqlite_int64 nextValidDocid(DocListReader *pIn){ - sqlite_int64 docid = 0; - while( !atEnd(pIn) && (docid = readDocid(pIn))==0 ){ - skipPositionList(pIn); - } - return docid; -} - /* We have two doclists: pLeft and pRight. -** Write the intersection of these two doclists into pOut. +** Write the phrase intersection of these two doclists into pOut. +** +** A phrase intersection means that two documents only match +** if pLeft.iPos+1==pRight.iPos. ** -** nLeftPhrase is the number of words of a phrase that have -** contributed to pLeft. +** The output pOut may or may not contain positions. If pOut +** does contain positions, they are the positions of pRight. */ -static void mergeBlockAnd( +static void docListPhraseMerge( DocList *pLeft, /* Doclist resulting from the words on the left */ DocList *pRight, /* Doclist for the next word to the right */ - int nLeftPhrase, /* Number of matching phrase terms in pLeft */ DocList *pOut /* Write the combined doclist here */ ){ DocListReader left, right; @@ -554,20 +560,11 @@ static void mergeBlockAnd( while( docidLeft>0 && docidRight>0 ){ if( docidLeftiType>=DL_POSITIONS ){ - mergePosList(&left, &right, docidLeft, nLeftPhrase, pOut); - }else{ - docListAddDocid(pOut, docidLeft); - skipPositionList(&left); - skipPositionList(&right); - } + mergePosList(&left, &right, docidLeft, pOut); docidLeft = nextValidDocid(&left); docidRight = nextValidDocid(&right); } @@ -575,35 +572,34 @@ static void mergeBlockAnd( } /* We have two doclists: pLeft and pRight. -** Write into pOut all documents that occur in pLeft but not -** in pRight. +** Write the intersection of these two doclists into pOut. +** Only docids are matched. Position information is ignored. ** ** The output pOut never holds positions. */ -static void mergeBlockExcept( +static void docListAndMerge( DocList *pLeft, /* Doclist resulting from the words on the left */ DocList *pRight, /* Doclist for the next word to the right */ DocList *pOut /* Write the combined doclist here */ ){ DocListReader left, right; - sqlite_int64 docidLeft, docidRight, priorLeft; + sqlite_int64 docidLeft, docidRight; + + assert( pOut->iType0 ){ - priorLeft = docidLeft; - if( docidRight==0 || docidLeft0 && docidRight>0 ){ + if( docidLeft0 && docidRight<=priorLeft ){ - skipPositionList(&right); docidRight = nextValidDocid(&right); } } @@ -611,10 +607,11 @@ static void mergeBlockExcept( /* We have two doclists: pLeft and pRight. ** Write the union of these two doclists into pOut. +** Only docids are matched. Position information is ignored. ** ** The output pOut never holds positions. */ -static void mergeBlockOr( +static void docListOrMerge( DocList *pLeft, /* Doclist resulting from the words on the left */ DocList *pRight, /* Doclist for the next word to the right */ DocList *pOut /* Write the combined doclist here */ @@ -635,26 +632,61 @@ static void mergeBlockOr( } priorLeft = docidLeft; if( docidLeft<=docidRight ){ - skipPositionList(&left); docidLeft = nextValidDocid(&left); } if( docidRight>0 && docidRight<=priorLeft ){ - skipPositionList(&right); docidRight = nextValidDocid(&right); } } while( docidLeft>0 ){ docListAddDocid(pOut, docidLeft); - skipPositionList(&left); docidLeft = nextValidDocid(&left); } while( docidRight>0 ){ docListAddDocid(pOut, docidRight); - skipPositionList(&right); docidRight = nextValidDocid(&right); } } +/* We have two doclists: pLeft and pRight. +** Write into pOut all documents that occur in pLeft but not +** in pRight. +** +** Only docids are matched. Position information is ignored. +** +** The output pOut never holds positions. +*/ +static void docListExceptMerge( + DocList *pLeft, /* Doclist resulting from the words on the left */ + DocList *pRight, /* Doclist for the next word to the right */ + DocList *pOut /* Write the combined doclist here */ +){ + DocListReader left, right; + sqlite_int64 docidLeft, docidRight, priorLeft; + + readerInit(&left, pLeft); + readerInit(&right, pRight); + docidLeft = nextValidDocid(&left); + docidRight = nextValidDocid(&right); + + while( docidLeft>0 && docidRight>0 ){ + priorLeft = docidLeft; + if( docidLeft0 && docidRight<=priorLeft ){ + docidRight = nextValidDocid(&right); + } + } + while( docidLeft>0 ){ + docListAddDocid(pOut, docidLeft); + docidLeft = nextValidDocid(&left); + } +} + /* Duplicate a string; the caller must free() the returned string. * (We don't use strdup() since it's not part of the standard C library and * may not be available everywhere.) */ @@ -1263,11 +1295,11 @@ static int fulltextNext(sqlite3_vtab_cursor *pCursor){ rc = sqlite3_reset(c->pStmt); if( rc!=SQLITE_OK ) return rc; - if( atEnd(&c->result)){ + iDocid = nextValidDocid(&c->result); + if( iDocid==0 ){ c->eof = 1; return SQLITE_OK; } - iDocid = readDocid(&c->result); rc = sqlite3_bind_int64(c->pStmt, 1, iDocid); if( rc!=SQLITE_OK ) return rc; /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */ @@ -1284,69 +1316,55 @@ static int fulltextNext(sqlite3_vtab_cursor *pCursor){ } } -/* -** Different kinds of allowed document merge operations. -*/ -#define MERGE_AND 1 /* Intersection of left and right */ -#define MERGE_OR 2 /* Union of left and right */ -#define MERGE_EXCEPT 3 /* Documents in left but not in right */ - -/* Read the posting list for [pTerm]; AND it with the doclist [pIn] to - * produce the doclist [out], using the given phrase position [iPhrasePos]. - * (*pSelect) is used to hold an SQLite statement used inside this function; - * the caller should initialize *pSelect to NULL before the first call. - */ -static int mergeQuery( - fulltext_vtab *v, /* The full text index virtual table */ - const char *pTerm, int nTerm, /* Term we are searching for */ - DocList *pIn, /* Prior search results. NULL for first term */ - int iPhrasePos, /* Offset to first term of phrase search */ - int eOp, /* MERGE_AND, MERGE_OR, or MERGE_EXCEPT */ - DocList *out /* Write results here */ -){ - int rc; - DocList doclist; - - /* If [pIn] is already empty, there's no point in reading the - * posting list to AND it in; return immediately. */ - if( pIn!=NULL && eOp==MERGE_AND && !pIn->nData ) return SQLITE_OK; - - rc = term_select_all(v, pTerm, nTerm, &doclist); - if( rc!=SQLITE_OK ) return rc; - - /* If there is no input and the output wants to contain position - ** information, then make the result the doclist for pTerm. - */ - if( pIn==0 && out->iType>=DL_POSITIONS ){ - docListDestroy(out); - *out = doclist; - return SQLITE_OK; - } - - if( eOp==MERGE_AND && pIn!=0 ){ - mergeBlockAnd(pIn, &doclist, iPhrasePos, out); - }else if( eOp==MERGE_OR || pIn==0 ){ - mergeBlockOr(pIn, &doclist, out); - }else if( eOp==MERGE_EXCEPT ){ - mergeBlockExcept(pIn, &doclist, out); - } - docListDestroy(&doclist); - - return SQLITE_OK; -} - /* A single term in a query is represented by an instances of ** the following structure. */ typedef struct QueryTerm { - int firstInPhrase; /* true if this term begins a new phrase */ + int nPhrase; /* How many following terms are part of the same phrase */ int isOr; /* this term is preceded by "OR" */ int isNot; /* this term is preceded by "-" */ char *pTerm; /* text of the term. '\000' terminated. malloced */ int nTerm; /* Number of bytes in pTerm[] */ } QueryTerm; -/* A parsed query. + +/* Return a DocList corresponding to the query term *pTerm. If *pTerm +** is the first term of a phrase query, go ahead and evaluate the phrase +** query and return the doclist for the entire phrase query. +** +** The result is stored in pTerm->doclist. +*/ +static int docListOfTerm( + fulltext_vtab *v, /* The full text index */ + QueryTerm *pQTerm, /* Term we are looking for, or 1st term of a phrase */ + DocList **ppResult /* Write the result here */ +){ + DocList *pLeft, *pRight, *pNew; + int i, rc; + + pLeft = docListNew(DL_POSITIONS); + rc = term_select_all(v, pQTerm->pTerm, pQTerm->nTerm, pLeft); + if( rc ) return rc; + for(i=1; i<=pQTerm->nPhrase; i++){ + pRight = docListNew(DL_POSITIONS); + rc = term_select_all(v, pQTerm[i].pTerm, pQTerm[i].nTerm, pRight); + if( rc ){ + docListDelete(pLeft); + return rc; + } + pNew = docListNew(inPhrase ? DL_POSITIONS : DL_DOCIDS); + docListPhraseMerge(pLeft, pRight, pNew); + docListDelete(pLeft); + docListDelete(pRight); + pLeft = pNew; + } + *ppResult = pLeft; + return SQLITE_OK; +} + + + +/* Parse a query string into a Query structure. * * We could, in theory, allow query strings to be complicated * nested expressions with precedence determined by parentheses. @@ -1377,12 +1395,12 @@ typedef struct QueryTerm { typedef struct Query { int nTerms; /* Number of terms in the query */ QueryTerm *pTerms; /* Array of terms. Space obtained from malloc() */ + int nextIsOr; /* Set the isOr flag on the next inserted term */ } Query; -/* Add a new term pTerm[0..nTerm-1] to the query *q. The new term is -** the beginning of a phrase is firstInPhrase is true. +/* Add a new term pTerm[0..nTerm-1] to the query *q. */ -static void queryAdd(Query *q, int firstInPhrase, const char *pTerm, int nTerm){ +static void queryAdd(Query *q, const char *pTerm, int nTerm){ QueryTerm *t; ++q->nTerms; q->pTerms = realloc(q->pTerms, q->nTerms * sizeof(q->pTerms[0])); @@ -1392,11 +1410,12 @@ static void queryAdd(Query *q, int firstInPhrase, const char *pTerm, int nTerm){ } t = &q->pTerms[q->nTerms - 1]; memset(t, 0, sizeof(*t)); - t->firstInPhrase = firstInPhrase; t->pTerm = malloc(nTerm+1); memcpy(t->pTerm, pTerm, nTerm); t->pTerm[nTerm] = 0; t->nTerm = nTerm; + t->isOr = q->nextIsOr; + q->nextIsOr = 0; } /* Free all of the memory that was malloced in order to build *q. @@ -1414,18 +1433,20 @@ static void queryDestroy(Query *q){ ** to the query being assemblied in pQuery. ** ** inPhrase is true if pSegment[0..nSegement-1] is contained within -** double-quotes. If inPhrase is true, then only the first term -** is marked with firstInPhrase and OR and "-" syntax is ignored. -** If inPhrase is false, then every term found is marked with -** firstInPhrase and OR and "-" syntax is significant. +** double-quotes. If inPhrase is true, then the first term +** is marked with the number of terms in the phrase less one and +** OR and "-" syntax is ignored. If inPhrase is false, then every +** term found is marked with nPhrase=0 and OR and "-" syntax is significant. */ -static int tokenizeSegment(sqlite3_tokenizer *pTokenizer, - const char *pSegment, int nSegment, int inPhrase, - Query *pQuery){ +static int tokenizeSegment( + sqlite3_tokenizer *pTokenizer, /* The tokenizer to use */ + const char *pSegment, int nSegment, /* Query expression being parsed */ + int inPhrase, /* True if within "..." */ + Query *pQuery /* Append results here */ +){ const sqlite3_tokenizer_module *pModule = pTokenizer->pModule; sqlite3_tokenizer_cursor *pCursor; - int is_first = 1; - int isOr = 0; + int firstIndex = pQuery->nTerms; int rc = pModule->xOpen(pTokenizer, pSegment, nSegment, &pCursor); if( rc!=SQLITE_OK ) return rc; @@ -1441,19 +1462,17 @@ static int tokenizeSegment(sqlite3_tokenizer *pTokenizer, if( rc!=SQLITE_OK ) break; if( !inPhrase && pQuery->nTerms>0 && nToken==2 && pSegment[iBegin]=='O' && pSegment[iBegin+1]=='R' ){ - isOr = 1; + pQuery->nextIsOr = 1; continue; } - queryAdd(pQuery, !inPhrase || is_first, pToken, nToken); - if( !inPhrase ){ - if( isOr ){ - pQuery->pTerms[pQuery->nTerms-1].isOr = 1; - }else if( iBegin>0 && pSegment[iBegin-1]=='-' ){ - pQuery->pTerms[pQuery->nTerms-1].isNot = 1; - } + queryAdd(pQuery, pToken, nToken); + if( !inPhrase && iBegin>0 && pSegment[iBegin-1]=='-' ){ + pQuery->pTerms[pQuery->nTerms-1].isNot = 1; } - is_first = 0; - isOr = 0; + } + + if( inPhrase && pQuery->nTerms>firstIndex ){ + pQuery->pTerms[firstIndex].nPhrase = pQuery->nTerms - firstIndex - 1; } return pModule->xClose(pCursor); @@ -1468,6 +1487,7 @@ static int parseQuery(fulltext_vtab *v, const char *pInput, int nInput, if( nInput<0 ) nInput = strlen(pInput); pQuery->nTerms = 0; pQuery->pTerms = NULL; + pQuery->nextIsOr = 0; for(iInput=0; iInput