From: dan Date: Wed, 17 Aug 2016 19:05:12 +0000 (+0000) Subject: Bias the fts5 snippet() function to return snippets that look like they start at... X-Git-Tag: version-3.15.0~123^2~5 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=e7036202bcc2bae5420332388dee8a74fce969a1;p=thirdparty%2Fsqlite.git Bias the fts5 snippet() function to return snippets that look like they start at the start of sentences. FossilOrigin-Name: 60de159476edbd48dc363f7f77f09c32ea68422f --- diff --git a/ext/fts5/fts5_aux.c b/ext/fts5/fts5_aux.c index d8147f8111..ca55fd6b01 100644 --- a/ext/fts5/fts5_aux.c +++ b/ext/fts5/fts5_aux.c @@ -246,6 +246,115 @@ static void fts5HighlightFunction( ** End of highlight() implementation. **************************************************************************/ +/* +** Context object passed to the fts5SentenceFinderCb() function. +*/ +typedef struct Fts5SFinder Fts5SFinder; +struct Fts5SFinder { + int iPos; /* Current token position */ + int nFirstAlloc; /* Allocated size of aFirst[] */ + int nFirst; /* Number of entries in aFirst[] */ + int *aFirst; /* Array of first token in each sentence */ + const char *zDoc; /* Document being tokenized */ +}; + +/* +** Add an entry to the Fts5SFinder.aFirst[] array. Grow the array if +** necessary. Return SQLITE_OK if successful, or SQLITE_NOMEM if an +** error occurs. +*/ +static int fts5SentenceFinderAdd(Fts5SFinder *p, int iAdd){ + if( p->nFirstAlloc==p->nFirst ){ + int nNew = p->nFirstAlloc ? p->nFirstAlloc*2 : 64; + int *aNew; + + aNew = (int*)sqlite3_realloc(p->aFirst, nNew*sizeof(int)); + if( aNew==0 ) return SQLITE_NOMEM; + p->aFirst = aNew; + p->nFirstAlloc = nNew; + } + p->aFirst[p->nFirst++] = iAdd; + return SQLITE_OK; +} + +/* +** This function is an xTokenize() callback used by the auxiliary snippet() +** function. Its job is to identify tokens that are the first in a sentence. +** For each such token, an entry is added to the SFinder.aFirst[] array. +*/ +static int fts5SentenceFinderCb( + void *pContext, /* Pointer to HighlightContext object */ + int tflags, /* Mask of FTS5_TOKEN_* flags */ + const char *pToken, /* Buffer containing token */ + int nToken, /* Size of token in bytes */ + int iStartOff, /* Start offset of token */ + int iEndOff /* End offset of token */ +){ + int rc = SQLITE_OK; + + if( (tflags & FTS5_TOKEN_COLOCATED)==0 ){ + Fts5SFinder *p = (Fts5SFinder*)pContext; + if( p->iPos>0 ){ + int i; + char c = 0; + for(i=iStartOff-1; i>=0; i--){ + c = p->zDoc[i]; + if( c!=' ' && c!='\t' && c!='\n' && c!='\r' ) break; + } + if( i!=iStartOff-1 && (c=='.' || c==':') ){ + rc = fts5SentenceFinderAdd(p, p->iPos); + } + }else{ + rc = fts5SentenceFinderAdd(p, 0); + } + p->iPos++; + } + return rc; +} + +static int fts5SnippetScore( + const Fts5ExtensionApi *pApi, /* API offered by current FTS version */ + Fts5Context *pFts, /* First arg to pass to pApi functions */ + int nDocsize, /* Size of column in tokens */ + unsigned char *aSeen, /* Array with one element per query phrase */ + int iCol, /* Column to score */ + int iPos, /* Starting offset to score */ + int nToken, /* Max tokens per snippet */ + int *pnScore, /* OUT: Score */ + int *piPos /* OUT: Adjusted offset */ +){ + int rc; + int i; + int ip = 0; + int ic = 0; + int iOff = 0; + int iFirst = -1; + int nInst; + int nScore = 0; + + rc = pApi->xInstCount(pFts, &nInst); + for(i=0; ixInst(pFts, i, &ip, &ic, &iOff); + if( rc==SQLITE_OK && ic==iCol && iOff>=iPos && iOff<(iPos+nToken) ){ + nScore += (aSeen[ip] ? 1 : 1000); + aSeen[ip] = 1; + if( iFirst<0 ) iFirst = iOff; + } + } + + *pnScore = nScore; + if( piPos ){ + int iLast = iOff + pApi->xPhraseSize(pFts, ip); + int iAdj = iFirst - (nToken - (iLast-iFirst)) / 2; + + if( (iAdj+nToken)>nDocsize ) iAdj = nDocsize - nToken; + if( iAdj<0 ) iAdj = 0; + *piPos = iAdj; + } + + return rc; +} + /* ** Implementation of snippet() function. */ @@ -267,9 +376,10 @@ static void fts5SnippetFunction( unsigned char *aSeen; /* Array of "seen instance" flags */ int iBestCol; /* Column containing best snippet */ int iBestStart = 0; /* First token of best snippet */ - int iBestLast; /* Last token of best snippet */ int nBestScore = 0; /* Score of best snippet */ int nColSize = 0; /* Total size of iBestCol in tokens */ + Fts5SFinder sFinder; /* Used to find the beginnings of sentences */ + int nCol; if( nVal!=5 ){ const char *zErr = "wrong number of arguments to function snippet()"; @@ -277,13 +387,13 @@ static void fts5SnippetFunction( return; } + nCol = pApi->xColumnCount(pFts); memset(&ctx, 0, sizeof(HighlightContext)); iCol = sqlite3_value_int(apVal[0]); ctx.zOpen = (const char*)sqlite3_value_text(apVal[1]); ctx.zClose = (const char*)sqlite3_value_text(apVal[2]); zEllips = (const char*)sqlite3_value_text(apVal[3]); nToken = sqlite3_value_int(apVal[4]); - iBestLast = nToken-1; iBestCol = (iCol>=0 ? iCol : 0); nPhrase = pApi->xPhraseCount(pFts); @@ -291,43 +401,64 @@ static void fts5SnippetFunction( if( aSeen==0 ){ rc = SQLITE_NOMEM; } - if( rc==SQLITE_OK ){ rc = pApi->xInstCount(pFts, &nInst); } - for(i=0; rc==SQLITE_OK && ixInst(pFts, i, &ip, &iSnippetCol, &iStart); - if( rc==SQLITE_OK && (iCol<0 || iSnippetCol==iCol) ){ - int nScore = 1000; - int iLast = iStart - 1 + pApi->xPhraseSize(pFts, ip); - int j; - aSeen[ip] = 1; - for(j=i+1; rc==SQLITE_OK && jxInst(pFts, j, &ip, &ic, &io); - iFinal = io + pApi->xPhraseSize(pFts, ip) - 1; - if( rc==SQLITE_OK && ic==iSnippetCol && iLastiLast ) iLast = iFinal; + memset(&sFinder, 0, sizeof(Fts5SFinder)); + for(i=0; ixColumnText(pFts, i, &sFinder.zDoc, &nDoc); + if( rc!=SQLITE_OK ) break; + rc = pApi->xTokenize(pFts, + sFinder.zDoc, nDoc, (void*)&sFinder,fts5SentenceFinderCb + ); + if( rc!=SQLITE_OK ) break; + rc = pApi->xColumnSize(pFts, i, &nDocsize); + if( rc!=SQLITE_OK ) break; + + for(ii=0; rc==SQLITE_OK && iinBestScore ){ + nBestScore = nScore; + iBestCol = i; + iBestStart = sFinder.aFirst[ii]; + nColSize = nDocsize; } } - if( rc==SQLITE_OK && nScore>nBestScore ){ - iBestCol = iSnippetCol; - iBestStart = iStart; - iBestLast = iLast; - nBestScore = nScore; + for(ii=0; rc==SQLITE_OK && iixInst(pFts, ii, &ip, &ic, &io); + if( ic!=i || rc!=SQLITE_OK ) continue; + memset(aSeen, 0, nPhrase); + rc = fts5SnippetScore(pApi, pFts, nDocsize, aSeen, i, + io, nToken, &nScore, &io + ); + if( rc==SQLITE_OK && nScore>nBestScore ){ + nBestScore = nScore; + iBestCol = i; + iBestStart = io; + nColSize = nDocsize; + } } } } - if( rc==SQLITE_OK ){ - rc = pApi->xColumnSize(pFts, iBestCol, &nColSize); - } if( rc==SQLITE_OK ){ rc = pApi->xColumnText(pFts, iBestCol, &ctx.zIn, &ctx.nIn); } @@ -336,14 +467,6 @@ static void fts5SnippetFunction( rc = fts5CInstIterInit(pApi, pFts, iBestCol, &ctx.iter); } - if( (iBestStart+nToken-1)>iBestLast ){ - iBestStart -= (iBestStart+nToken-1-iBestLast) / 2; - } - if( iBestStart+nToken>nColSize ){ - iBestStart = nColSize - nToken; - } - if( iBestStart<0 ) iBestStart = 0; - ctx.iRangeStart = iBestStart; ctx.iRangeEnd = iBestStart + nToken - 1; @@ -365,15 +488,15 @@ static void fts5SnippetFunction( }else{ fts5HighlightAppend(&rc, &ctx, zEllips, -1); } - - if( rc==SQLITE_OK ){ - sqlite3_result_text(pCtx, (const char*)ctx.zOut, -1, SQLITE_TRANSIENT); - }else{ - sqlite3_result_error_code(pCtx, rc); - } - sqlite3_free(ctx.zOut); } + if( rc==SQLITE_OK ){ + sqlite3_result_text(pCtx, (const char*)ctx.zOut, -1, SQLITE_TRANSIENT); + }else{ + sqlite3_result_error_code(pCtx, rc); + } + sqlite3_free(ctx.zOut); sqlite3_free(aSeen); + sqlite3_free(sFinder.aFirst); } /************************************************************************/ diff --git a/ext/fts5/test/fts5af.test b/ext/fts5/test/fts5af.test index 21854662d3..903dd2910f 100644 --- a/ext/fts5/test/fts5af.test +++ b/ext/fts5/test/fts5af.test @@ -59,7 +59,6 @@ proc do_snippet_test {tn doc match res} { foreach {tn doc res} { - 1.1 {X o o o o o o} {[X] o o o o o o} 1.2 {o X o o o o o} {o [X] o o o o o} 1.3 {o o X o o o o} {o o [X] o o o o} @@ -111,6 +110,10 @@ foreach {tn doc res} { 8.1 {o o o o X o o o o o o o o o o o o o o o o o o o o o X X X o o o} {...o o [X] [X] [X] o o...} + 8.2 {o o o o. X o o o o o o o o o o o o o o o o o o o o o X X X o o o} + {...[X] o o o o o o...} + + } { do_snippet_test 1.$tn $doc X $res } diff --git a/ext/fts5/test/fts5unicode2.test b/ext/fts5/test/fts5unicode2.test index d3ff5128da..9fead57c07 100644 --- a/ext/fts5/test/fts5unicode2.test +++ b/ext/fts5/test/fts5unicode2.test @@ -160,11 +160,11 @@ foreach {tn query snippet} { the maximum x value. } 4 "rollback" { - ...[ROLLBACK]. Instead, the pending statement + ...Instead, the pending statement will return SQLITE_ABORT upon next access after the [ROLLBACK]. } 5 "rOllback" { - ...[ROLLBACK]. Instead, the pending statement + ...Instead, the pending statement will return SQLITE_ABORT upon next access after the [ROLLBACK]. } 6 "lang*" { diff --git a/manifest b/manifest index 2dccb1db29..4fbf53dc5a 100644 --- a/manifest +++ b/manifest @@ -1,5 +1,5 @@ -C Fix\sa\sproblem\sin\sthe\sfts5\ssnippet()\sauxiliary\sfunction. -D 2016-08-17T11:14:39.949 +C Bias\sthe\sfts5\ssnippet()\sfunction\sto\sreturn\ssnippets\sthat\slook\slike\sthey\sstart\sat\sthe\sstart\sof\ssentences. +D 2016-08-17T19:05:12.783 F Makefile.in cfd8fb987cd7a6af046daa87daa146d5aad0e088 F Makefile.linux-gcc 7bc79876b875010e8c8f9502eb935ca92aa3c434 F Makefile.msc d66d0395c38571aab3804f8db0fa20707ae4609a @@ -99,7 +99,7 @@ F ext/fts3/unicode/parseunicode.tcl da577d1384810fb4e2b209bf3313074353193e95 F ext/fts5/extract_api_docs.tcl a36e54ec777172ddd3f9a88daf593b00848368e0 F ext/fts5/fts5.h 62f3e33ceeb9a428db139f9c012186b371da1cc7 F ext/fts5/fts5Int.h b2eda36e0f224365c8e23dc8f559311834f1c13f -F ext/fts5/fts5_aux.c e4bec077c5190946dbaac72c6555defd823724ca +F ext/fts5/fts5_aux.c 5921bbef4c78b86159111335135837a867f1ff8a F ext/fts5/fts5_buffer.c 4c1502d4c956cd092c89ce4480867f9d8bf325cd F ext/fts5/fts5_config.c 5af9c360e99669d29f06492c370892394aba0857 F ext/fts5/fts5_expr.c 1ee97156421919e497595bfa962bb88ad1665401 @@ -122,7 +122,7 @@ F ext/fts5/test/fts5ab.test 30325a89453280160106be411bba3acf138e6d1b F ext/fts5/test/fts5ac.test 55cad4275a1f5acabfe14d8442a8046b47e49e5f F ext/fts5/test/fts5ad.test 36995f0586f30f5602074e012b9224c71ec5171c F ext/fts5/test/fts5ae.test 612dcb51f4069226791ff14c17dbfb3138c56f20 -F ext/fts5/test/fts5af.test b6afd7c28ad62d546c30f387fb971f3aaebaac0d +F ext/fts5/test/fts5af.test 38bfc8c3f2fd4687a37ae3a0d4d64e029273df36 F ext/fts5/test/fts5ag.test 27180de76c03036be75ee80b93d8c5f540014071 F ext/fts5/test/fts5ah.test dfb7897711dbcda1dacb038aec310daca139fcf5 F ext/fts5/test/fts5ai.test 3909d0b949b2afcaae4d5795cd79153da75381df @@ -190,7 +190,7 @@ F ext/fts5/test/fts5tok1.test beb894c6f3468f10a574302f69ebe4436b0287c7 F ext/fts5/test/fts5tok2.test dcacb32d4a2a3f0dd3215d4a3987f78ae4be21a2 F ext/fts5/test/fts5tokenizer.test ea4df698b35cc427ebf2ba22829d0e28386d8c89 F ext/fts5/test/fts5unicode.test fbef8d8a3b4b88470536cc57604a82ca52e51841 -F ext/fts5/test/fts5unicode2.test c1dd890ba32b7609adba78e420faa847abe43b59 +F ext/fts5/test/fts5unicode2.test cb4918278514eeb6f398a28ce71ba52934f50ce5 F ext/fts5/test/fts5unicode3.test 35c3d02aa7acf7d43d8de3bfe32c15ba96e8928e F ext/fts5/test/fts5unindexed.test e9539d5b78c677315e7ed8ea911d4fd25437c680 F ext/fts5/test/fts5update.test 57c7012a7919889048947addae10e0613df45529 @@ -1510,7 +1510,7 @@ F vsixtest/vsixtest.tcl 6a9a6ab600c25a91a7acc6293828957a386a8a93 F vsixtest/vsixtest.vcxproj.data 2ed517e100c66dc455b492e1a33350c1b20fbcdc F vsixtest/vsixtest.vcxproj.filters 37e51ffedcdb064aad6ff33b6148725226cd608e F vsixtest/vsixtest_TemporaryKey.pfx e5b1b036facdb453873e7084e1cae9102ccc67a0 -P f9035b8e2ea331801402bcb62b203ab092949770 -R c6d38a9772924d9b143e2aa32916b131 +P a861713cc6a3868a1c89240e8340bc7b2b9559da +R ddec9217c48a760254ac22da18301cf5 U dan -Z c5e29b567d02bec227e0b49d5b8a188c +Z 28e0d85dd23359fe591994acb5271e6f diff --git a/manifest.uuid b/manifest.uuid index 77bfe841ee..cfefaabd19 100644 --- a/manifest.uuid +++ b/manifest.uuid @@ -1 +1 @@ -a861713cc6a3868a1c89240e8340bc7b2b9559da \ No newline at end of file +60de159476edbd48dc363f7f77f09c32ea68422f \ No newline at end of file