}
}
-typedef struct Fts5GatherCtx Fts5GatherCtx;
+
+/*
+** Context object passed by fts5GatherTotals() to xQueryPhrase callback
+** fts5GatherCallback().
+*/
struct Fts5GatherCtx {
- int nCol;
- int iPhrase;
- int *anVal;
+ int nCol; /* Number of columns in FTS table */
+ int iPhrase; /* Phrase currently under investigation */
+ int *anVal; /* Array to populate */
};
+/*
+** Callback used by fts5GatherTotals() with the xQueryPhrase() API.
+*/
static int fts5GatherCallback(
const Fts5ExtensionApi *pApi,
Fts5Context *pFts,
- void *pUserData
+ void *pUserData /* Pointer to Fts5GatherCtx object */
){
- Fts5GatherCtx *p = (Fts5GatherCtx*)pUserData;
+ struct Fts5GatherCtx *p = (struct Fts5GatherCtx*)pUserData;
int i = 0;
int iPrev = -1;
i64 iPos = 0;
int nPhrase = pApi->xPhraseCount(pFts);
int nCol = pApi->xColumnCount(pFts);
int nByte = nCol * nPhrase * sizeof(int);
- Fts5GatherCtx sCtx;
+ struct Fts5GatherCtx sCtx;
sCtx.nCol = nCol;
anVal = sCtx.anVal = (int*)sqlite3_malloc(nByte);
typedef struct Fts5Bm25Context Fts5Bm25Context;
struct Fts5Bm25Context {
- int nPhrase;
- int nCol;
+ int nPhrase; /* Number of phrases in query */
+ int nCol; /* Number of columns in FTS table */
double *aIDF; /* Array of IDF values */
double *aAvg; /* Average size of each column in tokens */
};
-static void fts5Bm25Function(
+static int fts5Bm25GetContext(
const Fts5ExtensionApi *pApi, /* API offered by current FTS version */
Fts5Context *pFts, /* First arg to pass to pApi functions */
- sqlite3_context *pCtx, /* Context for returning result/error */
- int nVal, /* Number of values in apVal[] array */
- sqlite3_value **apVal /* Array of trailing arguments */
+ Fts5Bm25Context **pp /* OUT: Context object */
){
- const double k1 = 1.2;
- const double B = 0.75;
-
- int rc = SQLITE_OK;
Fts5Bm25Context *p;
+ int rc = SQLITE_OK;
p = pApi->xGetAuxdata(pFts, 0);
if( p==0 ){
memset(p, 0, nByte);
p->aAvg = (double*)&p[1];
p->aIDF = (double*)&p->aAvg[nCol];
+ p->nCol = nCol;
+ p->nPhrase = nPhrase;
}
if( rc==SQLITE_OK ){
rc = pApi->xRowCount(pFts, &nRow);
assert( nRow>0 || rc!=SQLITE_OK );
+ if( nRow<2 ) nRow = 2;
}
for(ic=0; rc==SQLITE_OK && ic<nCol; ic++){
}
for(ic=0; ic<nCol; ic++){
for(ip=0; rc==SQLITE_OK && ip<nPhrase; ip++){
- int idx = ip * nCol + ic;
- p->aIDF[idx] = log( (0.5 + nRow - anVal[idx]) / (0.5 + anVal[idx]) );
- if( p->aIDF[idx]<0.0 ) p->aIDF[idx] = 0.0;
+ /* Calculate the IDF (Inverse Document Frequency) for phrase ip
+ ** in column ic. This is done using the standard BM25 formula as
+ ** found on wikipedia:
+ **
+ ** IDF = log( (N - nHit + 0.5) / (nHit + 0.5) )
+ **
+ ** where "N" is the total number of documents in the set and nHit
+ ** is the number that contain at least one instance of the phrase
+ ** under consideration.
+ **
+ ** The problem with this is that if (N < 2*nHit), the IDF is
+ ** negative. Which is undesirable. So the mimimum allowable IDF is
+ ** (1e-6) - roughly the same as a term that appears in just over
+ ** half of set of 5,000,000 documents. */
+ int idx = ip * nCol + ic; /* Index in aIDF[] and anVal[] arrays */
+ int nHit = anVal[idx]; /* Number of docs matching "ic: ip" */
+
+ p->aIDF[idx] = log( (0.5 + nRow - nHit) / (0.5 + nHit) );
+ if( p->aIDF[idx]<=0.0 ) p->aIDF[idx] = 1e-6;
+ assert( p->aIDF[idx]>=0.0 );
}
}
}
if( rc!=SQLITE_OK ){
sqlite3_free(p);
+ p = 0;
}
}
+ *pp = p;
+ return rc;
+}
+
+static void fts5Bm25DebugContext(
+ int *pRc, /* IN/OUT: Return code */
+ Fts5Buffer *pBuf, /* Buffer to populate */
+ Fts5Bm25Context *p /* Context object to decode */
+){
+ int ip;
+ int ic;
+
+ sqlite3Fts5BufferAppendString(pRc, pBuf, "idf ");
+ if( p->nPhrase>1 || p->nCol>1 ){
+ sqlite3Fts5BufferAppendString(pRc, pBuf, "{");
+ }
+ for(ip=0; ip<p->nPhrase; ip++){
+ if( ip>0 ) sqlite3Fts5BufferAppendString(pRc, pBuf, " ");
+ if( p->nCol>1 ) sqlite3Fts5BufferAppendString(pRc, pBuf, "{");
+ for(ic=0; ic<p->nCol; ic++){
+ if( ic>0 ) sqlite3Fts5BufferAppendString(pRc, pBuf, " ");
+ sqlite3Fts5BufferAppendPrintf(pRc, pBuf, "%f", p->aIDF[ip*p->nCol+ic]);
+ }
+ if( p->nCol>1 ) sqlite3Fts5BufferAppendString(pRc, pBuf, "}");
+ }
+ if( p->nPhrase>1 || p->nCol>1 ){
+ sqlite3Fts5BufferAppendString(pRc, pBuf, "}");
+ }
+
+ sqlite3Fts5BufferAppendString(pRc, pBuf, " avgdl ");
+ if( p->nCol>1 ) sqlite3Fts5BufferAppendString(pRc, pBuf, "{");
+ for(ic=0; ic<p->nCol; ic++){
+ if( ic>0 ) sqlite3Fts5BufferAppendString(pRc, pBuf, " ");
+ sqlite3Fts5BufferAppendPrintf(pRc, pBuf, "%f", p->aAvg[ic]);
+ }
+ if( p->nCol>1 ) sqlite3Fts5BufferAppendString(pRc, pBuf, "}");
+}
+
+static void fts5Bm25DebugRow(
+ int *pRc,
+ Fts5Buffer *pBuf,
+ Fts5Bm25Context *p,
+ const Fts5ExtensionApi *pApi,
+ Fts5Context *pFts
+){
+}
+
+static void fts5Bm25Function(
+ const Fts5ExtensionApi *pApi, /* API offered by current FTS version */
+ Fts5Context *pFts, /* First arg to pass to pApi functions */
+ sqlite3_context *pCtx, /* Context for returning result/error */
+ int nVal, /* Number of values in apVal[] array */
+ sqlite3_value **apVal /* Array of trailing arguments */
+){
+ const double k1 = 1.2;
+ const double B = 0.75;
+ int rc = SQLITE_OK;
+ Fts5Bm25Context *p;
+
+ rc = fts5Bm25GetContext(pApi, pFts, &p);
+
if( rc==SQLITE_OK ){
+ /* If the bDebug flag is set, instead of returning a numeric rank, this
+ ** function returns a text value showing how the rank is calculated. */
+ Fts5Buffer debug;
+ int bDebug = (pApi->xUserData(pFts)!=0);
+ memset(&debug, 0, sizeof(Fts5Buffer));
+
int ip;
double score = 0.0;
+ if( bDebug ){
+ fts5Bm25DebugContext(&rc, &debug, p);
+ fts5Bm25DebugRow(&rc, &debug, p, pApi, pFts);
+ }
+
for(ip=0; rc==SQLITE_OK && ip<p->nPhrase; ip++){
int iPrev = 0;
int nHit = 0;
int i = 0;
i64 iPos = 0;
- while( rc==SQLITE_OK && 0==pApi->xPoslist(pFts, ip, &i, &iPos) ){
+ while( rc==SQLITE_OK ){
+ int bDone = pApi->xPoslist(pFts, ip, &i, &iPos);
int iCol = FTS5_POS2COLUMN(iPos);
- if( iCol!=iPrev && nHit>0 ){
+ if( (iCol!=iPrev || bDone) && nHit>0 ){
int sz = 0;
int idx = ip * p->nCol + iPrev;
+ double bm25;
rc = pApi->xColumnSize(pFts, iPrev, &sz);
- score += p->aIDF[idx] * nHit * (k1+1.0) /
- (nHit + k1 * (1.0 - B + B * sz / p->aAvg[iCol]));
+ bm25 = (p->aIDF[idx] * nHit * (k1+1.0)) /
+ (nHit + k1 * (1.0 - B + B * sz / p->aAvg[iPrev]));
+
+
+ score = score + bm25;
nHit = 0;
}
+ if( bDone ) break;
nHit++;
iPrev = iCol;
}
}
-
+
if( rc==SQLITE_OK ){
- sqlite3_result_double(pCtx, score);
+ if( bDebug ){
+ sqlite3_result_text(pCtx, (const char*)debug.p, -1, SQLITE_TRANSIENT);
+ }else{
+ sqlite3_result_double(pCtx, score);
+ }
}
-
+ sqlite3_free(debug.p);
}
if( rc!=SQLITE_OK ){
void (*xDestroy)(void*); /* Destructor function */
} aBuiltin [] = {
{ "bm25", 0, fts5Bm25Function, 0 },
+ { "bm25debug", (void*)1, fts5Bm25Function, 0 },
{ "snippet", 0, fts5SnippetFunction, 0 },
{ "fts5_test", 0, fts5TestFunction, 0 },
};
-C Add\sextension\sapis\sxRowCount,\sxQueryPhrase,\sxSetAuxdata\sand\sxGetAuxdata.\sAnd\sa\sranking\sfunction\sthat\suses\sall\sof\sthe\sabove.
-D 2014-07-25T20:30:47.445
+C Add\stests\sand\sfixes\sfor\sbm25()\sfunction.
+D 2014-07-26T18:38:51.294
F Makefile.arm-wince-mingw32ce-gcc d6df77f1f48d690bd73162294bbba7f59507c72f
F Makefile.in b03432313a3aad96c706f8164fb9f5307eaf19f5
F Makefile.linux-gcc 91d710bdc4998cb015f39edf3cb314ec4f4d7e23
F ext/fts5/fts5.c 1496aff16dd9b0a013d14b6c8cf5b7df8c170abe
F ext/fts5/fts5.h 8ace10d5b249a3baa983c79e7a1306d2a79cfd6a
F ext/fts5/fts5Int.h 92fb9c4f759674ef569aebc338f363e167a8933c
-F ext/fts5/fts5_aux.c f8bed7a86b65cb07cffdafbf4f0611f127b36274
+F ext/fts5/fts5_aux.c 78adc5db0ff4d6834df220ba6b3caa351d98b971
F ext/fts5/fts5_buffer.c 248c61ac9fec001602efc72a45704f3b8d367c00
F ext/fts5/fts5_config.c 94f1b4cb4de6a7cd5780c14adb0198e289df8cef
F ext/fts5/fts5_expr.c 65c1918002f2ec1755e4c0c28bf007659409fbd8
F test/fts5ab.test dc04ed48cf93ca957d174406e6c192f2ff4f3397
F test/fts5ac.test 9be418d037763f4cc5d86f4239db41fc86bb4f85
F test/fts5ad.test 2ed38bbc865678cb2905247120d02ebba7f20e07
-F test/fts5ae.test 1424ec557d543ace1f3cf6d231b247bc7b9f337c
+F test/fts5ae.test 24b337571c51a10da1ae439b96b70317813a2fd4
F test/fts5af.test 5f53d0a52280b63caf5a519d6994c4d428835155
F test/fts5ea.test ff43b40f8879ba50b82def70f2ab67c195d1a1d4
F test/full.test 6b3c8fb43c6beab6b95438c1675374b95fab245d
F tool/warnings-clang.sh f6aa929dc20ef1f856af04a730772f59283631d4
F tool/warnings.sh 0abfd78ceb09b7f7c27c688c8e3fe93268a13b32
F tool/win/sqlite.vsix deb315d026cc8400325c5863eef847784a219a2f
-P bdc58fd28a63ac9632c3df6c7768a9a236566605
-R 2e8cb20122478987f116ef8ff9f6144b
+P c4d50428ab97f77e6721c4f8d03eaaf3ea91f3eb
+R 3301ccb2b839356242606883792ca77e
U dan
-Z 5dd5c36b8a0e52d63a87d23e7179571f
+Z 456b4a2f1abc554b124e25c35490489e