From: shess Date: Fri, 7 Dec 2007 23:47:53 +0000 (+0000) Subject: Change prefix search from O(N*M) to O(NlogM). The previous code X-Git-Tag: version-3.6.10~1578 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=b6a75606ed995f51626c1d0f542f5f0377e1adea;p=thirdparty%2Fsqlite.git Change prefix search from O(N*M) to O(NlogM). The previous code linearly merged the doclists, so as the accumulated list got large, things got slow (the M term, a fucntion of the number of documents in the index). This change does pairwise merges until a single doclist remains. A test search of 't*' against a database of RFC text improves from 1m16s to 4.75s. (CVS 4599) FossilOrigin-Name: feef1b15d645d638b4a05742f214b0445fa7e176 --- diff --git a/ext/fts3/fts3.c b/ext/fts3/fts3.c index 2c53cb591e..71df92253f 100644 --- a/ext/fts3/fts3.c +++ b/ext/fts3/fts3.c @@ -437,6 +437,7 @@ static int fts3GetVarint32(const char *p, int *pi){ ** dataBufferInit - create a buffer with given initial capacity. ** dataBufferReset - forget buffer's data, retaining capacity. ** dataBufferDestroy - free buffer's data. +** dataBufferSwap - swap contents of two buffers. ** dataBufferExpand - expand capacity without adding data. ** dataBufferAppend - append data. ** dataBufferAppend2 - append two pieces of data at once. @@ -461,6 +462,11 @@ static void dataBufferDestroy(DataBuffer *pBuffer){ if( pBuffer->pData!=NULL ) sqlite3_free(pBuffer->pData); SCRAMBLE(pBuffer); } +static void dataBufferSwap(DataBuffer *pBuffer1, DataBuffer *pBuffer2){ + DataBuffer tmp = *pBuffer1; + *pBuffer1 = *pBuffer2; + *pBuffer2 = tmp; +} static void dataBufferExpand(DataBuffer *pBuffer, int nAddCapacity){ assert( nAddCapacity>0 ); /* TODO(shess) Consider expanding more aggressively. Note that the @@ -5555,6 +5561,26 @@ static int segmentMerge(fulltext_vtab *v, int iLevel){ return rc; } +/* Accumulate the union of *acc and *pData into *acc. */ +static void docListAccumulateUnion(DataBuffer *acc, + const char *pData, int nData) { + DataBuffer tmp = *acc; + dataBufferInit(acc, tmp.nData+nData); + docListUnion(tmp.pData, tmp.nData, pData, nData, acc); + dataBufferDestroy(&tmp); +} + +/* TODO(shess) It might be interesting to explore different merge +** strategies, here. For instance, since this is a sorted merge, we +** could easily merge many doclists in parallel. With some +** comprehension of the storage format, we could merge all of the +** doclists within a leaf node directly from the leaf node's storage. +** It may be worthwhile to merge smaller doclists before larger +** doclists, since they can be traversed more quickly - but the +** results may have less overlap, making them more expensive in a +** different way. +*/ + /* Scan pReader for pTerm/nTerm, and merge the term's doclist over ** *out (any doclists with duplicate docids overwrite those in *out). ** Internal function for loadSegmentLeaf(). @@ -5562,39 +5588,116 @@ static int segmentMerge(fulltext_vtab *v, int iLevel){ static int loadSegmentLeavesInt(fulltext_vtab *v, LeavesReader *pReader, const char *pTerm, int nTerm, int isPrefix, DataBuffer *out){ + /* doclist data is accumulated into pBuffers similar to how one does + ** increment in binary arithmetic. If index 0 is empty, the data is + ** stored there. If there is data there, it is merged and the + ** results carried into position 1, with further merge-and-carry + ** until an empty position is found. + */ + DataBuffer *pBuffers = NULL; + int nBuffers = 0, nMaxBuffers = 0, rc; + assert( nTerm>0 ); - /* Process while the prefix matches. */ - while( !leavesReaderAtEnd(pReader) ){ + for(rc=SQLITE_OK; rc==SQLITE_OK && !leavesReaderAtEnd(pReader); + rc=leavesReaderStep(v, pReader)){ /* TODO(shess) Really want leavesReaderTermCmp(), but that name is ** already taken to compare the terms of two LeavesReaders. Think ** on a better name. [Meanwhile, break encapsulation rather than ** use a confusing name.] */ - int rc; int c = leafReaderTermCmp(&pReader->leafReader, pTerm, nTerm, isPrefix); + if( c>0 ) break; /* Past any possible matches. */ if( c==0 ){ const char *pData = leavesReaderData(pReader); - int nData = leavesReaderDataBytes(pReader); - if( out->nData==0 ){ - dataBufferReplace(out, pData, nData); + int iBuffer, nData = leavesReaderDataBytes(pReader); + + /* Find the first empty buffer. */ + for(iBuffer=0; iBuffer0 ){ + assert(pBuffers!=NULL); + memcpy(p, pBuffers, nBuffers*sizeof(*pBuffers)); + sqlite3_free(pBuffers); + } + pBuffers = p; + } + dataBufferInit(&(pBuffers[nBuffers]), 0); + nBuffers++; + } + + /* At this point, must have an empty at iBuffer. */ + assert(iBuffernData+nData); - docListUnion(out->pData, out->nData, pData, nData, &result); - dataBufferDestroy(out); - *out = result; - /* TODO(shess) Rather than destroy out, we could retain it for - ** later reuse. + /* pAcc is the empty buffer the merged data will end up in. */ + DataBuffer *pAcc = &(pBuffers[iBuffer]); + DataBuffer *p = &(pBuffers[0]); + + /* Handle position 0 specially to avoid need to prime pAcc + ** with pData/nData. */ + dataBufferSwap(p, pAcc); + docListAccumulateUnion(pAcc, pData, nData); + + /* Accumulate remaining doclists into pAcc. */ + for(++p; ppData, p->nData); + + /* dataBufferReset() could allow a large doclist to blow up + ** our memory requirements. + */ + if( p->nCapacity<1024 ){ + dataBufferReset(p); + }else{ + dataBufferDestroy(p); + dataBufferInit(p, 0); + } + } } } - if( c>0 ) break; /* Past any possible matches. */ + } - rc = leavesReaderStep(v, pReader); - if( rc!=SQLITE_OK ) return rc; + /* Union all the doclists together into *out. */ + /* TODO(shess) What if *out is big? Sigh. */ + if( rc==SQLITE_OK && nBuffers>0 ){ + int iBuffer; + for(iBuffer=0; iBuffer0 ){ + if( out->nData==0 ){ + dataBufferSwap(out, &(pBuffers[iBuffer])); + }else{ + docListAccumulateUnion(out, pBuffers[iBuffer].pData, + pBuffers[iBuffer].nData); + } + } + } } - return SQLITE_OK; + + while( nBuffers-- ){ + dataBufferDestroy(&(pBuffers[nBuffers])); + } + if( pBuffers!=NULL ) sqlite3_free(pBuffers); + + return rc; } /* Call loadSegmentLeavesInt() with pData/nData as input. */ diff --git a/manifest b/manifest index ee238a3dae..e49027e4a6 100644 --- a/manifest +++ b/manifest @@ -1,5 +1,5 @@ -C In\sshared-cache\smode,\smake\ssure\sthe\sbusy\shander\sinvoked\sis\sthe\nbusy\shandler\sassociated\swith\sthe\sdatabase\sconnection\sthat\scaused\nthe\slock\scontention\sin\sthe\sfirst\splace.\s(CVS\s4598) -D 2007-12-07T18:55:28 +C Change\sprefix\ssearch\sfrom\sO(N*M)\sto\sO(NlogM).\s\sThe\sprevious\scode\nlinearly\smerged\sthe\sdoclists,\sso\sas\sthe\saccumulated\slist\sgot\slarge,\nthings\sgot\sslow\s(the\sM\sterm,\sa\sfucntion\sof\sthe\snumber\sof\sdocuments\sin\nthe\sindex).\s\sThis\schange\sdoes\spairwise\smerges\suntil\sa\ssingle\sdoclist\nremains.\s\sA\stest\ssearch\sof\s't*'\sagainst\sa\sdatabase\sof\sRFC\stext\nimproves\sfrom\s1m16s\sto\s4.75s.\s(CVS\s4599) +D 2007-12-07T23:47:53 F Makefile.arm-wince-mingw32ce-gcc ac5f7b2cef0cd850d6f755ba6ee4ab961b1fadf7 F Makefile.in 30789bf70614bad659351660d76b8e533f3340e9 F Makefile.linux-gcc d53183f4aa6a9192d249731c90dbdffbd2c68654 @@ -50,7 +50,7 @@ F ext/fts2/fts2_tokenizer1.c 8a545c232bdffafd117c4eeaf59789691909f26a F ext/fts2/mkfts2amal.tcl 974d5d438cb3f7c4a652639262f82418c1e4cff0 F ext/fts3/README.tokenizers a97c9a55b3422f6cb04af9de9296fe2447ea4a78 F ext/fts3/README.txt 8c18f41574404623b76917b9da66fcb0ab38328d -F ext/fts3/fts3.c b95b4b62211335cf74d2485a7c17925a9f8338f8 +F ext/fts3/fts3.c 0992fca534de44c1f72efb080c6cd48726906eab F ext/fts3/fts3.h 3a10a0af180d502cecc50df77b1b22df142817fe F ext/fts3/fts3_hash.c 83e7bb4042106b32811681dd2859b4577a7a6b35 F ext/fts3/fts3_hash.h 004b759e1602ff16dfa02fea3ca1c77336ad6798 @@ -597,7 +597,7 @@ F www/tclsqlite.tcl 8be95ee6dba05eabcd27a9d91331c803f2ce2130 F www/vdbe.tcl 87a31ace769f20d3627a64fa1fade7fed47b90d0 F www/version3.tcl 890248cf7b70e60c383b0e84d77d5132b3ead42b F www/whentouse.tcl fc46eae081251c3c181bd79c5faef8195d7991a5 -P 754298a74e3d889f3767daba058262613d20a601 -R 0ae36f66eb071af9c23c05039a009307 -U drh -Z 974b15de4ea104f24852fc03b178d3ac +P c9eb65912f61ce0a6b66fe253652a1827e46b12a +R 0fafe177127549f06d346814ea671cc8 +U shess +Z d10cd33d2eea817a0c124609ae700d66 diff --git a/manifest.uuid b/manifest.uuid index 97615b565b..9ce332a565 100644 --- a/manifest.uuid +++ b/manifest.uuid @@ -1 +1 @@ -c9eb65912f61ce0a6b66fe253652a1827e46b12a \ No newline at end of file +feef1b15d645d638b4a05742f214b0445fa7e176 \ No newline at end of file