From: shess Date: Tue, 22 Jul 2008 23:08:40 +0000 (+0000) Subject: Change prefix search from O(N*M) to O(NlogM). X-Git-Tag: version-3.6.10~722 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=deca811cb5f3de24b707eadb21376c8acccab317;p=thirdparty%2Fsqlite.git Change prefix search from O(N*M) to O(NlogM). Backports (4599) from fts3. (CVS 5455) FossilOrigin-Name: 3f614453d2d7c753a5963b027fe8618b50b4f6b9 --- diff --git a/ext/fts2/fts2.c b/ext/fts2/fts2.c index 2a995b0214..3279f655d8 100644 --- a/ext/fts2/fts2.c +++ b/ext/fts2/fts2.c @@ -455,6 +455,7 @@ static int getVarint32(const char *p, int *pi){ ** dataBufferInit - create a buffer with given initial capacity. ** dataBufferReset - forget buffer's data, retaining capacity. ** dataBufferDestroy - free buffer's data. +** dataBufferSwap - swap contents of two buffers. ** dataBufferExpand - expand capacity without adding data. ** dataBufferAppend - append data. ** dataBufferAppend2 - append two pieces of data at once. @@ -479,6 +480,11 @@ static void dataBufferDestroy(DataBuffer *pBuffer){ if( pBuffer->pData!=NULL ) sqlite3_free(pBuffer->pData); SCRAMBLE(pBuffer); } +static void dataBufferSwap(DataBuffer *pBuffer1, DataBuffer *pBuffer2){ + DataBuffer tmp = *pBuffer1; + *pBuffer1 = *pBuffer2; + *pBuffer2 = tmp; +} static void dataBufferExpand(DataBuffer *pBuffer, int nAddCapacity){ assert( nAddCapacity>0 ); /* TODO(shess) Consider expanding more aggressively. Note that the @@ -5219,6 +5225,26 @@ static int segmentMerge(fulltext_vtab *v, int iLevel){ return rc; } +/* Accumulate the union of *acc and *pData into *acc. */ +static void docListAccumulateUnion(DataBuffer *acc, + const char *pData, int nData) { + DataBuffer tmp = *acc; + dataBufferInit(acc, tmp.nData+nData); + docListUnion(tmp.pData, tmp.nData, pData, nData, acc); + dataBufferDestroy(&tmp); +} + +/* TODO(shess) It might be interesting to explore different merge +** strategies, here. For instance, since this is a sorted merge, we +** could easily merge many doclists in parallel. With some +** comprehension of the storage format, we could merge all of the +** doclists within a leaf node directly from the leaf node's storage. +** It may be worthwhile to merge smaller doclists before larger +** doclists, since they can be traversed more quickly - but the +** results may have less overlap, making them more expensive in a +** different way. +*/ + /* Scan pReader for pTerm/nTerm, and merge the term's doclist over ** *out (any doclists with duplicate docids overwrite those in *out). ** Internal function for loadSegmentLeaf(). @@ -5226,39 +5252,116 @@ static int segmentMerge(fulltext_vtab *v, int iLevel){ static int loadSegmentLeavesInt(fulltext_vtab *v, LeavesReader *pReader, const char *pTerm, int nTerm, int isPrefix, DataBuffer *out){ + /* doclist data is accumulated into pBuffers similar to how one does + ** increment in binary arithmetic. If index 0 is empty, the data is + ** stored there. If there is data there, it is merged and the + ** results carried into position 1, with further merge-and-carry + ** until an empty position is found. + */ + DataBuffer *pBuffers = NULL; + int nBuffers = 0, nMaxBuffers = 0, rc; + assert( nTerm>0 ); - /* Process while the prefix matches. */ - while( !leavesReaderAtEnd(pReader) ){ + for(rc=SQLITE_OK; rc==SQLITE_OK && !leavesReaderAtEnd(pReader); + rc=leavesReaderStep(v, pReader)){ /* TODO(shess) Really want leavesReaderTermCmp(), but that name is ** already taken to compare the terms of two LeavesReaders. Think ** on a better name. [Meanwhile, break encapsulation rather than ** use a confusing name.] */ - int rc; int c = leafReaderTermCmp(&pReader->leafReader, pTerm, nTerm, isPrefix); + if( c>0 ) break; /* Past any possible matches. */ if( c==0 ){ const char *pData = leavesReaderData(pReader); - int nData = leavesReaderDataBytes(pReader); - if( out->nData==0 ){ - dataBufferReplace(out, pData, nData); + int iBuffer, nData = leavesReaderDataBytes(pReader); + + /* Find the first empty buffer. */ + for(iBuffer=0; iBuffer0 ){ + assert(pBuffers!=NULL); + memcpy(p, pBuffers, nBuffers*sizeof(*pBuffers)); + sqlite3_free(pBuffers); + } + pBuffers = p; + } + dataBufferInit(&(pBuffers[nBuffers]), 0); + nBuffers++; + } + + /* At this point, must have an empty at iBuffer. */ + assert(iBuffernData+nData); - docListUnion(out->pData, out->nData, pData, nData, &result); - dataBufferDestroy(out); - *out = result; - /* TODO(shess) Rather than destroy out, we could retain it for - ** later reuse. + /* pAcc is the empty buffer the merged data will end up in. */ + DataBuffer *pAcc = &(pBuffers[iBuffer]); + DataBuffer *p = &(pBuffers[0]); + + /* Handle position 0 specially to avoid need to prime pAcc + ** with pData/nData. */ + dataBufferSwap(p, pAcc); + docListAccumulateUnion(pAcc, pData, nData); + + /* Accumulate remaining doclists into pAcc. */ + for(++p; ppData, p->nData); + + /* dataBufferReset() could allow a large doclist to blow up + ** our memory requirements. + */ + if( p->nCapacity<1024 ){ + dataBufferReset(p); + }else{ + dataBufferDestroy(p); + dataBufferInit(p, 0); + } + } } } - if( c>0 ) break; /* Past any possible matches. */ + } - rc = leavesReaderStep(v, pReader); - if( rc!=SQLITE_OK ) return rc; + /* Union all the doclists together into *out. */ + /* TODO(shess) What if *out is big? Sigh. */ + if( rc==SQLITE_OK && nBuffers>0 ){ + int iBuffer; + for(iBuffer=0; iBuffer0 ){ + if( out->nData==0 ){ + dataBufferSwap(out, &(pBuffers[iBuffer])); + }else{ + docListAccumulateUnion(out, pBuffers[iBuffer].pData, + pBuffers[iBuffer].nData); + } + } + } } - return SQLITE_OK; + + while( nBuffers-- ){ + dataBufferDestroy(&(pBuffers[nBuffers])); + } + if( pBuffers!=NULL ) sqlite3_free(pBuffers); + + return rc; } /* Call loadSegmentLeavesInt() with pData/nData as input. */ diff --git a/manifest b/manifest index 795744b128..20e990b6fc 100644 --- a/manifest +++ b/manifest @@ -1,5 +1,5 @@ -C Changes\sfts2\sto\suse\sonly\ssqlite3_malloc()\sand\snot\ssystem\smalloc.\r\nBackports\s(4554)\sand\s(4555)\sfrom\sfts3.\s(CVS\s5454) -D 2008-07-22T22:57:54 +C Change\sprefix\ssearch\sfrom\sO(N*M)\sto\sO(NlogM).\r\nBackports\s(4599)\sfrom\sfts3.\s(CVS\s5455) +D 2008-07-22T23:08:40 F Makefile.arm-wince-mingw32ce-gcc fcd5e9cd67fe88836360bb4f9ef4cb7f8e2fb5a0 F Makefile.in 77ff156061bb870aa0a8b3d545c670d08070f7e6 F Makefile.linux-gcc d53183f4aa6a9192d249731c90dbdffbd2c68654 @@ -39,7 +39,7 @@ F ext/fts1/simple_tokenizer.c 1844d72f7194c3fd3d7e4173053911bf0661b70d F ext/fts1/tokenizer.h 0c53421b832366d20d720d21ea3e1f6e66a36ef9 F ext/fts2/README.tokenizers 21e3684ea5a095b55d70f6878b4ce6af5932dfb7 F ext/fts2/README.txt 8c18f41574404623b76917b9da66fcb0ab38328d -F ext/fts2/fts2.c f50c7faca742f40b14a2b279d652594c532e6523 +F ext/fts2/fts2.c 7a2e88d110d059c986234c3d7734133d59a709ef F ext/fts2/fts2.h da5f76c65163301d1068a971fd32f4119e3c95fa F ext/fts2/fts2_hash.c 2689e42e1107ea67207f725cf69cf8972d00cf93 F ext/fts2/fts2_hash.h 9a5b1be94664139f93217a0770d7144425cffb3a @@ -609,7 +609,7 @@ F tool/speedtest16.c c8a9c793df96db7e4933f0852abb7a03d48f2e81 F tool/speedtest2.tcl ee2149167303ba8e95af97873c575c3e0fab58ff F tool/speedtest8.c 1dbced29de5f59ba2ebf877edcadf171540374d1 F tool/speedtest8inst1.c 293327bc76823f473684d589a8160bde1f52c14e -P d562515e1cdd05212674516033c64b5f5668b799 -R 2682d383bee51312a260e3583fb091fa +P ecf2dec66cb979cb7d8db3b7ce5c64cab57fe2bb +R 89694177ea5ff7c878e6daa15283840d U shess -Z 7b59d5a322f77dacbe20c5139c2b6d35 +Z 48033bdd7abd5465e3195f281a547d67 diff --git a/manifest.uuid b/manifest.uuid index 5f5c013d3f..8e4895f1e4 100644 --- a/manifest.uuid +++ b/manifest.uuid @@ -1 +1 @@ -ecf2dec66cb979cb7d8db3b7ce5c64cab57fe2bb \ No newline at end of file +3f614453d2d7c753a5963b027fe8618b50b4f6b9 \ No newline at end of file