** dataBufferInit - create a buffer with given initial capacity.
** dataBufferReset - forget buffer's data, retaining capacity.
** dataBufferDestroy - free buffer's data.
+** dataBufferSwap - swap contents of two buffers.
** dataBufferExpand - expand capacity without adding data.
** dataBufferAppend - append data.
** dataBufferAppend2 - append two pieces of data at once.
if( pBuffer->pData!=NULL ) sqlite3_free(pBuffer->pData);
SCRAMBLE(pBuffer);
}
+static void dataBufferSwap(DataBuffer *pBuffer1, DataBuffer *pBuffer2){
+ DataBuffer tmp = *pBuffer1;
+ *pBuffer1 = *pBuffer2;
+ *pBuffer2 = tmp;
+}
static void dataBufferExpand(DataBuffer *pBuffer, int nAddCapacity){
assert( nAddCapacity>0 );
/* TODO(shess) Consider expanding more aggressively. Note that the
return rc;
}
+/* Accumulate the union of *acc and *pData into *acc. */
+static void docListAccumulateUnion(DataBuffer *acc,
+ const char *pData, int nData) {
+ DataBuffer tmp = *acc;
+ dataBufferInit(acc, tmp.nData+nData);
+ docListUnion(tmp.pData, tmp.nData, pData, nData, acc);
+ dataBufferDestroy(&tmp);
+}
+
+/* TODO(shess) It might be interesting to explore different merge
+** strategies, here. For instance, since this is a sorted merge, we
+** could easily merge many doclists in parallel. With some
+** comprehension of the storage format, we could merge all of the
+** doclists within a leaf node directly from the leaf node's storage.
+** It may be worthwhile to merge smaller doclists before larger
+** doclists, since they can be traversed more quickly - but the
+** results may have less overlap, making them more expensive in a
+** different way.
+*/
+
/* Scan pReader for pTerm/nTerm, and merge the term's doclist over
** *out (any doclists with duplicate docids overwrite those in *out).
** Internal function for loadSegmentLeaf().
static int loadSegmentLeavesInt(fulltext_vtab *v, LeavesReader *pReader,
const char *pTerm, int nTerm, int isPrefix,
DataBuffer *out){
+ /* doclist data is accumulated into pBuffers similar to how one does
+ ** increment in binary arithmetic. If index 0 is empty, the data is
+ ** stored there. If there is data there, it is merged and the
+ ** results carried into position 1, with further merge-and-carry
+ ** until an empty position is found.
+ */
+ DataBuffer *pBuffers = NULL;
+ int nBuffers = 0, nMaxBuffers = 0, rc;
+
assert( nTerm>0 );
- /* Process while the prefix matches. */
- while( !leavesReaderAtEnd(pReader) ){
+ for(rc=SQLITE_OK; rc==SQLITE_OK && !leavesReaderAtEnd(pReader);
+ rc=leavesReaderStep(v, pReader)){
/* TODO(shess) Really want leavesReaderTermCmp(), but that name is
** already taken to compare the terms of two LeavesReaders. Think
** on a better name. [Meanwhile, break encapsulation rather than
** use a confusing name.]
*/
- int rc;
int c = leafReaderTermCmp(&pReader->leafReader, pTerm, nTerm, isPrefix);
+ if( c>0 ) break; /* Past any possible matches. */
if( c==0 ){
const char *pData = leavesReaderData(pReader);
- int nData = leavesReaderDataBytes(pReader);
- if( out->nData==0 ){
- dataBufferReplace(out, pData, nData);
+ int iBuffer, nData = leavesReaderDataBytes(pReader);
+
+ /* Find the first empty buffer. */
+ for(iBuffer=0; iBuffer<nBuffers; ++iBuffer){
+ if( 0==pBuffers[iBuffer].nData ) break;
+ }
+
+ /* Out of buffers, add an empty one. */
+ if( iBuffer==nBuffers ){
+ if( nBuffers==nMaxBuffers ){
+ DataBuffer *p;
+ nMaxBuffers += 20;
+
+ /* Manual realloc so we can handle NULL appropriately. */
+ p = sqlite3_malloc(nMaxBuffers*sizeof(*pBuffers));
+ if( p==NULL ){
+ rc = SQLITE_NOMEM;
+ break;
+ }
+
+ if( nBuffers>0 ){
+ assert(pBuffers!=NULL);
+ memcpy(p, pBuffers, nBuffers*sizeof(*pBuffers));
+ sqlite3_free(pBuffers);
+ }
+ pBuffers = p;
+ }
+ dataBufferInit(&(pBuffers[nBuffers]), 0);
+ nBuffers++;
+ }
+
+ /* At this point, must have an empty at iBuffer. */
+ assert(iBuffer<nBuffers && pBuffers[iBuffer].nData==0);
+
+ /* If empty was first buffer, no need for merge logic. */
+ if( iBuffer==0 ){
+ dataBufferReplace(&(pBuffers[0]), pData, nData);
}else{
- DataBuffer result;
- dataBufferInit(&result, out->nData+nData);
- docListUnion(out->pData, out->nData, pData, nData, &result);
- dataBufferDestroy(out);
- *out = result;
- /* TODO(shess) Rather than destroy out, we could retain it for
- ** later reuse.
+ /* pAcc is the empty buffer the merged data will end up in. */
+ DataBuffer *pAcc = &(pBuffers[iBuffer]);
+ DataBuffer *p = &(pBuffers[0]);
+
+ /* Handle position 0 specially to avoid need to prime pAcc
+ ** with pData/nData.
*/
+ dataBufferSwap(p, pAcc);
+ docListAccumulateUnion(pAcc, pData, nData);
+
+ /* Accumulate remaining doclists into pAcc. */
+ for(++p; p<pAcc; ++p){
+ docListAccumulateUnion(pAcc, p->pData, p->nData);
+
+ /* dataBufferReset() could allow a large doclist to blow up
+ ** our memory requirements.
+ */
+ if( p->nCapacity<1024 ){
+ dataBufferReset(p);
+ }else{
+ dataBufferDestroy(p);
+ dataBufferInit(p, 0);
+ }
+ }
}
}
- if( c>0 ) break; /* Past any possible matches. */
+ }
- rc = leavesReaderStep(v, pReader);
- if( rc!=SQLITE_OK ) return rc;
+ /* Union all the doclists together into *out. */
+ /* TODO(shess) What if *out is big? Sigh. */
+ if( rc==SQLITE_OK && nBuffers>0 ){
+ int iBuffer;
+ for(iBuffer=0; iBuffer<nBuffers; ++iBuffer){
+ if( pBuffers[iBuffer].nData>0 ){
+ if( out->nData==0 ){
+ dataBufferSwap(out, &(pBuffers[iBuffer]));
+ }else{
+ docListAccumulateUnion(out, pBuffers[iBuffer].pData,
+ pBuffers[iBuffer].nData);
+ }
+ }
+ }
}
- return SQLITE_OK;
+
+ while( nBuffers-- ){
+ dataBufferDestroy(&(pBuffers[nBuffers]));
+ }
+ if( pBuffers!=NULL ) sqlite3_free(pBuffers);
+
+ return rc;
}
/* Call loadSegmentLeavesInt() with pData/nData as input. */
-C In\sshared-cache\smode,\smake\ssure\sthe\sbusy\shander\sinvoked\sis\sthe\nbusy\shandler\sassociated\swith\sthe\sdatabase\sconnection\sthat\scaused\nthe\slock\scontention\sin\sthe\sfirst\splace.\s(CVS\s4598)
-D 2007-12-07T18:55:28
+C Change\sprefix\ssearch\sfrom\sO(N*M)\sto\sO(NlogM).\s\sThe\sprevious\scode\nlinearly\smerged\sthe\sdoclists,\sso\sas\sthe\saccumulated\slist\sgot\slarge,\nthings\sgot\sslow\s(the\sM\sterm,\sa\sfucntion\sof\sthe\snumber\sof\sdocuments\sin\nthe\sindex).\s\sThis\schange\sdoes\spairwise\smerges\suntil\sa\ssingle\sdoclist\nremains.\s\sA\stest\ssearch\sof\s't*'\sagainst\sa\sdatabase\sof\sRFC\stext\nimproves\sfrom\s1m16s\sto\s4.75s.\s(CVS\s4599)
+D 2007-12-07T23:47:53
F Makefile.arm-wince-mingw32ce-gcc ac5f7b2cef0cd850d6f755ba6ee4ab961b1fadf7
F Makefile.in 30789bf70614bad659351660d76b8e533f3340e9
F Makefile.linux-gcc d53183f4aa6a9192d249731c90dbdffbd2c68654
F ext/fts2/mkfts2amal.tcl 974d5d438cb3f7c4a652639262f82418c1e4cff0
F ext/fts3/README.tokenizers a97c9a55b3422f6cb04af9de9296fe2447ea4a78
F ext/fts3/README.txt 8c18f41574404623b76917b9da66fcb0ab38328d
-F ext/fts3/fts3.c b95b4b62211335cf74d2485a7c17925a9f8338f8
+F ext/fts3/fts3.c 0992fca534de44c1f72efb080c6cd48726906eab
F ext/fts3/fts3.h 3a10a0af180d502cecc50df77b1b22df142817fe
F ext/fts3/fts3_hash.c 83e7bb4042106b32811681dd2859b4577a7a6b35
F ext/fts3/fts3_hash.h 004b759e1602ff16dfa02fea3ca1c77336ad6798
F www/vdbe.tcl 87a31ace769f20d3627a64fa1fade7fed47b90d0
F www/version3.tcl 890248cf7b70e60c383b0e84d77d5132b3ead42b
F www/whentouse.tcl fc46eae081251c3c181bd79c5faef8195d7991a5
-P 754298a74e3d889f3767daba058262613d20a601
-R 0ae36f66eb071af9c23c05039a009307
-U drh
-Z 974b15de4ea104f24852fc03b178d3ac
+P c9eb65912f61ce0a6b66fe253652a1827e46b12a
+R 0fafe177127549f06d346814ea671cc8
+U shess
+Z d10cd33d2eea817a0c124609ae700d66