Change prefix search from O(N*M) to O(NlogM).

author shess <shess@noemail.net>

Tue, 22 Jul 2008 23:08:40 +0000 (23:08 +0000)

committer shess <shess@noemail.net>

Tue, 22 Jul 2008 23:08:40 +0000 (23:08 +0000)
author shess <shess@noemail.net>
Tue, 22 Jul 2008 23:08:40 +0000 (23:08 +0000)
committer shess <shess@noemail.net>
Tue, 22 Jul 2008 23:08:40 +0000 (23:08 +0000)
diff --git a/ext/fts2/fts2.c b/ext/fts2/fts2.c

index 2a995b0214124efb1d738bb5c2a766fcee7da58b..3279f655d86cc22cb04d194de9d3a9cdc8a02260 100644 (file)
--- a/ext/fts2/fts2.c
+++ b/ext/fts2/fts2.c
@@ -455,6 +455,7 @@ static int getVarint32(const char *p, int *pi){
  ** dataBufferInit - create a buffer with given initial capacity.
  ** dataBufferReset - forget buffer's data, retaining capacity.
  ** dataBufferDestroy - free buffer's data.
+** dataBufferSwap - swap contents of two buffers.
  ** dataBufferExpand - expand capacity without adding data.
  ** dataBufferAppend - append data.
  ** dataBufferAppend2 - append two pieces of data at once.
@@ -479,6 +480,11 @@ static void dataBufferDestroy(DataBuffer *pBuffer){
    if( pBuffer->pData!=NULL ) sqlite3_free(pBuffer->pData);
    SCRAMBLE(pBuffer);
  }
+static void dataBufferSwap(DataBuffer *pBuffer1, DataBuffer *pBuffer2){
+  DataBuffer tmp = *pBuffer1;
+  *pBuffer1 = *pBuffer2;
+  *pBuffer2 = tmp;
+}
  static void dataBufferExpand(DataBuffer *pBuffer, int nAddCapacity){
    assert( nAddCapacity>0 );
    /* TODO(shess) Consider expanding more aggressively.  Note that the
@@ -5219,6 +5225,26 @@ static int segmentMerge(fulltext_vtab *v, int iLevel){
    return rc;
  }
  
+/* Accumulate the union of *acc and *pData into *acc. */
+static void docListAccumulateUnion(DataBuffer *acc,
+                                   const char *pData, int nData) {
+  DataBuffer tmp = *acc;
+  dataBufferInit(acc, tmp.nData+nData);
+  docListUnion(tmp.pData, tmp.nData, pData, nData, acc);
+  dataBufferDestroy(&tmp);
+}
+
+/* TODO(shess) It might be interesting to explore different merge
+** strategies, here.  For instance, since this is a sorted merge, we
+** could easily merge many doclists in parallel.  With some
+** comprehension of the storage format, we could merge all of the
+** doclists within a leaf node directly from the leaf node's storage.
+** It may be worthwhile to merge smaller doclists before larger
+** doclists, since they can be traversed more quickly - but the
+** results may have less overlap, making them more expensive in a
+** different way.
+*/
+
  /* Scan pReader for pTerm/nTerm, and merge the term's doclist over
  ** *out (any doclists with duplicate docids overwrite those in *out).
  ** Internal function for loadSegmentLeaf().
@@ -5226,39 +5252,116 @@ static int segmentMerge(fulltext_vtab *v, int iLevel){
  static int loadSegmentLeavesInt(fulltext_vtab *v, LeavesReader *pReader,
                                  const char *pTerm, int nTerm, int isPrefix,
                                  DataBuffer *out){
+  /* doclist data is accumulated into pBuffers similar to how one does
+  ** increment in binary arithmetic.  If index 0 is empty, the data is
+  ** stored there.  If there is data there, it is merged and the
+  ** results carried into position 1, with further merge-and-carry
+  ** until an empty position is found.
+  */
+  DataBuffer *pBuffers = NULL;
+  int nBuffers = 0, nMaxBuffers = 0, rc;
+
    assert( nTerm>0 );
  
-  /* Process while the prefix matches. */
-  while( !leavesReaderAtEnd(pReader) ){
+  for(rc=SQLITE_OK; rc==SQLITE_OK && !leavesReaderAtEnd(pReader);
+      rc=leavesReaderStep(v, pReader)){
      /* TODO(shess) Really want leavesReaderTermCmp(), but that name is
      ** already taken to compare the terms of two LeavesReaders.  Think
      ** on a better name.  [Meanwhile, break encapsulation rather than
      ** use a confusing name.]
      */
-    int rc;
      int c = leafReaderTermCmp(&pReader->leafReader, pTerm, nTerm, isPrefix);
+    if( c>0 ) break;      /* Past any possible matches. */
      if( c==0 ){
        const char *pData = leavesReaderData(pReader);
-      int nData = leavesReaderDataBytes(pReader);
-      if( out->nData==0 ){
-        dataBufferReplace(out, pData, nData);
+      int iBuffer, nData = leavesReaderDataBytes(pReader);
+
+      /* Find the first empty buffer. */
+      for(iBuffer=0; iBuffer<nBuffers; ++iBuffer){
+        if( 0==pBuffers[iBuffer].nData ) break;
+      }
+
+      /* Out of buffers, add an empty one. */
+      if( iBuffer==nBuffers ){
+        if( nBuffers==nMaxBuffers ){
+          DataBuffer *p;
+          nMaxBuffers += 20;
+
+          /* Manual realloc so we can handle NULL appropriately. */
+          p = sqlite3_malloc(nMaxBuffers*sizeof(*pBuffers));
+          if( p==NULL ){
+            rc = SQLITE_NOMEM;
+            break;
+          }
+
+          if( nBuffers>0 ){
+            assert(pBuffers!=NULL);
+            memcpy(p, pBuffers, nBuffers*sizeof(*pBuffers));
+            sqlite3_free(pBuffers);
+          }
+          pBuffers = p;
+        }
+        dataBufferInit(&(pBuffers[nBuffers]), 0);
+        nBuffers++;
+      }
+
+      /* At this point, must have an empty at iBuffer. */
+      assert(iBuffer<nBuffers && pBuffers[iBuffer].nData==0);
+
+      /* If empty was first buffer, no need for merge logic. */
+      if( iBuffer==0 ){
+        dataBufferReplace(&(pBuffers[0]), pData, nData);
        }else{
-        DataBuffer result;
-        dataBufferInit(&result, out->nData+nData);
-        docListUnion(out->pData, out->nData, pData, nData, &result);
-        dataBufferDestroy(out);
-        *out = result;
-        /* TODO(shess) Rather than destroy out, we could retain it for
-        ** later reuse.
+        /* pAcc is the empty buffer the merged data will end up in. */
+        DataBuffer *pAcc = &(pBuffers[iBuffer]);
+        DataBuffer *p = &(pBuffers[0]);
+
+        /* Handle position 0 specially to avoid need to prime pAcc
+        ** with pData/nData.
          */
+        dataBufferSwap(p, pAcc);
+        docListAccumulateUnion(pAcc, pData, nData);
+
+        /* Accumulate remaining doclists into pAcc. */
+        for(++p; p<pAcc; ++p){
+          docListAccumulateUnion(pAcc, p->pData, p->nData);
+
+          /* dataBufferReset() could allow a large doclist to blow up
+          ** our memory requirements.
+          */
+          if( p->nCapacity<1024 ){
+            dataBufferReset(p);
+          }else{
+            dataBufferDestroy(p);
+            dataBufferInit(p, 0);
+          }
+        }
        }
      }
-    if( c>0 ) break;      /* Past any possible matches. */
+  }
  
-    rc = leavesReaderStep(v, pReader);
-    if( rc!=SQLITE_OK ) return rc;
+  /* Union all the doclists together into *out. */
+  /* TODO(shess) What if *out is big?  Sigh. */
+  if( rc==SQLITE_OK && nBuffers>0 ){
+    int iBuffer;
+    for(iBuffer=0; iBuffer<nBuffers; ++iBuffer){
+      if( pBuffers[iBuffer].nData>0 ){
+        if( out->nData==0 ){
+          dataBufferSwap(out, &(pBuffers[iBuffer]));
+        }else{
+          docListAccumulateUnion(out, pBuffers[iBuffer].pData,
+                                 pBuffers[iBuffer].nData);
+        }
+      }
+    }
    }
-  return SQLITE_OK;
+
+  while( nBuffers-- ){
+    dataBufferDestroy(&(pBuffers[nBuffers]));
+  }
+  if( pBuffers!=NULL ) sqlite3_free(pBuffers);
+
+  return rc;
  }
  
  /* Call loadSegmentLeavesInt() with pData/nData as input. */
diff --git a/manifest b/manifest

index 795744b12808f0ab11b9120ee72f2b6fdb73bbc0..20e990b6fc0f63104f37f4912e04a7defe2fea51 100644 (file)
--- a/manifest
+++ b/manifest
@@ -1,5 +1,5 @@
-C Changes\sfts2\sto\suse\sonly\ssqlite3_malloc()\sand\snot\ssystem\smalloc.\r\nBackports\s(4554)\sand\s(4555)\sfrom\sfts3.\s(CVS\s5454)
-D 2008-07-22T22:57:54
+C Change\sprefix\ssearch\sfrom\sO(N*M)\sto\sO(NlogM).\r\nBackports\s(4599)\sfrom\sfts3.\s(CVS\s5455)
+D 2008-07-22T23:08:40
  F Makefile.arm-wince-mingw32ce-gcc fcd5e9cd67fe88836360bb4f9ef4cb7f8e2fb5a0
  F Makefile.in 77ff156061bb870aa0a8b3d545c670d08070f7e6
  F Makefile.linux-gcc d53183f4aa6a9192d249731c90dbdffbd2c68654
@@ -39,7 +39,7 @@ F ext/fts1/simple_tokenizer.c 1844d72f7194c3fd3d7e4173053911bf0661b70d
  F ext/fts1/tokenizer.h 0c53421b832366d20d720d21ea3e1f6e66a36ef9
  F ext/fts2/README.tokenizers 21e3684ea5a095b55d70f6878b4ce6af5932dfb7
  F ext/fts2/README.txt 8c18f41574404623b76917b9da66fcb0ab38328d
-F ext/fts2/fts2.c f50c7faca742f40b14a2b279d652594c532e6523
+F ext/fts2/fts2.c 7a2e88d110d059c986234c3d7734133d59a709ef
  F ext/fts2/fts2.h da5f76c65163301d1068a971fd32f4119e3c95fa
  F ext/fts2/fts2_hash.c 2689e42e1107ea67207f725cf69cf8972d00cf93
  F ext/fts2/fts2_hash.h 9a5b1be94664139f93217a0770d7144425cffb3a
@@ -609,7 +609,7 @@ F tool/speedtest16.c c8a9c793df96db7e4933f0852abb7a03d48f2e81
  F tool/speedtest2.tcl ee2149167303ba8e95af97873c575c3e0fab58ff
  F tool/speedtest8.c 1dbced29de5f59ba2ebf877edcadf171540374d1
  F tool/speedtest8inst1.c 293327bc76823f473684d589a8160bde1f52c14e
-P d562515e1cdd05212674516033c64b5f5668b799
-R 2682d383bee51312a260e3583fb091fa
+P ecf2dec66cb979cb7d8db3b7ce5c64cab57fe2bb
+R 89694177ea5ff7c878e6daa15283840d
  U shess
-Z 7b59d5a322f77dacbe20c5139c2b6d35
+Z 48033bdd7abd5465e3195f281a547d67
diff --git a/manifest.uuid b/manifest.uuid

index 5f5c013d3fa3dcd4867c8fdedc19ca74c45da6ec..8e4895f1e430e5c5524171938618517dc295a718 100644 (file)
--- a/manifest.uuid
+++ b/manifest.uuid
@@ -1 +1 @@
-ecf2dec66cb979cb7d8db3b7ce5c64cab57fe2bb
-\ No newline at end of file
+3f614453d2d7c753a5963b027fe8618b50b4f6b9
+\ No newline at end of file
author	shess <shess@noemail.net>
	Tue, 22 Jul 2008 23:08:40 +0000 (23:08 +0000)
committer	shess <shess@noemail.net>
	Tue, 22 Jul 2008 23:08:40 +0000 (23:08 +0000)
ext/fts2/fts2.c		patch \| blob \| blame \| history
manifest		patch \| blob \| blame \| history
manifest.uuid		patch \| blob \| blame \| history