]> git.ipfire.org Git - thirdparty/sqlite.git/commitdiff
Optimize copying data from fts5 in-memory hash tables to top level segments.
authordan <dan@noemail.net>
Thu, 26 Feb 2015 20:49:09 +0000 (20:49 +0000)
committerdan <dan@noemail.net>
Thu, 26 Feb 2015 20:49:09 +0000 (20:49 +0000)
FossilOrigin-Name: 8e3ca6323a2beab5f04250e24ae15b159d2aa0ac

ext/fts5/fts5_hash.c
ext/fts5/fts5_index.c
manifest
manifest.uuid

index 4b85b2af23eb4b60d2debe97934b58dddc1775a9..eb1f3d1b1a17a55d1026c6fb3b3ae677dcbeda19 100644 (file)
@@ -393,55 +393,6 @@ static int fts5HashEntrySort(
   return SQLITE_OK;
 }
 
-int sqlite3Fts5HashIterate(
-  Fts5Hash *pHash,
-  void *pCtx,
-  int (*xTerm)(void*, const char*, int),
-  int (*xEntry)(void*, i64, const u8*, int),
-  int (*xTermDone)(void*)
-){
-  Fts5HashEntry *pList;
-  int rc;
-
-  rc = fts5HashEntrySort(pHash, 0, 0, &pList);
-  if( rc==SQLITE_OK ){
-    memset(pHash->aSlot, 0, sizeof(Fts5HashEntry*) * pHash->nSlot);
-    while( pList ){
-      Fts5HashEntry *pNext = pList->pScanNext;
-      if( rc==SQLITE_OK ){
-        const int nKey = strlen(pList->zKey);
-        i64 iRowid = 0;
-        u8 *pPtr = (u8*)pList;
-        int iOff = sizeof(Fts5HashEntry) + nKey + 1;
-
-        /* Fill in the final poslist size field */
-        fts5HashAddPoslistSize(pList);
-        
-        /* Issue the new-term callback */
-        rc = xTerm(pCtx, pList->zKey, nKey);
-
-        /* Issue the xEntry callbacks */
-        while( rc==SQLITE_OK && iOff<pList->nData ){
-          i64 iDelta;             /* Rowid delta value */
-          int nPoslist;           /* Size of position list in bytes */
-          int nVarint;
-          iOff += getVarint(&pPtr[iOff], (u64*)&iDelta);
-          iRowid += iDelta;
-          nVarint = fts5GetVarint32(&pPtr[iOff], nPoslist);
-          rc = xEntry(pCtx, iRowid, &pPtr[iOff], nPoslist+nVarint);
-          iOff += nVarint+nPoslist;
-        }
-
-        /* Issue the term-done callback */
-        if( rc==SQLITE_OK ) rc = xTermDone(pCtx);
-      }
-      sqlite3_free(pList);
-      pList = pNext;
-    }
-  }
-  return rc;
-}
-
 /*
 ** Query the hash table for a doclist associated with term pTerm/nTerm.
 */
@@ -478,9 +429,8 @@ void sqlite3Fts5HashScanInit(
 }
 
 void sqlite3Fts5HashScanNext(Fts5Hash *p){
-  if( p->pScan ){
-    p->pScan = p->pScan->pScanNext;
-  }
+  Fts5HashEntry *pScan = p->pScan;
+  if( pScan ) p->pScan = pScan->pScanNext;
 }
 
 int sqlite3Fts5HashScanEof(Fts5Hash *p){
index f24be525d9173162ebdb8fcfec9f2645c680641e..481cd75fa70ba286cf79fe4bb39d0733a8c3dd8c 100644 (file)
 **         poslist: first poslist
 **         zero-or-more {
 **           varint:  rowid delta (always > 0)
-**           poslist: first poslist
+**           poslist: next poslist
 **         }
 **         0x00 byte
 **
@@ -2677,7 +2677,7 @@ static void fts5WriteBtreeNEmpty(Fts5Index *p, Fts5SegWriter *pWriter){
     int bFlag = 0;
     Fts5PageWriter *pPg;
     pPg = &pWriter->aWriter[1];
-    if( pWriter->nEmpty>=FTS5_MIN_DLIDX_SIZE ){
+    if( pWriter->nEmpty>=FTS5_MIN_DLIDX_SIZE && pWriter->cdlidx.n ){
       i64 iKey = FTS5_DOCLIST_IDX_ROWID(
           pWriter->iIdx, pWriter->iSegid, 
           pWriter->aWriter[0].pgno - 1 - pWriter->nEmpty
@@ -3004,12 +3004,15 @@ static void fts5WriteFinish(
 ){
   int i;
   if( p->rc==SQLITE_OK ){
-    *pnLeaf = pWriter->aWriter[0].pgno;
-    if( *pnLeaf==1 && pWriter->aWriter[0].buf.n==0 ){
+    Fts5PageWriter *pLeaf = &pWriter->aWriter[0];
+    if( pLeaf->pgno==1 && pLeaf->buf.n==0 ){
       *pnLeaf = 0;
       *pnHeight = 0;
     }else{
-      fts5WriteFlushLeaf(p, pWriter);
+      if( pLeaf->buf.n>4 ){
+        fts5WriteFlushLeaf(p, pWriter);
+      }
+      *pnLeaf = pLeaf->pgno-1;
       if( pWriter->nWriter==1 && pWriter->nEmpty>=FTS5_MIN_DLIDX_SIZE ){
         fts5WriteBtreeGrow(p, pWriter);
       }
@@ -3381,44 +3384,20 @@ struct Fts5FlushCtx {
   Fts5SegWriter writer; 
 };
 
-static int fts5FlushNewTerm(void *pCtx, const char *zTerm, int nTerm){
-  Fts5FlushCtx *p = (Fts5FlushCtx*)pCtx;
-  int rc = SQLITE_OK;
-  fts5WriteAppendTerm(p->pIdx, &p->writer, nTerm, (const u8*)zTerm);
-  return rc;
-}
-
-static int fts5FlushTermDone(void *pCtx){
-  Fts5FlushCtx *p = (Fts5FlushCtx*)pCtx;
-  int rc = SQLITE_OK;
-  /* Write the doclist terminator */
-  fts5WriteAppendZerobyte(p->pIdx, &p->writer);
-  return rc;
-}
-
-static int fts5FlushNewEntry(
-  void *pCtx, 
-  i64 iRowid, 
-  const u8 *aPoslist, 
-  int nPoslist
-){
-  Fts5FlushCtx *p = (Fts5FlushCtx*)pCtx;
-  Fts5Index *pIdx = p->pIdx;
-
-#ifdef SQLITE_DEBUG
-  /* The poslist-size varint should already be at the start of the 
-  ** aPoslist/nPoslist buffer. This assert verifies that. */
-  int n, i;
-  i = fts5GetVarint32(aPoslist, n);
-  assert( nPoslist==(n+i) );
-#endif
-
-  /* Append the rowid itself */
-  fts5WriteAppendRowid(pIdx, &p->writer, iRowid);
-
-  /* And the poslist data */
-  fts5WriteAppendPoslistData(pIdx, &p->writer, aPoslist, nPoslist);
-  return pIdx->rc;
+/*
+** Buffer aBuf[] contains a list of varints, all small enough to fit
+** in a 32-bit integer. Return the size of the largest prefix of this 
+** list nMax bytes or less in size.
+*/
+static int fts5PoslistPrefix(const u8 *aBuf, int nMax){
+  int ret = 0;
+  while( 1 ){
+    u32 dummy;
+    int i = fts5GetVarint32(&aBuf[ret], dummy);
+    if( (ret + i) > nMax ) break;
+    ret += i;
+  }
+  return ret;
 }
 
 /*
@@ -3429,6 +3408,7 @@ static int fts5FlushNewEntry(
 ** already occurred, this function is a no-op.
 */
 static void fts5FlushOneHash(Fts5Index *p, int iHash, int *pnLeaf){
+  Fts5Hash *pHash = p->apHash[iHash];
   Fts5Structure *pStruct;
   int iSegid;
   int pgnoLast = 0;                 /* Last leaf page number in segment */
@@ -3439,19 +3419,127 @@ static void fts5FlushOneHash(Fts5Index *p, int iHash, int *pnLeaf){
   iSegid = fts5AllocateSegid(p, pStruct);
 
   if( iSegid ){
+    const int pgsz = p->pConfig->pgsz;
+
     Fts5StructureSegment *pSeg;   /* New segment within pStruct */
     int nHeight;                  /* Height of new segment b-tree */
-    int rc;
-    Fts5FlushCtx ctx;
+    Fts5Buffer *pBuf;             /* Buffer in which to assemble leaf page */
 
-    fts5WriteInit(p, &ctx.writer, iHash, iSegid);
-    ctx.pIdx = p;
+    Fts5SegWriter writer;
+    fts5WriteInit(p, &writer, iHash, iSegid);
 
-    rc = sqlite3Fts5HashIterate( p->apHash[iHash], (void*)&ctx, 
-        fts5FlushNewTerm, fts5FlushNewEntry, fts5FlushTermDone
-    );
-    if( p->rc==SQLITE_OK ) p->rc = rc;
-    fts5WriteFinish(p, &ctx.writer, &nHeight, &pgnoLast);
+    /* Pre-allocate the buffer used to assemble leaf pages to the target
+    ** page size.  */
+    assert( pgsz>0 );
+    pBuf = &writer.aWriter[0].buf;
+    fts5BufferGrow(&p->rc, pBuf, pgsz + 20);
+
+    /* Begin scanning through hash table entries. */
+    if( p->rc==SQLITE_OK ){
+      memset(pBuf->p, 0, 4);
+      pBuf->n = 4;
+      sqlite3Fts5HashScanInit(pHash, 0, 0);
+    }
+
+    while( 0==sqlite3Fts5HashScanEof(pHash) ){
+      const char *zTerm;
+      int nTerm;
+      const u8 *pDoclist;
+      int nDoclist;
+
+      sqlite3Fts5HashScanEntry(pHash, &zTerm,(const char**)&pDoclist,&nDoclist);
+      nTerm = strlen(zTerm);
+
+      /* Decide if the term fits on the current leaf. If not, flush it
+      ** to disk.  */
+      if( (pBuf->n + nTerm + 2) > pgsz ){
+        fts5WriteFlushLeaf(p, &writer);
+        pBuf = &writer.aWriter[0].buf;
+        if( (nTerm + 32) > pBuf->nSpace ){
+          fts5BufferGrow(&p->rc, pBuf, nTerm + 32 - pBuf->n);
+        }
+      }
+
+      /* Write the term to the leaf. And push it up into the b-tree hierarchy */
+      if( writer.bFirstTermInPage==0 ){
+        pBuf->n += sqlite3PutVarint(&pBuf->p[pBuf->n], 0);
+      }else{
+        fts5PutU16(&pBuf->p[2], pBuf->n);
+        writer.bFirstTermInPage = 0;
+        if( writer.aWriter[0].pgno!=1 ){
+          fts5WriteBtreeTerm(p, &writer, nTerm, (const u8*)zTerm);
+          pBuf = &writer.aWriter[0].buf;
+        }
+      }
+      pBuf->n += sqlite3PutVarint(&pBuf->p[pBuf->n], nTerm);
+      fts5BufferAppendBlob(&p->rc, pBuf, nTerm, (const u8*)zTerm);
+
+      if( pgsz>=(pBuf->n + nDoclist + 1) ){
+        /* The entire doclist will fit on the current leaf. */
+        fts5BufferAppendBlob(&p->rc, pBuf, nDoclist, pDoclist);
+      }else{
+        i64 iRowid = 0;
+        i64 iDelta = 0;
+        int iOff = 0;
+        int bFirstDocid = 0;
+
+        /* The entire doclist will not fit on this leaf. The following 
+        ** loop iterates through the poslists that make up the current 
+        ** doclist.  */
+        while( iOff<nDoclist ){
+          u32 nPos;
+          int nCopy;
+          iOff += getVarint(&pDoclist[iOff], (u64*)&iDelta);
+          nCopy = fts5GetVarint32(&pDoclist[iOff], nPos);
+          nCopy += nPos;
+          iRowid += iDelta;
+          
+          if( bFirstDocid ){
+            fts5PutU16(&pBuf->p[0], pBuf->n);   /* first docid on page */
+            pBuf->n += sqlite3PutVarint(&pBuf->p[pBuf->n], iRowid);
+            bFirstDocid = 0;
+          }else{
+            pBuf->n += sqlite3PutVarint(&pBuf->p[pBuf->n], iDelta);
+          }
+          assert( pBuf->n<=pBuf->nSpace );
+
+          if( (pBuf->n + nCopy) <= pgsz ){
+            /* The entire poslist will fit on the current leaf. So copy
+            ** it in one go. */
+            fts5BufferAppendBlob(&p->rc, pBuf, nCopy, &pDoclist[iOff]);
+          }else{
+            /* The entire poslist will not fit on this leaf. So it needs
+            ** to be broken into sections. The only qualification being
+            ** that each varint must be stored contiguously.  */
+            const u8 *pPoslist = &pDoclist[iOff];
+            int iPos = 0;
+            while( 1 ){
+              int nSpace = pgsz - pBuf->n;
+              int n;
+              if( (nCopy - iPos)<=nSpace ){
+                n = nCopy - iPos;
+              }else{
+                n = fts5PoslistPrefix(&pPoslist[iPos], nSpace);
+              }
+              fts5BufferAppendBlob(&p->rc, pBuf, n, &pPoslist[iPos]);
+              iPos += n;
+              if( iPos>=nCopy ) break;
+              fts5WriteFlushLeaf(p, &writer);
+              pBuf = &writer.aWriter[0].buf;
+            }
+            bFirstDocid = 1;
+          }
+          assert( pBuf->n<=pgsz );
+          iOff += nCopy;
+        }
+      }
+
+      pBuf->p[pBuf->n++] = '\0';
+      assert( pBuf->n<=pBuf->nSpace );
+      sqlite3Fts5HashScanNext(pHash);
+    }
+    sqlite3Fts5HashClear(pHash);
+    fts5WriteFinish(p, &writer, &nHeight, &pgnoLast);
 
     /* Update the Fts5Structure. It is written back to the database by the
     ** fts5StructureRelease() call below.  */
index 94a134ea1d502d1b3c9d746c30d02900df0b55a4..c27cd5f2c0b409048d89f20fc36e4c958f8a793c 100644 (file)
--- a/manifest
+++ b/manifest
@@ -1,5 +1,5 @@
-C Fix\san\sfts5\sbug\sin\slarge\sincremental\smerges.
-D 2015-02-26T14:54:03.688
+C Optimize\scopying\sdata\sfrom\sfts5\sin-memory\shash\stables\sto\stop\slevel\ssegments.
+D 2015-02-26T20:49:09.566
 F Makefile.arm-wince-mingw32ce-gcc d6df77f1f48d690bd73162294bbba7f59507c72f
 F Makefile.in 5407a688f4d77a05c18a8142be8ae5a2829dd610
 F Makefile.linux-gcc 91d710bdc4998cb015f39edf3cb314ec4f4d7e23
@@ -111,8 +111,8 @@ F ext/fts5/fts5_aux.c 549aef152b0fd46020f5595d861b1fd60b3f9b4f
 F ext/fts5/fts5_buffer.c b92ba0eb67532d174934087f93716caf9a2168c7
 F ext/fts5/fts5_config.c e3421a76c2abd33a05ac09df0c97c64952d1e700
 F ext/fts5/fts5_expr.c eee52c9df84eade48eaa3f50c8876f44b552ff9b
-F ext/fts5/fts5_hash.c 9032dd35bf8da6f9d4fc8c955c348dd6d229d8e4
-F ext/fts5/fts5_index.c 97ce5c919be5a70b623f89c66c60bda15408d577
+F ext/fts5/fts5_hash.c 323099a445bf8f608af069e2d8ff4bb93db9904c
+F ext/fts5/fts5_index.c 7a9de0c033a8f702f8e3659a23c2ea31bbbb789b
 F ext/fts5/fts5_storage.c f7c12c9f454b2a525827b3d85fd222789236f548
 F ext/fts5/fts5_tcl.c 1293fac2bb26903fd3d5cdee59c5885ba7e620d5
 F ext/fts5/fts5_tokenize.c 0d108148c26132448487926fe683425002aee369
@@ -1284,7 +1284,7 @@ F tool/vdbe_profile.tcl 67746953071a9f8f2f668b73fe899074e2c6d8c1
 F tool/warnings-clang.sh f6aa929dc20ef1f856af04a730772f59283631d4
 F tool/warnings.sh 0abfd78ceb09b7f7c27c688c8e3fe93268a13b32
 F tool/win/sqlite.vsix deb315d026cc8400325c5863eef847784a219a2f
-P 7eb022d7e5fdb180af823c82c47c938e4a7a355f
-R a31df37a1e652372bc458cec184c4145
+P 208e3cb6b6dc8c7d824b64dec2034004c9fcbba5
+R 2ae35fa8ad1fd2b74f86acf33cc74d9f
 U dan
-Z ba5b725699c58a1a7e341156a5c76451
+Z 84061e5477aeb4ed2552f0a9aa275eda
index 32c499775ff6c0de1cb2de40be2baa4ae3b72d35..3538b76cd70bd7d012616eb62c0aca638a2c1a04 100644 (file)
@@ -1 +1 @@
-208e3cb6b6dc8c7d824b64dec2034004c9fcbba5
\ No newline at end of file
+8e3ca6323a2beab5f04250e24ae15b159d2aa0ac
\ No newline at end of file