From: shess Date: Mon, 13 Nov 2006 21:09:24 +0000 (+0000) Subject: Delta-encode docids. This is good for around 22% reduction in index X-Git-Tag: version-3.6.10~2657 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=de163af26eee8a6728bf358b66acb8e9b47647ad;p=thirdparty%2Fsqlite.git Delta-encode docids. This is good for around 22% reduction in index size with DL_POSITIONS. It improves performance about 5%-6%. (CVS 3511) FossilOrigin-Name: 9b6d413d751d962b67cb4e3a208efe61581cb822 --- diff --git a/ext/fts2/fts2.c b/ext/fts2/fts2.c index 4689db65c9..cd31b35294 100644 --- a/ext/fts2/fts2.c +++ b/ext/fts2/fts2.c @@ -605,7 +605,9 @@ static void dlrStep(DLReader *pReader){ /* If there is more data, read the next doclist element. */ if( pReader->nData!=0 ){ - int iDummy, n = getVarint(pReader->pData, &pReader->iDocid); + sqlite_int64 iDocidDelta; + int iDummy, n = getVarint(pReader->pData, &iDocidDelta); + pReader->iDocid += iDocidDelta; if( pReader->iType>=DL_POSITIONS ){ assert( nnData ); while( 1 ){ @@ -649,17 +651,13 @@ static void dlrDestroy(DLReader *pReader){ */ static int docListValidate(DocListType iType, const char *pData, int nData, sqlite_int64 *pLastDocid){ - int has_prevDocid = 0; - sqlite_int64 iPrevDocid; + sqlite_int64 iPrevDocid = 0; assert( pData!=0 ); assert( nData!=0 ); while( nData!=0 ){ - int n; - sqlite_int64 iDocid; - n = getVarint(pData, &iDocid); - assert( !has_prevDocid || iPrevDocidDL_DOCIDS ){ int iDummy; while( 1 ){ @@ -678,7 +676,6 @@ static int docListValidate(DocListType iType, const char *pData, int nData, pData += n; nData -= n; } - assert( has_prevDocid ); if( pLastDocid ) *pLastDocid = iPrevDocid; return 1; } @@ -693,53 +690,72 @@ static int docListValidate(DocListType iType, const char *pData, int nData, ** dlwAppend - append raw doclist data to buffer. ** dlwAdd - construct doclist element and append to buffer. */ -/* TODO(shess) Modify to handle delta-encoding docids. This should be -** fairly simple. The changes to dlwAdd() are obvious. dlwAppend() -** would need to decode the leading docid, rencode as a delta, and -** copy the rest of the data (which would already be delta-encoded). -** Note that this will require a change to pass the trailing docid. -*/ typedef struct DLWriter { DocListType iType; DataBuffer *b; -#ifndef NDEBUG - int has_prevDocid; sqlite_int64 iPrevDocid; -#endif } DLWriter; static void dlwInit(DLWriter *pWriter, DocListType iType, DataBuffer *b){ pWriter->b = b; pWriter->iType = iType; -#ifndef NDEBUG - pWriter->has_prevDocid = 0; pWriter->iPrevDocid = 0; -#endif } static void dlwDestroy(DLWriter *pWriter){ SCRAMBLE(pWriter); } +/* iFirstDocid is the first docid in the doclist in pData. It is +** needed because pData may point within a larger doclist, in which +** case the first item would be delta-encoded. +** +** iLastDocid is the final docid in the doclist in pData. It is +** needed to create the new iPrevDocid for future delta-encoding. The +** code could decode the passed doclist to recreate iLastDocid, but +** the only current user (docListMerge) already has decoded this +** information. +*/ +/* TODO(shess) This has become just a helper for docListMerge. +** Consider a refactor to make this cleaner. +*/ static void dlwAppend(DLWriter *pWriter, - const char *pData, int nData){ + const char *pData, int nData, + sqlite_int64 iFirstDocid, sqlite_int64 iLastDocid){ + sqlite_int64 iDocid = 0; + char c[VARINT_MAX]; + int nFirstOld, nFirstNew; /* Old and new varint len of first docid. */ #ifndef NDEBUG - sqlite_int64 iDocid; - int n; - n = getVarint(pData, &iDocid); - assert( n<=nData ); - assert( !pWriter->has_prevDocid || pWriter->iPrevDocidiType>DL_DOCIDS ); - assert( docListValidate(pWriter->iType, pData, nData, &iDocid) ); - pWriter->has_prevDocid = 1; - pWriter->iPrevDocid = iDocid; + sqlite_int64 iLastDocidDelta; #endif - dataBufferAppend(pWriter->b, pData, nData); + + /* Recode the initial docid as delta from iPrevDocid. */ + nFirstOld = getVarint(pData, &iDocid); + assert( nFirstOldiType==DL_DOCIDS) ); + nFirstNew = putVarint(c, iFirstDocid-pWriter->iPrevDocid); + + /* Verify that the incoming doclist is valid AND that it ends with + ** the expected docid. This is essential because we'll trust this + ** docid in future delta-encoding. + */ + assert( docListValidate(pWriter->iType, pData, nData, &iLastDocidDelta) ); + assert( iLastDocid==iFirstDocid-iDocid+iLastDocidDelta ); + + /* Append recoded initial docid and everything else. Rest of docids + ** should have been delta-encoded from previous initial docid. + */ + if( nFirstOldb, c, nFirstNew, + pData+nFirstOld, nData-nFirstOld); + }else{ + dataBufferAppend(pWriter->b, c, nFirstNew); + } + pWriter->iPrevDocid = iLastDocid; } static void dlwAdd(DLWriter *pWriter, sqlite_int64 iDocid, const char *pPosList, int nPosList){ char c[VARINT_MAX]; - int n = putVarint(c, iDocid); + int n = putVarint(c, iDocid-pWriter->iPrevDocid); - assert( !pWriter->has_prevDocid || pWriter->iPrevDocidiPrevDocidiType>DL_DOCIDS ); dataBufferAppend(pWriter->b, c, n); @@ -752,10 +768,7 @@ static void dlwAdd(DLWriter *pWriter, sqlite_int64 iDocid, dataBufferAppend(pWriter->b, c, n); } } -#ifndef NDEBUG - pWriter->has_prevDocid = 1; pWriter->iPrevDocid = iDocid; -#endif } /*******************************************************************/ @@ -1047,6 +1060,7 @@ static void docListMerge(DataBuffer *out, int i, n; const char *pStart = 0; int nStart = 0; + sqlite_int64 iFirstDocid = 0, iLastDocid = 0; assert( nReaders>0 ); if( nReaders==1 ){ @@ -1083,10 +1097,14 @@ static void docListMerge(DataBuffer *out, if( dlrDocData(readers[0].pReader)==pStart+nStart ){ nStart += dlrDocDataBytes(readers[0].pReader); }else{ - if( pStart!=0 ) dlwAppend(&writer, pStart, nStart); + if( pStart!=0 ){ + dlwAppend(&writer, pStart, nStart, iFirstDocid, iLastDocid); + } pStart = dlrDocData(readers[0].pReader); nStart = dlrDocDataBytes(readers[0].pReader); + iFirstDocid = iDocid; } + iLastDocid = iDocid; dlrStep(readers[0].pReader); /* Drop all of the older elements with the same docid. */ @@ -1103,7 +1121,7 @@ static void docListMerge(DataBuffer *out, } /* Copy over any remaining elements. */ - if( nStart>0 ) dlwAppend(&writer, pStart, nStart); + if( nStart>0 ) dlwAppend(&writer, pStart, nStart, iFirstDocid, iLastDocid); dlwDestroy(&writer); } @@ -4155,6 +4173,7 @@ static int leafWriterStep(fulltext_vtab *v, LeafWriter *pWriter, rc = leafWriterFlush(v, pWriter); if( rc!=SQLITE_OK ) return rc; } + assert( leafNodeValidate(pWriter->data.pData, pWriter->data.nData) ); return SQLITE_OK; } diff --git a/manifest b/manifest index 787fdc05bc..40d8dbd654 100644 --- a/manifest +++ b/manifest @@ -1,5 +1,5 @@ -C Require\sa\sminimum\sfanout\sfor\sinterior\snodes.\s\sThis\sprevents\scases\nwhere\sexcessively\slarge\sterms\skeep\sthe\stree\sfrom\sfinding\sa\ssingle\nroot.\s\sA\sdownside\sis\sthat\sthis\scould\sresult\sin\slarge\sinterior\snodes\sin\nthe\spresence\sof\slarge\sterms,\swhich\smay\sbe\sprone\sto\sfragmentation,\nthough\sif\sthe\snodes\swere\ssmaller\sthat\swould\stranslate\sinto\smore\slevels\nin\sthe\stree,\swhich\swould\salso\shave\sthat\sproblem.\s(CVS\s3510) -D 2006-11-13T21:00:55 +C Delta-encode\sdocids.\s\sThis\sis\sgood\sfor\saround\s22%\sreduction\sin\sindex\nsize\swith\sDL_POSITIONS.\s\sIt\simproves\sperformance\sabout\s5%-6%.\s(CVS\s3511) +D 2006-11-13T21:09:25 F Makefile.in 8e14898d41a53033ecb687d93c9cd5d109fb9ae3 F Makefile.linux-gcc 2d8574d1ba75f129aba2019f0b959db380a90935 F README 9c4e2d6706bdcc3efdd773ce752a8cdab4f90028 @@ -33,7 +33,7 @@ F ext/fts1/fulltext.h 08525a47852d1d62a0be81d3fc3fe2d23b094efd F ext/fts1/simple_tokenizer.c 1844d72f7194c3fd3d7e4173053911bf0661b70d F ext/fts1/tokenizer.h 0c53421b832366d20d720d21ea3e1f6e66a36ef9 F ext/fts2/README.txt 8c18f41574404623b76917b9da66fcb0ab38328d -F ext/fts2/fts2.c 667a93b3fe079f20870a3042bd4b4c3841925c01 +F ext/fts2/fts2.c 7909381760660b3da9918ff3e618e2c83315234b F ext/fts2/fts2.h bbdab26d34f91974d5b9ade8b7836c140a7c4ce1 F ext/fts2/fts2_hash.c b3f22116d4ef0bc8f2da6e3fdc435c86d0951a9b F ext/fts2/fts2_hash.h e283308156018329f042816eb09334df714e105e @@ -421,7 +421,7 @@ F www/tclsqlite.tcl bb0d1357328a42b1993d78573e587c6dcbc964b9 F www/vdbe.tcl 87a31ace769f20d3627a64fa1fade7fed47b90d0 F www/version3.tcl 890248cf7b70e60c383b0e84d77d5132b3ead42b F www/whentouse.tcl 97e2b5cd296f7d8057e11f44427dea8a4c2db513 -P 9628a61a6f33b7bec3455086534b76437d2622b4 -R 2b75980ff8122f283fe2f8c11a712490 +P 64b7e3406134ac4891113b9bb432ad97504268bb +R 5cca903a493ab0c4e72312813e09cd62 U shess -Z 5a39d4513967a7196d065949839c93cd +Z 6c02cb52391a3d0abd67d903b02caa78 diff --git a/manifest.uuid b/manifest.uuid index bb7665bdd3..4e80194757 100644 --- a/manifest.uuid +++ b/manifest.uuid @@ -1 +1 @@ -64b7e3406134ac4891113b9bb432ad97504268bb \ No newline at end of file +9b6d413d751d962b67cb4e3a208efe61581cb822 \ No newline at end of file