From: Yann Collet Date: Sun, 12 Sep 2021 07:54:23 +0000 (-0700) Subject: initialize btultra stats with greedy X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=ccde29d407589a627cf574614b605c64c026985a;p=thirdparty%2Fzstd.git initialize btultra stats with greedy improves compression ratio on first block at a speed cost of ~-5%. --- diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index e68f7352a..a42d89b93 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -737,36 +737,36 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, case ZSTD_c_windowLog : if (value!=0) /* 0 => use default */ BOUNDCHECK(ZSTD_c_windowLog, value); - CCtxParams->cParams.windowLog = (U32)value; + CCtxParams->cParams.windowLog = (unsigned)value; return CCtxParams->cParams.windowLog; case ZSTD_c_hashLog : if (value!=0) /* 0 => use default */ BOUNDCHECK(ZSTD_c_hashLog, value); - CCtxParams->cParams.hashLog = (U32)value; + CCtxParams->cParams.hashLog = (unsigned)value; return CCtxParams->cParams.hashLog; case ZSTD_c_chainLog : if (value!=0) /* 0 => use default */ BOUNDCHECK(ZSTD_c_chainLog, value); - CCtxParams->cParams.chainLog = (U32)value; + CCtxParams->cParams.chainLog = (unsigned)value; return CCtxParams->cParams.chainLog; case ZSTD_c_searchLog : if (value!=0) /* 0 => use default */ BOUNDCHECK(ZSTD_c_searchLog, value); - CCtxParams->cParams.searchLog = (U32)value; + CCtxParams->cParams.searchLog = (unsigned)value; return (size_t)value; case ZSTD_c_minMatch : if (value!=0) /* 0 => use default */ BOUNDCHECK(ZSTD_c_minMatch, value); - CCtxParams->cParams.minMatch = value; + CCtxParams->cParams.minMatch = (unsigned)value; return CCtxParams->cParams.minMatch; case ZSTD_c_targetLength : BOUNDCHECK(ZSTD_c_targetLength, value); - CCtxParams->cParams.targetLength = value; + CCtxParams->cParams.targetLength = (unsigned)value; return CCtxParams->cParams.targetLength; case ZSTD_c_strategy : @@ -779,12 +779,12 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, /* Content size written in frame header _when known_ (default:1) */ DEBUGLOG(4, "set content size flag = %u", (value!=0)); CCtxParams->fParams.contentSizeFlag = value != 0; - return CCtxParams->fParams.contentSizeFlag; + return (size_t)CCtxParams->fParams.contentSizeFlag; case ZSTD_c_checksumFlag : /* A 32-bits content checksum will be calculated and written at end of frame (default:0) */ CCtxParams->fParams.checksumFlag = value != 0; - return CCtxParams->fParams.checksumFlag; + return (size_t)CCtxParams->fParams.checksumFlag; case ZSTD_c_dictIDFlag : /* When applicable, dictionary's dictID is provided in frame header (default:1) */ DEBUGLOG(4, "set dictIDFlag = %u", (value!=0)); @@ -793,18 +793,18 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, case ZSTD_c_forceMaxWindow : CCtxParams->forceWindow = (value != 0); - return CCtxParams->forceWindow; + return (size_t)CCtxParams->forceWindow; case ZSTD_c_forceAttachDict : { const ZSTD_dictAttachPref_e pref = (ZSTD_dictAttachPref_e)value; - BOUNDCHECK(ZSTD_c_forceAttachDict, pref); + BOUNDCHECK(ZSTD_c_forceAttachDict, (int)pref); CCtxParams->attachDictPref = pref; return CCtxParams->attachDictPref; } case ZSTD_c_literalCompressionMode : { const ZSTD_literalCompressionMode_e lcm = (ZSTD_literalCompressionMode_e)value; - BOUNDCHECK(ZSTD_c_literalCompressionMode, lcm); + BOUNDCHECK(ZSTD_c_literalCompressionMode, (int)lcm); CCtxParams->literalCompressionMode = lcm; return CCtxParams->literalCompressionMode; } diff --git a/lib/compress/zstd_compress_internal.h b/lib/compress/zstd_compress_internal.h index c032d7d18..aae6fa2c8 100644 --- a/lib/compress/zstd_compress_internal.h +++ b/lib/compress/zstd_compress_internal.h @@ -657,7 +657,7 @@ static unsigned ZSTD_NbCommonBytes (size_t val) return _BitScanForward64( &r, (U64)val ) ? (unsigned)(r >> 3) : 0; # endif # elif defined(__GNUC__) && (__GNUC__ >= 4) - return (__builtin_ctzll((U64)val) >> 3); + return (unsigned)(__builtin_ctzll((U64)val) >> 3); # else static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, @@ -674,7 +674,7 @@ static unsigned ZSTD_NbCommonBytes (size_t val) unsigned long r=0; return _BitScanForward( &r, (U32)val ) ? (unsigned)(r >> 3) : 0; # elif defined(__GNUC__) && (__GNUC__ >= 3) - return (__builtin_ctz((U32)val) >> 3); + return (unsigned)(__builtin_ctz((U32)val) >> 3); # else static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, @@ -693,7 +693,7 @@ static unsigned ZSTD_NbCommonBytes (size_t val) return _BitScanReverse64(&r, (U64)val) ? (unsigned)(r >> 3) : 0; # endif # elif defined(__GNUC__) && (__GNUC__ >= 4) - return (__builtin_clzll(val) >> 3); + return (unsigned)(__builtin_clzll(val) >> 3); # else unsigned r; const unsigned n32 = sizeof(size_t)*4; /* calculate this way due to compiler complaining in 32-bits mode */ @@ -707,7 +707,7 @@ static unsigned ZSTD_NbCommonBytes (size_t val) unsigned long r = 0; return _BitScanReverse( &r, (unsigned long)val ) ? (unsigned)(r >> 3) : 0; # elif defined(__GNUC__) && (__GNUC__ >= 3) - return (__builtin_clz((U32)val) >> 3); + return (unsigned)(__builtin_clz((U32)val) >> 3); # else unsigned r; if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; } diff --git a/lib/compress/zstd_opt.c b/lib/compress/zstd_opt.c index bb43c4ed0..532edd5bc 100644 --- a/lib/compress/zstd_opt.c +++ b/lib/compress/zstd_opt.c @@ -113,6 +113,21 @@ static U32 ZSTD_scaleStats(unsigned* table, U32 lastEltIndex, U32 logTarget) return ZSTD_downscaleStats(table, lastEltIndex, ZSTD_highbit32(factor)); } +static unsigned const k_baseLLfreqs[MaxLL+1] = { + 4, 2, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1 + }; + +static unsigned const k_baseOFCfreqs[MaxOff+1] = { + 6, 2, 1, 1, 2, 3, 4, 4, + 4, 3, 2, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1 + }; + /* ZSTD_rescaleFreqs() : * if first block (detected by optPtr->litLengthSum == 0) : init statistics * take hints from dictionary if there is one @@ -192,20 +207,13 @@ ZSTD_rescaleFreqs(optState_t* const optPtr, assert(optPtr->litFreq != NULL); if (compressedLiterals) { - unsigned lit = MaxLit; - HIST_count_simple(optPtr->litFreq, &lit, src, srcSize); /* use raw first block to init statistics */ + unsigned maxlit = MaxLit; + HIST_count_simple(optPtr->litFreq, &maxlit, src, srcSize); /* use raw first block to init statistics */ optPtr->litSum = ZSTD_downscaleStats(optPtr->litFreq, MaxLit, 8); } - { unsigned const baseLLfreqs[MaxLL+1] = { - 4, 2, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1 - }; - ZSTD_memcpy(optPtr->litLengthFreq, baseLLfreqs, sizeof(baseLLfreqs)); optPtr->litLengthSum = sum_u32(baseLLfreqs, MaxLL+1); - } + ZSTD_memcpy(optPtr->litLengthFreq, k_baseLLfreqs, sizeof(k_baseLLfreqs)); + optPtr->litLengthSum = sum_u32(k_baseLLfreqs, MaxLL+1); { unsigned ml; for (ml=0; ml<=MaxML; ml++) @@ -213,15 +221,8 @@ ZSTD_rescaleFreqs(optState_t* const optPtr, } optPtr->matchLengthSum = MaxML+1; - { unsigned const baseOFCfreqs[MaxOff+1] = { - 6, 2, 1, 1, 2, 3, 4, 4, - 4, 3, 2, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1 - }; - ZSTD_memcpy(optPtr->offCodeFreq, baseOFCfreqs, sizeof(baseOFCfreqs)); optPtr->offCodeSum = sum_u32(baseOFCfreqs, MaxOff+1); - } - + ZSTD_memcpy(optPtr->offCodeFreq, k_baseOFCfreqs, sizeof(k_baseOFCfreqs)); + optPtr->offCodeSum = sum_u32(k_baseOFCfreqs, MaxOff+1); } @@ -1253,14 +1254,14 @@ size_t ZSTD_compressBlock_btopt( - -/* ZSTD_initStats_ultra(): +#include "zstd_lazy.h" +/* ZSTD_initStats_greedy(): * make a first compression pass, just to seed stats with more accurate starting values. * only works on first block, with no dictionary and no ldm. * this function cannot error, hence its contract must be respected. */ static void -ZSTD_initStats_ultra(ZSTD_matchState_t* ms, +ZSTD_initStats_greedy(ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], const void* src, size_t srcSize) @@ -1268,13 +1269,53 @@ ZSTD_initStats_ultra(ZSTD_matchState_t* ms, U32 tmpRep[ZSTD_REP_NUM]; /* updated rep codes will sink here */ ZSTD_memcpy(tmpRep, rep, sizeof(tmpRep)); - DEBUGLOG(4, "ZSTD_initStats_ultra (srcSize=%zu)", srcSize); + DEBUGLOG(4, "ZSTD_initStats_greedy (srcSize=%zu)", srcSize); assert(ms->opt.litLengthSum == 0); /* first block */ assert(seqStore->sequences == seqStore->sequencesStart); /* no ldm */ assert(ms->window.dictLimit == ms->window.lowLimit); /* no dictionary */ assert(ms->window.dictLimit - ms->nextToUpdate <= 1); /* no prefix (note: intentional overflow, defined as 2-complement) */ - ZSTD_compressBlock_opt_generic(ms, seqStore, tmpRep, src, srcSize, 2 /*optLevel*/, ZSTD_noDict); /* generate stats into ms->opt*/ + ZSTD_compressBlock_greedy(ms, seqStore, tmpRep, src, srcSize); /* generate stats into seqstore */ + + /* transfer stats into ms-opt */ + /* literals stats */ + { unsigned maxlit = MaxLit; + assert(seqStore->lit >= seqStore->litStart); + HIST_count_simple(ms->opt.litFreq, &maxlit, seqStore->litStart, (size_t)(seqStore->lit - seqStore->litStart)); + ms->opt.litSum = ZSTD_downscaleStats(ms->opt.litFreq, MaxLit, 0); /* flatten stats, by providing at least 1 to every symbol */ + } + + /* seqStats */ + assert(seqStore->sequences >= seqStore->sequencesStart); + { U32 const nbSeq = (U32)(seqStore->sequences - seqStore->sequencesStart); + ZSTD_seqToCodes(seqStore); + + { const BYTE* codePtr = seqStore->ofCode; + U32 u; + memset(ms->opt.offCodeFreq, 0, sizeof(U32) * (MaxOff+1)); + ZSTD_STATIC_ASSERT(MaxOff >= 17); + for (u=0; u<17; u++) ms->opt.offCodeFreq[u]=1; /* flatten stats; some offcode may not be produced by greedy but still be present */ + for (u=0; uopt.offCodeFreq[codePtr[u]]++; + assert(ms->opt.offCodeFreq[1] == 1); /* greedy can't find rep1/rep2 */ + ms->opt.offCodeFreq[1] = (ms->opt.offCodeFreq[0] / 3) + 1; /* bias correction */ + ms->opt.offCodeSum = sum_u32(ms->opt.offCodeFreq, 18); + } + + { const BYTE* codePtr = seqStore->mlCode; + U32 u; + for (u=0; uopt.matchLengthFreq[u]=1; /* flatten stats; some match length not produced by greedy might end up present */ + for (u=0; uopt.matchLengthFreq[codePtr[u]]++; + assert(ms->opt.matchLengthFreq[0] == 1); /* greedy can't find mml=3 */ + ms->opt.matchLengthFreq[0] = ms->opt.matchLengthFreq[1] + 1; /* bias correction */ + ms->opt.matchLengthSum = sum_u32(ms->opt.matchLengthFreq, MaxML+1); + } + + { const BYTE* codePtr = seqStore->llCode; + U32 u; + ZSTD_memcpy(ms->opt.litLengthFreq, k_baseLLfreqs, sizeof(k_baseLLfreqs)); + for (u=0; uopt.litLengthFreq[codePtr[u]]++; + ms->opt.litLengthSum = sum_u32(ms->opt.litLengthFreq, MaxLL+1); + } } /* invalidate first scan from history */ ZSTD_resetSeqStore(seqStore); @@ -1289,10 +1330,70 @@ size_t ZSTD_compressBlock_btultra( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], const void* src, size_t srcSize) { + U32 const curr = (U32)((const BYTE*)src - ms->window.base); DEBUGLOG(5, "ZSTD_compressBlock_btultra (srcSize=%zu)", srcSize); + + /* 2-pass strategy: + * this strategy makes a first pass over first block to collect statistics + * and seed next round's statistics with it. + * After 1st pass, function forgets everything, and starts a new block. + * Consequently, this can only work if no data has been previously loaded in tables, + * aka, no dictionary, no prefix, no ldm preprocessing. + * The compression ratio gain is generally small (~0.5% on first block), + * the cost is 2x cpu time on first block. */ + assert(srcSize <= ZSTD_BLOCKSIZE_MAX); + if ( (ms->opt.litLengthSum==0) /* first block */ + && (seqStore->sequences == seqStore->sequencesStart) /* no ldm */ + && (ms->window.dictLimit == ms->window.lowLimit) /* no dictionary */ + && (curr == ms->window.dictLimit) /* start of frame, nothing already loaded nor skipped */ + && (srcSize > ZSTD_PREDEF_THRESHOLD) + ) { + ZSTD_initStats_greedy(ms, seqStore, rep, src, srcSize); + } + return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /*optLevel*/, ZSTD_noDict); } + + +/* ZSTD_initStats_ultra(): + * make a first compression pass, just to seed stats with more accurate starting values. + * only works on first block, with no dictionary and no ldm. + * this function cannot error, hence its contract must be respected. + */ +static void +ZSTD_initStats_ultra(ZSTD_matchState_t* ms, + seqStore_t* seqStore, + U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize) +{ + U32 tmpRep[ZSTD_REP_NUM]; /* updated rep codes will sink here */ + ZSTD_memcpy(tmpRep, rep, sizeof(tmpRep)); + + DEBUGLOG(4, "ZSTD_initStats_ultra (srcSize=%zu)", srcSize); + assert(ms->opt.litLengthSum == 0); /* first block */ + assert(seqStore->sequences == seqStore->sequencesStart); /* no ldm */ + assert(ms->window.dictLimit == ms->window.lowLimit); /* no dictionary */ + assert(ms->window.dictLimit - ms->nextToUpdate <= 1); /* no prefix (note: intentional overflow, defined as 2-complement) */ + + if (srcSize <= 16 KB) { + /* raw btultra, initialized by default starting stats */ + ZSTD_compressBlock_opt_generic(ms, seqStore, tmpRep, src, srcSize, 2 /*optLevel*/, ZSTD_noDict); /* generate stats into ms->opt*/ + } else { + /* in this mode, btultra is initialized greedy; + * measured better for larger blocks, but not for small ones */ + ZSTD_compressBlock_btultra(ms, seqStore, tmpRep, src, srcSize); /* generate stats into ms->opt*/ + } + + /* invalidate first scan from history */ + ZSTD_resetSeqStore(seqStore); + ms->window.base -= srcSize; + ms->window.dictLimit += (U32)srcSize; + ms->window.lowLimit = ms->window.dictLimit; + ms->nextToUpdate = ms->window.dictLimit; + +} + size_t ZSTD_compressBlock_btultra2( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], const void* src, size_t srcSize) diff --git a/lib/compress/zstd_opt.h b/lib/compress/zstd_opt.h index 627255f53..2c68de867 100644 --- a/lib/compress/zstd_opt.h +++ b/lib/compress/zstd_opt.h @@ -20,6 +20,8 @@ extern "C" { /* used in ZSTD_loadDictionaryContent() */ void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend); +/* All parsers @return the size of "last literals" segment */ + size_t ZSTD_compressBlock_btopt( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); diff --git a/lib/dictBuilder/zdict.c b/lib/dictBuilder/zdict.c index 9edc77fe8..821f71f95 100644 --- a/lib/dictBuilder/zdict.c +++ b/lib/dictBuilder/zdict.c @@ -627,7 +627,6 @@ static void ZDICT_countEStats(EStats_ress_t esr, const ZSTD_parameters* params, if (srcSize > blockSizeMax) srcSize = blockSizeMax; /* protection vs large samples */ { size_t const errorCode = ZSTD_compressBegin_usingCDict(esr.zc, esr.dict); if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_compressBegin_usingCDict failed \n"); return; } - } cSize = ZSTD_compressBlock(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_MAX, src, srcSize); if (ZSTD_isError(cSize)) { DISPLAYLEVEL(3, "warning : could not compress sample size %u \n", (unsigned)srcSize); return; } @@ -637,7 +636,7 @@ static void ZDICT_countEStats(EStats_ress_t esr, const ZSTD_parameters* params, /* literals stats */ { const BYTE* bytePtr; - for(bytePtr = seqStorePtr->litStart; bytePtr < seqStorePtr->lit; bytePtr++) + for (bytePtr = seqStorePtr->litStart; bytePtr < seqStorePtr->lit; bytePtr++) countLit[*bytePtr]++; }