From: Yann Collet Date: Fri, 11 May 2018 00:59:12 +0000 (-0700) Subject: opt: init statistics from dictionary X-Git-Tag: v1.3.5~3^2~55^2~4 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=1a26ec6e8d6575e671e2304f1fd43eeb880e015d;p=thirdparty%2Fzstd.git opt: init statistics from dictionary instead of starting from fake "default" statistics. --- diff --git a/lib/common/entropy_common.c b/lib/common/entropy_common.c index b37a082fe..344c32361 100644 --- a/lib/common/entropy_common.c +++ b/lib/common/entropy_common.c @@ -143,6 +143,11 @@ size_t FSE_readNCount (short* normalizedCounter, unsigned* maxSVPtr, unsigned* t } } /* while ((remaining>1) & (charnum<=*maxSVPtr)) */ if (remaining != 1) return ERROR(corruption_detected); if (bitCount > 32) return ERROR(corruption_detected); + /* zeroise the rest */ + { unsigned symbNb = charnum; + for (symbNb=charnum; symbNb <= *maxSVPtr; symbNb++) + normalizedCounter[symbNb] = 0; + } *maxSVPtr = charnum-1; ip += (bitCount+7)>>3; diff --git a/lib/compress/fse_compress.c b/lib/compress/fse_compress.c index 8e170150f..5df92db45 100644 --- a/lib/compress/fse_compress.c +++ b/lib/compress/fse_compress.c @@ -143,7 +143,10 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct, const short* normalizedCounter, unsi for (s=0; s<=maxSymbolValue; s++) { switch (normalizedCounter[s]) { - case 0: break; + case 0: + /* filling nonetheless, for compatibility with FSE_getMaxNbBits() */ + symbolTT[s].deltaNbBits = (tableLog+1) << 16; + break; case -1: case 1: diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index d6e3e6b07..58daf5d0c 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -2396,7 +2396,8 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs, if (FSE_isError(offcodeHeaderSize)) return ERROR(dictionary_corrupted); if (offcodeLog > OffFSELog) return ERROR(dictionary_corrupted); /* Defer checking offcodeMaxValue because we need to know the size of the dictionary content */ - CHECK_E( FSE_buildCTable_wksp(bs->entropy.offcodeCTable, offcodeNCount, offcodeMaxValue, offcodeLog, workspace, HUF_WORKSPACE_SIZE), + /* fill all offset symbols to avoid garbage at end of table */ + CHECK_E( FSE_buildCTable_wksp(bs->entropy.offcodeCTable, offcodeNCount, MaxOff, offcodeLog, workspace, HUF_WORKSPACE_SIZE), dictionary_corrupted); dictPtr += offcodeHeaderSize; } diff --git a/lib/compress/zstd_opt.c b/lib/compress/zstd_opt.c index 7e7a6935f..9233d5d6f 100644 --- a/lib/compress/zstd_opt.c +++ b/lib/compress/zstd_opt.c @@ -35,7 +35,6 @@ static void ZSTD_rescaleFreqs(optState_t* const optPtr, optPtr->priceType = zop_dynamic; if (optPtr->litLengthSum == 0) { /* first block : init */ - unsigned u; if (srcSize <= 1024) /* heuristic */ optPtr->priceType = zop_predef; @@ -47,28 +46,84 @@ static void ZSTD_rescaleFreqs(optState_t* const optPtr, assert(optPtr->priceType == zop_dynamic); } + assert(optPtr->litFreq != NULL); + assert(optPtr->symbolCosts != NULL); + optPtr->litSum = 0; + { unsigned lit; + for (lit=0; lit<=MaxLit; lit++) { + U32 const scaleLog = 12; /* scale to 4K */ + U32 const bitCost = HUF_getNbBits(optPtr->symbolCosts->hufCTable, lit); + assert(bitCost < scaleLog); + optPtr->litFreq[lit] = bitCost ? 1 << (scaleLog-bitCost) : 1 /*minimum to calculate cost*/; + optPtr->litSum += optPtr->litFreq[lit]; + } } - } + { unsigned ll; + FSE_CState_t llstate; + FSE_initCState(&llstate, optPtr->symbolCosts->litlengthCTable); + optPtr->litLengthSum = 0; + for (ll=0; ll<=MaxLL; ll++) { + U32 const scaleLog = 11; /* scale to 2K */ + U32 const bitCost = FSE_getMaxNbBits(llstate.symbolTT, ll); + assert(bitCost < scaleLog); + optPtr->litLengthFreq[ll] = bitCost ? 1 << (scaleLog-bitCost) : 1 /*minimum to calculate cost*/; + optPtr->litLengthSum += optPtr->litLengthFreq[ll]; + } } - assert(optPtr->litFreq != NULL); - { unsigned max = MaxLit; - FSE_count(optPtr->litFreq, &max, src, srcSize); /* use raw first block to init statistics */ - } - optPtr->litSum = 0; - for (u=0; u<=MaxLit; u++) { - optPtr->litFreq[u] = 1 + (optPtr->litFreq[u] >> (ZSTD_FREQ_DIV+1)); - optPtr->litSum += optPtr->litFreq[u]; - } + { unsigned ml; + FSE_CState_t mlstate; + FSE_initCState(&mlstate, optPtr->symbolCosts->matchlengthCTable); + optPtr->matchLengthSum = 0; + for (ml=0; ml<=MaxML; ml++) { + U32 const scaleLog = 11; /* scale to 2K */ + U32 const bitCost = FSE_getMaxNbBits(mlstate.symbolTT, ml); + assert(bitCost < scaleLog); + optPtr->matchLengthFreq[ml] = bitCost ? 1 << (scaleLog-bitCost) : 1 /*minimum to calculate cost*/; + optPtr->matchLengthSum += optPtr->matchLengthFreq[ml]; + } } - for (u=0; u<=MaxLL; u++) - optPtr->litLengthFreq[u] = 1; - optPtr->litLengthSum = MaxLL+1; - for (u=0; u<=MaxML; u++) - optPtr->matchLengthFreq[u] = 1; - optPtr->matchLengthSum = MaxML+1; - for (u=0; u<=MaxOff; u++) - optPtr->offCodeFreq[u] = 1; - optPtr->offCodeSum = (MaxOff+1); + { unsigned of; + FSE_CState_t ofstate; + FSE_initCState(&ofstate, optPtr->symbolCosts->offcodeCTable); + optPtr->offCodeSum = 0; + for (of=0; of<=MaxOff; of++) { + U32 const scaleLog = 11; /* scale to 2K */ + U32 const bitCost = FSE_getMaxNbBits(ofstate.symbolTT, of); + assert(bitCost < scaleLog); + optPtr->offCodeFreq[of] = bitCost ? 1 << (scaleLog-bitCost) : 1 /*minimum to calculate cost*/; + optPtr->offCodeSum += optPtr->offCodeFreq[of]; + } } + + } else { /* not a dictionary */ + + assert(optPtr->litFreq != NULL); + optPtr->litSum = 0; + { unsigned lit = MaxLit; + FSE_count(optPtr->litFreq, &lit, src, srcSize); /* use raw first block to init statistics */ + for (lit=0; lit<=MaxLit; lit++) { + optPtr->litFreq[lit] = 1 + (optPtr->litFreq[lit] >> (ZSTD_FREQ_DIV+1)); + optPtr->litSum += optPtr->litFreq[lit]; + } } + + { unsigned ll; + for (ll=0; ll<=MaxLL; ll++) + optPtr->litLengthFreq[ll] = 1; + optPtr->litLengthSum = MaxLL+1; + } + + { unsigned ml; + for (ml=0; ml<=MaxML; ml++) + optPtr->matchLengthFreq[ml] = 1; + optPtr->matchLengthSum = MaxML+1; + } + + { unsigned of; + for (of=0; of<=MaxOff; of++) + optPtr->offCodeFreq[of] = 1; + optPtr->offCodeSum = MaxOff+1; + } + + } } else { /* new block : re-use previous statistics, scaled down */ unsigned u;