From: Yann Collet Date: Tue, 8 May 2018 22:37:06 +0000 (-0700) Subject: pass entropy tables to optimal parser X-Git-Tag: v1.3.5~3^2~55^2~13 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=338f738c242d857acf3828c27cd475df7657178e;p=thirdparty%2Fzstd.git pass entropy tables to optimal parser for proper estimation of symbol's weights when using dictionary compression. Note : using only huffman costs is not good enough, presumably because sequence symbol costs are incorrect. --- diff --git a/lib/common/huf.h b/lib/common/huf.h index b4645b4e5..1f46fda2b 100644 --- a/lib/common/huf.h +++ b/lib/common/huf.h @@ -208,7 +208,7 @@ size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, si typedef enum { HUF_repeat_none, /**< Cannot use the previous table */ HUF_repeat_check, /**< Can use the previous table but it must be checked. Note : The previous table must have been constructed by HUF_compress{1, 4}X_repeat */ - HUF_repeat_valid /**< Can use the previous table and it is asumed to be valid */ + HUF_repeat_valid /**< Can use the previous table and it is assumed to be valid */ } HUF_repeat; /** HUF_compress4X_repeat() : * Same as HUF_compress4X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none. @@ -227,7 +227,9 @@ size_t HUF_compress4X_repeat(void* dst, size_t dstSize, */ #define HUF_CTABLE_WORKSPACE_SIZE_U32 (2*HUF_SYMBOLVALUE_MAX +1 +1) #define HUF_CTABLE_WORKSPACE_SIZE (HUF_CTABLE_WORKSPACE_SIZE_U32 * sizeof(unsigned)) -size_t HUF_buildCTable_wksp (HUF_CElt* tree, const U32* count, U32 maxSymbolValue, U32 maxNbBits, void* workSpace, size_t wkspSize); +size_t HUF_buildCTable_wksp (HUF_CElt* tree, + const U32* count, U32 maxSymbolValue, U32 maxNbBits, + void* workSpace, size_t wkspSize); /*! HUF_readStats() : * Read compact Huffman tree, saved by HUF_writeCTable(). @@ -242,6 +244,11 @@ size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, * Loading a CTable saved with HUF_writeCTable() */ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize); +/** HUF_getNbBits() : + * Read nbBits from CTable symbolTable, for symbol `symbolValue` presumed <= HUF_SYMBOLVALUE_MAX + * Note 1 : is not inlined, as HUF_CElt definition is private + * Note 2 : const void* used, so that it can provide a statically allocated table as argument (which uses type U32) */ +U32 HUF_getNbBits(const void* symbolTable, U32 symbolValue); /* * HUF_decompress() does the following: diff --git a/lib/compress/huf_compress.c b/lib/compress/huf_compress.c index 83230b415..c01a2381e 100644 --- a/lib/compress/huf_compress.c +++ b/lib/compress/huf_compress.c @@ -216,6 +216,13 @@ size_t HUF_readCTable (HUF_CElt* CTable, U32* maxSymbolValuePtr, const void* src return readSize; } +U32 HUF_getNbBits(const void* symbolTable, U32 symbolValue) +{ + const HUF_CElt* table = (const HUF_CElt*)symbolTable; + assert(symbolValue <= HUF_SYMBOLVALUE_MAX); + return table[symbolValue].nbBits; +} + typedef struct nodeElt_s { U32 count; diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index 76b471c5e..d3e52c627 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -1997,6 +1997,7 @@ static size_t ZSTD_compressBlock_internal(ZSTD_CCtx* zc, return 0; /* don't even attempt compression below a certain srcSize */ } ZSTD_resetSeqStore(&(zc->seqStore)); + ms->opt.symbolCosts = &zc->blockState.prevCBlock->entropy; /* required for optimal parser to read stats from dictionary */ /* limited update after a very long match */ { const BYTE* const base = ms->window.base; diff --git a/lib/compress/zstd_compress_internal.h b/lib/compress/zstd_compress_internal.h index bbb66a8ea..eeb7b230a 100644 --- a/lib/compress/zstd_compress_internal.h +++ b/lib/compress/zstd_compress_internal.h @@ -76,6 +76,8 @@ typedef struct { U32 rep[ZSTD_REP_NUM]; } ZSTD_optimal_t; +typedef enum { zop_none=0, zop_predef, zop_static } ZSTD_OptPrice_e; + typedef struct { /* All tables are allocated inside cctx->workspace by ZSTD_resetCCtx_internal() */ U32* litFreq; /* table of literals statistics, of size 256 */ @@ -95,7 +97,8 @@ typedef struct { U32 log2matchLengthSum; /* pow2 to compare log2(mlfreq) to */ U32 log2offCodeSum; /* pow2 to compare log2(offreq) to */ /* end : updated by ZSTD_setLog2Prices */ - U32 predefPrices; /* prices follow a pre-defined cost structure, statistics are irrelevant */ + ZSTD_OptPrice_e priceType; /* prices follow a pre-defined cost structure, statistics are irrelevant */ + const ZSTD_entropyCTables_t* symbolCosts; /* pre-calculated symbol costs, from dictionary */ } optState_t; typedef struct { diff --git a/lib/compress/zstd_opt.c b/lib/compress/zstd_opt.c index 80b9e5e00..ecd676a68 100644 --- a/lib/compress/zstd_opt.c +++ b/lib/compress/zstd_opt.c @@ -32,11 +32,15 @@ static void ZSTD_setLog2Prices(optState_t* optPtr) static void ZSTD_rescaleFreqs(optState_t* const optPtr, const BYTE* const src, size_t const srcSize) { - optPtr->predefPrices = 0; + optPtr->priceType = zop_none; if (optPtr->litLengthSum == 0) { /* first init */ unsigned u; - if (srcSize <= 1024) optPtr->predefPrices = 1; + if (srcSize <= 1024) optPtr->priceType = zop_predef; + assert(optPtr->symbolCosts != NULL); + if (0 && optPtr->symbolCosts->hufCTable_repeatMode == HUF_repeat_valid) { /* huffman table presumed generated by dictionary */ + optPtr->priceType = zop_static; + } assert(optPtr->litFreq!=NULL); for (u=0; u<=MaxLit; u++) @@ -94,7 +98,15 @@ static void ZSTD_rescaleFreqs(optState_t* const optPtr, static U32 ZSTD_rawLiteralsCost(const BYTE* const literals, U32 const litLength, const optState_t* const optPtr) { - if (optPtr->predefPrices) return (litLength*6); /* 6 bit per literal - no statistic used */ + if (optPtr->priceType == zop_static) { + U32 u, cost; + assert(optPtr->symbolCosts != NULL); + assert(optPtr->symbolCosts->hufCTable_repeatMode == HUF_repeat_valid); + for (u=0, cost=0; u < litLength; u++) + cost += HUF_getNbBits(optPtr->symbolCosts->hufCTable, literals[u]); + return cost; + } + if (optPtr->priceType == zop_predef) return (litLength*6); /* 6 bit per literal - no statistic used */ if (litLength == 0) return 0; /* literals */ @@ -110,7 +122,7 @@ static U32 ZSTD_rawLiteralsCost(const BYTE* const literals, U32 const litLength, * cost of literalLength symbol */ static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optPtr) { - if (optPtr->predefPrices) return ZSTD_highbit32((U32)litLength+1); + if (optPtr->priceType == zop_predef) return ZSTD_highbit32((U32)litLength+1); /* literal Length */ { U32 const llCode = ZSTD_LLcode(litLength); @@ -135,7 +147,7 @@ static U32 ZSTD_fullLiteralsCost(const BYTE* const literals, U32 const litLength * to provide a cost which is directly comparable to a match ending at same position */ static int ZSTD_litLengthContribution(U32 const litLength, const optState_t* const optPtr) { - if (optPtr->predefPrices) return ZSTD_highbit32(litLength+1); + if (optPtr->priceType == zop_predef) return ZSTD_highbit32(litLength+1); /* literal Length */ { U32 const llCode = ZSTD_LLcode(litLength); @@ -176,7 +188,7 @@ ZSTD_getMatchPrice(U32 const offset, U32 const matchLength, U32 const mlBase = matchLength - MINMATCH; assert(matchLength >= MINMATCH); - if (optPtr->predefPrices) /* fixed scheme, do not use statistics */ + if (optPtr->priceType == zop_predef) /* fixed scheme, do not use statistics */ return ZSTD_highbit32(mlBase+1) + 16 + offCode; price = offCode + optPtr->log2offCodeSum - ZSTD_highbit32(optPtr->offCodeFreq[offCode]+1);