]> git.ipfire.org Git - thirdparty/zstd.git/commitdiff
opt: init statistics from dictionary
authorYann Collet <cyan@fb.com>
Fri, 11 May 2018 00:59:12 +0000 (17:59 -0700)
committerYann Collet <cyan@fb.com>
Fri, 11 May 2018 00:59:12 +0000 (17:59 -0700)
instead of starting from fake "default" statistics.

lib/common/entropy_common.c
lib/compress/fse_compress.c
lib/compress/zstd_compress.c
lib/compress/zstd_opt.c

index b37a082fee2c4f00eec933919bce3b9c6904ba96..344c323615f6da5528bee781bbc8b83754183eec 100644 (file)
@@ -143,6 +143,11 @@ size_t FSE_readNCount (short* normalizedCounter, unsigned* maxSVPtr, unsigned* t
     }   }   /* while ((remaining>1) & (charnum<=*maxSVPtr)) */
     if (remaining != 1) return ERROR(corruption_detected);
     if (bitCount > 32) return ERROR(corruption_detected);
+    /* zeroise the rest */
+    {   unsigned symbNb = charnum;
+        for (symbNb=charnum; symbNb <= *maxSVPtr; symbNb++)
+            normalizedCounter[symbNb] = 0;
+    }
     *maxSVPtr = charnum-1;
 
     ip += (bitCount+7)>>3;
index 8e170150f6c0917dd9ddb88885d58a2ef726290b..5df92db454292382da68c8db23e4d60432a632fd 100644 (file)
@@ -143,7 +143,10 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct, const short* normalizedCounter, unsi
         for (s=0; s<=maxSymbolValue; s++) {
             switch (normalizedCounter[s])
             {
-            case  0: break;
+            case  0:
+                /* filling nonetheless, for compatibility with FSE_getMaxNbBits() */
+                symbolTT[s].deltaNbBits = (tableLog+1) << 16;
+                break;
 
             case -1:
             case  1:
index d6e3e6b075653f656dc71b7d961404d1a690b4a6..58daf5d0cf4e6b3a66659d0eda073808c191d2b3 100644 (file)
@@ -2396,7 +2396,8 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs,
         if (FSE_isError(offcodeHeaderSize)) return ERROR(dictionary_corrupted);
         if (offcodeLog > OffFSELog) return ERROR(dictionary_corrupted);
         /* Defer checking offcodeMaxValue because we need to know the size of the dictionary content */
-        CHECK_E( FSE_buildCTable_wksp(bs->entropy.offcodeCTable, offcodeNCount, offcodeMaxValue, offcodeLog, workspace, HUF_WORKSPACE_SIZE),
+        /* fill all offset symbols to avoid garbage at end of table */
+        CHECK_E( FSE_buildCTable_wksp(bs->entropy.offcodeCTable, offcodeNCount, MaxOff, offcodeLog, workspace, HUF_WORKSPACE_SIZE),
                  dictionary_corrupted);
         dictPtr += offcodeHeaderSize;
     }
index 7e7a6935f5519901d24189514d6da71eee5f8183..9233d5d6fabbb53ad8733600ec6d7aa1de2a6ccd 100644 (file)
@@ -35,7 +35,6 @@ static void ZSTD_rescaleFreqs(optState_t* const optPtr,
     optPtr->priceType = zop_dynamic;
 
     if (optPtr->litLengthSum == 0) {  /* first block : init */
-        unsigned u;
         if (srcSize <= 1024)   /* heuristic */
             optPtr->priceType = zop_predef;
 
@@ -47,28 +46,84 @@ static void ZSTD_rescaleFreqs(optState_t* const optPtr,
                 assert(optPtr->priceType == zop_dynamic);
             }
 
+            assert(optPtr->litFreq != NULL);
+            assert(optPtr->symbolCosts != NULL);
+            optPtr->litSum = 0;
+            {   unsigned lit;
+                for (lit=0; lit<=MaxLit; lit++) {
+                    U32 const scaleLog = 12;   /* scale to 4K */
+                    U32 const bitCost = HUF_getNbBits(optPtr->symbolCosts->hufCTable, lit);
+                    assert(bitCost < scaleLog);
+                    optPtr->litFreq[lit] = bitCost ? 1 << (scaleLog-bitCost) : 1 /*minimum to calculate cost*/;
+                    optPtr->litSum += optPtr->litFreq[lit];
+            }   }
 
-        }
+            {   unsigned ll;
+                FSE_CState_t llstate;
+                FSE_initCState(&llstate, optPtr->symbolCosts->litlengthCTable);
+                optPtr->litLengthSum = 0;
+                for (ll=0; ll<=MaxLL; ll++) {
+                    U32 const scaleLog = 11;   /* scale to 2K */
+                    U32 const bitCost = FSE_getMaxNbBits(llstate.symbolTT, ll);
+                    assert(bitCost < scaleLog);
+                    optPtr->litLengthFreq[ll] = bitCost ? 1 << (scaleLog-bitCost) : 1 /*minimum to calculate cost*/;
+                    optPtr->litLengthSum += optPtr->litLengthFreq[ll];
+            }   }
 
-        assert(optPtr->litFreq != NULL);
-        {   unsigned max = MaxLit;
-            FSE_count(optPtr->litFreq, &max, src, srcSize);   /* use raw first block to init statistics */
-        }
-        optPtr->litSum = 0;
-        for (u=0; u<=MaxLit; u++) {
-            optPtr->litFreq[u] = 1 + (optPtr->litFreq[u] >> (ZSTD_FREQ_DIV+1));
-            optPtr->litSum += optPtr->litFreq[u];
-        }
+            {   unsigned ml;
+                FSE_CState_t mlstate;
+                FSE_initCState(&mlstate, optPtr->symbolCosts->matchlengthCTable);
+                optPtr->matchLengthSum = 0;
+                for (ml=0; ml<=MaxML; ml++) {
+                    U32 const scaleLog = 11;   /* scale to 2K */
+                    U32 const bitCost = FSE_getMaxNbBits(mlstate.symbolTT, ml);
+                    assert(bitCost < scaleLog);
+                    optPtr->matchLengthFreq[ml] = bitCost ? 1 << (scaleLog-bitCost) : 1 /*minimum to calculate cost*/;
+                    optPtr->matchLengthSum += optPtr->matchLengthFreq[ml];
+            }   }
 
-        for (u=0; u<=MaxLL; u++)
-            optPtr->litLengthFreq[u] = 1;
-        optPtr->litLengthSum = MaxLL+1;
-        for (u=0; u<=MaxML; u++)
-            optPtr->matchLengthFreq[u] = 1;
-        optPtr->matchLengthSum = MaxML+1;
-        for (u=0; u<=MaxOff; u++)
-            optPtr->offCodeFreq[u] = 1;
-        optPtr->offCodeSum = (MaxOff+1);
+            {   unsigned of;
+                FSE_CState_t ofstate;
+                FSE_initCState(&ofstate, optPtr->symbolCosts->offcodeCTable);
+                optPtr->offCodeSum = 0;
+                for (of=0; of<=MaxOff; of++) {
+                    U32 const scaleLog = 11;   /* scale to 2K */
+                    U32 const bitCost = FSE_getMaxNbBits(ofstate.symbolTT, of);
+                    assert(bitCost < scaleLog);
+                    optPtr->offCodeFreq[of] = bitCost ? 1 << (scaleLog-bitCost) : 1 /*minimum to calculate cost*/;
+                    optPtr->offCodeSum += optPtr->offCodeFreq[of];
+            }   }
+
+        } else {  /* not a dictionary */
+
+            assert(optPtr->litFreq != NULL);
+            optPtr->litSum = 0;
+            {   unsigned lit = MaxLit;
+                FSE_count(optPtr->litFreq, &lit, src, srcSize);   /* use raw first block to init statistics */
+                for (lit=0; lit<=MaxLit; lit++) {
+                    optPtr->litFreq[lit] = 1 + (optPtr->litFreq[lit] >> (ZSTD_FREQ_DIV+1));
+                    optPtr->litSum += optPtr->litFreq[lit];
+            }   }
+
+            {   unsigned ll;
+                for (ll=0; ll<=MaxLL; ll++)
+                    optPtr->litLengthFreq[ll] = 1;
+                optPtr->litLengthSum = MaxLL+1;
+            }
+
+            {   unsigned ml;
+                for (ml=0; ml<=MaxML; ml++)
+                    optPtr->matchLengthFreq[ml] = 1;
+                optPtr->matchLengthSum = MaxML+1;
+            }
+
+            {   unsigned of;
+                for (of=0; of<=MaxOff; of++)
+                    optPtr->offCodeFreq[of] = 1;
+                optPtr->offCodeSum = MaxOff+1;
+            }
+
+        }
 
     } else {   /* new block : re-use previous statistics, scaled down */
         unsigned u;