]> git.ipfire.org Git - thirdparty/zstd.git/commitdiff
[lib] Allow compression dictionaries with missing symbols 2202/head
authorNick Terrell <terrelln@fb.com>
Sat, 13 Jun 2020 00:48:39 +0000 (17:48 -0700)
committerNick Terrell <terrelln@fb.com>
Sat, 13 Jun 2020 00:57:19 +0000 (17:57 -0700)
Allow compression to use dictionaries with missing symbols in their
entropy tables. We set the FSE repeat mode to check when there are
missing symbols, and set the FSE repeat mode to valid when all symbols
are present.

Note that when not all symbols are present, the heuristics which favor
dictionary tables for lower compression levels won't activate.

Tested by manually creating a dictionary with missing symbols of every
type, and validing that the compressor rejects it before this change,
and accepts it after this change. Also, I ran the `dictionary_loader`
fuzzer for >1 hour of CPU time without running into cases where
compression succeeds, but decompression fails.

Fixes #2174.

lib/compress/zstd_compress.c
lib/compress/zstd_compress_internal.h
lib/dictBuilder/zdict.c
tests/golden-compression/http [new file with mode: 0644]
tests/golden-dictionaries/http-dict-missing-symbols [new file with mode: 0644]
tests/playTests.sh

index b9475e920f86b3a3c3dc12541ea1da0792b36b8c..a6a464a8d7058accffc20b63f5935bcaea1f7f8d 100644 (file)
@@ -2891,22 +2891,28 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
 
 
 /* Dictionaries that assign zero probability to symbols that show up causes problems
-   when FSE encoding.  Refuse dictionaries that assign zero probability to symbols
-   that we may encounter during compression.
-   NOTE: This behavior is not standard and could be improved in the future. */
-static size_t ZSTD_checkDictNCount(short* normalizedCounter, unsigned dictMaxSymbolValue, unsigned maxSymbolValue) {
+ * when FSE encoding. Mark dictionaries with zero probability symbols as FSE_repeat_check
+ * and only dictionaries with 100% valid symbols can be assumed valid.
+ */
+static FSE_repeat ZSTD_dictNCountRepeat(short* normalizedCounter, unsigned dictMaxSymbolValue, unsigned maxSymbolValue)
+{
     U32 s;
-    RETURN_ERROR_IF(dictMaxSymbolValue < maxSymbolValue, dictionary_corrupted, "dict fse tables don't have all symbols");
+    if (dictMaxSymbolValue < maxSymbolValue) {
+        return FSE_repeat_check;
+    }
     for (s = 0; s <= maxSymbolValue; ++s) {
-        RETURN_ERROR_IF(normalizedCounter[s] == 0, dictionary_corrupted, "dict fse tables don't have all symbols");
+        if (normalizedCounter[s] == 0) {
+            return FSE_repeat_check;
+        }
     }
-    return 0;
+    return FSE_repeat_valid;
 }
 
 size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace,
-                         short* offcodeNCount, unsigned* offcodeMaxValue,
                          const void* const dict, size_t dictSize)
 {
+    short offcodeNCount[MaxOff+1];
+    unsigned offcodeMaxValue = MaxOff;
     const BYTE* dictPtr = (const BYTE*)dict;    /* skip magic num and dict ID */
     const BYTE* const dictEnd = dictPtr + dictSize;
     dictPtr += 8;
@@ -2928,16 +2934,16 @@ size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace,
     }
 
     {   unsigned offcodeLog;
-        size_t const offcodeHeaderSize = FSE_readNCount(offcodeNCount, offcodeMaxValue, &offcodeLog, dictPtr, dictEnd-dictPtr);
+        size_t const offcodeHeaderSize = FSE_readNCount(offcodeNCount, &offcodeMaxValue, &offcodeLog, dictPtr, dictEnd-dictPtr);
         RETURN_ERROR_IF(FSE_isError(offcodeHeaderSize), dictionary_corrupted, "");
         RETURN_ERROR_IF(offcodeLog > OffFSELog, dictionary_corrupted, "");
-        /* Defer checking offcodeMaxValue because we need to know the size of the dictionary content */
         /* fill all offset symbols to avoid garbage at end of table */
         RETURN_ERROR_IF(FSE_isError(FSE_buildCTable_wksp(
                 bs->entropy.fse.offcodeCTable,
                 offcodeNCount, MaxOff, offcodeLog,
                 workspace, HUF_WORKSPACE_SIZE)),
             dictionary_corrupted, "");
+        /* Defer checking offcodeMaxValue because we need to know the size of the dictionary content */
         dictPtr += offcodeHeaderSize;
     }
 
@@ -2946,13 +2952,12 @@ size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace,
         size_t const matchlengthHeaderSize = FSE_readNCount(matchlengthNCount, &matchlengthMaxValue, &matchlengthLog, dictPtr, dictEnd-dictPtr);
         RETURN_ERROR_IF(FSE_isError(matchlengthHeaderSize), dictionary_corrupted, "");
         RETURN_ERROR_IF(matchlengthLog > MLFSELog, dictionary_corrupted, "");
-        /* Every match length code must have non-zero probability */
-        FORWARD_IF_ERROR( ZSTD_checkDictNCount(matchlengthNCount, matchlengthMaxValue, MaxML), "");
         RETURN_ERROR_IF(FSE_isError(FSE_buildCTable_wksp(
                 bs->entropy.fse.matchlengthCTable,
                 matchlengthNCount, matchlengthMaxValue, matchlengthLog,
                 workspace, HUF_WORKSPACE_SIZE)),
             dictionary_corrupted, "");
+        bs->entropy.fse.matchlength_repeatMode = ZSTD_dictNCountRepeat(matchlengthNCount, matchlengthMaxValue, MaxML);
         dictPtr += matchlengthHeaderSize;
     }
 
@@ -2961,13 +2966,12 @@ size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace,
         size_t const litlengthHeaderSize = FSE_readNCount(litlengthNCount, &litlengthMaxValue, &litlengthLog, dictPtr, dictEnd-dictPtr);
         RETURN_ERROR_IF(FSE_isError(litlengthHeaderSize), dictionary_corrupted, "");
         RETURN_ERROR_IF(litlengthLog > LLFSELog, dictionary_corrupted, "");
-        /* Every literal length code must have non-zero probability */
-        FORWARD_IF_ERROR( ZSTD_checkDictNCount(litlengthNCount, litlengthMaxValue, MaxLL), "");
         RETURN_ERROR_IF(FSE_isError(FSE_buildCTable_wksp(
                 bs->entropy.fse.litlengthCTable,
                 litlengthNCount, litlengthMaxValue, litlengthLog,
                 workspace, HUF_WORKSPACE_SIZE)),
             dictionary_corrupted, "");
+        bs->entropy.fse.litlength_repeatMode = ZSTD_dictNCountRepeat(litlengthNCount, litlengthMaxValue, MaxLL);
         dictPtr += litlengthHeaderSize;
     }
 
@@ -2977,6 +2981,22 @@ size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace,
     bs->rep[2] = MEM_readLE32(dictPtr+8);
     dictPtr += 12;
 
+    {   size_t const dictContentSize = (size_t)(dictEnd - dictPtr);
+        U32 offcodeMax = MaxOff;
+        if (dictContentSize <= ((U32)-1) - 128 KB) {
+            U32 const maxOffset = (U32)dictContentSize + 128 KB; /* The maximum offset that must be supported */
+            offcodeMax = ZSTD_highbit32(maxOffset); /* Calculate minimum offset code required to represent maxOffset */
+        }
+        /* All offset values <= dictContentSize + 128 KB must be representable for a valid table */
+        bs->entropy.fse.offcode_repeatMode = ZSTD_dictNCountRepeat(offcodeNCount, offcodeMaxValue, MIN(offcodeMax, MaxOff));
+
+        /* All repCodes must be <= dictContentSize and != 0 */
+        {   U32 u;
+            for (u=0; u<3; u++) {
+                RETURN_ERROR_IF(bs->rep[u] == 0, dictionary_corrupted, "");
+                RETURN_ERROR_IF(bs->rep[u] > dictContentSize, dictionary_corrupted, "");
+    }   }   }
+
     return dictPtr - (const BYTE*)dict;
 }
 
@@ -2999,8 +3019,6 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs,
 {
     const BYTE* dictPtr = (const BYTE*)dict;
     const BYTE* const dictEnd = dictPtr + dictSize;
-    short offcodeNCount[MaxOff+1];
-    unsigned offcodeMaxValue = MaxOff;
     size_t dictID;
     size_t eSize;
 
@@ -3009,32 +3027,16 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs,
     assert(MEM_readLE32(dictPtr) == ZSTD_MAGIC_DICTIONARY);
 
     dictID = params->fParams.noDictIDFlag ? 0 :  MEM_readLE32(dictPtr + 4 /* skip magic number */ );
-    eSize = ZSTD_loadCEntropy(bs, workspace, offcodeNCount, &offcodeMaxValue, dict, dictSize);
+    eSize = ZSTD_loadCEntropy(bs, workspace, dict, dictSize);
     FORWARD_IF_ERROR(eSize, "ZSTD_loadCEntropy failed");
     dictPtr += eSize;
 
-    {   size_t const dictContentSize = (size_t)(dictEnd - dictPtr);
-        U32 offcodeMax = MaxOff;
-        if (dictContentSize <= ((U32)-1) - 128 KB) {
-            U32 const maxOffset = (U32)dictContentSize + 128 KB; /* The maximum offset that must be supported */
-            offcodeMax = ZSTD_highbit32(maxOffset); /* Calculate minimum offset code required to represent maxOffset */
-        }
-        /* All offset values <= dictContentSize + 128 KB must be representable */
-        FORWARD_IF_ERROR(ZSTD_checkDictNCount(offcodeNCount, offcodeMaxValue, MIN(offcodeMax, MaxOff)), "");
-        /* All repCodes must be <= dictContentSize and != 0*/
-        {   U32 u;
-            for (u=0; u<3; u++) {
-                RETURN_ERROR_IF(bs->rep[u] == 0, dictionary_corrupted, "");
-                RETURN_ERROR_IF(bs->rep[u] > dictContentSize, dictionary_corrupted, "");
-        }   }
-
-        bs->entropy.fse.offcode_repeatMode = FSE_repeat_valid;
-        bs->entropy.fse.matchlength_repeatMode = FSE_repeat_valid;
-        bs->entropy.fse.litlength_repeatMode = FSE_repeat_valid;
+    {
+        size_t const dictContentSize = (size_t)(dictEnd - dictPtr);
         FORWARD_IF_ERROR(ZSTD_loadDictionaryContent(
             ms, NULL, ws, params, dictPtr, dictContentSize, dtlm), "");
-        return dictID;
     }
+    return dictID;
 }
 
 /** ZSTD_compress_insertDictionary() :
index db73f6ce21f2ca597b18c6ba825c6496c6e9f2bd..f2be0188c0264947cb66a47dcb7c9665325e44e1 100644 (file)
@@ -1045,7 +1045,6 @@ MEM_STATIC void ZSTD_debugTable(const U32* table, U32 max)
  * assumptions : magic number supposed already checked
  *               and dictSize >= 8 */
 size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace,
-                         short* offcodeNCount, unsigned* offcodeMaxValue,
                          const void* const dict, size_t dictSize);
 
 void ZSTD_reset_compressedBlockState(ZSTD_compressedBlockState_t* bs);
index 6d0b04231bb550c33d747c892390e45d1929d56b..6f13392a07c3f8f169b771420c7475160a88a851 100644 (file)
@@ -105,20 +105,17 @@ size_t ZDICT_getDictHeaderSize(const void* dictBuffer, size_t dictSize)
     size_t headerSize;
     if (dictSize <= 8 || MEM_readLE32(dictBuffer) != ZSTD_MAGIC_DICTIONARY) return ERROR(dictionary_corrupted);
 
-    {   unsigned offcodeMaxValue = MaxOff;
-        ZSTD_compressedBlockState_t* bs = (ZSTD_compressedBlockState_t*)malloc(sizeof(ZSTD_compressedBlockState_t));
+    {   ZSTD_compressedBlockState_t* bs = (ZSTD_compressedBlockState_t*)malloc(sizeof(ZSTD_compressedBlockState_t));
         U32* wksp = (U32*)malloc(HUF_WORKSPACE_SIZE);
-        short* offcodeNCount = (short*)malloc((MaxOff+1)*sizeof(short));
-        if (!bs || !wksp || !offcodeNCount) {
+        if (!bs || !wksp) {
             headerSize = ERROR(memory_allocation);
         } else {
             ZSTD_reset_compressedBlockState(bs);
-            headerSize = ZSTD_loadCEntropy(bs, wksp, offcodeNCount, &offcodeMaxValue, dictBuffer, dictSize);
+            headerSize = ZSTD_loadCEntropy(bs, wksp, dictBuffer, dictSize);
         }
 
         free(bs);
         free(wksp);
-        free(offcodeNCount);
     }
 
     return headerSize;
diff --git a/tests/golden-compression/http b/tests/golden-compression/http
new file mode 100644 (file)
index 0000000..a3908cb
--- /dev/null
@@ -0,0 +1 @@
+<?xml version="1.0" encoding="UTF-8" ?><VAST version="2.0"><Ad id="43C53990160C658B"><Wrapper><AdSystem version="1.0">ads60.vertamedia.com</AdSystem><VASTAdTagURI>http://ads60.vertamedia.com/ops/43C53990160C658B/46991</VASTAdTagURI><Impression><![CDATA[http://ads60.vertamedia.com/i/?x=1&adId=43C53990160C658B&aid=46991&cmpId=13765&fi=24929&advId=13177&pubId=28786&sid=0&width=300&height=250&domain=m.gazetaexpress.com]]></Impression><Creatives><Creative><Linear><TrackingEvents><Tracking event="start">http://ads60.vertamedia.com/ve/43C53990160C658B/53</Tracking><Tracking event="midpoint">http://ads60.vertamedia.com/ve/43C53990160C658B/54</Tracking><Tracking event="firstQuartile">http://ads60.vertamedia.com/ve/43C53990160C658B/55</Tracking><Tracking event="thirdQuartile">http://ads60.vertamedia.com/ve/43C53990160C658B/56</Tracking><Tracking event="complete">http://ads60.vertamedia.com/ve/43C53990160C658B/57</Tracking><Tracking event="skip">http://ads60.vertamedia.com/ve/43C53990160C658B/66</Tracking></TrackingEvents><VideoClicks><ClickTracking>http://ads60.vertamedia.com/ve/43C53990160C658B/71</ClickTracking></VideoClicks></Linear></Creative></Creatives></Wrapper></Ad></VAST>
\ No newline at end of file
diff --git a/tests/golden-dictionaries/http-dict-missing-symbols b/tests/golden-dictionaries/http-dict-missing-symbols
new file mode 100644 (file)
index 0000000..0fa5ca6
Binary files /dev/null and b/tests/golden-dictionaries/http-dict-missing-symbols differ
index 5bca6d1333eece50b913f1673bdb81bceb3f4eab..7a2689cd3ab811f44f1fbfe5d5861d8e9ed1d664 100755 (executable)
@@ -893,8 +893,9 @@ datagen | zstd -c | zstd -t
 
 println "\n===>  golden files tests "
 
-zstd -t -r "$TESTDIR/golden-compression"
+zstd -t -r "$TESTDIR/golden-decompression"
 zstd -c -r "$TESTDIR/golden-compression" | zstd -t
+zstd -D "$TESTDIR/golden-dictionaries/http-dict-missing-symbols" "$TESTDIR/golden-compression/http" -c | zstd -D "$TESTDIR/golden-dictionaries/http-dict-missing-symbols" -t
 
 
 println "\n===>  benchmark mode tests "