[lib] Allow compression dictionaries with missing symbols

author Nick Terrell <terrelln@fb.com>

Sat, 13 Jun 2020 00:48:39 +0000 (17:48 -0700)

committer Nick Terrell <terrelln@fb.com>

Sat, 13 Jun 2020 00:57:19 +0000 (17:57 -0700)
author Nick Terrell <terrelln@fb.com>
Sat, 13 Jun 2020 00:48:39 +0000 (17:48 -0700)
committer Nick Terrell <terrelln@fb.com>
Sat, 13 Jun 2020 00:57:19 +0000 (17:57 -0700)
diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c

index b9475e920f86b3a3c3dc12541ea1da0792b36b8c..a6a464a8d7058accffc20b63f5935bcaea1f7f8d 100644 (file)
--- a/lib/compress/zstd_compress.c
+++ b/lib/compress/zstd_compress.c
@@ -2891,22 +2891,28 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
  
  
  /* Dictionaries that assign zero probability to symbols that show up causes problems
-   when FSE encoding.  Refuse dictionaries that assign zero probability to symbols
-   that we may encounter during compression.
-   NOTE: This behavior is not standard and could be improved in the future. */
-static size_t ZSTD_checkDictNCount(short* normalizedCounter, unsigned dictMaxSymbolValue, unsigned maxSymbolValue) {
+ * when FSE encoding. Mark dictionaries with zero probability symbols as FSE_repeat_check
+ * and only dictionaries with 100% valid symbols can be assumed valid.
+ */
+static FSE_repeat ZSTD_dictNCountRepeat(short* normalizedCounter, unsigned dictMaxSymbolValue, unsigned maxSymbolValue)
+{
      U32 s;
-    RETURN_ERROR_IF(dictMaxSymbolValue < maxSymbolValue, dictionary_corrupted, "dict fse tables don't have all symbols");
+    if (dictMaxSymbolValue < maxSymbolValue) {
+        return FSE_repeat_check;
+    }
      for (s = 0; s <= maxSymbolValue; ++s) {
-        RETURN_ERROR_IF(normalizedCounter[s] == 0, dictionary_corrupted, "dict fse tables don't have all symbols");
+        if (normalizedCounter[s] == 0) {
+            return FSE_repeat_check;
+        }
      }
-    return 0;
+    return FSE_repeat_valid;
  }
  
  size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace,
-                         short* offcodeNCount, unsigned* offcodeMaxValue,
                           const void* const dict, size_t dictSize)
  {
+    short offcodeNCount[MaxOff+1];
+    unsigned offcodeMaxValue = MaxOff;
      const BYTE* dictPtr = (const BYTE*)dict;    /* skip magic num and dict ID */
      const BYTE* const dictEnd = dictPtr + dictSize;
      dictPtr += 8;
@@ -2928,16 +2934,16 @@ size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace,
      }
  
      {   unsigned offcodeLog;
-        size_t const offcodeHeaderSize = FSE_readNCount(offcodeNCount, offcodeMaxValue, &offcodeLog, dictPtr, dictEnd-dictPtr);
+        size_t const offcodeHeaderSize = FSE_readNCount(offcodeNCount, &offcodeMaxValue, &offcodeLog, dictPtr, dictEnd-dictPtr);
          RETURN_ERROR_IF(FSE_isError(offcodeHeaderSize), dictionary_corrupted, "");
          RETURN_ERROR_IF(offcodeLog > OffFSELog, dictionary_corrupted, "");
-        /* Defer checking offcodeMaxValue because we need to know the size of the dictionary content */
          /* fill all offset symbols to avoid garbage at end of table */
          RETURN_ERROR_IF(FSE_isError(FSE_buildCTable_wksp(
                  bs->entropy.fse.offcodeCTable,
                  offcodeNCount, MaxOff, offcodeLog,
                  workspace, HUF_WORKSPACE_SIZE)),
              dictionary_corrupted, "");
+        /* Defer checking offcodeMaxValue because we need to know the size of the dictionary content */
          dictPtr += offcodeHeaderSize;
      }
  
@@ -2946,13 +2952,12 @@ size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace,
          size_t const matchlengthHeaderSize = FSE_readNCount(matchlengthNCount, &matchlengthMaxValue, &matchlengthLog, dictPtr, dictEnd-dictPtr);
          RETURN_ERROR_IF(FSE_isError(matchlengthHeaderSize), dictionary_corrupted, "");
          RETURN_ERROR_IF(matchlengthLog > MLFSELog, dictionary_corrupted, "");
-        /* Every match length code must have non-zero probability */
-        FORWARD_IF_ERROR( ZSTD_checkDictNCount(matchlengthNCount, matchlengthMaxValue, MaxML), "");
          RETURN_ERROR_IF(FSE_isError(FSE_buildCTable_wksp(
                  bs->entropy.fse.matchlengthCTable,
                  matchlengthNCount, matchlengthMaxValue, matchlengthLog,
                  workspace, HUF_WORKSPACE_SIZE)),
              dictionary_corrupted, "");
+        bs->entropy.fse.matchlength_repeatMode = ZSTD_dictNCountRepeat(matchlengthNCount, matchlengthMaxValue, MaxML);
          dictPtr += matchlengthHeaderSize;
      }
  
@@ -2961,13 +2966,12 @@ size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace,
          size_t const litlengthHeaderSize = FSE_readNCount(litlengthNCount, &litlengthMaxValue, &litlengthLog, dictPtr, dictEnd-dictPtr);
          RETURN_ERROR_IF(FSE_isError(litlengthHeaderSize), dictionary_corrupted, "");
          RETURN_ERROR_IF(litlengthLog > LLFSELog, dictionary_corrupted, "");
-        /* Every literal length code must have non-zero probability */
-        FORWARD_IF_ERROR( ZSTD_checkDictNCount(litlengthNCount, litlengthMaxValue, MaxLL), "");
          RETURN_ERROR_IF(FSE_isError(FSE_buildCTable_wksp(
                  bs->entropy.fse.litlengthCTable,
                  litlengthNCount, litlengthMaxValue, litlengthLog,
                  workspace, HUF_WORKSPACE_SIZE)),
              dictionary_corrupted, "");
+        bs->entropy.fse.litlength_repeatMode = ZSTD_dictNCountRepeat(litlengthNCount, litlengthMaxValue, MaxLL);
          dictPtr += litlengthHeaderSize;
      }
  
@@ -2977,6 +2981,22 @@ size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace,
      bs->rep[2] = MEM_readLE32(dictPtr+8);
      dictPtr += 12;
  
+    {   size_t const dictContentSize = (size_t)(dictEnd - dictPtr);
+        U32 offcodeMax = MaxOff;
+        if (dictContentSize <= ((U32)-1) - 128 KB) {
+            U32 const maxOffset = (U32)dictContentSize + 128 KB; /* The maximum offset that must be supported */
+            offcodeMax = ZSTD_highbit32(maxOffset); /* Calculate minimum offset code required to represent maxOffset */
+        }
+        /* All offset values <= dictContentSize + 128 KB must be representable for a valid table */
+        bs->entropy.fse.offcode_repeatMode = ZSTD_dictNCountRepeat(offcodeNCount, offcodeMaxValue, MIN(offcodeMax, MaxOff));
+
+        /* All repCodes must be <= dictContentSize and != 0 */
+        {   U32 u;
+            for (u=0; u<3; u++) {
+                RETURN_ERROR_IF(bs->rep[u] == 0, dictionary_corrupted, "");
+                RETURN_ERROR_IF(bs->rep[u] > dictContentSize, dictionary_corrupted, "");
+    }   }   }
+
      return dictPtr - (const BYTE*)dict;
  }
  
@@ -2999,8 +3019,6 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs,
  {
      const BYTE* dictPtr = (const BYTE*)dict;
      const BYTE* const dictEnd = dictPtr + dictSize;
-    short offcodeNCount[MaxOff+1];
-    unsigned offcodeMaxValue = MaxOff;
      size_t dictID;
      size_t eSize;
  
@@ -3009,32 +3027,16 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs,
      assert(MEM_readLE32(dictPtr) == ZSTD_MAGIC_DICTIONARY);
  
      dictID = params->fParams.noDictIDFlag ? 0 :  MEM_readLE32(dictPtr + 4 /* skip magic number */ );
-    eSize = ZSTD_loadCEntropy(bs, workspace, offcodeNCount, &offcodeMaxValue, dict, dictSize);
+    eSize = ZSTD_loadCEntropy(bs, workspace, dict, dictSize);
      FORWARD_IF_ERROR(eSize, "ZSTD_loadCEntropy failed");
      dictPtr += eSize;
  
-    {   size_t const dictContentSize = (size_t)(dictEnd - dictPtr);
-        U32 offcodeMax = MaxOff;
-        if (dictContentSize <= ((U32)-1) - 128 KB) {
-            U32 const maxOffset = (U32)dictContentSize + 128 KB; /* The maximum offset that must be supported */
-            offcodeMax = ZSTD_highbit32(maxOffset); /* Calculate minimum offset code required to represent maxOffset */
-        }
-        /* All offset values <= dictContentSize + 128 KB must be representable */
-        FORWARD_IF_ERROR(ZSTD_checkDictNCount(offcodeNCount, offcodeMaxValue, MIN(offcodeMax, MaxOff)), "");
-        /* All repCodes must be <= dictContentSize and != 0*/
-        {   U32 u;
-            for (u=0; u<3; u++) {
-                RETURN_ERROR_IF(bs->rep[u] == 0, dictionary_corrupted, "");
-                RETURN_ERROR_IF(bs->rep[u] > dictContentSize, dictionary_corrupted, "");
-        }   }
-
-        bs->entropy.fse.offcode_repeatMode = FSE_repeat_valid;
-        bs->entropy.fse.matchlength_repeatMode = FSE_repeat_valid;
-        bs->entropy.fse.litlength_repeatMode = FSE_repeat_valid;
+    {
+        size_t const dictContentSize = (size_t)(dictEnd - dictPtr);
          FORWARD_IF_ERROR(ZSTD_loadDictionaryContent(
              ms, NULL, ws, params, dictPtr, dictContentSize, dtlm), "");
-        return dictID;
      }
+    return dictID;
  }
  
  /** ZSTD_compress_insertDictionary() :
diff --git a/lib/compress/zstd_compress_internal.h b/lib/compress/zstd_compress_internal.h

index db73f6ce21f2ca597b18c6ba825c6496c6e9f2bd..f2be0188c0264947cb66a47dcb7c9665325e44e1 100644 (file)
--- a/lib/compress/zstd_compress_internal.h
+++ b/lib/compress/zstd_compress_internal.h
@@ -1045,7 +1045,6 @@ MEM_STATIC void ZSTD_debugTable(const U32* table, U32 max)
   * assumptions : magic number supposed already checked
   *               and dictSize >= 8 */
  size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace,
-                         short* offcodeNCount, unsigned* offcodeMaxValue,
                           const void* const dict, size_t dictSize);
  
  void ZSTD_reset_compressedBlockState(ZSTD_compressedBlockState_t* bs);
diff --git a/lib/dictBuilder/zdict.c b/lib/dictBuilder/zdict.c

index 6d0b04231bb550c33d747c892390e45d1929d56b..6f13392a07c3f8f169b771420c7475160a88a851 100644 (file)
--- a/lib/dictBuilder/zdict.c
+++ b/lib/dictBuilder/zdict.c
@@ -105,20 +105,17 @@ size_t ZDICT_getDictHeaderSize(const void* dictBuffer, size_t dictSize)
      size_t headerSize;
      if (dictSize <= 8 || MEM_readLE32(dictBuffer) != ZSTD_MAGIC_DICTIONARY) return ERROR(dictionary_corrupted);
  
-    {   unsigned offcodeMaxValue = MaxOff;
-        ZSTD_compressedBlockState_t* bs = (ZSTD_compressedBlockState_t*)malloc(sizeof(ZSTD_compressedBlockState_t));
+    {   ZSTD_compressedBlockState_t* bs = (ZSTD_compressedBlockState_t*)malloc(sizeof(ZSTD_compressedBlockState_t));
          U32* wksp = (U32*)malloc(HUF_WORKSPACE_SIZE);
-        short* offcodeNCount = (short*)malloc((MaxOff+1)*sizeof(short));
-        if (!bs || !wksp || !offcodeNCount) {
+        if (!bs || !wksp) {
              headerSize = ERROR(memory_allocation);
          } else {
              ZSTD_reset_compressedBlockState(bs);
-            headerSize = ZSTD_loadCEntropy(bs, wksp, offcodeNCount, &offcodeMaxValue, dictBuffer, dictSize);
+            headerSize = ZSTD_loadCEntropy(bs, wksp, dictBuffer, dictSize);
          }
  
          free(bs);
          free(wksp);
-        free(offcodeNCount);
      }
  
      return headerSize;
diff --git a/tests/golden-compression/http b/tests/golden-compression/http

new file mode 100644 (file)

index 0000000..a3908cb
--- /dev/null
+++ b/tests/golden-compression/http
@@ -0,0 +1 @@
+<?xml version="1.0" encoding="UTF-8" ?><VAST version="2.0"><Ad id="43C53990160C658B"><Wrapper><AdSystem version="1.0">ads60.vertamedia.com</AdSystem><VASTAdTagURI>http://ads60.vertamedia.com/ops/43C53990160C658B/46991</VASTAdTagURI><Impression><![CDATA[http://ads60.vertamedia.com/i/?x=1&adId=43C53990160C658B&aid=46991&cmpId=13765&fi=24929&advId=13177&pubId=28786&sid=0&width=300&height=250&domain=m.gazetaexpress.com]]></Impression><Creatives><Creative><Linear><TrackingEvents><Tracking event="start">http://ads60.vertamedia.com/ve/43C53990160C658B/53</Tracking><Tracking event="midpoint">http://ads60.vertamedia.com/ve/43C53990160C658B/54</Tracking><Tracking event="firstQuartile">http://ads60.vertamedia.com/ve/43C53990160C658B/55</Tracking><Tracking event="thirdQuartile">http://ads60.vertamedia.com/ve/43C53990160C658B/56</Tracking><Tracking event="complete">http://ads60.vertamedia.com/ve/43C53990160C658B/57</Tracking><Tracking event="skip">http://ads60.vertamedia.com/ve/43C53990160C658B/66</Tracking></TrackingEvents><VideoClicks><ClickTracking>http://ads60.vertamedia.com/ve/43C53990160C658B/71</ClickTracking></VideoClicks></Linear></Creative></Creatives></Wrapper></Ad></VAST>
+\ No newline at end of file
diff --git a/tests/golden-dictionaries/http-dict-missing-symbols b/tests/golden-dictionaries/http-dict-missing-symbols

new file mode 100644 (file)

index 0000000..0fa5ca6

Binary files /dev/null and b/tests/golden-dictionaries/http-dict-missing-symbols differ
diff --git a/tests/playTests.sh b/tests/playTests.sh

index 5bca6d1333eece50b913f1673bdb81bceb3f4eab..7a2689cd3ab811f44f1fbfe5d5861d8e9ed1d664 100755 (executable)
--- a/tests/playTests.sh
+++ b/tests/playTests.sh
@@ -893,8 +893,9 @@ datagen | zstd -c | zstd -t
  
  println "\n===>  golden files tests "
  
-zstd -t -r "$TESTDIR/golden-compression"
+zstd -t -r "$TESTDIR/golden-decompression"
  zstd -c -r "$TESTDIR/golden-compression" | zstd -t
+zstd -D "$TESTDIR/golden-dictionaries/http-dict-missing-symbols" "$TESTDIR/golden-compression/http" -c | zstd -D "$TESTDIR/golden-dictionaries/http-dict-missing-symbols" -t
  
  
  println "\n===>  benchmark mode tests "
author	Nick Terrell <terrelln@fb.com>
	Sat, 13 Jun 2020 00:48:39 +0000 (17:48 -0700)
committer	Nick Terrell <terrelln@fb.com>
	Sat, 13 Jun 2020 00:57:19 +0000 (17:57 -0700)
lib/compress/zstd_compress.c		patch \| blob \| blame \| history
lib/compress/zstd_compress_internal.h		patch \| blob \| blame \| history
lib/dictBuilder/zdict.c		patch \| blob \| blame \| history
tests/golden-compression/http	[new file with mode: 0644]	patch \| blob
tests/golden-dictionaries/http-dict-missing-symbols	[new file with mode: 0644]	patch \| blob
tests/playTests.sh		patch \| blob \| blame \| history