fixed #304

author Yann Collet <cyan@fb.com>

Thu, 11 Jan 2018 19:16:32 +0000 (11:16 -0800)

committer Yann Collet <cyan@fb.com>

Thu, 11 Jan 2018 19:16:32 +0000 (11:16 -0800)
author Yann Collet <cyan@fb.com>
Thu, 11 Jan 2018 19:16:32 +0000 (11:16 -0800)
committer Yann Collet <cyan@fb.com>
Thu, 11 Jan 2018 19:16:32 +0000 (11:16 -0800)
diff --git a/lib/common/huf.h b/lib/common/huf.h

index 522bf9b6c0032aab6c5c8a6a25fb53ad866c1f84..1cead357a6d0339614e607ced050e94e859fea47 100644 (file)
--- a/lib/common/huf.h
+++ b/lib/common/huf.h
@@ -206,10 +206,10 @@ The following API allows targeting specific sub-functions for advanced tasks.
  For example, it's possible to compress several blocks using the same 'CTable',
  or to save and regenerate 'CTable' using external methods.
  */
-/* FSE_count() : find it within "fse.h" */
+/* FSE_count() : exposed within "fse.h" */
  unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue);
  typedef struct HUF_CElt_s HUF_CElt;   /* incomplete type */
-size_t HUF_buildCTable (HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue, unsigned maxNbBits);
+size_t HUF_buildCTable (HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue, unsigned maxNbBits);   /* @return : maxNbBits; CTable and count can overlap, in which case, CTable will overwrite count content */
  size_t HUF_writeCTable (void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog);
  size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable);
  
diff --git a/lib/compress/huf_compress.c b/lib/compress/huf_compress.c

index 5692d56e003daf1a30f964f1b8b837308c249ae3..cfc5a98bb6b06cd113dcd6202c092d088fcc7035 100644 (file)
--- a/lib/compress/huf_compress.c
+++ b/lib/compress/huf_compress.c
@@ -405,6 +405,7 @@ size_t HUF_buildCTable_wksp (HUF_CElt* tree, const U32* count, U32 maxSymbolValu
  }
  
  /** HUF_buildCTable() :
+ * @return : maxNbBits
   *  Note : count is used before tree is written, so they can safely overlap
   */
  size_t HUF_buildCTable (HUF_CElt* tree, const U32* count, U32 maxSymbolValue, U32 maxNbBits)
diff --git a/lib/dictBuilder/zdict.c b/lib/dictBuilder/zdict.c

index 3a1649419ef60948fe9834f3e2ec28f6c9ae10d1..2380599c9b0e2be640558a14cf9c85fd793b21f9 100644 (file)
--- a/lib/dictBuilder/zdict.c
+++ b/lib/dictBuilder/zdict.c
@@ -666,6 +666,18 @@ static void ZDICT_insertSortCount(offsetCount_t table[ZSTD_REP_NUM+1], U32 val,
      }
  }
  
+/* ZDICT_flatLit() :
+ * rewrite `countLit` to contain a mostly flat but still compressible distribution of literals.
+ * necessary to avoid generating a non-compressible distribution that HUF_writeCTable() cannot encode.
+ */
+static void ZDICT_flatLit(U32* countLit)
+{
+    int u;
+    for (u=1; u<256; u++) countLit[u] = 2;
+    countLit[0]   = 4;
+    countLit[253] = 1;
+    countLit[254] = 1;
+}
  
  #define OFFCODE_MAX 30  /* only applicable to first block */
  static size_t ZDICT_analyzeEntropy(void*  dstBuffer, size_t maxDstSize,
@@ -730,14 +742,20 @@ static size_t ZDICT_analyzeEntropy(void*  dstBuffer, size_t maxDstSize,
          pos += fileSizes[u];
      }
  
-    /* analyze */
-    errorCode = HUF_buildCTable (hufTable, countLit, 255, huffLog);
-    if (HUF_isError(errorCode)) {
-        eSize = ERROR(GENERIC);
-        DISPLAYLEVEL(1, " HUF_buildCTable error \n");
-        goto _cleanup;
+    /* analyze, build stats, starting with literals */
+    {   size_t maxNbBits = HUF_buildCTable (hufTable, countLit, 255, huffLog);
+        if (HUF_isError(maxNbBits)) {
+            eSize = ERROR(GENERIC);
+            DISPLAYLEVEL(1, " HUF_buildCTable error \n");
+            goto _cleanup;
+        }
+        if (maxNbBits==8) {  /* not compressible : will fail on HUF_writeCTable() */
+            ZDICT_flatLit(countLit);  /* replace distribution by a fake "mostly flat but still compressible" distribution, that HUF_writeCTable() can encode */
+            maxNbBits = HUF_buildCTable (hufTable, countLit, 255, huffLog);
+            assert(maxNbBits==9);
+        }
+        huffLog = (U32)maxNbBits;
      }
-    huffLog = (U32)errorCode;
  
      /* looking for most common first offsets */
      {   U32 offset;
diff --git a/tests/fuzzer.c b/tests/fuzzer.c

index 7198329a8629caf65009691ddcc20d748c8ea70d..acb670b10f7d2faf29058ca501e8f81b581e5ad3 100644 (file)
--- a/tests/fuzzer.c
+++ b/tests/fuzzer.c
@@ -659,12 +659,13 @@ static int basicUnitTests(U32 seed, double compressibility)
  
      /* Dictionary and dictBuilder tests */
      {   ZSTD_CCtx* const cctx = ZSTD_createCCtx();
-        size_t dictSize = 16 KB;
-        void* dictBuffer = malloc(dictSize);
+        size_t const dictBufferCapacity = 16 KB;
+        void* dictBuffer = malloc(dictBufferCapacity);
          size_t const totalSampleSize = 1 MB;
          size_t const sampleUnitSize = 8 KB;
          U32 const nbSamples = (U32)(totalSampleSize / sampleUnitSize);
          size_t* const samplesSizes = (size_t*) malloc(nbSamples * sizeof(size_t));
+        size_t dictSize;
          U32 dictID;
  
          if (dictBuffer==NULL || samplesSizes==NULL) {
@@ -675,16 +676,17 @@ static int basicUnitTests(U32 seed, double compressibility)
  
          DISPLAYLEVEL(4, "test%3i : dictBuilder on cyclic data : ", testNb++);
          assert(compressedBufferSize >= totalSampleSize);
-        { U32 u; for (u=0; u<totalSampleSize; u++) ((BYTE*)compressedBuffer)[u] = (BYTE)u; }
+        { U32 u; for (u=0; u<totalSampleSize; u++) ((BYTE*)decodedBuffer)[u] = (BYTE)u; }
          { U32 u; for (u=0; u<nbSamples; u++) samplesSizes[u] = sampleUnitSize; }
-        dictSize = ZDICT_trainFromBuffer(dictBuffer, dictSize,
-                                         compressedBuffer, samplesSizes, nbSamples);
-        if (ZDICT_isError(dictSize)) goto _output_error;
-        DISPLAYLEVEL(4, "OK, created dictionary of size %u \n", (U32)dictSize);
+        {   size_t const sDictSize = ZDICT_trainFromBuffer(dictBuffer, dictBufferCapacity,
+                                         decodedBuffer, samplesSizes, nbSamples);
+            if (ZDICT_isError(sDictSize)) goto _output_error;
+            DISPLAYLEVEL(4, "OK, created dictionary of size %u \n", (U32)sDictSize);
+        }
  
          DISPLAYLEVEL(4, "test%3i : dictBuilder : ", testNb++);
          { U32 u; for (u=0; u<nbSamples; u++) samplesSizes[u] = sampleUnitSize; }
-        dictSize = ZDICT_trainFromBuffer(dictBuffer, dictSize,
+        dictSize = ZDICT_trainFromBuffer(dictBuffer, dictBufferCapacity,
                                           CNBuffer, samplesSizes, nbSamples);
          if (ZDICT_isError(dictSize)) goto _output_error;
          DISPLAYLEVEL(4, "OK, created dictionary of size %u \n", (U32)dictSize);
author	Yann Collet <cyan@fb.com>
	Thu, 11 Jan 2018 19:16:32 +0000 (11:16 -0800)
committer	Yann Collet <cyan@fb.com>
	Thu, 11 Jan 2018 19:16:32 +0000 (11:16 -0800)
lib/common/huf.h		patch \| blob \| blame \| history
lib/compress/huf_compress.c		patch \| blob \| blame \| history
lib/dictBuilder/zdict.c		patch \| blob \| blame \| history
tests/fuzzer.c		patch \| blob \| blame \| history