Fix superblock mode (#2100)

author Nick Terrell <terrelln@fb.com>

Fri, 1 May 2020 23:11:47 +0000 (16:11 -0700)

committer GitHub <noreply@github.com>

Fri, 1 May 2020 23:11:47 +0000 (16:11 -0700)
author Nick Terrell <terrelln@fb.com>
Fri, 1 May 2020 23:11:47 +0000 (16:11 -0700)
committer GitHub <noreply@github.com>
Fri, 1 May 2020 23:11:47 +0000 (16:11 -0700)
diff --git a/lib/common/huf.h b/lib/common/huf.h

index 0d27ccdba94e590599d8dd01e8377c31a3b74b86..23e184d40313b6db13dc5b30c8123befbc81c30d 100644 (file)
--- a/lib/common/huf.h
+++ b/lib/common/huf.h
@@ -189,6 +189,7 @@ size_t HUF_buildCTable (HUF_CElt* CTable, const unsigned* count, unsigned maxSym
  size_t HUF_writeCTable (void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog);
  size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable);
  size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue);
+int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue);
  
  typedef enum {
     HUF_repeat_none,  /**< Cannot use the previous table */
diff --git a/lib/common/zstd_internal.h b/lib/common/zstd_internal.h

index 2103ef8594e41bb435d299db04dfcfc81ad27a49..950b789cf4427c78ad3f97f60c687d66aeaf8078 100644 (file)
--- a/lib/common/zstd_internal.h
+++ b/lib/common/zstd_internal.h
@@ -291,6 +291,31 @@ typedef struct {
      U32   longLengthPos;
  } seqStore_t;
  
+typedef struct {
+    U32 litLength;
+    U32 matchLength;
+} ZSTD_sequenceLength;
+
+/**
+ * Returns the ZSTD_sequenceLength for the given sequences. It handles the decoding of long sequences
+ * indicated by longLengthPos and longLengthID, and adds MINMATCH back to matchLength.
+ */
+MEM_STATIC ZSTD_sequenceLength ZSTD_getSequenceLength(seqStore_t const* seqStore, seqDef const* seq)
+{
+    ZSTD_sequenceLength seqLen;
+    seqLen.litLength = seq->litLength;
+    seqLen.matchLength = seq->matchLength + MINMATCH;
+    if (seqStore->longLengthPos == (U32)(seq - seqStore->sequencesStart)) {
+        if (seqStore->longLengthID == 1) {
+            seqLen.litLength += 0xFFFF;
+        }
+        if (seqStore->longLengthID == 2) {
+            seqLen.matchLength += 0xFFFF;
+        }
+    }
+    return seqLen;
+}
+
  /**
   * Contains the compressed frame size and an upper-bound for the decompressed frame size.
   * Note: before using `compressedSize`, check for errors using ZSTD_isError().
diff --git a/lib/compress/huf_compress.c b/lib/compress/huf_compress.c

index 5cab31d042f8028b154fe420fbf0ba7842661181..f54123c563e6f7355c7c85b956f6ac2e24447035 100644 (file)
--- a/lib/compress/huf_compress.c
+++ b/lib/compress/huf_compress.c
@@ -417,7 +417,7 @@ size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count,
      return nbBits >> 3;
  }
  
-static int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue) {
+int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue) {
    int bad = 0;
    int s;
    for (s = 0; s <= (int)maxSymbolValue; ++s) {
diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c

index 552ca9fb44d7f72d14759a569614335a627435aa..d12a1e6f885389a216b2a5c134a49ff19edf5786 100644 (file)
--- a/lib/compress/zstd_compress.c
+++ b/lib/compress/zstd_compress.c
@@ -1928,21 +1928,6 @@ void ZSTD_seqToCodes(const seqStore_t* seqStorePtr)
          mlCodeTable[seqStorePtr->longLengthPos] = MaxML;
  }
  
-static int ZSTD_disableLiteralsCompression(const ZSTD_CCtx_params* cctxParams)
-{
-    switch (cctxParams->literalCompressionMode) {
-    case ZSTD_lcm_huffman:
-        return 0;
-    case ZSTD_lcm_uncompressed:
-        return 1;
-    default:
-        assert(0 /* impossible: pre-validated */);
-        /* fall-through */
-    case ZSTD_lcm_auto:
-        return (cctxParams->cParams.strategy == ZSTD_fast) && (cctxParams->cParams.targetLength > 0);
-    }
-}
-
  /* ZSTD_useTargetCBlockSize():
   * Returns if target compressed block size param is being used.
   * If used, compression will do best effort to make a compressed block size to be around targetCBlockSize.
@@ -2387,6 +2372,18 @@ static int ZSTD_isRLE(const BYTE *ip, size_t length) {
      return 1;
  }
  
+/* Returns true if the given block may be RLE.
+ * This is just a heuristic based on the compressibility.
+ * It may return both false positives and false negatives.
+ */
+static int ZSTD_maybeRLE(seqStore_t const* seqStore)
+{
+    size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart);
+    size_t const nbLits = (size_t)(seqStore->lit - seqStore->litStart);
+
+    return nbSeqs < 4 && nbLits < 10;
+}
+
  static void ZSTD_confirmRepcodesAndEntropyTables(ZSTD_CCtx* zc)
  {
      ZSTD_compressedBlockState_t* const tmp = zc->blockState.prevCBlock;
@@ -2463,6 +2460,16 @@ static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc,
  {
      DEBUGLOG(6, "Attempting ZSTD_compressSuperBlock()");
      if (bss == ZSTDbss_compress) {
+        if (/* We don't want to emit our first block as a RLE even if it qualifies because
+            * doing so will cause the decoder (cli only) to throw a "should consume all input error."
+            * This is only an issue for zstd <= v1.4.3
+            */
+            !zc->isFirstBlock &&
+            ZSTD_maybeRLE(&zc->seqStore) &&
+            ZSTD_isRLE((BYTE const*)src, srcSize))
+        {
+            return ZSTD_rleCompressBlock(dst, dstCapacity, *(BYTE const*)src, srcSize, lastBlock);
+        }
          /* Attempt superblock compression.
           *
           * Note that compressed size of ZSTD_compressSuperBlock() is not bound by the
@@ -2481,12 +2488,15 @@ static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc,
           *   * cSize >= blockBound(srcSize): We have expanded the block too much so
           *     emit an uncompressed block.
           */
-        size_t const cSize = ZSTD_compressSuperBlock(zc, dst, dstCapacity, lastBlock);
-        if (cSize != ERROR(dstSize_tooSmall)) {
-            FORWARD_IF_ERROR(cSize);
-            if (cSize != 0 && cSize < srcSize + ZSTD_blockHeaderSize) {
-                ZSTD_confirmRepcodesAndEntropyTables(zc);
-                return cSize;
+        {
+            size_t const cSize = ZSTD_compressSuperBlock(zc, dst, dstCapacity, src, srcSize, lastBlock);
+            if (cSize != ERROR(dstSize_tooSmall)) {
+                size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, zc->appliedParams.cParams.strategy);
+                FORWARD_IF_ERROR(cSize);
+                if (cSize != 0 && cSize < maxCSize + ZSTD_blockHeaderSize) {
+                    ZSTD_confirmRepcodesAndEntropyTables(zc);
+                    return cSize;
+                }
              }
          }
      }
diff --git a/lib/compress/zstd_compress_internal.h b/lib/compress/zstd_compress_internal.h

index 893e8def2098fa5375e6b7c25e77d38329876b29..db7b89cebbd87c6ee175c6a2edcd7e33f8ddf27d 100644 (file)
--- a/lib/compress/zstd_compress_internal.h
+++ b/lib/compress/zstd_compress_internal.h
@@ -326,6 +326,31 @@ MEM_STATIC U32 ZSTD_MLcode(U32 mlBase)
      return (mlBase > 127) ? ZSTD_highbit32(mlBase) + ML_deltaCode : ML_Code[mlBase];
  }
  
+typedef struct repcodes_s {
+    U32 rep[3];
+} repcodes_t;
+
+MEM_STATIC repcodes_t ZSTD_updateRep(U32 const rep[3], U32 const offset, U32 const ll0)
+{
+    repcodes_t newReps;
+    if (offset >= ZSTD_REP_NUM) {  /* full offset */
+        newReps.rep[2] = rep[1];
+        newReps.rep[1] = rep[0];
+        newReps.rep[0] = offset - ZSTD_REP_MOVE;
+    } else {   /* repcode */
+        U32 const repCode = offset + ll0;
+        if (repCode > 0) {  /* note : if repCode==0, no change */
+            U32 const currentOffset = (repCode==ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode];
+            newReps.rep[2] = (repCode >= 2) ? rep[1] : rep[2];
+            newReps.rep[1] = rep[0];
+            newReps.rep[0] = currentOffset;
+        } else {   /* repCode == 0 */
+            memcpy(&newReps, rep, sizeof(newReps));
+        }
+    }
+    return newReps;
+}
+
  /* ZSTD_cParam_withinBounds:
   * @return 1 if value is within cParam bounds,
   * 0 otherwise */
@@ -351,6 +376,16 @@ MEM_STATIC size_t ZSTD_noCompressBlock (void* dst, size_t dstCapacity, const voi
      return ZSTD_blockHeaderSize + srcSize;
  }
  
+MEM_STATIC size_t ZSTD_rleCompressBlock (void* dst, size_t dstCapacity, BYTE src, size_t srcSize, U32 lastBlock)
+{
+    BYTE* const op = (BYTE*)dst;
+    U32 const cBlockHeader = lastBlock + (((U32)bt_rle)<<1) + (U32)(srcSize << 3);
+    RETURN_ERROR_IF(dstCapacity < 4, dstSize_tooSmall, "");
+    MEM_writeLE24(op, cBlockHeader);
+    op[3] = src;
+    return 4;
+}
+
  
  /* ZSTD_minGain() :
   * minimum compression required
@@ -364,6 +399,21 @@ MEM_STATIC size_t ZSTD_minGain(size_t srcSize, ZSTD_strategy strat)
      return (srcSize >> minlog) + 2;
  }
  
+MEM_STATIC int ZSTD_disableLiteralsCompression(const ZSTD_CCtx_params* cctxParams)
+{
+    switch (cctxParams->literalCompressionMode) {
+    case ZSTD_lcm_huffman:
+        return 0;
+    case ZSTD_lcm_uncompressed:
+        return 1;
+    default:
+        assert(0 /* impossible: pre-validated */);
+        /* fall-through */
+    case ZSTD_lcm_auto:
+        return (cctxParams->cParams.strategy == ZSTD_fast) && (cctxParams->cParams.targetLength > 0);
+    }
+}
+
  /*! ZSTD_safecopyLiterals() :
   *  memcpy() function that won't read beyond more than WILDCOPY_OVERLENGTH bytes past ilimit_w.
   *  Only called when the sequence ends past ilimit_w, so it only needs to be optimized for single
diff --git a/lib/compress/zstd_compress_literals.c b/lib/compress/zstd_compress_literals.c

index 8d22bcadffb5324e868fc2a9a02c2502ee93a684..b76800046060bb4c3d59620b0a3021876abe747e 100644 (file)
--- a/lib/compress/zstd_compress_literals.c
+++ b/lib/compress/zstd_compress_literals.c
@@ -36,6 +36,7 @@ size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src,
      }
  
      memcpy(ostart + flSize, src, srcSize);
+    DEBUGLOG(5, "Raw literals: %u -> %u", (U32)srcSize, (U32)(srcSize + flSize));
      return srcSize + flSize;
  }
  
@@ -62,6 +63,7 @@ size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void*
      }
  
      ostart[flSize] = *(const BYTE*)src;
+    DEBUGLOG(5, "RLE literals: %u -> %u", (U32)srcSize, (U32)flSize + 1);
      return flSize+1;
  }
  
@@ -80,8 +82,8 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
      symbolEncodingType_e hType = set_compressed;
      size_t cLitSize;
  
-    DEBUGLOG(5,"ZSTD_compressLiterals (disableLiteralCompression=%i)",
-                disableLiteralCompression);
+    DEBUGLOG(5,"ZSTD_compressLiterals (disableLiteralCompression=%i srcSize=%u)",
+                disableLiteralCompression, (U32)srcSize);
  
      /* Prepare nextEntropy assuming reusing the existing table */
      memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
@@ -110,6 +112,7 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
                  (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2);
          if (repeat != HUF_repeat_none) {
              /* reused the existing table */
+            DEBUGLOG(5, "Reusing previous huffman table");
              hType = set_repeat;
          }
      }
@@ -150,5 +153,6 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
      default:  /* not possible : lhSize is {3,4,5} */
          assert(0);
      }
+    DEBUGLOG(5, "Compressed literals: %u -> %u", (U32)srcSize, (U32)(lhSize+cLitSize));
      return lhSize+cLitSize;
  }
diff --git a/lib/compress/zstd_compress_superblock.c b/lib/compress/zstd_compress_superblock.c

index 8c98d18e15186aaa7e54ac3b9a3321099042e0ee..fd475dcc243f044eae5ab0735210ce7484c98655 100644 (file)
--- a/lib/compress/zstd_compress_superblock.c
+++ b/lib/compress/zstd_compress_superblock.c
@@ -16,6 +16,7 @@
  #include "zstd_compress_sequences.h"
  #include "zstd_compress_literals.h"
  #include "zstd_compress_superblock.h"
+#include "zstd_internal.h"  /* ZSTD_getSequenceLength */
  
  /*-*************************************
  *  Superblock entropy buffer structs
@@ -53,15 +54,14 @@ typedef struct {
  
  /** ZSTD_buildSuperBlockEntropy_literal() :
   *  Builds entropy for the super-block literals.
- *  Stores literals block type (raw, rle, compressed) and
+ *  Stores literals block type (raw, rle, compressed, repeat) and
   *  huffman description table to hufMetadata.
- *  Currently, this does not consider the option of reusing huffman table from
- *  previous super-block. I think it would be a good improvement to add that option.
   *  @return : size of huffman description table or error code */
  static size_t ZSTD_buildSuperBlockEntropy_literal(void* const src, size_t srcSize,
                                              const ZSTD_hufCTables_t* prevHuf,
                                                    ZSTD_hufCTables_t* nextHuf,
                                                    ZSTD_hufCTablesMetadata_t* hufMetadata,
+                                                  const int disableLiteralsCompression,
                                                    void* workspace, size_t wkspSize)
  {
      BYTE* const wkspStart = (BYTE*)workspace;
@@ -72,26 +72,49 @@ static size_t ZSTD_buildSuperBlockEntropy_literal(void* const src, size_t srcSiz
      BYTE* const nodeWksp = countWkspStart + countWkspSize;
      const size_t nodeWkspSize = wkspEnd-nodeWksp;
      unsigned maxSymbolValue = 255;
-    unsigned huffLog = 11;
+    unsigned huffLog = HUF_TABLELOG_DEFAULT;
+    HUF_repeat repeat = prevHuf->repeatMode;
  
      DEBUGLOG(5, "ZSTD_buildSuperBlockEntropy_literal (srcSize=%zu)", srcSize);
  
      /* Prepare nextEntropy assuming reusing the existing table */
      memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
  
+    if (disableLiteralsCompression) {
+        DEBUGLOG(5, "set_basic - disabled");
+        hufMetadata->hType = set_basic;
+        return 0;
+    }
+
      /* small ? don't even attempt compression (speed opt) */
  #   define COMPRESS_LITERALS_SIZE_MIN 63
-    {   size_t const minLitSize = COMPRESS_LITERALS_SIZE_MIN;
-        if (srcSize <= minLitSize) { hufMetadata->hType = set_basic; return 0; }
+    {   size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN;
+        if (srcSize <= minLitSize) {
+            DEBUGLOG(5, "set_basic - too small");
+            hufMetadata->hType = set_basic;
+            return 0;
+        }
      }
  
      /* Scan input and build symbol stats */
      {   size_t const largest = HIST_count_wksp (countWksp, &maxSymbolValue, (const BYTE*)src, srcSize, workspace, wkspSize);
          FORWARD_IF_ERROR(largest);
-        if (largest == srcSize) { hufMetadata->hType = set_rle; return 0; }
-        if (largest <= (srcSize >> 7)+4) { hufMetadata->hType = set_basic; return 0; }
+        if (largest == srcSize) {
+            DEBUGLOG(5, "set_rle");
+            hufMetadata->hType = set_rle;
+            return 0;
+        }
+        if (largest <= (srcSize >> 7)+4) {
+            DEBUGLOG(5, "set_basic - no gain");
+            hufMetadata->hType = set_basic;
+            return 0;
+        }
      }
  
+    /* Validate the previous Huffman table */
+    if (repeat == HUF_repeat_check && !HUF_validateCTable((HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue)) {
+        repeat = HUF_repeat_none;
+    }
  
      /* Build Huffman Tree */
      memset(nextHuf->CTable, 0, sizeof(nextHuf->CTable));
@@ -101,13 +124,32 @@ static size_t ZSTD_buildSuperBlockEntropy_literal(void* const src, size_t srcSiz
                                                      nodeWksp, nodeWkspSize);
          FORWARD_IF_ERROR(maxBits);
          huffLog = (U32)maxBits;
-        {   size_t cSize = HUF_estimateCompressedSize(
-                              (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue);
-            size_t hSize = HUF_writeCTable(
-                              hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesBuffer),
-                              (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog);
-            if (cSize + hSize >= srcSize) { hufMetadata->hType = set_basic; return 0; }
+        {   /* Build and write the CTable */
+            size_t const newCSize = HUF_estimateCompressedSize(
+                    (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue);
+            size_t const hSize = HUF_writeCTable(
+                    hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesBuffer),
+                    (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog);
+            /* Check against repeating the previous CTable */
+            if (repeat != HUF_repeat_none) {
+                size_t const oldCSize = HUF_estimateCompressedSize(
+                        (HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue);
+                if (oldCSize < srcSize && (oldCSize <= hSize + newCSize || hSize + 12 >= srcSize)) {
+                    DEBUGLOG(5, "set_repeat - smaller");
+                    memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+                    hufMetadata->hType = set_repeat;
+                    return 0;
+                }
+            }
+            if (newCSize + hSize >= srcSize) {
+                DEBUGLOG(5, "set_basic - no gains");
+                memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+                hufMetadata->hType = set_basic;
+                return 0;
+            }
+            DEBUGLOG(5, "set_compressed (hSize=%u)", (U32)hSize);
              hufMetadata->hType = set_compressed;
+            nextHuf->repeatMode = HUF_repeat_check;
              return hSize;
          }
      }
@@ -241,6 +283,7 @@ ZSTD_buildSuperBlockEntropy(seqStore_t* seqStorePtr,
          ZSTD_buildSuperBlockEntropy_literal(seqStorePtr->litStart, litSize,
                                              &prevEntropy->huf, &nextEntropy->huf,
                                              &entropyMetadata->hufMetadata,
+                                            ZSTD_disableLiteralsCompression(cctxParams),
                                              workspace, wkspSize);
      FORWARD_IF_ERROR(entropyMetadata->hufMetadata.hufDesSize);
      entropyMetadata->fseMetadata.fseTablesSize =
@@ -255,21 +298,19 @@ ZSTD_buildSuperBlockEntropy(seqStore_t* seqStorePtr,
  
  /** ZSTD_compressSubBlock_literal() :
   *  Compresses literals section for a sub-block.
- *  Compressed literal size needs to be less than uncompressed literal size.
- *      ZSTD spec doesn't have this constaint. I will explain why I have this constraint here.
- *      Literals section header size ranges from 1 to 5 bytes,
- *      which is dictated by regenerated size and compressed size.
- *      In order to figure out the memory address to start writing compressed literal,
- *      it is necessary to figure out the literals section header size.
- *      The challenge is that compressed size is only known after compression.
- *      This is a chicken and egg problem.
- *      I am simplifying the problem by assuming that
- *      compressed size will always be less than or equal to regenerated size,
- *      and using regenerated size to calculate literals section header size.
+ *  When we have to write the Huffman table we will sometimes choose a header
+ *  size larger than necessary. This is because we have to pick the header size
+ *  before we know the table size + compressed size, so we have a bound on the
+ *  table size. If we guessed incorrectly, we fall back to uncompressed literals.
+ *
+ *  We write the header when writeEntropy=1 and set entropyWrriten=1 when we succeeded
+ *  in writing the header, otherwise it is set to 0.
+ *
   *  hufMetadata->hType has literals block type info.
   *      If it is set_basic, all sub-blocks literals section will be Raw_Literals_Block.
   *      If it is set_rle, all sub-blocks literals section will be RLE_Literals_Block.
   *      If it is set_compressed, first sub-block's literals section will be Compressed_Literals_Block
+ *      If it is set_compressed, first sub-block's literals section will be Treeless_Literals_Block
   *      and the following sub-blocks' literals sections will be Treeless_Literals_Block.
   *  @return : compressed size of literals section of a sub-block
   *            Or 0 if it unable to compress.
@@ -278,28 +319,22 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
                                      const ZSTD_hufCTablesMetadata_t* hufMetadata,
                                      const BYTE* literals, size_t litSize,
                                      void* dst, size_t dstSize,
-                                    const int bmi2, int writeEntropy)
+                                    const int bmi2, int writeEntropy, int* entropyWritten)
  {
-    size_t const lhSize = 3 + (litSize >= 1 KB) + (litSize >= 16 KB);
+    size_t const header = writeEntropy ? 200 : 0;
+    size_t const lhSize = 3 + (litSize >= (1 KB - header)) + (litSize >= (16 KB - header));
      BYTE* const ostart = (BYTE*)dst;
      BYTE* const oend = ostart + dstSize;
      BYTE* op = ostart + lhSize;
-    U32 singleStream = litSize < 256;
-    symbolEncodingType_e hType = writeEntropy ? set_compressed : set_repeat;
+    U32 const singleStream = lhSize == 3;
+    symbolEncodingType_e hType = writeEntropy ? hufMetadata->hType : set_repeat;
      size_t cLitSize = 0;
  
      (void)bmi2; // TODO bmi2...
  
      DEBUGLOG(5, "ZSTD_compressSubBlock_literal (litSize=%zu, lhSize=%zu, writeEntropy=%d)", litSize, lhSize, writeEntropy);
  
-    if (writeEntropy && litSize == 0) {
-      /* Literals section cannot be compressed mode when litSize == 0.
-       * (This seems to be decoder constraint.)
-       * Entropy cannot be written if literals section is not compressed mode.
-       */
-      return 0;
-    }
-
+    *entropyWritten = 0;
      if (litSize == 0 || hufMetadata->hType == set_basic) {
        DEBUGLOG(5, "ZSTD_compressSubBlock_literal using raw literal");
        return ZSTD_noCompressLiterals(dst, dstSize, literals, litSize);
@@ -308,8 +343,10 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
        return ZSTD_compressRleLiteralsBlock(dst, dstSize, literals, litSize);
      }
  
-    if (lhSize == 3) singleStream = 1;
-    if (writeEntropy) {
+    assert(litSize > 0);
+    assert(hufMetadata->hType == set_compressed || hufMetadata->hType == set_repeat);
+
+    if (writeEntropy && hufMetadata->hType == set_compressed) {
          memcpy(op, hufMetadata->hufDesBuffer, hufMetadata->hufDesSize);
          op += hufMetadata->hufDesSize;
          cLitSize += hufMetadata->hufDesSize;
@@ -322,11 +359,19 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
          op += cSize;
          cLitSize += cSize;
          if (cSize == 0 || ERR_isError(cSize)) {
-          return 0;
+            DEBUGLOG(5, "Failed to write entropy tables %s", ZSTD_getErrorName(cSize));
+            return 0;
+        }
+        /* If we expand and we aren't writing a header then emit uncompressed */
+        if (!writeEntropy && cLitSize >= litSize) {
+            DEBUGLOG(5, "ZSTD_compressSubBlock_literal using raw literal because uncompressible");
+            return ZSTD_noCompressLiterals(dst, dstSize, literals, litSize);
          }
-        if (cLitSize > litSize) {
-            if (writeEntropy) return 0;
-            else return ZSTD_noCompressLiterals(dst, dstSize, literals, litSize);
+        /* If we are writing headers then allow expansion that doesn't change our header size. */
+        if (lhSize < (size_t)(3 + (cLitSize >= 1 KB) + (cLitSize >= 16 KB))) {
+            assert(cLitSize > litSize);
+            DEBUGLOG(5, "Literals expanded beyond allowed header size");
+            return ZSTD_noCompressLiterals(dst, dstSize, literals, litSize);
          }
          DEBUGLOG(5, "ZSTD_compressSubBlock_literal (cSize=%zu)", cSize);
      }
@@ -353,17 +398,26 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
      default:  /* not possible : lhSize is {3,4,5} */
          assert(0);
      }
+    *entropyWritten = 1;
+    DEBUGLOG(5, "Compressed literals: %u -> %u", (U32)litSize, (U32)(op-ostart));
      return op-ostart;
  }
  
-static size_t ZSTD_seqDecompressedSize(const seqDef* sequences, size_t nbSeq, size_t litSize) {
+static size_t ZSTD_seqDecompressedSize(seqStore_t const* seqStore, const seqDef* sequences, size_t nbSeq, size_t litSize, int lastSequence) {
      const seqDef* const sstart = sequences;
      const seqDef* const send = sequences + nbSeq;
      const seqDef* sp = sstart;
      size_t matchLengthSum = 0;
+    size_t litLengthSum = 0;
      while (send-sp > 0) {
-      matchLengthSum += sp->matchLength + MINMATCH;
-      sp++;
+        ZSTD_sequenceLength const seqLen = ZSTD_getSequenceLength(seqStore, sp);
+        litLengthSum += seqLen.litLength;
+        matchLengthSum += seqLen.matchLength;
+        sp++;
+    }
+    assert(litLengthSum <= litSize);
+    if (!lastSequence) {
+        assert(litLengthSum == litSize);
      }
      return matchLengthSum + litSize;
  }
@@ -372,8 +426,9 @@ static size_t ZSTD_seqDecompressedSize(const seqDef* sequences, size_t nbSeq, si
   *  Compresses sequences section for a sub-block.
   *  fseMetadata->llType, fseMetadata->ofType, and fseMetadata->mlType have
   *  symbol compression modes for the super-block.
- *  First sub-block will have these in its header. The following sub-blocks
- *  will always have repeat mode.
+ *  The first successfully compressed block will have these in its header.
+ *  We set entropyWritten=1 when we succeed in compressing the sequences.
+ *  The following sub-blocks will always have repeat mode.
   *  @return : compressed size of sequences section of a sub-block
   *            Or 0 if it is unable to compress
   *            Or error code. */
@@ -383,7 +438,7 @@ static size_t ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables
                                                const BYTE* llCode, const BYTE* mlCode, const BYTE* ofCode,
                                                const ZSTD_CCtx_params* cctxParams,
                                                void* dst, size_t dstCapacity,
-                                              const int bmi2, int writeEntropy)
+                                              const int bmi2, int writeEntropy, int* entropyWritten)
  {
      const int longOffsets = cctxParams->cParams.windowLog > STREAM_ACCUMULATOR_MIN;
      BYTE* const ostart = (BYTE*)dst;
@@ -393,6 +448,7 @@ static size_t ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables
  
      DEBUGLOG(5, "ZSTD_compressSubBlock_sequences (nbSeq=%zu, writeEntropy=%d, longOffsets=%d)", nbSeq, writeEntropy, longOffsets);
  
+    *entropyWritten = 0;
      /* Sequences Header */
      RETURN_ERROR_IF((oend-op) < 3 /*max nbSeq Size*/ + 1 /*seqHead*/,
                      dstSize_tooSmall);
@@ -402,9 +458,6 @@ static size_t ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables
          op[0] = (BYTE)((nbSeq>>8) + 0x80), op[1] = (BYTE)nbSeq, op+=2;
      else
          op[0]=0xFF, MEM_writeLE16(op+1, (U16)(nbSeq - LONGNBSEQ)), op+=3;
-    if (writeEntropy && nbSeq == 0) {
-        return 0;
-    }
      if (nbSeq==0) {
          return op - ostart;
      }
@@ -444,6 +497,7 @@ static size_t ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables
           * In this exceedingly rare case, we will simply emit an uncompressed
           * block, since it isn't worth optimizing.
           */
+#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
          if (writeEntropy && fseMetadata->lastCountSize && fseMetadata->lastCountSize + bitstreamSize < 4) {
              /* NCountSize >= 2 && bitstreamSize > 0 ==> lastCountSize == 3 */
              assert(fseMetadata->lastCountSize + bitstreamSize == 3);
@@ -451,6 +505,7 @@ static size_t ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables
                          "emitting an uncompressed block.");
              return 0;
          }
+#endif
          DEBUGLOG(5, "ZSTD_compressSubBlock_sequences (bitstreamSize=%zu)", bitstreamSize);
      }
  
@@ -461,10 +516,15 @@ static size_t ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables
       * with rle mode and the current block's sequences section is compressed
       * with repeat mode where sequences section body size can be 1 byte.
       */
+#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
      if (op-seqHead < 4) {
+        DEBUGLOG(5, "Avoiding bug in zstd decoder in versions <= 1.4.0 by emitting "
+                    "an uncompressed block when sequences are < 4 bytes");
          return 0;
      }
+#endif
  
+    *entropyWritten = 1;
      return op - ostart;
  }
  
@@ -479,16 +539,19 @@ static size_t ZSTD_compressSubBlock(const ZSTD_entropyCTables_t* entropy,
                                      const BYTE* llCode, const BYTE* mlCode, const BYTE* ofCode,
                                      const ZSTD_CCtx_params* cctxParams,
                                      void* dst, size_t dstCapacity,
-                                    const int bmi2, int writeEntropy, U32 lastBlock)
+                                    const int bmi2,
+                                    int writeLitEntropy, int writeSeqEntropy,
+                                    int* litEntropyWritten, int* seqEntropyWritten,
+                                    U32 lastBlock)
  {
      BYTE* const ostart = (BYTE*)dst;
      BYTE* const oend = ostart + dstCapacity;
      BYTE* op = ostart + ZSTD_blockHeaderSize;
-    DEBUGLOG(5, "ZSTD_compressSubBlock (litSize=%zu, nbSeq=%zu, writeEntropy=%d, lastBlock=%d)",
-                litSize, nbSeq, writeEntropy, lastBlock);
+    DEBUGLOG(5, "ZSTD_compressSubBlock (litSize=%zu, nbSeq=%zu, writeLitEntropy=%d, writeSeqEntropy=%d, lastBlock=%d)",
+                litSize, nbSeq, writeLitEntropy, writeSeqEntropy, lastBlock);
      {   size_t cLitSize = ZSTD_compressSubBlock_literal((const HUF_CElt*)entropy->huf.CTable,
                                                          &entropyMetadata->hufMetadata, literals, litSize,
-                                                        op, oend-op, bmi2, writeEntropy);
+                                                        op, oend-op, bmi2, writeLitEntropy, litEntropyWritten);
          FORWARD_IF_ERROR(cLitSize);
          if (cLitSize == 0) return 0;
          op += cLitSize;
@@ -499,7 +562,7 @@ static size_t ZSTD_compressSubBlock(const ZSTD_entropyCTables_t* entropy,
                                                    llCode, mlCode, ofCode,
                                                    cctxParams,
                                                    op, oend-op,
-                                                  bmi2, writeEntropy);
+                                                  bmi2, writeSeqEntropy, seqEntropyWritten);
          FORWARD_IF_ERROR(cSeqSize);
          if (cSeqSize == 0) return 0;
          op += cSeqSize;
@@ -524,7 +587,7 @@ static size_t ZSTD_estimateSubBlockSize_literal(const BYTE* literals, size_t lit
  
      if (hufMetadata->hType == set_basic) return litSize;
      else if (hufMetadata->hType == set_rle) return 1;
-    else if (hufMetadata->hType == set_compressed) {
+    else if (hufMetadata->hType == set_compressed || hufMetadata->hType == set_repeat) {
          size_t const largest = HIST_count_wksp (countWksp, &maxSymbolValue, (const BYTE*)literals, litSize, workspace, wkspSize);
          if (ZSTD_isError(largest)) return litSize;
          {   size_t cLitSizeEstimate = HUF_estimateCompressedSize((const HUF_CElt*)huf->CTable, countWksp, maxSymbolValue);
@@ -601,17 +664,28 @@ static size_t ZSTD_estimateSubBlockSize(const BYTE* literals, size_t litSize,
                                          const ZSTD_entropyCTables_t* entropy,
                                          const ZSTD_entropyCTablesMetadata_t* entropyMetadata,
                                          void* workspace, size_t wkspSize,
-                                        int writeEntropy) {
+                                        int writeLitEntropy, int writeSeqEntropy) {
      size_t cSizeEstimate = 0;
      cSizeEstimate += ZSTD_estimateSubBlockSize_literal(literals, litSize,
                                                           &entropy->huf, &entropyMetadata->hufMetadata,
-                                                         workspace, wkspSize, writeEntropy);
+                                                         workspace, wkspSize, writeLitEntropy);
      cSizeEstimate += ZSTD_estimateSubBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable,
                                                           nbSeq, &entropy->fse, &entropyMetadata->fseMetadata,
-                                                         workspace, wkspSize, writeEntropy);
+                                                         workspace, wkspSize, writeSeqEntropy);
      return cSizeEstimate + ZSTD_blockHeaderSize;
  }
  
+static int ZSTD_needSequenceEntropyTables(ZSTD_fseCTablesMetadata_t const* fseMetadata)
+{
+    if (fseMetadata->llType == set_compressed || fseMetadata->llType == set_rle)
+        return 1;
+    if (fseMetadata->mlType == set_compressed || fseMetadata->mlType == set_rle)
+        return 1;
+    if (fseMetadata->ofType == set_compressed || fseMetadata->ofType == set_rle)
+        return 1;
+    return 0;
+}
+
  /** ZSTD_compressSubBlock_multi() :
   *  Breaks super-block into multiple sub-blocks and compresses them.
   *  Entropy will be written to the first block.
@@ -620,10 +694,12 @@ static size_t ZSTD_estimateSubBlockSize(const BYTE* literals, size_t litSize,
   *  @return : compressed size of the super block (which is multiple ZSTD blocks)
   *            Or 0 if it failed to compress. */
  static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr,
-                            const ZSTD_entropyCTables_t* entropy,
+                            const ZSTD_compressedBlockState_t* prevCBlock,
+                            ZSTD_compressedBlockState_t* nextCBlock,
                              const ZSTD_entropyCTablesMetadata_t* entropyMetadata,
                              const ZSTD_CCtx_params* cctxParams,
                                    void* dst, size_t dstCapacity,
+                            const void* src, size_t srcSize,
                              const int bmi2, U32 lastBlock,
                              void* workspace, size_t wkspSize)
  {
@@ -633,6 +709,8 @@ static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr,
      const BYTE* const lstart = seqStorePtr->litStart;
      const BYTE* const lend = seqStorePtr->lit;
      const BYTE* lp = lstart;
+    BYTE const* ip = (BYTE const*)src;
+    BYTE const* const iend = ip + srcSize;
      BYTE* const ostart = (BYTE*)dst;
      BYTE* const oend = ostart + dstCapacity;
      BYTE* op = ostart;
@@ -641,41 +719,57 @@ static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr,
      const BYTE* ofCodePtr = seqStorePtr->ofCode;
      size_t targetCBlockSize = cctxParams->targetCBlockSize;
      size_t litSize, seqCount;
-    int writeEntropy = 1;
-    size_t remaining = ZSTD_seqDecompressedSize(sstart, send-sstart, lend-lstart);
-    size_t cBlockSizeEstimate = 0;
+    int writeLitEntropy = entropyMetadata->hufMetadata.hType == set_compressed;
+    int writeSeqEntropy = 1;
+    int lastSequence = 0;
  
      DEBUGLOG(5, "ZSTD_compressSubBlock_multi (litSize=%u, nbSeq=%u)",
                  (unsigned)(lend-lp), (unsigned)(send-sstart));
  
      litSize = 0;
      seqCount = 0;
-    while (sp + seqCount < send) {
-        const seqDef* const sequence = sp + seqCount;
-        const U32 lastSequence = sequence+1 == send;
-        litSize = (sequence == send) ? (size_t)(lend-lp) : litSize + sequence->litLength;
-        seqCount++;
+    do {
+        size_t cBlockSizeEstimate = 0;
+        if (sstart == send) {
+            lastSequence = 1;
+        } else {
+            const seqDef* const sequence = sp + seqCount;
+            lastSequence = sequence == send - 1;
+            litSize += ZSTD_getSequenceLength(seqStorePtr, sequence).litLength;
+            seqCount++;
+        }
+        if (lastSequence) {
+            assert(lp <= lend);
+            assert(litSize <= (size_t)(lend - lp));
+            litSize = (size_t)(lend - lp);
+        }
          /* I think there is an optimization opportunity here.
           * Calling ZSTD_estimateSubBlockSize for every sequence can be wasteful
           * since it recalculates estimate from scratch.
           * For example, it would recount literal distribution and symbol codes everytime.
           */
          cBlockSizeEstimate = ZSTD_estimateSubBlockSize(lp, litSize, ofCodePtr, llCodePtr, mlCodePtr, seqCount,
-                                                       entropy, entropyMetadata,
-                                                       workspace, wkspSize, writeEntropy);
+                                                       &nextCBlock->entropy, entropyMetadata,
+                                                       workspace, wkspSize, writeLitEntropy, writeSeqEntropy);
          if (cBlockSizeEstimate > targetCBlockSize || lastSequence) {
-            const size_t decompressedSize = ZSTD_seqDecompressedSize(sp, seqCount, litSize);
-            const size_t cSize = ZSTD_compressSubBlock(entropy, entropyMetadata,
+            int litEntropyWritten = 0;
+            int seqEntropyWritten = 0;
+            const size_t decompressedSize = ZSTD_seqDecompressedSize(seqStorePtr, sp, seqCount, litSize, lastSequence);
+            const size_t cSize = ZSTD_compressSubBlock(&nextCBlock->entropy, entropyMetadata,
                                                         sp, seqCount,
                                                         lp, litSize,
                                                         llCodePtr, mlCodePtr, ofCodePtr,
                                                         cctxParams,
                                                         op, oend-op,
-                                                       bmi2, writeEntropy, lastBlock && lastSequence);
+                                                       bmi2, writeLitEntropy, writeSeqEntropy,
+                                                       &litEntropyWritten, &seqEntropyWritten,
+                                                       lastBlock && lastSequence);
              FORWARD_IF_ERROR(cSize);
+            DEBUGLOG(5, "cSize = %zu | decompressedSize = %zu", cSize, decompressedSize);
              if (cSize > 0 && cSize < decompressedSize) {
-                assert(remaining >= decompressedSize);
-                remaining -= decompressedSize;
+                DEBUGLOG(5, "Committed the sub-block");
+                assert(ip + decompressedSize <= iend);
+                ip += decompressedSize;
                  sp += seqCount;
                  lp += litSize;
                  op += cSize;
@@ -684,20 +778,51 @@ static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr,
                  ofCodePtr += seqCount;
                  litSize = 0;
                  seqCount = 0;
-                writeEntropy = 0; // Entropy only needs to be written once
+                /* Entropy only needs to be written once */
+                if (litEntropyWritten) {
+                    writeLitEntropy = 0;
+                }
+                if (seqEntropyWritten) {
+                    writeSeqEntropy = 0;
+                }
              }
          }
+    } while (!lastSequence);
+    if (writeLitEntropy) {
+        DEBUGLOG(5, "ZSTD_compressSubBlock_multi has literal entropy tables unwritten");
+        memcpy(&nextCBlock->entropy.huf, &prevCBlock->entropy.huf, sizeof(prevCBlock->entropy.huf));
      }
-    if (remaining) {
-        DEBUGLOG(5, "ZSTD_compressSubBlock_multi failed to compress");
+    if (writeSeqEntropy && ZSTD_needSequenceEntropyTables(&entropyMetadata->fseMetadata)) {
+        /* If we haven't written our entropy tables, then we've violated our contract and
+         * must emit an uncompressed block.
+         */
+        DEBUGLOG(5, "ZSTD_compressSubBlock_multi has sequence entropy tables unwritten");
          return 0;
      }
+    if (ip < iend) {
+        size_t const cSize = ZSTD_noCompressBlock(op, oend - op, ip, iend - ip, lastBlock);
+        DEBUGLOG(5, "ZSTD_compressSubBlock_multi last sub-block uncompressed, %zu bytes", (size_t)(iend - ip));
+        FORWARD_IF_ERROR(cSize);
+        assert(cSize != 0);
+        op += cSize;
+        /* We have to regenerate the repcodes because we've skipped some sequences */
+        if (sp < send) {
+            seqDef const* seq;
+            repcodes_t rep;
+            memcpy(&rep, prevCBlock->rep, sizeof(rep)); 
+            for (seq = sstart; seq < sp; ++seq) {
+                rep = ZSTD_updateRep(rep.rep, seq->offset - 1, ZSTD_getSequenceLength(seqStorePtr, seq).litLength == 0);
+            }
+            memcpy(nextCBlock->rep, &rep, sizeof(rep));
+        }
+    }
      DEBUGLOG(5, "ZSTD_compressSubBlock_multi compressed");
      return op-ostart;
  }
  
  size_t ZSTD_compressSuperBlock(ZSTD_CCtx* zc,
                                 void* dst, size_t dstCapacity,
+                               void const* src, size_t srcSize,
                                 unsigned lastBlock) {
      ZSTD_entropyCTablesMetadata_t entropyMetadata;
  
@@ -709,10 +834,12 @@ size_t ZSTD_compressSuperBlock(ZSTD_CCtx* zc,
            zc->entropyWorkspace, HUF_WORKSPACE_SIZE /* statically allocated in resetCCtx */));
  
      return ZSTD_compressSubBlock_multi(&zc->seqStore,
-            &zc->blockState.nextCBlock->entropy,
+            zc->blockState.prevCBlock,
+            zc->blockState.nextCBlock,
              &entropyMetadata,
              &zc->appliedParams,
              dst, dstCapacity,
+            src, srcSize,
              zc->bmi2, lastBlock,
              zc->entropyWorkspace, HUF_WORKSPACE_SIZE /* statically allocated in resetCCtx */);
  }
diff --git a/lib/compress/zstd_compress_superblock.h b/lib/compress/zstd_compress_superblock.h

index 3bd6fdcf33e4c1409e52c316d0eebf61ee09a05a..35d207299d83f53b05200e9ff85f821dcb373b5f 100644 (file)
--- a/lib/compress/zstd_compress_superblock.h
+++ b/lib/compress/zstd_compress_superblock.h
@@ -26,6 +26,7 @@
   * The given block will be compressed into multiple sub blocks that are around targetCBlockSize. */
  size_t ZSTD_compressSuperBlock(ZSTD_CCtx* zc,
                                 void* dst, size_t dstCapacity,
+                               void const* src, size_t srcSize,
                                 unsigned lastBlock);
  
  #endif /* ZSTD_COMPRESS_ADVANCED_H */
diff --git a/lib/compress/zstd_opt.c b/lib/compress/zstd_opt.c

index a835e9ec28562a5d4d41092914ee896ab8cb8c4c..8d63019654e0a4c86105da625375e5c0090a2f7a 100644 (file)
--- a/lib/compress/zstd_opt.c
+++ b/lib/compress/zstd_opt.c
@@ -765,30 +765,6 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_BtGetAllMatches (
  /*-*******************************
  *  Optimal parser
  *********************************/
-typedef struct repcodes_s {
-    U32 rep[3];
-} repcodes_t;
-
-static repcodes_t ZSTD_updateRep(U32 const rep[3], U32 const offset, U32 const ll0)
-{
-    repcodes_t newReps;
-    if (offset >= ZSTD_REP_NUM) {  /* full offset */
-        newReps.rep[2] = rep[1];
-        newReps.rep[1] = rep[0];
-        newReps.rep[0] = offset - ZSTD_REP_MOVE;
-    } else {   /* repcode */
-        U32 const repCode = offset + ll0;
-        if (repCode > 0) {  /* note : if repCode==0, no change */
-            U32 const currentOffset = (repCode==ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode];
-            newReps.rep[2] = (repCode >= 2) ? rep[1] : rep[2];
-            newReps.rep[1] = rep[0];
-            newReps.rep[0] = currentOffset;
-        } else {   /* repCode == 0 */
-            memcpy(&newReps, rep, sizeof(newReps));
-        }
-    }
-    return newReps;
-}
  
  
  static U32 ZSTD_totalLen(ZSTD_optimal_t sol)
diff --git a/tests/fuzz/simple_round_trip.c b/tests/fuzz/simple_round_trip.c

index e37fa6f6f61ddddf9d2f7a6671fad77655e9dafa..41ea96739fed01ee79c52ef53a39f2f9baf93ad4 100644 (file)
--- a/tests/fuzz/simple_round_trip.c
+++ b/tests/fuzz/simple_round_trip.c
@@ -32,9 +32,12 @@ static size_t roundTripTest(void *result, size_t resultCapacity,
                              FUZZ_dataProducer_t *producer)
  {
      size_t cSize;
+    size_t dSize;
+    int targetCBlockSize = 0;
      if (FUZZ_dataProducer_uint32Range(producer, 0, 1)) {
          FUZZ_setRandomParameters(cctx, srcSize, producer);
          cSize = ZSTD_compress2(cctx, compressed, compressedCapacity, src, srcSize);
+        FUZZ_ZASSERT(ZSTD_CCtx_getParameter(cctx, ZSTD_c_targetCBlockSize, &targetCBlockSize));
      } else {
        int const cLevel = FUZZ_dataProducer_int32Range(producer, kMinClevel, kMaxClevel);
  
@@ -42,14 +45,33 @@ static size_t roundTripTest(void *result, size_t resultCapacity,
              cctx, compressed, compressedCapacity, src, srcSize, cLevel);
      }
      FUZZ_ZASSERT(cSize);
-    return ZSTD_decompressDCtx(dctx, result, resultCapacity, compressed, cSize);
+    dSize = ZSTD_decompressDCtx(dctx, result, resultCapacity, compressed, cSize);
+    FUZZ_ZASSERT(dSize);
+    /* When superblock is enabled make sure we don't expand the block more than expected. */
+    if (targetCBlockSize != 0) {
+        size_t normalCSize;
+        FUZZ_ZASSERT(ZSTD_CCtx_setParameter(cctx, ZSTD_c_targetCBlockSize, 0));
+        normalCSize = ZSTD_compress2(cctx, compressed, compressedCapacity, src, srcSize);
+        FUZZ_ZASSERT(normalCSize);
+        {
+            size_t const bytesPerBlock = 3 /* block header */
+                + 5 /* Literal header */
+                + 6 /* Huffman jump table */
+                + 3 /* number of sequences */
+                + 1 /* symbol compression modes */;
+            size_t const expectedExpansion = bytesPerBlock * (1 + (normalCSize / MAX(1, targetCBlockSize)));
+            size_t const allowedExpansion = (srcSize >> 4) + 3 * expectedExpansion + 10;
+            FUZZ_ASSERT(cSize <= normalCSize + allowedExpansion);
+        }
+    }
+    return dSize;
  }
  
  int LLVMFuzzerTestOneInput(const uint8_t *src, size_t size)
  {
      size_t const rBufSize = size;
      void* rBuf = malloc(rBufSize);
-    size_t cBufSize = ZSTD_compressBound(size) * 2;
+    size_t cBufSize = ZSTD_compressBound(size);
      void* cBuf;
  
      /* Give a random portion of src data to the producer, to use for
diff --git a/tests/fuzzer.c b/tests/fuzzer.c

index 416df24d2a1a76c52ec893036c09c0e75011ea7e..700cb5771600a931ce59d2c4314dab09beb45547 100644 (file)
--- a/tests/fuzzer.c
+++ b/tests/fuzzer.c
@@ -708,8 +708,8 @@ static int basicUnitTests(U32 const seed, double compressibility)
            for (read = 0; read < streamCompressThreshold; read += streamCompressDelta) {
              ZSTD_inBuffer in = {src, streamCompressDelta, 0};
              ZSTD_outBuffer out = {dst, dstCapacity, 0};
-            assert(!ZSTD_isError(ZSTD_compressStream2(cctx, &out, &in, ZSTD_e_continue)));
-            assert(!ZSTD_isError(ZSTD_compressStream2(cctx, &out, &in, ZSTD_e_end)));
+            CHECK_Z(ZSTD_compressStream2(cctx, &out, &in, ZSTD_e_continue));
+            CHECK_Z(ZSTD_compressStream2(cctx, &out, &in, ZSTD_e_end));
              src += streamCompressDelta; srcSize -= streamCompressDelta;
              dst += out.pos; dstCapacity -= out.pos;}}
  
@@ -717,7 +717,35 @@ static int basicUnitTests(U32 const seed, double compressibility)
  
          { ZSTD_inBuffer in = {src, srcSize, 0};
            ZSTD_outBuffer out = {dst, dstCapacity, 0};
-          assert(!ZSTD_isError(ZSTD_compressStream2(cctx, &out, &in, ZSTD_e_end)));}
+          CHECK_Z(ZSTD_compressStream2(cctx, &out, &in, ZSTD_e_end));}
+        ZSTD_freeCCtx(cctx);
+    }
+    DISPLAYLEVEL(3, "OK \n");
+
+    DISPLAYLEVEL(3, "test%3d: superblock with no literals : ", testNb++);
+    /* Generate the same data 20 times over */
+    {
+        size_t const avgChunkSize = CNBuffSize / 20;
+        size_t b;
+        for (b = 0; b < CNBuffSize; b += avgChunkSize) {
+            size_t const chunkSize = MIN(CNBuffSize - b, avgChunkSize);
+            RDG_genBuffer((char*)CNBuffer + b, chunkSize, compressibility, 0. /* auto */, seed);
+        }
+    }
+    {
+        ZSTD_CCtx* const cctx = ZSTD_createCCtx();
+        size_t const normalCSize = ZSTD_compress2(cctx, compressedBuffer, compressedBufferSize, CNBuffer, CNBuffSize);
+        size_t const allowedExpansion = (CNBuffSize * 3 / 1000);
+        size_t superCSize;
+        CHECK_Z(normalCSize);
+        ZSTD_CCtx_setParameter(cctx, ZSTD_c_compressionLevel, 19);
+        ZSTD_CCtx_setParameter(cctx, ZSTD_c_targetCBlockSize, 1000);
+        superCSize = ZSTD_compress2(cctx, compressedBuffer, compressedBufferSize, CNBuffer, CNBuffSize);
+        CHECK_Z(superCSize);
+        if (superCSize > normalCSize + allowedExpansion) {
+            DISPLAYLEVEL(1, "Superblock too big: %u > %u + %u \n", (U32)superCSize, (U32)normalCSize, (U32)allowedExpansion);
+            goto _output_error;
+        }
          ZSTD_freeCCtx(cctx);
      }
      DISPLAYLEVEL(3, "OK \n");
author	Nick Terrell <terrelln@fb.com>
	Fri, 1 May 2020 23:11:47 +0000 (16:11 -0700)
committer	GitHub <noreply@github.com>
	Fri, 1 May 2020 23:11:47 +0000 (16:11 -0700)
lib/common/huf.h		patch \| blob \| blame \| history
lib/common/zstd_internal.h		patch \| blob \| blame \| history
lib/compress/huf_compress.c		patch \| blob \| blame \| history
lib/compress/zstd_compress.c		patch \| blob \| blame \| history
lib/compress/zstd_compress_internal.h		patch \| blob \| blame \| history
lib/compress/zstd_compress_literals.c		patch \| blob \| blame \| history
lib/compress/zstd_compress_superblock.c		patch \| blob \| blame \| history
lib/compress/zstd_compress_superblock.h		patch \| blob \| blame \| history
lib/compress/zstd_opt.c		patch \| blob \| blame \| history
tests/fuzz/simple_round_trip.c		patch \| blob \| blame \| history
tests/fuzzer.c		patch \| blob \| blame \| history