]> git.ipfire.org Git - thirdparty/zstd.git/commitdiff
Fix repcode-related OSS-fuzz issues in block splitter (#2560)
authorsen <senhuang96@fb.com>
Wed, 31 Mar 2021 19:14:59 +0000 (15:14 -0400)
committerGitHub <noreply@github.com>
Wed, 31 Mar 2021 19:14:59 +0000 (15:14 -0400)
* Do not emit last partitions of blocks as RLE/uncompressed

* Fix repcode updates within block splitter

* Add a entropytables confirm function, redo ZSTD_confirmRepcodesAndEntropyTables() for better function signature

* Add a repcode updater to block splitter, no longer need to force emit compressed blocks

lib/compress/zstd_compress.c

index 4f547aab256ddf8005b555a89d88e362b08e28c5..27ef756d7fb0a9576c33971cfab2073943a10d8d 100644 (file)
@@ -2483,7 +2483,7 @@ ZSTD_entropyCompressSeqStore(seqStore_t* seqStorePtr,
                              void* dst, size_t dstCapacity,
                              size_t srcSize,
                              void* entropyWorkspace, size_t entropyWkspSize,
-                             int bmi2, U32 const canEmitUncompressed)
+                             int bmi2)
 {
     size_t const cSize = ZSTD_entropyCompressSeqStore_internal(
                             seqStorePtr, prevEntropy, nextEntropy, cctxParams,
@@ -2493,15 +2493,13 @@ ZSTD_entropyCompressSeqStore(seqStore_t* seqStorePtr,
     /* When srcSize <= dstCapacity, there is enough space to write a raw uncompressed block.
      * Since we ran out of space, block must be not compressible, so fall back to raw uncompressed block.
      */
-    if (canEmitUncompressed) {
-        if ((cSize == ERROR(dstSize_tooSmall)) & (srcSize <= dstCapacity))
-            return 0;  /* block not compressed */
-        FORWARD_IF_ERROR(cSize, "ZSTD_entropyCompressSeqStore_internal failed");
-
-        /* Check compressibility */
-        {   size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, cctxParams->cParams.strategy);
-            if (cSize >= maxCSize) return 0;  /* block not compressed */
-        }
+    if ((cSize == ERROR(dstSize_tooSmall)) & (srcSize <= dstCapacity))
+        return 0;  /* block not compressed */
+    FORWARD_IF_ERROR(cSize, "ZSTD_entropyCompressSeqStore_internal failed");
+
+    /* Check compressibility */
+    {   size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, cctxParams->cParams.strategy);
+        if (cSize >= maxCSize) return 0;  /* block not compressed */
     }
     DEBUGLOG(4, "ZSTD_entropyCompressSeqStore() cSize: %zu", cSize);
     return cSize;
@@ -2793,11 +2791,11 @@ static int ZSTD_maybeRLE(seqStore_t const* seqStore)
     return nbSeqs < 4 && nbLits < 10;
 }
 
-static void ZSTD_confirmRepcodesAndEntropyTables(ZSTD_CCtx* zc)
+static void ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* const bs)
 {
-    ZSTD_compressedBlockState_t* const tmp = zc->blockState.prevCBlock;
-    zc->blockState.prevCBlock = zc->blockState.nextCBlock;
-    zc->blockState.nextCBlock = tmp;
+    ZSTD_compressedBlockState_t* const tmp = bs->prevCBlock;
+    bs->prevCBlock = bs->nextCBlock;
+    bs->nextCBlock = tmp;
 }
 
 /* Writes the block header */
@@ -3180,6 +3178,26 @@ static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore,
     resultSeqStore->ofCode += startIdx;
 }
 
+/**
+ * ZSTD_seqStore_updateRepcodes(): Starting from an array of initial repcodes and a seqStore,
+ * construct the final repcodes at the conclusion of compressing the seqStore, stored in dstRep.
+ */
+static void ZSTD_seqStore_updateRepcodes(U32 dstRep[ZSTD_REP_NUM],
+                                         const U32 initialRep[ZSTD_REP_NUM],
+                                         const seqStore_t* const seqStore, U32 const nbSeq) {
+    repcodes_t updatedRepcodes;
+    U32 idx = 0;
+    ZSTD_memcpy(updatedRepcodes.rep, initialRep, sizeof(repcodes_t));
+    for (; idx < nbSeq; ++idx) {
+        seqDef const seq = seqStore->sequencesStart[idx];
+        U32 const ll0 = (seq.litLength == 0);
+        U32 const offCode = seq.offset - 1;
+        assert(seq.offset >= 1); /* seqDef::offset == offCode+1, and ZSTD_updateRep() expects an offCode */
+        updatedRepcodes = ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0);
+    }
+    ZSTD_memcpy(dstRep, updatedRepcodes.rep, sizeof(repcodes_t));
+}
+
 /* ZSTD_compressSeqStore_singleBlock():
  * Compresses a seqStore into a block with a block header, into the buffer dst.
  * 
@@ -3188,7 +3206,7 @@ static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore,
 static size_t ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* seqStore,
                                                  void* dst, size_t dstCapacity,
                                                  const void* src, size_t srcSize,
-                                                 U32 lastBlock, U32 canEmitRLEorNoCompress) {
+                                                 U32 lastBlock, U32 isPartition) {
     const U32 rleMaxLength = 25;
     BYTE* op = (BYTE*)dst;
     const BYTE* ip = (const BYTE*)src;
@@ -3199,13 +3217,12 @@ static size_t ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* seqSt
                 op + ZSTD_blockHeaderSize, dstCapacity - ZSTD_blockHeaderSize,
                 srcSize,
                 zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */,
-                zc->bmi2, canEmitRLEorNoCompress);
+                zc->bmi2);
     FORWARD_IF_ERROR(cSeqsSize, "ZSTD_entropyCompressSeqStore failed!");
 
     if (!zc->isFirstBlock &&
         cSeqsSize < rleMaxLength &&
-        ZSTD_isRLE((BYTE const*)src, srcSize)&&
-        canEmitRLEorNoCompress) {
+        ZSTD_isRLE((BYTE const*)src, srcSize)) {
         /* We don't want to emit our first block as a RLE even if it qualifies because
         * doing so will cause the decoder (cli only) to throw a "should consume all input error."
         * This is only an issue for zstd <= v1.4.3
@@ -3215,24 +3232,30 @@ static size_t ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* seqSt
 
     if (zc->seqCollector.collectSequences) {
         ZSTD_copyBlockSequences(zc);
-        ZSTD_confirmRepcodesAndEntropyTables(zc);
+        ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);
         return 0;
     }
 
     if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid)
         zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check;
 
-    if (cSeqsSize == 0 && canEmitRLEorNoCompress) {
+    if (cSeqsSize == 0) {
         cSize = ZSTD_noCompressBlock(op, dstCapacity, ip, srcSize, lastBlock);
         FORWARD_IF_ERROR(cSize, "Nocompress block failed");
         DEBUGLOG(4, "Writing out nocompress block, size: %zu", cSize);
-    } else if (cSeqsSize == 1 && canEmitRLEorNoCompress) {
+    } else if (cSeqsSize == 1) {
         cSize = ZSTD_rleCompressBlock(op, dstCapacity, *ip, srcSize, lastBlock);
         FORWARD_IF_ERROR(cSize, "RLE compress block failed");
         DEBUGLOG(4, "Writing out RLE block, size: %zu", cSize);
     } else {
-        /* Error checking and repcodes update */
-        ZSTD_confirmRepcodesAndEntropyTables(zc);
+        if (isPartition) {
+            /* We manually update repcodes if we are currently compressing a partition. Otherwise,
+             * for non-split blocks, the repcodes are already correct as-is.
+             */
+            ZSTD_seqStore_updateRepcodes(zc->blockState.nextCBlock->rep, zc->blockState.prevCBlock->rep,
+                                        seqStore, (U32)(seqStore->sequences - seqStore->sequencesStart));
+        }
+        ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);
         writeBlockHeader(op, cSeqsSize, srcSize, lastBlock);
         cSize = ZSTD_blockHeaderSize + cSeqsSize;
         DEBUGLOG(4, "Writing out compressed block, size: %zu", cSize);
@@ -3310,18 +3333,6 @@ static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq)
     return splits.idx;
 }
 
-/* Return 1 if if the first three sequences of seqstore/block use repcodes */
-static U32 ZSTD_seqStore_firstThreeContainRepcodes(const seqStore_t* const seqStore) {  
-    U32 const seqLimit = MIN((U32)(seqStore->sequences - seqStore->sequencesStart), ZSTD_REP_NUM);
-    U32 seqIdx = 0;
-    for (; seqIdx < seqLimit; ++seqIdx) {
-        if (seqStore->sequencesStart[seqIdx].offset <= ZSTD_REP_MOVE) {
-            return 1;
-        }
-    }
-    return 0;
-}
-
 /* ZSTD_compressBlock_splitBlock():
  * Attempts to split a given block into multiple blocks to improve compression ratio.
  * 
@@ -3338,17 +3349,19 @@ static size_t ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, s
     size_t numSplits = ZSTD_deriveBlockSplits(zc, partitions, nbSeq);
     seqStore_t nextSeqStore;
     seqStore_t currSeqStore;
-    U32 canEmitRLEorNoCompress = 1;
-    const size_t dstCapacityInitial = dstCapacity;
 
     DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)",
                 (unsigned)dstCapacity, (unsigned)zc->blockState.matchState.window.dictLimit,
                 (unsigned)zc->blockState.matchState.nextToUpdate);
 
     if (numSplits == 0) {
-        size_t cSizeSingleBlock = ZSTD_compressSeqStore_singleBlock(zc, &zc->seqStore, op, dstCapacity, ip, blockSize, lastBlock, 1);
+        size_t cSizeSingleBlock = ZSTD_compressSeqStore_singleBlock(zc, &zc->seqStore,
+                                                                    op, dstCapacity,
+                                                                    ip, blockSize,
+                                                                    lastBlock, 0 /* isPartition */);
         FORWARD_IF_ERROR(cSizeSingleBlock, "Compressing single block from splitBlock_internal() failed!");
         DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal: No splits");
+        assert(cSizeSingleBlock <= ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize);
         return cSizeSingleBlock;
     }
 
@@ -3356,41 +3369,33 @@ static size_t ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, s
     for (i = 0; i <= numSplits; ++i) {
         size_t srcBytes;
         size_t cSizeChunk;
-        U32 lastBlockActual;
+        U32 const lastPartition = (i == numSplits);
+        U32 lastBlockEntireSrc = 0;
 
         srcBytes = ZSTD_countSeqStoreLiteralsBytes(&currSeqStore) + ZSTD_countSeqStoreMatchBytes(&currSeqStore);
-        lastBlockActual = lastBlock && (i == numSplits);
         srcBytesTotal += srcBytes;
-        if (i == numSplits) {
+        if (lastPartition) {
             /* This is the final partition, need to account for possible last literals */
             srcBytes += blockSize - srcBytesTotal;
+            lastBlockEntireSrc = lastBlock;
         } else {
             ZSTD_deriveSeqStoreChunk(&nextSeqStore, &zc->seqStore, partitions[i], partitions[i+1]);
-            if (ZSTD_seqStore_firstThreeContainRepcodes(&nextSeqStore)) {
-                DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal: Next block contains rep in first three seqs!");
-                canEmitRLEorNoCompress = 0;
-            }
         }
 
-        cSizeChunk = ZSTD_compressSeqStore_singleBlock(zc, &currSeqStore, op, dstCapacity, ip, srcBytes, lastBlockActual, canEmitRLEorNoCompress);
+        cSizeChunk = ZSTD_compressSeqStore_singleBlock(zc, &currSeqStore,
+                                                       op, dstCapacity,
+                                                       ip, srcBytes,
+                                                       lastBlockEntireSrc, 1 /* isPartition */);
         DEBUGLOG(5, "Estimated size: %zu actual size: %zu", ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(&currSeqStore, zc), cSizeChunk);
         FORWARD_IF_ERROR(cSizeChunk, "Compressing chunk failed!");
-        ZSTD_memcpy(zc->blockState.nextCBlock->rep, zc->blockState.prevCBlock->rep, sizeof(U32)*ZSTD_REP_NUM);
 
         ip += srcBytes;
         op += cSizeChunk;
         dstCapacity -= cSizeChunk;
         cSize += cSizeChunk;
         currSeqStore = nextSeqStore;
+        assert(cSizeChunk <= ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize);
     }
-
-    if (cSize > ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize) {
-        /* If too large, recompress the original block to avoid any chance of a single block exceeding ZSTD_BLOCKSIZE_MAX */
-        cSize = ZSTD_compressSeqStore_singleBlock(zc, &zc->seqStore, (BYTE*)dst, dstCapacityInitial, (const BYTE*)src, blockSize, lastBlock, 1);
-        FORWARD_IF_ERROR(cSize, "Compressing single block from splitBlock_internal() fallback failed!");
-        DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal: Compressed split block too large, recompressed");
-    }
-    assert(cSize <= ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize);
     return cSize;
 }
 
@@ -3445,7 +3450,7 @@ static size_t ZSTD_compressBlock_internal(ZSTD_CCtx* zc,
 
     if (zc->seqCollector.collectSequences) {
         ZSTD_copyBlockSequences(zc);
-        ZSTD_confirmRepcodesAndEntropyTables(zc);
+        ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);
         return 0;
     }
 
@@ -3456,7 +3461,7 @@ static size_t ZSTD_compressBlock_internal(ZSTD_CCtx* zc,
             dst, dstCapacity,
             srcSize,
             zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */,
-            zc->bmi2, 1 /* Can emit uncompressed blocks */);
+            zc->bmi2);
 
     if (zc->seqCollector.collectSequences) {
         ZSTD_copyBlockSequences(zc);
@@ -3479,7 +3484,7 @@ static size_t ZSTD_compressBlock_internal(ZSTD_CCtx* zc,
 
 out:
     if (!ZSTD_isError(cSize) && cSize > 1) {
-        ZSTD_confirmRepcodesAndEntropyTables(zc);
+        ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);
     }
     /* We check that dictionaries have offset codes available for the first
      * block. After the first block, the offcode table might not have large
@@ -3532,7 +3537,7 @@ static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc,
                 size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, zc->appliedParams.cParams.strategy);
                 FORWARD_IF_ERROR(cSize, "ZSTD_compressSuperBlock failed");
                 if (cSize != 0 && cSize < maxCSize + ZSTD_blockHeaderSize) {
-                    ZSTD_confirmRepcodesAndEntropyTables(zc);
+                    ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);
                     return cSize;
                 }
             }
@@ -5665,7 +5670,7 @@ static size_t ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
                                 op + ZSTD_blockHeaderSize /* Leave space for block header */, dstCapacity - ZSTD_blockHeaderSize,
                                 blockSize,
                                 cctx->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */,
-                                cctx->bmi2, 1 /* Can emit uncompressed blocks */);
+                                cctx->bmi2);
         FORWARD_IF_ERROR(compressedSeqsSize, "Compressing sequences of block failed");
         DEBUGLOG(4, "Compressed sequences size: %zu", compressedSeqsSize);
 
@@ -5691,7 +5696,7 @@ static size_t ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
         } else {
             U32 cBlockHeader;
             /* Error checking and repcodes update */
-            ZSTD_confirmRepcodesAndEntropyTables(cctx);
+            ZSTD_blockState_confirmRepcodesAndEntropyTables(&cctx->blockState);
             if (cctx->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid)
                 cctx->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check;