From: sen Date: Tue, 6 Apr 2021 21:25:55 +0000 (-0400) Subject: Maintain two repcode histories for block splitting, replace invalid repcodes (#2569) X-Git-Tag: v1.5.0^2~62 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=f1e8b565c2904c1bc4c5d441f1468f69249e0da6;p=thirdparty%2Fzstd.git Maintain two repcode histories for block splitting, replace invalid repcodes (#2569) --- diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index 4b7b134df..f09f52b7c 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -3181,23 +3181,40 @@ static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore, } /** - * ZSTD_seqStore_updateRepcodes(): Starting from an array of initial repcodes and a seqStore, - * construct the final repcodes at the conclusion of compressing the seqStore, stored in dstRep. + * ZSTD_seqStore_resolveOffCodes() reconciles any possible divergences in offset history that may arise + * due to emission of RLE/raw blocks that disturb the offset history, and replaces any repcodes within + * the seqStore that may be invalid. + * + * dRepcodes are updated as would be on the decompression side. cRepcodes are updated exactly in + * accordance with the seqStore. */ -static void ZSTD_seqStore_updateRepcodes(U32 dstRep[ZSTD_REP_NUM], - const U32 initialRep[ZSTD_REP_NUM], - const seqStore_t* const seqStore, U32 const nbSeq) { - repcodes_t updatedRepcodes; +static void ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_t* const cRepcodes, + seqStore_t* const seqStore, U32 const nbSeq) { U32 idx = 0; - ZSTD_memcpy(updatedRepcodes.rep, initialRep, sizeof(repcodes_t)); for (; idx < nbSeq; ++idx) { - seqDef const seq = seqStore->sequencesStart[idx]; - U32 const ll0 = (seq.litLength == 0); - U32 const offCode = seq.offset - 1; - assert(seq.offset >= 1); /* seqDef::offset == offCode+1, and ZSTD_updateRep() expects an offCode */ - updatedRepcodes = ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0); + seqDef* const seq = seqStore->sequencesStart + idx; + U32 const ll0 = (seq->litLength == 0); + U32 offCode = seq->offset - 1; + assert(seq->offset > 0); + if (offCode <= ZSTD_REP_MOVE && (dRepcodes->rep[offCode] != cRepcodes->rep[offCode])) { + /* Adjust simulated decompression repcode history if we come across a mismatch. Replace + * the repcode with the offset it actually references, determined by the compression + * repcode history. + */ + offCode += ll0; + if (offCode == ZSTD_REP_MOVE+1) { + /* litlength == 0 and offset_value = 3 implies selection of first repcode - 1 */ + seq->offset = (cRepcodes->rep[0] + ZSTD_REP_NUM) - 1; + } else { + seq->offset = cRepcodes->rep[offCode] + ZSTD_REP_NUM; + } + *dRepcodes = ZSTD_updateRep(dRepcodes->rep, seq->offset - 1, ll0); + } else { + *dRepcodes = ZSTD_updateRep(dRepcodes->rep, offCode, ll0); + } + /* Compression repcode history is always updated with values directly from the seqStore */ + *cRepcodes = ZSTD_updateRep(cRepcodes->rep, offCode, ll0); } - ZSTD_memcpy(dstRep, updatedRepcodes.rep, sizeof(repcodes_t)); } /* ZSTD_compressSeqStore_singleBlock(): @@ -3205,15 +3222,23 @@ static void ZSTD_seqStore_updateRepcodes(U32 dstRep[ZSTD_REP_NUM], * * Returns the total size of that block (including header) or a ZSTD error code. */ -static size_t ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* seqStore, - void* dst, size_t dstCapacity, - const void* src, size_t srcSize, - U32 lastBlock, U32 isPartition) { +static size_t ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* const seqStore, + repcodes_t* const dRep, repcodes_t* const cRep, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + U32 lastBlock, U32 isPartition) { const U32 rleMaxLength = 25; BYTE* op = (BYTE*)dst; const BYTE* ip = (const BYTE*)src; size_t cSize; - size_t cSeqsSize = ZSTD_entropyCompressSeqStore(seqStore, + size_t cSeqsSize; + + /* In case of an RLE or raw block, the simulated decompression repcode history must be reset */ + repcodes_t const dRepOriginal = *dRep; + if (isPartition) + ZSTD_seqStore_resolveOffCodes(dRep, cRep, seqStore, (U32)(seqStore->sequences - seqStore->sequencesStart)); + + cSeqsSize = ZSTD_entropyCompressSeqStore(seqStore, &zc->blockState.prevCBlock->entropy, &zc->blockState.nextCBlock->entropy, &zc->appliedParams, op + ZSTD_blockHeaderSize, dstCapacity - ZSTD_blockHeaderSize, @@ -3245,18 +3270,13 @@ static size_t ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* seqSt cSize = ZSTD_noCompressBlock(op, dstCapacity, ip, srcSize, lastBlock); FORWARD_IF_ERROR(cSize, "Nocompress block failed"); DEBUGLOG(4, "Writing out nocompress block, size: %zu", cSize); + *dRep = dRepOriginal; /* reset simulated decompression repcode history */ } else if (cSeqsSize == 1) { cSize = ZSTD_rleCompressBlock(op, dstCapacity, *ip, srcSize, lastBlock); FORWARD_IF_ERROR(cSize, "RLE compress block failed"); DEBUGLOG(4, "Writing out RLE block, size: %zu", cSize); + *dRep = dRepOriginal; /* reset simulated decompression repcode history */ } else { - if (isPartition) { - /* We manually update repcodes if we are currently compressing a partition. Otherwise, - * for non-split blocks, the repcodes are already correct as-is. - */ - ZSTD_seqStore_updateRepcodes(zc->blockState.nextCBlock->rep, zc->blockState.prevCBlock->rep, - seqStore, (U32)(seqStore->sequences - seqStore->sequencesStart)); - } ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); writeBlockHeader(op, cSeqsSize, srcSize, lastBlock); cSize = ZSTD_blockHeaderSize + cSeqsSize; @@ -3352,12 +3372,32 @@ static size_t ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, s seqStore_t nextSeqStore; seqStore_t currSeqStore; - DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)", + /* If a block is split and some partitions are emitted as RLE/uncompressed, then repcode history + * may become invalid. In order to reconcile potentially invalid repcodes, we keep track of two + * separate repcode histories that simulate repcode history on compression and decompression side, + * and use the histories to determine whether we must replace a particular repcode with its raw offset. + * + * 1) cRep gets updated for each partition, regardless of whether the block was emitted as uncompressed + * or RLE. This allows us to retrieve the offset value that an invalid repcode references within + * a nocompress/RLE block. + * 2) dRep gets updated only for compressed partitions, and when a repcode gets replaced, will use + * the replacement offset value rather than the original repcode to update the repcode history. + * dRep also will be the final repcode history sent to the next block. + * + * See ZSTD_seqStore_resolveOffCodes() for more details. + */ + repcodes_t dRep; + repcodes_t cRep; + ZSTD_memcpy(dRep.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t)); + ZSTD_memcpy(cRep.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t)); + + DEBUGLOG(4, "ZSTD_compressBlock_splitBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)", (unsigned)dstCapacity, (unsigned)zc->blockState.matchState.window.dictLimit, (unsigned)zc->blockState.matchState.nextToUpdate); if (numSplits == 0) { size_t cSizeSingleBlock = ZSTD_compressSeqStore_singleBlock(zc, &zc->seqStore, + &dRep, &cRep, op, dstCapacity, ip, blockSize, lastBlock, 0 /* isPartition */); @@ -3385,6 +3425,7 @@ static size_t ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, s } cSizeChunk = ZSTD_compressSeqStore_singleBlock(zc, &currSeqStore, + &dRep, &cRep, op, dstCapacity, ip, srcBytes, lastBlockEntireSrc, 1 /* isPartition */); @@ -3398,6 +3439,10 @@ static size_t ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, s currSeqStore = nextSeqStore; assert(cSizeChunk <= ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize); } + /* cRep and dRep may have diverged during the compression. If so, we use the dRep repcodes + * for the next block. + */ + ZSTD_memcpy(zc->blockState.prevCBlock->rep, dRep.rep, sizeof(repcodes_t)); return cSize; }