From: Yann Collet Date: Fri, 23 Feb 2024 22:03:26 +0000 (-0800) Subject: speed optimized version of targetCBlockSize X-Git-Tag: v1.5.6^2~61^2~7 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=cc4530924b42c5d138f871c33726d374e2778ad3;p=thirdparty%2Fzstd.git speed optimized version of targetCBlockSize note that the size of individual compressed blocks will vary more wildly with this modification. But it seems good enough for a first test, and fix the speed regression issue. Further refinements can be attempted later. --- diff --git a/lib/compress/zstd_compress_superblock.c b/lib/compress/zstd_compress_superblock.c index dacaf85db..5b22da026 100644 --- a/lib/compress/zstd_compress_superblock.c +++ b/lib/compress/zstd_compress_superblock.c @@ -122,7 +122,7 @@ ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable, } *entropyWritten = 1; DEBUGLOG(5, "Compressed literals: %u -> %u", (U32)litSize, (U32)(op-ostart)); - return op-ostart; + return (size_t)(op-ostart); } static size_t @@ -187,7 +187,7 @@ ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables, else op[0]=0xFF, MEM_writeLE16(op+1, (U16)(nbSeq - LONGNBSEQ)), op+=3; if (nbSeq==0) { - return op - ostart; + return (size_t)(op - ostart); } /* seqHead : flags for FSE encoding type */ @@ -209,7 +209,7 @@ ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables, } { size_t const bitstreamSize = ZSTD_encodeSequences( - op, oend - op, + op, (size_t)(oend - op), fseTables->matchlengthCTable, mlCode, fseTables->offcodeCTable, ofCode, fseTables->litlengthCTable, llCode, @@ -253,7 +253,7 @@ ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables, #endif *entropyWritten = 1; - return op - ostart; + return (size_t)(op - ostart); } /** ZSTD_compressSubBlock() : @@ -296,11 +296,11 @@ static size_t ZSTD_compressSubBlock(const ZSTD_entropyCTables_t* entropy, op += cSeqSize; } /* Write block header */ - { size_t cSize = (op-ostart)-ZSTD_blockHeaderSize; + { size_t cSize = (size_t)(op-ostart) - ZSTD_blockHeaderSize; U32 const cBlockHeader24 = lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3); MEM_writeLE24(ostart, cBlockHeader24); } - return op-ostart; + return (size_t)(op-ostart); } static size_t ZSTD_estimateSubBlockSize_literal(const BYTE* literals, size_t litSize, @@ -419,6 +419,16 @@ static int ZSTD_needSequenceEntropyTables(ZSTD_fseCTablesMetadata_t const* fseMe return 0; } +static size_t countLiterals(const seqDef* sp, size_t seqCount) +{ + size_t n, total = 0; + assert(sp != NULL); + for (n=0; nsequencesStart; const seqDef* const send = seqStorePtr->sequences; - const seqDef* sp = sstart; + size_t const nbSeqs = (size_t)(send - sstart); + size_t nbSeqsPerBlock = nbSeqs; const BYTE* const lstart = seqStorePtr->litStart; const BYTE* const lend = seqStorePtr->lit; const BYTE* lp = lstart; + size_t const nbLiterals = (size_t)(lend - lstart); BYTE const* ip = (BYTE const*)src; BYTE const* const iend = ip + srcSize; BYTE* const ostart = (BYTE*)dst; @@ -451,52 +463,50 @@ static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr, const BYTE* mlCodePtr = seqStorePtr->mlCode; const BYTE* ofCodePtr = seqStorePtr->ofCode; size_t targetCBlockSize = cctxParams->targetCBlockSize; - size_t litSize, seqCount; int writeLitEntropy = entropyMetadata->hufMetadata.hType == set_compressed; int writeSeqEntropy = 1; - int lastSequence = 0; + size_t nbSubBlocks = 1; DEBUGLOG(5, "ZSTD_compressSubBlock_multi (litSize=%u, nbSeq=%u)", (unsigned)(lend-lp), (unsigned)(send-sstart)); - litSize = 0; - seqCount = 0; - do { - size_t cBlockSizeEstimate = 0; - if (sstart == send) { - lastSequence = 1; - } else { - const seqDef* const sequence = sp + seqCount; - lastSequence = sequence == send - 1; - litSize += ZSTD_getSequenceLength(seqStorePtr, sequence).litLength; - seqCount++; - } - if (lastSequence) { - assert(lp <= lend); - assert(litSize <= (size_t)(lend - lp)); - litSize = (size_t)(lend - lp); - } - /* I think there is an optimization opportunity here. - * Calling ZSTD_estimateSubBlockSize for every sequence can be wasteful - * since it recalculates estimate from scratch. - * For example, it would recount literal distribution and symbol codes every time. - */ - cBlockSizeEstimate = ZSTD_estimateSubBlockSize(lp, litSize, ofCodePtr, llCodePtr, mlCodePtr, seqCount, - &nextCBlock->entropy, entropyMetadata, - workspace, wkspSize, writeLitEntropy, writeSeqEntropy); - if (cBlockSizeEstimate > targetCBlockSize || lastSequence) { + /* let's start by a general estimation for the full block */ + { size_t const cBlockSizeEstimate = + ZSTD_estimateSubBlockSize(lp, nbLiterals, + ofCodePtr, llCodePtr, mlCodePtr, nbSeqs, + &nextCBlock->entropy, entropyMetadata, + workspace, wkspSize, + writeLitEntropy, writeSeqEntropy); + /* quick estimation */ + nbSubBlocks = (cBlockSizeEstimate + (targetCBlockSize-1)) / targetCBlockSize; + assert(nbSubBlocks > 0); + nbSeqsPerBlock = nbSeqs / nbSubBlocks; + /* Note: this is very approximative. Obviously, some sub-blocks will be larger and others faster. + * But the contract of this feature has always been approximative, so for now we'll leverage it for speed. + * It can be refined later, for closer-to-target compressed block size, if it ever matters. */ + } + + /* write sub-blocks */ + { size_t n; + for (n=0; n < nbSubBlocks; n++) { + const seqDef* sp = sstart + n*nbSeqsPerBlock; + int lastSubBlock = (n==nbSubBlocks-1); + size_t const nbSeqsLastSubBlock = nbSeqs - (nbSubBlocks-1) * nbSeqsPerBlock; + size_t seqCount = lastSubBlock ? nbSeqsLastSubBlock : nbSeqsPerBlock; + size_t litSize = lastSubBlock ? (size_t)(lend-lp) : countLiterals(sp, seqCount); int litEntropyWritten = 0; int seqEntropyWritten = 0; - const size_t decompressedSize = ZSTD_seqDecompressedSize(seqStorePtr, sp, seqCount, litSize, lastSequence); + const size_t decompressedSize = + ZSTD_seqDecompressedSize(seqStorePtr, sp, seqCount, litSize, lastSubBlock); const size_t cSize = ZSTD_compressSubBlock(&nextCBlock->entropy, entropyMetadata, sp, seqCount, lp, litSize, llCodePtr, mlCodePtr, ofCodePtr, cctxParams, - op, oend-op, + op, (size_t)(oend-op), bmi2, writeLitEntropy, writeSeqEntropy, &litEntropyWritten, &seqEntropyWritten, - lastBlock && lastSequence); + lastBlock && lastSubBlock); FORWARD_IF_ERROR(cSize, "ZSTD_compressSubBlock failed"); if (cSize > 0 && cSize < decompressedSize) { DEBUGLOG(5, "Committed the sub-block"); @@ -519,7 +529,8 @@ static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr, } } } - } while (!lastSequence); + } + if (writeLitEntropy) { DEBUGLOG(5, "ZSTD_compressSubBlock_multi has literal entropy tables unwritten"); ZSTD_memcpy(&nextCBlock->entropy.huf, &prevCBlock->entropy.huf, sizeof(prevCBlock->entropy.huf)); @@ -531,25 +542,10 @@ static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr, DEBUGLOG(5, "ZSTD_compressSubBlock_multi has sequence entropy tables unwritten"); return 0; } - if (ip < iend) { - size_t const cSize = ZSTD_noCompressBlock(op, oend - op, ip, iend - ip, lastBlock); - DEBUGLOG(5, "ZSTD_compressSubBlock_multi last sub-block uncompressed, %zu bytes", (size_t)(iend - ip)); - FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed"); - assert(cSize != 0); - op += cSize; - /* We have to regenerate the repcodes because we've skipped some sequences */ - if (sp < send) { - seqDef const* seq; - repcodes_t rep; - ZSTD_memcpy(&rep, prevCBlock->rep, sizeof(rep)); - for (seq = sstart; seq < sp; ++seq) { - ZSTD_updateRep(rep.rep, seq->offBase, ZSTD_getSequenceLength(seqStorePtr, seq).litLength == 0); - } - ZSTD_memcpy(nextCBlock->rep, &rep, sizeof(rep)); - } - } - DEBUGLOG(5, "ZSTD_compressSubBlock_multi compressed"); - return op-ostart; + assert(ip == iend); (void)iend; + DEBUGLOG(5, "ZSTD_compressSubBlock_multi compressed: %u subBlocks, total compressed size = %u", + (unsigned)nbSubBlocks, (unsigned)(op-ostart)); + return (size_t)(op-ostart); } size_t ZSTD_compressSuperBlock(ZSTD_CCtx* zc, diff --git a/programs/zstd.1.md b/programs/zstd.1.md index 9a4a5df2c..078455f3e 100644 --- a/programs/zstd.1.md +++ b/programs/zstd.1.md @@ -221,7 +221,10 @@ the last one takes effect. * `--target-compressed-block-size=#`: Attempt to produce compressed blocks of approximately this size. This will split larger blocks in order to approach this target. - Notably useful to improve latency when the receiver can make use of early data sooner. + This feature is notably useful for improved latency, when the receiver can leverage receiving early incomplete data. + This parameter defines a loose target: compressed blocks will target this size "on average", but individual blocks can still be larger or smaller. + Enabling this feature can decrease compression speed by up to ~10% at level 1. + Higher levels will see smaller relative speed regression, becoming invisible at higher settings. * `-o FILE`: save result into `FILE`. * `-f`, `--force`: