From: Arpad Panyik Date: Tue, 8 Jul 2025 17:05:45 +0000 (+0000) Subject: Improve ZSTD_get1BlockSummary X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=8e4400463adc7bc7633641d6a485cfef4f28bc31;p=thirdparty%2Fzstd.git Improve ZSTD_get1BlockSummary Add a faster scalar implementation of ZSTD_get1BlockSummary which removes the data dependency of the accumulators in the hot loop to leverage the superscalar potential of recent out-of-order CPUs. The new algorithm leverages SWAR (SIMD Within A Register) methodology to exploit the capabilities of 64-bit architectures. It achieves this by packing two 32-bit data elements into a single 64-bit register, enabling parallel operations on these subcomponents while ensuring that the 32-bit boundaries prevent overflow, thereby optimizing computational efficiency. Corresponding unit tests are included. Relative performance to GCC-13 using: `./fullbench -b19 -l5 enwik5` Neoverse-V2 before after GCC-13: 100.000% 290.527% GCC-14: 100.000% 291.714% GCC-15: 99.914% 291.495% Clang-18: 148.072% 264.524% Clang-19: 148.075% 264.512% Clang-20: 148.062% 264.490% Cortex-A720 before after GCC-13: 100.000% 235.261% GCC-14: 101.064% 234.903% GCC-15: 112.977% 218.547% Clang-18: 127.135% 180.359% Clang-19: 127.149% 180.297% Clang-20: 127.154% 180.260% Co-authored by, Thomas Daubney --- diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index 9b7aaf9f4..16423f6ee 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -7604,29 +7604,104 @@ BlockSummary ZSTD_get1BlockSummary(const ZSTD_Sequence* seqs, size_t nbSeqs) #else +/* + * The function assumes `litMatchLength` is a packed 64-bit value where the + * lower 32 bits represent the match length. The check varies based on the + * system's endianness: + * - On little-endian systems, it verifies if the entire 64-bit value is at most + * 0xFFFFFFFF, indicating the match length (lower 32 bits) is zero. + * - On big-endian systems, it directly checks if the lower 32 bits are zero. + * + * @returns 1 if the match length is zero, 0 otherwise. + */ +FORCE_INLINE_TEMPLATE int matchLengthHalfIsZero(U64 litMatchLength) +{ + if (MEM_isLittleEndian()) { + return litMatchLength <= 0xFFFFFFFFULL; + } else { + return (U32)litMatchLength == 0; + } +} + BlockSummary ZSTD_get1BlockSummary(const ZSTD_Sequence* seqs, size_t nbSeqs) { - size_t totalMatchSize = 0; - size_t litSize = 0; - size_t n; + /* Use multiple accumulators for efficient use of wide out-of-order machines. */ + U64 litMatchSize0 = 0; + U64 litMatchSize1 = 0; + U64 litMatchSize2 = 0; + U64 litMatchSize3 = 0; + size_t n = 0; + + ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, litLength) + 4 == offsetof(ZSTD_Sequence, matchLength)); + ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, matchLength) + 4 == offsetof(ZSTD_Sequence, rep)); assert(seqs); - for (n=0; n 3) { + /* Process the input in 4 independent streams to reach high throughput. */ + do { + /* Load `litLength` and `matchLength` as a packed `U64`. It is safe + * to use 64-bit unsigned arithmetic here because the sum of `litLength` + * and `matchLength` cannot exceed the block size, so the 32-bit + * subparts will never overflow. */ + U64 litMatchLength = MEM_read64(&seqs[n].litLength); + litMatchSize0 += litMatchLength; + if (matchLengthHalfIsZero(litMatchLength)) { + assert(seqs[n].offset == 0); + goto _out; + } + + litMatchLength = MEM_read64(&seqs[n + 1].litLength); + litMatchSize1 += litMatchLength; + if (matchLengthHalfIsZero(litMatchLength)) { + n += 1; + assert(seqs[n].offset == 0); + goto _out; + } + + litMatchLength = MEM_read64(&seqs[n + 2].litLength); + litMatchSize2 += litMatchLength; + if (matchLengthHalfIsZero(litMatchLength)) { + n += 2; + assert(seqs[n].offset == 0); + goto _out; + } + + litMatchLength = MEM_read64(&seqs[n + 3].litLength); + litMatchSize3 += litMatchLength; + if (matchLengthHalfIsZero(litMatchLength)) { + n += 3; + assert(seqs[n].offset == 0); + goto _out; + } + + n += 4; + } while(n < nbSeqs - 3); + } + + for (; n < nbSeqs; n++) { + U64 litMatchLength = MEM_read64(&seqs[n].litLength); + litMatchSize0 += litMatchLength; + if (matchLengthHalfIsZero(litMatchLength)) { assert(seqs[n].offset == 0); - break; + goto _out; } } - if (n==nbSeqs) { - BlockSummary bs; + /* At this point n == nbSeqs, so no end terminator. */ + { BlockSummary bs; bs.nbSequences = ERROR(externalSequences_invalid); return bs; } +_out: + litMatchSize0 += litMatchSize1 + litMatchSize2 + litMatchSize3; { BlockSummary bs; - bs.nbSequences = n+1; - bs.blockSize = litSize + totalMatchSize; - bs.litSize = litSize; + bs.nbSequences = n + 1; + if (MEM_isLittleEndian()) { + bs.litSize = (U32)litMatchSize0; + bs.blockSize = bs.litSize + (litMatchSize0 >> 32); + } else { + bs.litSize = litMatchSize0 >> 32; + bs.blockSize = bs.litSize + (U32)litMatchSize0; + } return bs; } } diff --git a/tests/fuzzer.c b/tests/fuzzer.c index da380aced..8e30d207d 100644 --- a/tests/fuzzer.c +++ b/tests/fuzzer.c @@ -45,6 +45,7 @@ #include "zstd_internal.h" /* ZSTD_WORKSPACETOOLARGE_MAXDURATION, ZSTD_WORKSPACETOOLARGE_FACTOR, KB, MB */ #include "threading.h" /* ZSTD_pthread_create, ZSTD_pthread_join */ #include "compress/hist.h" /* HIST_count_wksp */ +#include "compress/zstd_compress_internal.h" /* ZSTD_get1BlockSummary */ /*-************************************ @@ -769,6 +770,86 @@ static void test_blockSplitter_incompressibleExpansionProtection(unsigned testNb DISPLAYLEVEL(3, "OK \n"); } +static unsigned test_get1BlockSummary(unsigned testNb) +{ + static const ZSTD_Sequence nseqs[] = { + { 10, 2, 4, 1 }, + { 20, 3, 5, 2 }, + { 30, 6, 8, 3 }, + { 40, 7, 9, 4 }, + { 50, 10, 12, 5 }, + { 60, 11, 13, 6 }, + { 0, 14, 0, 7 }, + { 70, 15, 17, 8 }, + { 80, 16, 18, 9 }, + { 90, 19, 21, 1 }, + { 99, 20, 22, 2 }, + }; + static const BlockSummary blocks[] = { + { 7, 104, 53 }, + { 6, 98, 51 }, + { 5, 90, 48 }, + { 4, 76, 42 }, + { 3, 60, 35 }, + { 2, 38, 25 }, + { 1, 14, 14 }, + }; + size_t i; + + DISPLAYLEVEL(3, "test%3u : ZSTD_get1BlockSummary with empty array : ", testNb++); + { + BlockSummary bs = ZSTD_get1BlockSummary(nseqs, 0); + CHECK_EQ(bs.nbSequences, ERROR(externalSequences_invalid)); + } + DISPLAYLEVEL(3, "OK \n"); + + DISPLAYLEVEL(3, "test%3u : ZSTD_get1BlockSummary with 1 literal only : ", testNb++); + { + static const ZSTD_Sequence seqs[] = { { 0, 5, 0, 0 } }; + BlockSummary bs = ZSTD_get1BlockSummary(seqs, 1); + CHECK_EQ(bs.nbSequences, 1); + CHECK_EQ(bs.litSize, 5); + CHECK_EQ(bs.blockSize, 5); + } + DISPLAYLEVEL(3, "OK \n"); + + DISPLAYLEVEL(3, "test%3u : ZSTD_get1BlockSummary with no terminator : ", testNb++); + { + static const ZSTD_Sequence seqs[] = { { 10, 2, 4, 0 }, { 20, 3, 5, 0 } }; + BlockSummary bs = ZSTD_get1BlockSummary(seqs, 2); + CHECK_EQ(bs.nbSequences, ERROR(externalSequences_invalid)); + } + DISPLAYLEVEL(3, "OK \n"); + + DISPLAYLEVEL(3, "test%3u : ZSTD_get1BlockSummary with rep ignored : ", testNb++); + { + static const ZSTD_Sequence seqs[] = { + { 10, 2, 4, 2 }, + { 10, 3, 5, 2 }, + { 0, 7, 0, 3 }, + }; + BlockSummary bs = ZSTD_get1BlockSummary(seqs, 3); + CHECK_EQ(bs.nbSequences, 3); + CHECK_EQ(bs.litSize, 2 + 3 + 7); + CHECK_EQ(bs.blockSize, (4 + 5) + (2 + 3 + 7)); + } + DISPLAYLEVEL(3, "OK \n"); + + assert(COUNTOF(nseqs) > COUNTOF(blocks)); + for (i = 0; i < COUNTOF(blocks); ++i) { + BlockSummary bs; + DISPLAYLEVEL(3, "test%3u : ZSTD_get1BlockSummary with %u inputs : ", + testNb++, (unsigned)(COUNTOF(nseqs) - i)); + bs = ZSTD_get1BlockSummary(nseqs + i, COUNTOF(nseqs) - i); + CHECK_EQ(bs.nbSequences, blocks[i].nbSequences); + CHECK_EQ(bs.litSize, blocks[i].litSize); + CHECK_EQ(bs.blockSize, blocks[i].blockSize); + DISPLAYLEVEL(3, "OK \n"); + } + + return testNb; +} + /* ============================================================= */ static int basicUnitTests(U32 const seed, double compressibility) @@ -4004,6 +4085,8 @@ static int basicUnitTests(U32 const seed, double compressibility) } DISPLAYLEVEL(3, "OK \n"); + testNb = test_get1BlockSummary(testNb); + DISPLAYLEVEL(3, "test%3i : ZSTD_compressSequencesAndLiterals : ", testNb++); { const size_t srcSize = 497000;