From: Yann Collet Date: Wed, 8 Jan 2025 03:34:06 +0000 (-0800) Subject: minor +10% speed improvement for scalar ZSTD_get1BlockSummary() X-Git-Tag: v1.5.7^2~36^2~12 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=b6a4d5a8ba29bc873c95098103f57f987cfacd23;p=thirdparty%2Fzstd.git minor +10% speed improvement for scalar ZSTD_get1BlockSummary() --- diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index d5e3d1c78..11933fb3c 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -7395,14 +7395,68 @@ size_t ZSTD_convertBlockSequences(ZSTD_CCtx* cctx, return ZSTD_convertBlockSequences_internal(cctx, inSeqs, nbSequences, 0); } +#if 0 && defined(__AVX2__) + +/* C90-compatible alignment macro (GCC/Clang). Adjust for other compilers if needed. */ +#if defined(__GNUC__) +# define ALIGNED32 __attribute__((aligned(32))) +#else +# define ALIGNED32 +#endif + BlockSummary ZSTD_get1BlockSummary(const ZSTD_Sequence* seqs, size_t nbSeqs) { - size_t blockSize = 0; + size_t i; + __m256i sumVec; /* accumulates match+lit in 32-bit lanes */ + __m256i mask; /* shuffling control */ + ALIGNED32 int tmp[8]; /* temporary buffer for reduction */ + uint64_t sum; + int k; + + sumVec = _mm256_setzero_si256(); + mask = _mm256_setr_epi32( + 1,5, /* match(0), match(1) */ + 2,6, /* lit(0), lit(1) */ + 1,5, /* match(0), match(1) */ + 2,6 /* lit(0), lit(1) */ + ); + + /* Process 2 structs (32 bytes) at a time */ + for (i = 0; i + 2 <= count; i += 2) { + /* Load two consecutive MyStructs (8×4 = 32 bytes) */ + __m256i data = _mm256_loadu_si256((const __m256i*)&arr[i]); + /* Shuffle out lanes 1,2,5,6 => match(0), match(1), lit(0), lit(1), repeated */ + __m256i selected = _mm256_permutevar8x32_epi32(data, mask); + /* Accumulate in sumVec */ + sumVec = _mm256_add_epi32(sumVec, selected); + } + + /* Horizontal reduction of sumVec */ + _mm256_store_si256((__m256i*)tmp, sumVec); + sum = 0; + for (k = 0; k < 8; k++) { + sum += (uint64_t)tmp[k]; /* each lane is match+lit from pairs, repeated twice */ + } + + /* Handle the leftover (if count is odd) */ + for (; i < count; i++) { + sum += arr[i].matchLength; + sum += arr[i].litLength; + } + + return sum; +} + +#else + +BlockSummary ZSTD_get1BlockSummary(const ZSTD_Sequence* seqs, size_t nbSeqs) +{ + size_t totalMatchSize = 0; size_t litSize = 0; size_t n; assert(seqs); for (n=0; n