From ed0a8b8be173fdd8fc0a05b60c1571d13c14b0a3 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Tue, 7 Jan 2025 23:32:05 -0800 Subject: [PATCH] AVX2 version of ZSTD_get1BlockSummary() --- lib/compress/zstd_compress.c | 69 ++++++++++++++++++++---------------- 1 file changed, 39 insertions(+), 30 deletions(-) diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index 11933fb3c..445c5613a 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -7395,56 +7395,65 @@ size_t ZSTD_convertBlockSequences(ZSTD_CCtx* cctx, return ZSTD_convertBlockSequences_internal(cctx, inSeqs, nbSequences, 0); } -#if 0 && defined(__AVX2__) +#if defined(__AVX2__) /* C90-compatible alignment macro (GCC/Clang). Adjust for other compilers if needed. */ #if defined(__GNUC__) # define ALIGNED32 __attribute__((aligned(32))) +#elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* C11 */ +# define ALIGNED32 alignas(32) #else + /* this compiler will require its own alignment instruction */ # define ALIGNED32 #endif BlockSummary ZSTD_get1BlockSummary(const ZSTD_Sequence* seqs, size_t nbSeqs) { size_t i; - __m256i sumVec; /* accumulates match+lit in 32-bit lanes */ - __m256i mask; /* shuffling control */ - ALIGNED32 int tmp[8]; /* temporary buffer for reduction */ - uint64_t sum; - int k; - - sumVec = _mm256_setzero_si256(); - mask = _mm256_setr_epi32( - 1,5, /* match(0), match(1) */ - 2,6, /* lit(0), lit(1) */ - 1,5, /* match(0), match(1) */ - 2,6 /* lit(0), lit(1) */ - ); + __m256i const zeroVec = _mm256_setzero_si256(); + __m256i sumVec = zeroVec; /* accumulates match+lit in 32-bit lanes */ + __m256i shuffle32; /* shuffling control */ + ALIGNED32 U32 tmp[8]; /* temporary buffer for reduction */ + size_t mSum = 0, lSum = 0; /* Process 2 structs (32 bytes) at a time */ - for (i = 0; i + 2 <= count; i += 2) { - /* Load two consecutive MyStructs (8×4 = 32 bytes) */ - __m256i data = _mm256_loadu_si256((const __m256i*)&arr[i]); - /* Shuffle out lanes 1,2,5,6 => match(0), match(1), lit(0), lit(1), repeated */ - __m256i selected = _mm256_permutevar8x32_epi32(data, mask); + for (i = 0; i + 2 <= nbSeqs; i += 2) { + /* Load two consecutive ZSTD_Sequence (8×4 = 32 bytes) */ + __m256i data = _mm256_loadu_si256((const __m256i*)&seqs[i]); + /* check end of block signal */ + __m256i cmp = _mm256_cmpeq_epi32(data, zeroVec); + int cmp_res = _mm256_movemask_epi8(cmp); + /* indices for match lengths correspond to bits [8..11], [24..27] + * => combined mask = 0x0F000F00 */ + if (cmp_res & 0x0F000F00) break; /* Accumulate in sumVec */ - sumVec = _mm256_add_epi32(sumVec, selected); + sumVec = _mm256_add_epi32(sumVec, data); } - /* Horizontal reduction of sumVec */ + /* Horizontal reduction */ _mm256_store_si256((__m256i*)tmp, sumVec); - sum = 0; - for (k = 0; k < 8; k++) { - sum += (uint64_t)tmp[k]; /* each lane is match+lit from pairs, repeated twice */ - } + lSum = tmp[1] + tmp[5]; + mSum = tmp[2] + tmp[6]; - /* Handle the leftover (if count is odd) */ - for (; i < count; i++) { - sum += arr[i].matchLength; - sum += arr[i].litLength; + /* Handle the leftover */ + for (; i < nbSeqs; i++) { + lSum += seqs[i].litLength; + mSum += seqs[i].matchLength; + if (seqs[i].matchLength == 0) break; /* end of block */ } - return sum; + if (i==nbSeqs) { + /* reaching end of sequences: end of block signal was not present */ + BlockSummary bs; + bs.nbSequences = ERROR(externalSequences_invalid); + return bs; + } + { BlockSummary bs; + bs.nbSequences = i+1; + bs.blockSize = lSum + mSum; + bs.litSize = lSum; + return bs; + } } #else -- 2.47.2