From: Adam Stylinski <kungfujesus06@gmail.com>
Date: Tue, 23 Dec 2025 23:58:10 +0000 (-0500)
Subject: Small optimization in 256 bit wide chunkset
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=67b3edfd01b42cb4bba5c20fd39fbe2ad00fcf22;p=thirdparty%2Fzlib-ng.git

Small optimization in 256 bit wide chunkset

It turns out Intel only parses the bottom 4 bits of the shuffle vector.
This makes it already a sufficient permutation vector and saves us a
small bit of latency.
---

diff --git a/arch/x86/chunkset_avx2.c b/arch/x86/chunkset_avx2.c
index 28deb34ea..7e3ffc2c6 100644
--- a/arch/x86/chunkset_avx2.c
+++ b/arch/x86/chunkset_avx2.c
@@ -64,12 +64,8 @@ static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t
         /* This simpler case still requires us to shuffle in 128 bit lanes, so we must apply a static offset after
          * broadcasting the first vector register to both halves. This is _marginally_ faster than doing two separate
          * shuffles and combining the halves later */
-        const __m256i permute_xform =
-            _mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                             16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16);
         __m256i perm_vec = _mm256_load_si256((__m256i*)(permute_table+lut_rem.idx));
         __m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf);
-        perm_vec = _mm256_add_epi8(perm_vec, permute_xform);
         ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), ret_vec0, 1);
         ret_vec = _mm256_shuffle_epi8(ret_vec, perm_vec);
     }  else {
diff --git a/arch/x86/chunkset_avx512.c b/arch/x86/chunkset_avx512.c
index fc27a45a3..3c46b4bfb 100644
--- a/arch/x86/chunkset_avx512.c
+++ b/arch/x86/chunkset_avx512.c
@@ -111,13 +111,9 @@ static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t
      * loads to avoid an out of bounds read on the heap */
 
     if (dist < 16) {
-        const __m256i permute_xform =
-            _mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                             16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16);
         __m256i perm_vec = _mm256_load_si256((__m256i*)(permute_table+lut_rem.idx));
         halfmask_t load_mask = gen_half_mask(dist);
         __m128i ret_vec0 = _mm_maskz_loadu_epi8(load_mask, buf);
-        perm_vec = _mm256_add_epi8(perm_vec, permute_xform);
         ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), ret_vec0, 1);
         ret_vec = _mm256_shuffle_epi8(ret_vec, perm_vec);
     }  else {