From: Adam Stylinski Date: Tue, 23 Dec 2025 23:58:10 +0000 (-0500) Subject: Small optimization in 256 bit wide chunkset X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=67b3edfd01b42cb4bba5c20fd39fbe2ad00fcf22;p=thirdparty%2Fzlib-ng.git Small optimization in 256 bit wide chunkset It turns out Intel only parses the bottom 4 bits of the shuffle vector. This makes it already a sufficient permutation vector and saves us a small bit of latency. --- diff --git a/arch/x86/chunkset_avx2.c b/arch/x86/chunkset_avx2.c index 28deb34ea..7e3ffc2c6 100644 --- a/arch/x86/chunkset_avx2.c +++ b/arch/x86/chunkset_avx2.c @@ -64,12 +64,8 @@ static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t /* This simpler case still requires us to shuffle in 128 bit lanes, so we must apply a static offset after * broadcasting the first vector register to both halves. This is _marginally_ faster than doing two separate * shuffles and combining the halves later */ - const __m256i permute_xform = - _mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16); __m256i perm_vec = _mm256_load_si256((__m256i*)(permute_table+lut_rem.idx)); __m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf); - perm_vec = _mm256_add_epi8(perm_vec, permute_xform); ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), ret_vec0, 1); ret_vec = _mm256_shuffle_epi8(ret_vec, perm_vec); } else { diff --git a/arch/x86/chunkset_avx512.c b/arch/x86/chunkset_avx512.c index fc27a45a3..3c46b4bfb 100644 --- a/arch/x86/chunkset_avx512.c +++ b/arch/x86/chunkset_avx512.c @@ -111,13 +111,9 @@ static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t * loads to avoid an out of bounds read on the heap */ if (dist < 16) { - const __m256i permute_xform = - _mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16); __m256i perm_vec = _mm256_load_si256((__m256i*)(permute_table+lut_rem.idx)); halfmask_t load_mask = gen_half_mask(dist); __m128i ret_vec0 = _mm_maskz_loadu_epi8(load_mask, buf); - perm_vec = _mm256_add_epi8(perm_vec, permute_xform); ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), ret_vec0, 1); ret_vec = _mm256_shuffle_epi8(ret_vec, perm_vec); } else {