Small optimization in 256 bit wide chunkset

author Adam Stylinski <kungfujesus06@gmail.com>

Tue, 23 Dec 2025 23:58:10 +0000 (18:58 -0500)

committer Hans Kristian Rosbach <hk-github@circlestorm.org>

Sat, 27 Dec 2025 22:55:09 +0000 (23:55 +0100)
author Adam Stylinski <kungfujesus06@gmail.com>
Tue, 23 Dec 2025 23:58:10 +0000 (18:58 -0500)
committer Hans Kristian Rosbach <hk-github@circlestorm.org>
Sat, 27 Dec 2025 22:55:09 +0000 (23:55 +0100)
diff --git a/arch/x86/chunkset_avx2.c b/arch/x86/chunkset_avx2.c

index 28deb34eac95f58f9cc18ce874fc64e3ce4a301b..7e3ffc2c63d24c7608ab2ca737c9256e0ffdf981 100644 (file)
--- a/arch/x86/chunkset_avx2.c
+++ b/arch/x86/chunkset_avx2.c
@@ -64,12 +64,8 @@ static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t
          /* This simpler case still requires us to shuffle in 128 bit lanes, so we must apply a static offset after
           * broadcasting the first vector register to both halves. This is _marginally_ faster than doing two separate
           * shuffles and combining the halves later */
-        const __m256i permute_xform =
-            _mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                             16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16);
          __m256i perm_vec = _mm256_load_si256((__m256i*)(permute_table+lut_rem.idx));
          __m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf);
-        perm_vec = _mm256_add_epi8(perm_vec, permute_xform);
          ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), ret_vec0, 1);
          ret_vec = _mm256_shuffle_epi8(ret_vec, perm_vec);
      }  else {
diff --git a/arch/x86/chunkset_avx512.c b/arch/x86/chunkset_avx512.c

index fc27a45a3e8febc95ed13a1ecd87d29a038537d3..3c46b4bfb6a59ab5402a288ac7d7239a52772b1d 100644 (file)
--- a/arch/x86/chunkset_avx512.c
+++ b/arch/x86/chunkset_avx512.c
@@ -111,13 +111,9 @@ static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t
       * loads to avoid an out of bounds read on the heap */
  
      if (dist < 16) {
-        const __m256i permute_xform =
-            _mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                             16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16);
          __m256i perm_vec = _mm256_load_si256((__m256i*)(permute_table+lut_rem.idx));
          halfmask_t load_mask = gen_half_mask(dist);
          __m128i ret_vec0 = _mm_maskz_loadu_epi8(load_mask, buf);
-        perm_vec = _mm256_add_epi8(perm_vec, permute_xform);
          ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), ret_vec0, 1);
          ret_vec = _mm256_shuffle_epi8(ret_vec, perm_vec);
      }  else {
author	Adam Stylinski <kungfujesus06@gmail.com>
	Tue, 23 Dec 2025 23:58:10 +0000 (18:58 -0500)
committer	Hans Kristian Rosbach <hk-github@circlestorm.org>
	Sat, 27 Dec 2025 22:55:09 +0000 (23:55 +0100)
arch/x86/chunkset_avx2.c		patch \| blob \| blame \| history
arch/x86/chunkset_avx512.c		patch \| blob \| blame \| history