From: Adam Stylinski <kungfujesus06@gmail.com>
Date: Wed, 11 Sep 2024 22:34:54 +0000 (-0400)
Subject: Simplify avx2 chunkset a bit
X-Git-Tag: 2.2.3~28
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=b52e70341700ac5fd68ca8584b87561911cf8a75;p=thirdparty%2Fzlib-ng.git

Simplify avx2 chunkset a bit

Put length 16 in the length checking ladder and take care of it there
since it's also a simple case to handle. We kind of went out of our way
to pretend 128 bit vectors didn't exist when using avx2 but this can be
handled in a single instruction. Strangely the intrinsic uses vector
register operands but the instruction itself assumes a memory operand
for the source. This also means we don't have to handle this case in our
"GET_CHUNK_MAG" function.
---

diff --git a/arch/x86/chunkset_avx2.c b/arch/x86/chunkset_avx2.c
index 70620b91..26bd004c 100644
--- a/arch/x86/chunkset_avx2.c
+++ b/arch/x86/chunkset_avx2.c
@@ -14,6 +14,7 @@ typedef __m256i chunk_t;
 #define HAVE_CHUNKMEMSET_2
 #define HAVE_CHUNKMEMSET_4
 #define HAVE_CHUNKMEMSET_8
+#define HAVE_CHUNKMEMSET_16
 #define HAVE_CHUNK_MAG
 
 /* Populate don't cares so that this is a direct lookup (with some indirection into the permute table), because dist can
@@ -68,6 +69,10 @@ static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
     *chunk = _mm256_set1_epi64x(tmp);
 }
 
+static inline void chunkmemset_16(uint8_t *from, chunk_t *chunk) {
+    *chunk = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)from));
+}
+
 static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
     *chunk = _mm256_loadu_si256((__m256i *)s);
 }
@@ -99,10 +104,7 @@ static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t
         perm_vec = _mm256_add_epi8(perm_vec, permute_xform);
         ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), ret_vec0, 1);
         ret_vec = _mm256_shuffle_epi8(ret_vec, perm_vec);
-    } else if (dist == 16) {
-        __m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf);
-        return _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), ret_vec0, 1);
-    } else {
+    }  else {
         __m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf);
         __m128i ret_vec1 = _mm_loadu_si128((__m128i*)(buf + 16));
         /* Take advantage of the fact that only the latter half of the 256 bit vector will actually differ */
diff --git a/chunkset_tpl.h b/chunkset_tpl.h
index f5cc5c04..64f2bbec 100644
--- a/chunkset_tpl.h
+++ b/chunkset_tpl.h
@@ -130,11 +130,16 @@ Z_INTERNAL uint8_t* CHUNKMEMSET(uint8_t *out, unsigned dist, unsigned len) {
 #ifdef HAVE_CHUNKMEMSET_8
     if (dist == 8) {
         chunkmemset_8(from, &chunk_load);
-    } else if (dist == sizeof(chunk_t)) {
-        loadchunk(from, &chunk_load);
     } else
 #endif
-    {
+#ifdef HAVE_CHUNKMEMSET_16
+    if (dist == 16) {
+        chunkmemset_16(from, &chunk_load);
+    } else
+#endif
+    if (dist == sizeof(chunk_t)) {
+        loadchunk(from, &chunk_load);
+    } else {
         chunk_load = GET_CHUNK_MAG(from, &chunk_mod, dist);
     }