From: Adam Stylinski Date: Wed, 11 Sep 2024 22:34:54 +0000 (-0400) Subject: Simplify avx2 chunkset a bit X-Git-Tag: 2.2.3~28 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=b52e70341700ac5fd68ca8584b87561911cf8a75;p=thirdparty%2Fzlib-ng.git Simplify avx2 chunkset a bit Put length 16 in the length checking ladder and take care of it there since it's also a simple case to handle. We kind of went out of our way to pretend 128 bit vectors didn't exist when using avx2 but this can be handled in a single instruction. Strangely the intrinsic uses vector register operands but the instruction itself assumes a memory operand for the source. This also means we don't have to handle this case in our "GET_CHUNK_MAG" function. --- diff --git a/arch/x86/chunkset_avx2.c b/arch/x86/chunkset_avx2.c index 70620b91..26bd004c 100644 --- a/arch/x86/chunkset_avx2.c +++ b/arch/x86/chunkset_avx2.c @@ -14,6 +14,7 @@ typedef __m256i chunk_t; #define HAVE_CHUNKMEMSET_2 #define HAVE_CHUNKMEMSET_4 #define HAVE_CHUNKMEMSET_8 +#define HAVE_CHUNKMEMSET_16 #define HAVE_CHUNK_MAG /* Populate don't cares so that this is a direct lookup (with some indirection into the permute table), because dist can @@ -68,6 +69,10 @@ static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) { *chunk = _mm256_set1_epi64x(tmp); } +static inline void chunkmemset_16(uint8_t *from, chunk_t *chunk) { + *chunk = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)from)); +} + static inline void loadchunk(uint8_t const *s, chunk_t *chunk) { *chunk = _mm256_loadu_si256((__m256i *)s); } @@ -99,10 +104,7 @@ static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t perm_vec = _mm256_add_epi8(perm_vec, permute_xform); ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), ret_vec0, 1); ret_vec = _mm256_shuffle_epi8(ret_vec, perm_vec); - } else if (dist == 16) { - __m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf); - return _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), ret_vec0, 1); - } else { + } else { __m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf); __m128i ret_vec1 = _mm_loadu_si128((__m128i*)(buf + 16)); /* Take advantage of the fact that only the latter half of the 256 bit vector will actually differ */ diff --git a/chunkset_tpl.h b/chunkset_tpl.h index f5cc5c04..64f2bbec 100644 --- a/chunkset_tpl.h +++ b/chunkset_tpl.h @@ -130,11 +130,16 @@ Z_INTERNAL uint8_t* CHUNKMEMSET(uint8_t *out, unsigned dist, unsigned len) { #ifdef HAVE_CHUNKMEMSET_8 if (dist == 8) { chunkmemset_8(from, &chunk_load); - } else if (dist == sizeof(chunk_t)) { - loadchunk(from, &chunk_load); } else #endif - { +#ifdef HAVE_CHUNKMEMSET_16 + if (dist == 16) { + chunkmemset_16(from, &chunk_load); + } else +#endif + if (dist == sizeof(chunk_t)) { + loadchunk(from, &chunk_load); + } else { chunk_load = GET_CHUNK_MAG(from, &chunk_mod, dist); }