From: Adam Stylinski Date: Thu, 2 Dec 2021 22:05:55 +0000 (-0500) Subject: Made this work on 32 bit compilations X-Git-Tag: 2.1.0-beta1~484 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=1083c8e7132a010212ec490932e6f1964da58cda;p=thirdparty%2Fzlib-ng.git Made this work on 32 bit compilations For some reason the movq instruction from a 128 bit register to a 64 bit GPR is not supported in 32 bit code. A simple workaround seems to be to invoke movl if compiling with -m32. Also addressing some style nits. --- diff --git a/arch/x86/adler32_avx.c b/arch/x86/adler32_avx.c index 954a8a73a..f22730bb1 100644 --- a/arch/x86/adler32_avx.c +++ b/arch/x86/adler32_avx.c @@ -9,20 +9,24 @@ #include "../../zutil.h" #include "../../adler32_p.h" -#include #include #ifdef X86_AVX2_ADLER32 -/* 64 bit horizontal sum, adapted from Agner Fog's - * vector library. */ -static inline uint64_t hsum(__m256i x) -{ +/* 64 bit horizontal sum, adapted from Agner Fog's vector library. */ +static inline uint64_t hsum(__m256i x) { __m256i sum1 = _mm256_shuffle_epi32(x, 0x0E); __m256i sum2 = _mm256_add_epi64(x, sum1); __m128i sum3 = _mm256_extracti128_si256(sum2, 1); +#if defined(__x86_64__) || defined(_M_X64) return _mm_cvtsi128_si64(_mm_add_epi64(_mm256_castsi256_si128(sum2), sum3)); +#else + __m128i ret_vec = _mm_add_epi64(_mm256_castsi256_si128(sum2), sum3); + uint64_t ret_val; + _mm_storel_epi64((__m128i*)&ret_val, ret_vec); + return ret_val; +#endif } Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_t len) { @@ -44,17 +48,15 @@ Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_ if (UNLIKELY(len < 16)) return adler32_len_16(adler, buf, len, sum2); - /* If we could shift over 128 bit lanes, a broadcast + shift would be better */ - const __m256i sMask = _mm256_setr_epi32(0, 0, 0, 0, 0, 0, 0, -1); + const __m256i vs_mask = _mm256_setr_epi32(0, 0, 0, 0, 0, 0, 0, -1); __m256i vs1 = _mm256_set1_epi32(adler); __m256i vs2 = _mm256_set1_epi32(sum2); - vs1 = _mm256_and_si256(vs1, sMask); - vs2 = _mm256_and_si256(vs2, sMask); + vs1 = _mm256_and_si256(vs1, vs_mask); + vs2 = _mm256_and_si256(vs2, vs_mask); const __m256i dot1v = _mm256_set1_epi8(1); - const __m256i dot2v = _mm256_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, - 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, - 10, 9, 8, 7, 6, 5, 4, 3, 2, 1); + const __m256i dot2v = _mm256_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, + 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1); const __m256i dot3v = _mm256_set1_epi16(1); while (len >= 32) { @@ -67,7 +69,7 @@ Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_ while (k >= 32) { /* vs1 = adler + sum(c[i]) - vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] ) + vs2 = sum2 + 32 vs1 + sum( (32-i+1) c[i] ) */ __m256i vbuf = _mm256_loadu_si256((__m256i*)buf); buf += 32; @@ -86,66 +88,62 @@ Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_ /* The compiler is generating the following sequence for this integer modulus * when done the scalar way, in GPRs: - mov $0x80078071,%edi // move magic constant into 32 bit register %edi - ... - vmovd %xmm1,%esi // move vector lane 0 to 32 bit register %esi - mov %rsi,%rax // zero-extend this value to 64 bit precision in %rax - imul %rdi,%rsi // do a signed multiplication with magic constant and vector element - shr $0x2f,%rsi // shift right by 47 - imul $0xfff1,%esi,%esi // do a signed multiplication with value truncated to 32 bits with 0xfff1 - sub %esi,%eax // subtract lower 32 bits of original vector value from modified one above - ... - // repeats for each element with vpextract instructions - - This is tricky with AVX2 for a number of reasons: - 1.) There's no 64 bit multiplication instruction, but there is a sequence to get there - 2.) There's ways to extend vectors to 64 bit precision, but no simple way to truncate - back down to 32 bit precision later (there is in AVX512) - 3.) Full width integer multiplications aren't cheap - */ - - // At this point, we have partial sums stored in vs1 and vs2. There are AVX512 instructions that - // would allow us to sum these quickly (VP4DPWSSD). For now, just unpack and move on. - /*uint32_t ALIGNED_(32) s1_unpack[8]; - uint32_t ALIGNED_(32) s2_unpack[8]; - - _mm256_store_si256((__m256i*)s1_unpack, vs1); - _mm256_store_si256((__m256i*)s2_unpack, vs2); - - - adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE) + - (s1_unpack[4] % BASE) + (s1_unpack[5] % BASE) + (s1_unpack[6] % BASE) + (s1_unpack[7] % BASE);*/ - - /* Will translate to nops */ - __m128i s1lo = _mm256_castsi256_si128(vs1); - __m128i s2lo = _mm256_castsi256_si128(vs2); - - __m128i s1hi = _mm256_extracti128_si256(vs1, 1); - __m128i s2hi = _mm256_extracti128_si256(vs2, 1); - - /* Convert up to 64 bit precision to prevent overflow */ - __m256i s1lo256 = _mm256_cvtepi32_epi64(s1lo); - __m256i s1hi256 = _mm256_cvtepi32_epi64(s1hi); - __m256i s2lo256 = _mm256_cvtepi32_epi64(s2lo); - __m256i s2hi256 = _mm256_cvtepi32_epi64(s2hi); - - /* sum vectors in existing lanes */ - __m256i s1Sum = _mm256_add_epi64(s1lo256, s1hi256); - __m256i s2Sum = _mm256_add_epi64(s2lo256, s2hi256); - - /* In AVX2-land, this trip through GPRs will probably - * be unvoidable, as there's no cheap and easy conversion - * from 64 bit integer to 32 bit. This casting to 32 bit - * is cheap through GPRs (just register aliasing), and safe, - * as our base is significantly smaller than UINT32_MAX */ - adler = (uint32_t)(hsum(s1Sum) % BASE); - sum2 = (uint32_t)(hsum(s2Sum) % BASE); - - vs1 = _mm256_set1_epi32(adler); - vs1 = _mm256_and_si256(vs1, sMask); + + adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE) + + (s1_unpack[4] % BASE) + (s1_unpack[5] % BASE) + (s1_unpack[6] % BASE) + (s1_unpack[7] % BASE); + + mov $0x80078071,%edi // move magic constant into 32 bit register %edi + ... + vmovd %xmm1,%esi // move vector lane 0 to 32 bit register %esi + mov %rsi,%rax // zero-extend this value to 64 bit precision in %rax + imul %rdi,%rsi // do a signed multiplication with magic constant and vector element + shr $0x2f,%rsi // shift right by 47 + imul $0xfff1,%esi,%esi // do a signed multiplication with value truncated to 32 bits with 0xfff1 + sub %esi,%eax // subtract lower 32 bits of original vector value from modified one above + ... + // repeats for each element with vpextract instructions + + This is tricky with AVX2 for a number of reasons: + 1.) There's no 64 bit multiplication instruction, but there is a sequence to get there + 2.) There's ways to extend vectors to 64 bit precision, but no simple way to truncate + back down to 32 bit precision later (there is in AVX512) + 3.) Full width integer multiplications aren't cheap + + We can, however, cast up to 64 bit precision on all 8 integers at once, and do a relatively + cheap sequence for horizontal sums. Then, we simply do the integer modulus on the resulting + 64 bit GPR, on a scalar value + */ + - vs2 = _mm256_set1_epi32(sum2); - vs2 = _mm256_and_si256(vs2, sMask); + /* Will translate to nops */ + __m128i s1lo = _mm256_castsi256_si128(vs1); + __m128i s2lo = _mm256_castsi256_si128(vs2); + + /* Requires vextracti128 */ + __m128i s1hi = _mm256_extracti128_si256(vs1, 1); + __m128i s2hi = _mm256_extracti128_si256(vs2, 1); + + /* Convert up to 64 bit precision to prevent overflow */ + __m256i s1lo256 = _mm256_cvtepi32_epi64(s1lo); + __m256i s1hi256 = _mm256_cvtepi32_epi64(s1hi); + __m256i s2lo256 = _mm256_cvtepi32_epi64(s2lo); + __m256i s2hi256 = _mm256_cvtepi32_epi64(s2hi); + + /* Sum vectors in existing lanes */ + __m256i s1_sum = _mm256_add_epi64(s1lo256, s1hi256); + __m256i s2_sum = _mm256_add_epi64(s2lo256, s2hi256); + + /* In AVX2-land, this trip through GPRs will probably be unvoidable, as there's no cheap and easy + * conversion from 64 bit integer to 32 bit. This casting to 32 bit is cheap through GPRs + * (just register aliasing), and safe, as our base is significantly smaller than UINT32_MAX */ + adler = (uint32_t)(hsum(s1_sum) % BASE); + sum2 = (uint32_t)(hsum(s2_sum) % BASE); + + vs1 = _mm256_set1_epi32(adler); + vs2 = _mm256_set1_epi32(sum2); + + vs1 = _mm256_and_si256(vs1, vs_mask); + vs2 = _mm256_and_si256(vs2, vs_mask); } /* Process tail (len < 16). */