From: Adam Stylinski Date: Fri, 18 Mar 2022 23:18:10 +0000 (-0400) Subject: Fix an issue with the ubsan for overflow X-Git-Tag: 2.1.0-beta1~310 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=23c9191c01bf54f9d1ee03e3e82fac0aebcc1f15;p=thirdparty%2Fzlib-ng.git Fix an issue with the ubsan for overflow While this didn't _actually_ cause any issues for us, technically the _mm512_reduce_add_epi32() intrinsics returns a signed integer and it does the very last summation in scalar GPRs as signed integers. While the ALU still did the math properly (the negative representation is the same addition in hardware, just interpreted differently), the sanitizer caught window of inputs here definitely outside the range of a signed integer for this immediate operation. The solution, as silly as it may seem, would be to implement our own 32 bit horizontal sum function that does all of the work in vector registers. This allows us to implicitly keep things in vector register domain and convert at the very end after we've summed the summation. The compiler's sanitizer doesn't know the wiser and the solution still results in being correct. --- diff --git a/CMakeLists.txt b/CMakeLists.txt index 883e7945..2bc748a8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -711,6 +711,7 @@ if(WITH_OPTIM) list(APPEND AVX512_SRCS ${ARCHDIR}/adler32_avx512.c) add_feature_info(AVX512_ADLER32 1 "Support AVX512-accelerated adler32, using \"${AVX512FLAG}\"") list(APPEND ZLIB_ARCH_SRCS ${AVX512_SRCS}) + list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/adler32_avx512_p.h) if(HAVE_MASK_INTRIN) add_definitions(-DX86_MASK_INTRIN) endif() diff --git a/arch/x86/adler32_avx512.c b/arch/x86/adler32_avx512.c index c6327167..05f8068a 100644 --- a/arch/x86/adler32_avx512.c +++ b/arch/x86/adler32_avx512.c @@ -10,32 +10,10 @@ #include "../../adler32_p.h" #include "../../cpu_features.h" #include "../../fallback_builtins.h" - #include +#include "adler32_avx512_p.h" #ifdef X86_AVX512_ADLER32 -static inline uint32_t partial_hsum(__m512i x) { - /* We need a permutation vector to extract every other integer. The - * rest are going to be zeros. Marking this const so the compiler stands - * a better chance of keeping this resident in a register through entire - * loop execution. We certainly have enough zmm registers (32) */ - const __m512i perm_vec = _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, - 1, 1, 1, 1, 1, 1, 1, 1); - - __m512i non_zero = _mm512_permutexvar_epi32(perm_vec, x); - - /* From here, it's a simple 256 bit wide reduction sum */ - __m256i non_zero_avx = _mm512_castsi512_si256(non_zero); - - /* See Agner Fog's vectorclass for a decent reference. Essentially, phadd is - * pretty slow, much slower than the longer instruction sequence below */ - __m128i sum1 = _mm_add_epi32(_mm256_extracti128_si256(non_zero_avx, 1), - _mm256_castsi256_si128(non_zero_avx)); - __m128i sum2 = _mm_add_epi32(sum1,_mm_unpackhi_epi64(sum1, sum1)); - __m128i sum3 = _mm_add_epi32(sum2,_mm_shuffle_epi32(sum2, 1)); - return (uint32_t)_mm_cvtsi128_si32(sum3); -} - Z_INTERNAL uint32_t adler32_avx512(uint32_t adler, const unsigned char *buf, size_t len) { uint32_t sum2; @@ -112,7 +90,7 @@ Z_INTERNAL uint32_t adler32_avx512(uint32_t adler, const unsigned char *buf, siz adler = partial_hsum(vs1) % BASE; vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler)); - sum2 = _mm512_reduce_add_epi32(vs2) % BASE; + sum2 = _mm512_reduce_add_epu32(vs2) % BASE; vs2 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(sum2)); } diff --git a/arch/x86/adler32_avx512_p.h b/arch/x86/adler32_avx512_p.h new file mode 100644 index 00000000..3751a449 --- /dev/null +++ b/arch/x86/adler32_avx512_p.h @@ -0,0 +1,46 @@ +#ifndef AVX512_FUNCS_H +#define AVX512_FUNCS_H + +#include +#include +/* Written because *_add_epi32(a) sets off ubsan */ +static inline uint32_t _mm512_reduce_add_epu32(__m512i x) { + __m256i a = _mm512_extracti64x4_epi64(x, 1); + __m256i b = _mm512_extracti64x4_epi64(x, 0); + + __m256i a_plus_b = _mm256_add_epi32(a, b); + __m128i c = _mm256_extracti128_si256(a_plus_b, 1); + __m128i d = _mm256_extracti128_si256(a_plus_b, 0); + __m128i c_plus_d = _mm_add_epi32(c, d); + + __m128i sum1 = _mm_unpackhi_epi64(c_plus_d, c_plus_d); + __m128i sum2 = _mm_add_epi32(sum1, c_plus_d); + __m128i sum3 = _mm_shuffle_epi32(sum2, 0x01); + __m128i sum4 = _mm_add_epi32(sum2, sum3); + + return _mm_cvtsi128_si32(sum4); +} + +static inline uint32_t partial_hsum(__m512i x) { + /* We need a permutation vector to extract every other integer. The + * rest are going to be zeros. Marking this const so the compiler stands + * a better chance of keeping this resident in a register through entire + * loop execution. We certainly have enough zmm registers (32) */ + const __m512i perm_vec = _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, + 1, 1, 1, 1, 1, 1, 1, 1); + + __m512i non_zero = _mm512_permutexvar_epi32(perm_vec, x); + + /* From here, it's a simple 256 bit wide reduction sum */ + __m256i non_zero_avx = _mm512_castsi512_si256(non_zero); + + /* See Agner Fog's vectorclass for a decent reference. Essentially, phadd is + * pretty slow, much slower than the longer instruction sequence below */ + __m128i sum1 = _mm_add_epi32(_mm256_extracti128_si256(non_zero_avx, 1), + _mm256_castsi256_si128(non_zero_avx)); + __m128i sum2 = _mm_add_epi32(sum1,_mm_unpackhi_epi64(sum1, sum1)); + __m128i sum3 = _mm_add_epi32(sum2,_mm_shuffle_epi32(sum2, 1)); + return (uint32_t)_mm_cvtsi128_si32(sum3); +} + +#endif diff --git a/arch/x86/adler32_avx512_vnni.c b/arch/x86/adler32_avx512_vnni.c index ff0a9b58..180f7f41 100644 --- a/arch/x86/adler32_avx512_vnni.c +++ b/arch/x86/adler32_avx512_vnni.c @@ -11,33 +11,10 @@ #include "../../adler32_p.h" #include "../../cpu_features.h" #include "../../fallback_builtins.h" - #include +#include "adler32_avx512_p.h" #ifdef X86_AVX512VNNI_ADLER32 - -static inline uint32_t partial_hsum(__m512i x) { - /* We need a permutation vector to extract every other integer. The - * rest are going to be zeros. Marking this const so the compiler stands - * a better chance of keeping this resident in a register through entire - * loop execution. We certainly have enough zmm registers (32) */ - const __m512i perm_vec = _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, - 1, 1, 1, 1, 1, 1, 1, 1); - - __m512i non_zero = _mm512_permutexvar_epi32(perm_vec, x); - - /* From here, it's a simple 256 bit wide reduction sum */ - __m256i non_zero_avx = _mm512_castsi512_si256(non_zero); - - /* See Agner Fog's vectorclass for a decent reference. Essentially, phadd is - * pretty slow, much slower than the longer instruction sequence below */ - __m128i sum1 = _mm_add_epi32(_mm256_extracti128_si256(non_zero_avx, 1), - _mm256_castsi256_si128(non_zero_avx)); - __m128i sum2 = _mm_add_epi32(sum1,_mm_unpackhi_epi64(sum1, sum1)); - __m128i sum3 = _mm_add_epi32(sum2,_mm_shuffle_epi32(sum2, 1)); - return (uint32_t)_mm_cvtsi128_si32(sum3); -} - Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const unsigned char *buf, size_t len) { uint32_t sum2; @@ -142,7 +119,7 @@ Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const unsigned char *buf adler = partial_hsum(vs1) % BASE; vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler)); - sum2 = _mm512_reduce_add_epi32(vs2) % BASE; + sum2 = _mm512_reduce_add_epu32(vs2) % BASE; vs2 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(sum2)); }