From: Adam Stylinski <kungfujesus06@gmail.com>
Date: Fri, 18 Mar 2022 23:18:10 +0000 (-0400)
Subject: Fix an issue with the ubsan for overflow
X-Git-Tag: 2.1.0-beta1~310
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=23c9191c01bf54f9d1ee03e3e82fac0aebcc1f15;p=thirdparty%2Fzlib-ng.git

Fix an issue with the ubsan for overflow

While this didn't _actually_ cause any issues for us, technically the
_mm512_reduce_add_epi32() intrinsics returns a signed integer and it
does the very last summation in scalar GPRs as signed integers. While
the ALU still did the math properly (the negative representation is the
same addition in hardware, just interpreted differently), the sanitizer
caught window of inputs here definitely outside the range of a signed
integer for this immediate operation.

The solution, as silly as it may seem, would be to implement our own 32
bit horizontal sum function that does all of the work in vector
registers. This allows us to implicitly keep things in vector register
domain and convert at the very end after we've summed the summation.

The compiler's sanitizer doesn't know the wiser and the solution still
results in being correct.
---

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 883e79458..2bc748a8b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -711,6 +711,7 @@ if(WITH_OPTIM)
                 list(APPEND AVX512_SRCS ${ARCHDIR}/adler32_avx512.c)
                 add_feature_info(AVX512_ADLER32 1 "Support AVX512-accelerated adler32, using \"${AVX512FLAG}\"")
                 list(APPEND ZLIB_ARCH_SRCS ${AVX512_SRCS})
+                list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/adler32_avx512_p.h)
                 if(HAVE_MASK_INTRIN)
                     add_definitions(-DX86_MASK_INTRIN)
                 endif()
diff --git a/arch/x86/adler32_avx512.c b/arch/x86/adler32_avx512.c
index c6327167a..05f8068a6 100644
--- a/arch/x86/adler32_avx512.c
+++ b/arch/x86/adler32_avx512.c
@@ -10,32 +10,10 @@
 #include "../../adler32_p.h"
 #include "../../cpu_features.h"
 #include "../../fallback_builtins.h"
-
 #include <immintrin.h>
+#include "adler32_avx512_p.h"
 
 #ifdef X86_AVX512_ADLER32
-static inline uint32_t partial_hsum(__m512i x) {
-    /* We need a permutation vector to extract every other integer. The
-     * rest are going to be zeros. Marking this const so the compiler stands
-     * a better chance of keeping this resident in a register through entire 
-     * loop execution. We certainly have enough zmm registers (32) */
-    const __m512i perm_vec = _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14,
-                                               1, 1, 1, 1, 1,  1,  1,  1);
-
-    __m512i non_zero = _mm512_permutexvar_epi32(perm_vec, x);
-
-    /* From here, it's a simple 256 bit wide reduction sum */
-    __m256i non_zero_avx = _mm512_castsi512_si256(non_zero);
-    
-    /* See Agner Fog's vectorclass for a decent reference. Essentially, phadd is
-     * pretty slow, much slower than the longer instruction sequence below */
-    __m128i sum1  = _mm_add_epi32(_mm256_extracti128_si256(non_zero_avx, 1),
-                                  _mm256_castsi256_si128(non_zero_avx));
-    __m128i sum2  = _mm_add_epi32(sum1,_mm_unpackhi_epi64(sum1, sum1));
-    __m128i sum3  = _mm_add_epi32(sum2,_mm_shuffle_epi32(sum2, 1));
-    return (uint32_t)_mm_cvtsi128_si32(sum3);
-}
-
 Z_INTERNAL uint32_t adler32_avx512(uint32_t adler, const unsigned char *buf, size_t len) {
     uint32_t sum2;
 
@@ -112,7 +90,7 @@ Z_INTERNAL uint32_t adler32_avx512(uint32_t adler, const unsigned char *buf, siz
 
         adler = partial_hsum(vs1) % BASE;
         vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler));
-        sum2 = _mm512_reduce_add_epi32(vs2) % BASE;
+        sum2 = _mm512_reduce_add_epu32(vs2) % BASE;
         vs2 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(sum2));
     }
 
diff --git a/arch/x86/adler32_avx512_p.h b/arch/x86/adler32_avx512_p.h
new file mode 100644
index 000000000..3751a449e
--- /dev/null
+++ b/arch/x86/adler32_avx512_p.h
@@ -0,0 +1,46 @@
+#ifndef AVX512_FUNCS_H
+#define AVX512_FUNCS_H
+
+#include <immintrin.h>
+#include <stdint.h>
+/* Written because *_add_epi32(a) sets off ubsan */
+static inline uint32_t _mm512_reduce_add_epu32(__m512i x) {
+    __m256i a = _mm512_extracti64x4_epi64(x, 1);
+    __m256i b = _mm512_extracti64x4_epi64(x, 0);
+
+    __m256i a_plus_b = _mm256_add_epi32(a, b);
+    __m128i c = _mm256_extracti128_si256(a_plus_b, 1);
+    __m128i d = _mm256_extracti128_si256(a_plus_b, 0);
+    __m128i c_plus_d = _mm_add_epi32(c, d);
+
+    __m128i sum1 = _mm_unpackhi_epi64(c_plus_d, c_plus_d);
+    __m128i sum2 = _mm_add_epi32(sum1, c_plus_d);
+    __m128i sum3 = _mm_shuffle_epi32(sum2, 0x01);
+    __m128i sum4 = _mm_add_epi32(sum2, sum3);
+
+    return _mm_cvtsi128_si32(sum4);
+}
+
+static inline uint32_t partial_hsum(__m512i x) {
+    /* We need a permutation vector to extract every other integer. The
+     * rest are going to be zeros. Marking this const so the compiler stands
+     * a better chance of keeping this resident in a register through entire 
+     * loop execution. We certainly have enough zmm registers (32) */
+    const __m512i perm_vec = _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14,
+                                               1, 1, 1, 1, 1,  1,  1,  1);
+
+    __m512i non_zero = _mm512_permutexvar_epi32(perm_vec, x);
+
+    /* From here, it's a simple 256 bit wide reduction sum */
+    __m256i non_zero_avx = _mm512_castsi512_si256(non_zero);
+    
+    /* See Agner Fog's vectorclass for a decent reference. Essentially, phadd is
+     * pretty slow, much slower than the longer instruction sequence below */
+    __m128i sum1  = _mm_add_epi32(_mm256_extracti128_si256(non_zero_avx, 1),
+                                  _mm256_castsi256_si128(non_zero_avx));
+    __m128i sum2  = _mm_add_epi32(sum1,_mm_unpackhi_epi64(sum1, sum1));
+    __m128i sum3  = _mm_add_epi32(sum2,_mm_shuffle_epi32(sum2, 1));
+    return (uint32_t)_mm_cvtsi128_si32(sum3);
+}
+
+#endif
diff --git a/arch/x86/adler32_avx512_vnni.c b/arch/x86/adler32_avx512_vnni.c
index ff0a9b580..180f7f413 100644
--- a/arch/x86/adler32_avx512_vnni.c
+++ b/arch/x86/adler32_avx512_vnni.c
@@ -11,33 +11,10 @@
 #include "../../adler32_p.h"
 #include "../../cpu_features.h"
 #include "../../fallback_builtins.h"
-
 #include <immintrin.h>
+#include "adler32_avx512_p.h"
 
 #ifdef X86_AVX512VNNI_ADLER32
-
-static inline uint32_t partial_hsum(__m512i x) {
-    /* We need a permutation vector to extract every other integer. The
-     * rest are going to be zeros. Marking this const so the compiler stands
-     * a better chance of keeping this resident in a register through entire 
-     * loop execution. We certainly have enough zmm registers (32) */
-    const __m512i perm_vec = _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14,
-                                               1, 1, 1, 1, 1,  1,  1,  1);
-
-    __m512i non_zero = _mm512_permutexvar_epi32(perm_vec, x);
-
-    /* From here, it's a simple 256 bit wide reduction sum */
-    __m256i non_zero_avx = _mm512_castsi512_si256(non_zero);
-    
-    /* See Agner Fog's vectorclass for a decent reference. Essentially, phadd is
-     * pretty slow, much slower than the longer instruction sequence below */
-    __m128i sum1  = _mm_add_epi32(_mm256_extracti128_si256(non_zero_avx, 1),
-                                  _mm256_castsi256_si128(non_zero_avx));
-    __m128i sum2  = _mm_add_epi32(sum1,_mm_unpackhi_epi64(sum1, sum1));
-    __m128i sum3  = _mm_add_epi32(sum2,_mm_shuffle_epi32(sum2, 1));
-    return (uint32_t)_mm_cvtsi128_si32(sum3);
-}
-
 Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const unsigned char *buf, size_t len) {
     uint32_t sum2;
 
@@ -142,7 +119,7 @@ Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const unsigned char *buf
 
         adler = partial_hsum(vs1) % BASE;
         vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler));
-        sum2 = _mm512_reduce_add_epi32(vs2) % BASE;
+        sum2 = _mm512_reduce_add_epu32(vs2) % BASE;
         vs2 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(sum2));
     }