Fix an issue with the ubsan for overflow

author Adam Stylinski <kungfujesus06@gmail.com>

Fri, 18 Mar 2022 23:18:10 +0000 (19:18 -0400)

committer Hans Kristian Rosbach <hk-github@circlestorm.org>

Thu, 24 Mar 2022 10:18:16 +0000 (11:18 +0100)
author Adam Stylinski <kungfujesus06@gmail.com>
Fri, 18 Mar 2022 23:18:10 +0000 (19:18 -0400)
committer Hans Kristian Rosbach <hk-github@circlestorm.org>
Thu, 24 Mar 2022 10:18:16 +0000 (11:18 +0100)
diff --git a/CMakeLists.txt b/CMakeLists.txt

index 883e794585a0f2497552600f37122ceeadab46a6..2bc748a8b39a5e5dc52b3673d9e397fc4d206592 100644 (file)
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -711,6 +711,7 @@ if(WITH_OPTIM)
                  list(APPEND AVX512_SRCS ${ARCHDIR}/adler32_avx512.c)
                  add_feature_info(AVX512_ADLER32 1 "Support AVX512-accelerated adler32, using \"${AVX512FLAG}\"")
                  list(APPEND ZLIB_ARCH_SRCS ${AVX512_SRCS})
+                list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/adler32_avx512_p.h)
                  if(HAVE_MASK_INTRIN)
                      add_definitions(-DX86_MASK_INTRIN)
                  endif()
diff --git a/arch/x86/adler32_avx512.c b/arch/x86/adler32_avx512.c

index c6327167a5c4ee528823ca666b2434437cbf1b41..05f8068a6084e3a3752f4fe8ecd640f2925ffabd 100644 (file)
--- a/arch/x86/adler32_avx512.c
+++ b/arch/x86/adler32_avx512.c
@@ -10,32 +10,10 @@
  #include "../../adler32_p.h"
  #include "../../cpu_features.h"
  #include "../../fallback_builtins.h"
-
  #include <immintrin.h>
+#include "adler32_avx512_p.h"
  
  #ifdef X86_AVX512_ADLER32
-static inline uint32_t partial_hsum(__m512i x) {
-    /* We need a permutation vector to extract every other integer. The
-     * rest are going to be zeros. Marking this const so the compiler stands
-     * a better chance of keeping this resident in a register through entire 
-     * loop execution. We certainly have enough zmm registers (32) */
-    const __m512i perm_vec = _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14,
-                                               1, 1, 1, 1, 1,  1,  1,  1);
-
-    __m512i non_zero = _mm512_permutexvar_epi32(perm_vec, x);
-
-    /* From here, it's a simple 256 bit wide reduction sum */
-    __m256i non_zero_avx = _mm512_castsi512_si256(non_zero);
-    
-    /* See Agner Fog's vectorclass for a decent reference. Essentially, phadd is
-     * pretty slow, much slower than the longer instruction sequence below */
-    __m128i sum1  = _mm_add_epi32(_mm256_extracti128_si256(non_zero_avx, 1),
-                                  _mm256_castsi256_si128(non_zero_avx));
-    __m128i sum2  = _mm_add_epi32(sum1,_mm_unpackhi_epi64(sum1, sum1));
-    __m128i sum3  = _mm_add_epi32(sum2,_mm_shuffle_epi32(sum2, 1));
-    return (uint32_t)_mm_cvtsi128_si32(sum3);
-}
-
  Z_INTERNAL uint32_t adler32_avx512(uint32_t adler, const unsigned char *buf, size_t len) {
      uint32_t sum2;
  
@@ -112,7 +90,7 @@ Z_INTERNAL uint32_t adler32_avx512(uint32_t adler, const unsigned char *buf, siz
  
          adler = partial_hsum(vs1) % BASE;
          vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler));
-        sum2 = _mm512_reduce_add_epi32(vs2) % BASE;
+        sum2 = _mm512_reduce_add_epu32(vs2) % BASE;
          vs2 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(sum2));
      }
  
diff --git a/arch/x86/adler32_avx512_p.h b/arch/x86/adler32_avx512_p.h

new file mode 100644 (file)

index 0000000..3751a44
--- /dev/null
+++ b/arch/x86/adler32_avx512_p.h
@@ -0,0 +1,46 @@
+#ifndef AVX512_FUNCS_H
+#define AVX512_FUNCS_H
+
+#include <immintrin.h>
+#include <stdint.h>
+/* Written because *_add_epi32(a) sets off ubsan */
+static inline uint32_t _mm512_reduce_add_epu32(__m512i x) {
+    __m256i a = _mm512_extracti64x4_epi64(x, 1);
+    __m256i b = _mm512_extracti64x4_epi64(x, 0);
+
+    __m256i a_plus_b = _mm256_add_epi32(a, b);
+    __m128i c = _mm256_extracti128_si256(a_plus_b, 1);
+    __m128i d = _mm256_extracti128_si256(a_plus_b, 0);
+    __m128i c_plus_d = _mm_add_epi32(c, d);
+
+    __m128i sum1 = _mm_unpackhi_epi64(c_plus_d, c_plus_d);
+    __m128i sum2 = _mm_add_epi32(sum1, c_plus_d);
+    __m128i sum3 = _mm_shuffle_epi32(sum2, 0x01);
+    __m128i sum4 = _mm_add_epi32(sum2, sum3);
+
+    return _mm_cvtsi128_si32(sum4);
+}
+
+static inline uint32_t partial_hsum(__m512i x) {
+    /* We need a permutation vector to extract every other integer. The
+     * rest are going to be zeros. Marking this const so the compiler stands
+     * a better chance of keeping this resident in a register through entire 
+     * loop execution. We certainly have enough zmm registers (32) */
+    const __m512i perm_vec = _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14,
+                                               1, 1, 1, 1, 1,  1,  1,  1);
+
+    __m512i non_zero = _mm512_permutexvar_epi32(perm_vec, x);
+
+    /* From here, it's a simple 256 bit wide reduction sum */
+    __m256i non_zero_avx = _mm512_castsi512_si256(non_zero);
+    
+    /* See Agner Fog's vectorclass for a decent reference. Essentially, phadd is
+     * pretty slow, much slower than the longer instruction sequence below */
+    __m128i sum1  = _mm_add_epi32(_mm256_extracti128_si256(non_zero_avx, 1),
+                                  _mm256_castsi256_si128(non_zero_avx));
+    __m128i sum2  = _mm_add_epi32(sum1,_mm_unpackhi_epi64(sum1, sum1));
+    __m128i sum3  = _mm_add_epi32(sum2,_mm_shuffle_epi32(sum2, 1));
+    return (uint32_t)_mm_cvtsi128_si32(sum3);
+}
+
+#endif
diff --git a/arch/x86/adler32_avx512_vnni.c b/arch/x86/adler32_avx512_vnni.c

index ff0a9b580b9adfc9d127b386c2bec028aa1fe0c7..180f7f413e5a53b8dd1d6a2752c4fc0b0449fa67 100644 (file)
--- a/arch/x86/adler32_avx512_vnni.c
+++ b/arch/x86/adler32_avx512_vnni.c
@@ -11,33 +11,10 @@
  #include "../../adler32_p.h"
  #include "../../cpu_features.h"
  #include "../../fallback_builtins.h"
-
  #include <immintrin.h>
+#include "adler32_avx512_p.h"
  
  #ifdef X86_AVX512VNNI_ADLER32
-
-static inline uint32_t partial_hsum(__m512i x) {
-    /* We need a permutation vector to extract every other integer. The
-     * rest are going to be zeros. Marking this const so the compiler stands
-     * a better chance of keeping this resident in a register through entire 
-     * loop execution. We certainly have enough zmm registers (32) */
-    const __m512i perm_vec = _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14,
-                                               1, 1, 1, 1, 1,  1,  1,  1);
-
-    __m512i non_zero = _mm512_permutexvar_epi32(perm_vec, x);
-
-    /* From here, it's a simple 256 bit wide reduction sum */
-    __m256i non_zero_avx = _mm512_castsi512_si256(non_zero);
-    
-    /* See Agner Fog's vectorclass for a decent reference. Essentially, phadd is
-     * pretty slow, much slower than the longer instruction sequence below */
-    __m128i sum1  = _mm_add_epi32(_mm256_extracti128_si256(non_zero_avx, 1),
-                                  _mm256_castsi256_si128(non_zero_avx));
-    __m128i sum2  = _mm_add_epi32(sum1,_mm_unpackhi_epi64(sum1, sum1));
-    __m128i sum3  = _mm_add_epi32(sum2,_mm_shuffle_epi32(sum2, 1));
-    return (uint32_t)_mm_cvtsi128_si32(sum3);
-}
-
  Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const unsigned char *buf, size_t len) {
      uint32_t sum2;
  
@@ -142,7 +119,7 @@ Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const unsigned char *buf
  
          adler = partial_hsum(vs1) % BASE;
          vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler));
-        sum2 = _mm512_reduce_add_epi32(vs2) % BASE;
+        sum2 = _mm512_reduce_add_epu32(vs2) % BASE;
          vs2 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(sum2));
      }
author	Adam Stylinski <kungfujesus06@gmail.com>
	Fri, 18 Mar 2022 23:18:10 +0000 (19:18 -0400)
committer	Hans Kristian Rosbach <hk-github@circlestorm.org>
	Thu, 24 Mar 2022 10:18:16 +0000 (11:18 +0100)
CMakeLists.txt		patch \| blob \| blame \| history
arch/x86/adler32_avx512.c		patch \| blob \| blame \| history
arch/x86/adler32_avx512_p.h	[new file with mode: 0644]	patch \| blob
arch/x86/adler32_avx512_vnni.c		patch \| blob \| blame \| history