option(WITH_CRC32_VX "Build with vectorized CRC32 on IBM Z" ON)
elseif(BASEARCH_X86_FOUND)
option(WITH_AVX2 "Build with AVX2" ON)
+ option(WITH_AVX512 "Build with AVX512" ON)
+ option(WITH_AVX512VNNI "Build with AVX512 VNNI extensions" ON)
option(WITH_SSE2 "Build with SSE2" ON)
option(WITH_SSSE3 "Build with SSSE3" ON)
option(WITH_SSE4 "Build with SSE4" ON)
set(WITH_AVX2 OFF)
endif()
endif()
+ if(WITH_AVX512)
+ check_avx512_intrinsics()
+ if(HAVE_AVX512_INTRIN)
+ add_definitions(-DX86_AVX512 -DX86_AVX512_ADLER32)
+ list(APPEND AVX512_SRCS ${ARCHDIR}/adler32_avx512.c)
+ add_feature_info(AVX512_ADLER32 1 "Support AVX512-accelerated adler32, using \"${AVX512FLAG}\"")
+ list(APPEND ZLIB_ARCH_SRCS ${AVX512_SRCS})
+ if(HAVE_MASK_INTRIN)
+ add_definitions(-DX86_MASK_INTRIN)
+ endif()
+ set_property(SOURCE ${AVX512_SRCS} PROPERTY COMPILE_FLAGS "${AVX512FLAG} ${NOLTOFLAG}")
+ else()
+ set(WITH_AVX512 OFF)
+ endif()
+ endif()
+ if(WITH_AVX512VNNI)
+ check_avx512vnni_intrinsics()
+ if(HAVE_AVX512VNNI_INTRIN)
+ add_definitions(-DX86_AVX512VNNI -DX86_AVX512VNNI_ADLER32)
+ add_feature_info(AVX512VNNI_ADLER32 1 "Support AVX512VNNI adler32, using \"${AVX512VNNIFLAG}\"")
+ list(APPEND AVX512VNNI_SRCS ${ARCHDIR}/adler32_avx512_vnni.c)
+ list(APPEND ZLIB_ARCH_SRCS ${AVX512VNNI_SRCS})
+ set_property(SOURCE ${AVX512VNNI_SRCS} PROPERTY COMPILE_FLAGS "${AVX512VNNIFLAG} ${NOLTOFLAG}")
+ else()
+ set(WITH_AVX512VNNI OFF)
+ endif()
+ endif()
if(WITH_SSE4)
check_sse4_intrinsics()
if(HAVE_SSE42CRC_INLINE_ASM OR HAVE_SSE42CRC_INTRIN)
add_feature_info(WITH_CRC32_VX WITH_CRC32_VX "Build with vectorized CRC32 on IBM Z")
elseif(BASEARCH_X86_FOUND)
add_feature_info(WITH_AVX2 WITH_AVX2 "Build with AVX2")
+ add_feature_info(WITH_AVX512 WITH_AVX512 "Build with AVX512")
+ add_feature_info(WITH_AVX512VNNI WITH_AVX512VNNI "Build with AVX512 VNNI")
add_feature_info(WITH_SSE2 WITH_SSE2 "Build with SSE2")
add_feature_info(WITH_SSSE3 WITH_SSSE3 "Build with SSSE3")
add_feature_info(WITH_SSE4 WITH_SSE4 "Build with SSE4")
* Modern C11 syntax and a clean code layout
* Deflate medium and quick algorithms based on Intels zlib fork
* Support for CPU intrinsics when available
- * Adler32 implementation using SSSE3, AVX2, Neon, VMX & VSX
+ * Adler32 implementation using SSSE3, AVX2, AVX512, AVX512-VNNI, Neon, VMX & VSX
* CRC32-B implementation using PCLMULQDQ & ACLE
* Hash table implementation using CRC32-C intrinsics on x86 and ARM
* Slide hash implementations using SSE2, AVX2, Neon, VMX & VSX
| UNALIGNED_OK | | Allow unaligned reads | ON (x86, arm) |
| | --force-sse2 | Skip runtime check for SSE2 instructions (Always on for x86_64) | OFF (x86) |
| WITH_AVX2 | | Build with AVX2 intrinsics | ON |
+| WITH_AVX512 | | Build with AVX512 intrinsics | ON |
+| WITH_AVX512VNNI | | Build with AVX512VNNI intrinsics | ON |
| WITH_SSE2 | | Build with SSE2 intrinsics | ON |
| WITH_SSE4 | | Build with SSE4 intrinsics | ON |
| WITH_PCLMULQDQ | | Build with PCLMULQDQ intrinsics | ON |
INCLUDES=
SUFFIX=
+AVX512FLAG=-mavx512f -mavx512dq -mavx512vl -mavx512bw
+AVX512VNNIFLAG=-mavx512vnni
AVX2FLAG=-mavx2
SSE2FLAG=-msse2
SSSE3FLAG=-mssse3
all: \
x86.o x86.lo \
- adler32_avx.o adler32.lo \
+ adler32_avx.o adler32_avx.lo \
+ adler32_avx512.o adler32_avx512.lo \
+ adler32_avx512_vnni.o adler32_avx512_vnni.lo \
adler32_ssse3.o adler32_ssse3.lo \
chunkset_avx.o chunkset_avx.lo \
chunkset_sse.o chunkset_sse.lo \
adler32_avx.lo: $(SRCDIR)/adler32_avx.c
$(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx.c
+adler32_avx512.o: $(SRCDIR)/adler32_avx512.c
+ $(CC) $(CFLAGS) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512.c
+
+adler32_avx512.lo: $(SRCDIR)/adler32_avx512.c
+ $(CC) $(SFLAGS) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512.c
+
+adler32_avx512_vnni.o: $(SRCDIR)/adler32_avx512_vnni.c
+ $(CC) $(CFLAGS) $(AVX512VNNIFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512_vnni.c
+
+adler32_avx512_vnni.lo: $(SRCDIR)/adler32_avx512_vnni.c
+ $(CC) $(SFLAGS) $(AVX512VNNIFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512_vnni.c
+
adler32_ssse3.o: $(SRCDIR)/adler32_ssse3.c
$(CC) $(CFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_ssse3.c
--- /dev/null
+/* adler32_avx512.c -- compute the Adler-32 checksum of a data stream
+ * Copyright (C) 1995-2011 Mark Adler
+ * Authors:
+ * Adam Stylinski <kungfujesus06@gmail.com>
+ * Brian Bockelman <bockelman@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "../../zbuild.h"
+#include "../../zutil.h"
+
+#include "../../adler32_p.h"
+
+#include <immintrin.h>
+
+#ifdef X86_AVX512_ADLER32
+
+Z_INTERNAL uint32_t adler32_avx512(uint32_t adler, const unsigned char *buf, size_t len) {
+ uint32_t sum2;
+
+ /* split Adler-32 into component sums */
+ sum2 = (adler >> 16) & 0xffff;
+ adler &= 0xffff;
+
+ /* in case user likes doing a byte at a time, keep it fast */
+ if (UNLIKELY(len == 1))
+ return adler32_len_1(adler, buf, sum2);
+
+ /* initial Adler-32 value (deferred check for len == 1 speed) */
+ if (UNLIKELY(buf == NULL))
+ return 1L;
+
+ /* in case short lengths are provided, keep it somewhat fast */
+ if (UNLIKELY(len < 16))
+ return adler32_len_16(adler, buf, len, sum2);
+
+ const __mmask16 vs_mask = 1U << 15;
+ __m512i vs1 = _mm512_maskz_set1_epi32(vs_mask, adler);
+ __m512i vs2 = _mm512_maskz_set1_epi32(vs_mask, sum2);
+
+ const __m512i dot1v = _mm512_set1_epi8(1);
+ const __m512i dot2v = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+ 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
+ 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+ 56, 57, 58, 59, 60, 61, 62, 63, 64);
+ const __m512i dot3v = _mm512_set1_epi16(1);
+
+ while (len >= 64) {
+ __m512i vs1_0 = vs1;
+
+ int k = (len < NMAX ? (int)len : NMAX);
+ k -= k % 64;
+ len -= k;
+
+ while (k >= 64) {
+ /*
+ vs1 = adler + sum(c[i])
+ vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] )
+ */
+ __m512i vbuf = _mm512_loadu_si512(buf);
+ buf += 64;
+ k -= 64;
+
+ __m512i v_short_sum1 = _mm512_maddubs_epi16(vbuf, dot1v); // multiply-add, resulting in 16 shorts.
+ __m512i vsum1 = _mm512_madd_epi16(v_short_sum1, dot3v); // sum 16 shorts to 8 int32_t;
+ __m512i v_short_sum2 = _mm512_maddubs_epi16(vbuf, dot2v);
+ vs1 = _mm512_add_epi32(vsum1, vs1);
+ __m512i vsum2 = _mm512_madd_epi16(v_short_sum2, dot3v);
+ vs1_0 = _mm512_slli_epi32(vs1_0, 6);
+ vsum2 = _mm512_add_epi32(vsum2, vs2);
+ vs2 = _mm512_add_epi32(vsum2, vs1_0);
+ vs1_0 = vs1;
+ }
+
+ adler = _mm512_reduce_add_epi32(vs1) % BASE;
+ vs1 = _mm512_maskz_set1_epi32(vs_mask, adler);
+ sum2 = _mm512_reduce_add_epi32(vs2) % BASE;
+ vs2 = _mm512_maskz_set1_epi32(vs_mask, sum2);
+ }
+
+ /* Process tail (len < 16). */
+ return adler32_len_16(adler, buf, len, sum2);
+}
+
+#endif
--- /dev/null
+/* adler32_avx512_vnni.c -- compute the Adler-32 checksum of a data stream
+ * Based on Brian Bockelman's AVX2 version
+ * Copyright (C) 1995-2011 Mark Adler
+ * Authors:
+ * Adam Stylinski <kungfujesus06@gmail.com>
+ * Brian Bockelman <bockelman@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "../../zbuild.h"
+#include "../../zutil.h"
+
+#include "../../adler32_p.h"
+
+#include <immintrin.h>
+
+#ifdef X86_AVX512VNNI_ADLER32
+
+static inline uint32_t partial_hsum(__m512i x)
+{
+ /* We need a permutation vector to extract every other integer. The
+ * rest are going to be zeros. Marking this const so the compiler stands
+ * a better chance of keeping this resident in a register through entire
+ * loop execution. We certainly have enough zmm registers (32) */
+ const __m512i perm_vec = _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14,
+ 1, 1, 1, 1, 1, 1, 1, 1);
+ __m512i non_zero = _mm512_permutexvar_epi32(perm_vec, x);
+
+ /* From here, it's a simple 256 bit wide reduction sum */
+ __m256i non_zero_avx = _mm512_castsi512_si256(non_zero);
+
+ /* See Agner Fog's vectorclass for a decent reference. Essentially, phadd is
+ * pretty slow, much slower than the longer instruction sequence below */
+ __m128i sum1 = _mm_add_epi32(_mm256_extracti128_si256(non_zero_avx, 1),
+ _mm256_castsi256_si128(non_zero_avx));
+ __m128i sum2 = _mm_add_epi32(sum1,_mm_unpackhi_epi64(sum1, sum1));
+ __m128i sum3 = _mm_add_epi32(sum2,_mm_shuffle_epi32(sum2, 1));
+ return (uint32_t)_mm_cvtsi128_si32(sum3);
+}
+
+Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const unsigned char *buf, size_t len) {
+ uint32_t sum2;
+
+ /* split Adler-32 into component sums */
+ sum2 = (adler >> 16) & 0xffff;
+ adler &= 0xffff;
+
+ /* in case user likes doing a byte at a time, keep it fast */
+ if (UNLIKELY(len == 1))
+ return adler32_len_1(adler, buf, sum2);
+
+ /* initial Adler-32 value (deferred check for len == 1 speed) */
+ if (UNLIKELY(buf == NULL))
+ return 1L;
+
+ /* in case short lengths are provided, keep it somewhat fast */
+ if (UNLIKELY(len < 16))
+ return adler32_len_16(adler, buf, len, sum2);
+
+ const __mmask16 vs_mask = 1U << 15;
+ /* We want to place initial adler sum at vector position 0, as it is one of the lanes that line up
+ * with the sum of absolute differences' reduction sum. If we do this, we can get away with a partial,
+ * less expensive horizontal sum for the vs1 component at the end. It also happens to be marginally better
+ * (by a single cycle) to do this with the ancient vmovd insruction, and simply allow the register to be
+ * aliased up to a 512 bit wide zmm */
+ __m512i vs1 = _mm512_castsi128_si512(_mm_cvtsi32_si128(adler));
+ __m512i vs2 = _mm512_maskz_set1_epi32(vs_mask, sum2);
+
+ const __m512i dot2v = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+ 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
+ 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+ 56, 57, 58, 59, 60, 61, 62, 63, 64);
+
+ const __m512i zero = _mm512_setzero_si512();
+
+ while (len >= 64) {
+ int k = (len < NMAX ? (int)len : NMAX);
+ k -= k % 64;
+ len -= k;
+ __m512i vs1_0 = vs1;
+ __m512i vs3 = _mm512_setzero_si512();
+
+ /* Manually unrolled this loop by 2 for an decent amount of ILP */
+ while (k >= 128) {
+ /*
+ vs1 = adler + sum(c[i])
+ vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] )
+ */
+ __m512i vbuf0 = _mm512_loadu_si512(buf);
+ __m512i vbuf1 = _mm512_loadu_si512(buf+64);
+ buf += 128;
+ k -= 128;
+
+ __m512i vs1_sad = _mm512_sad_epu8(vbuf0, zero);
+ vs1 = _mm512_add_epi32(vs1, vs1_sad);
+ vs3 = _mm512_add_epi32(vs3, vs1_0);
+ /* multiply-add, resulting in 16 ints. Fuse with sum stage from prior versions, as we now have the dp
+ * instructions to eliminate them */
+ vs2 = _mm512_dpbusd_epi32(vs2, vbuf0, dot2v);
+ vs1_0 = vs1;
+
+ vs1_sad = _mm512_sad_epu8(vbuf1, zero);
+ vs1 = _mm512_add_epi32(vs1, vs1_sad);
+ vs3 = _mm512_add_epi32(vs3, vs1_0);
+ vs2 = _mm512_dpbusd_epi32(vs2, vbuf1, dot2v);
+ vs1_0 = vs1;
+ }
+
+ /* Remainder peeling */
+ while (k >= 64) {
+ __m512i vbuf = _mm512_loadu_si512(buf);
+ buf += 64;
+ k -= 64;
+
+ __m512i vs1_sad = _mm512_sad_epu8(vbuf, zero);
+ vs1 = _mm512_add_epi32(vs1, vs1_sad);
+ vs3 = _mm512_add_epi32(vs3, vs1_0);
+ vs2 = _mm512_dpbusd_epi32(vs2, vbuf, dot2v);
+ vs1_0 = vs1;
+ }
+
+ vs3 = _mm512_slli_epi32(vs3, 6);
+ vs2 = _mm512_add_epi32(vs2, vs3);
+
+ adler = partial_hsum(vs1) % BASE;
+ vs1 = _mm512_castsi128_si512(_mm_cvtsi32_si128(adler));
+ sum2 = _mm512_reduce_add_epi32(vs2) % BASE;
+ vs2 = _mm512_maskz_set1_epi32(vs_mask, sum2);
+ }
+
+ /* Process tail (len < 16). */
+ return adler32_len_16(adler, buf, len, sum2);
+}
+
+#endif
# include <cpuid.h>
#endif
+#include <string.h>
+
Z_INTERNAL int x86_cpu_has_avx2;
+Z_INTERNAL int x86_cpu_has_avx512;
+Z_INTERNAL int x86_cpu_has_avx512vnni;
Z_INTERNAL int x86_cpu_has_sse2;
Z_INTERNAL int x86_cpu_has_ssse3;
Z_INTERNAL int x86_cpu_has_sse42;
Z_INTERNAL int x86_cpu_has_pclmulqdq;
Z_INTERNAL int x86_cpu_has_tzcnt;
+Z_INTERNAL int x86_cpu_well_suited_avx512;
static void cpuid(int info, unsigned* eax, unsigned* ebx, unsigned* ecx, unsigned* edx) {
#ifdef _MSC_VER
void Z_INTERNAL x86_check_features(void) {
unsigned eax, ebx, ecx, edx;
unsigned maxbasic;
+ unsigned family, model, extended_model;
+ int intel_cpu;
+ char cpu_identity[13];
cpuid(0, &maxbasic, &ebx, &ecx, &edx);
+ /* NULL terminate the string */
+ memset(cpu_identity, 0, 13);
+ memcpy(cpu_identity, (char*)&ebx, sizeof(int));
+ memcpy(cpu_identity + 4, (char*)&edx, sizeof(int));
+ memcpy(cpu_identity + 8, (char*)&ecx, sizeof(int));
+
+ intel_cpu = strncmp(cpu_identity, "GenuineIntel", 12) == 0;
+
cpuid(1 /*CPU_PROCINFO_AND_FEATUREBITS*/, &eax, &ebx, &ecx, &edx);
x86_cpu_has_sse2 = edx & 0x4000000;
x86_cpu_has_ssse3 = ecx & 0x200;
x86_cpu_has_sse42 = ecx & 0x100000;
x86_cpu_has_pclmulqdq = ecx & 0x2;
+ x86_cpu_well_suited_avx512 = 0;
+
+ model = (eax & 0xf0) >> 4;
+ family = (eax & 0xf00) >> 8;
+ extended_model = (eax & 0xf0000) >> 16;
if (maxbasic >= 7) {
cpuidex(7, 0, &eax, &ebx, &ecx, &edx);
x86_cpu_has_tzcnt = ebx & 0x8;
// check AVX2 bit
x86_cpu_has_avx2 = ebx & 0x20;
+ x86_cpu_has_avx512 = ebx & 0x00010000;
+ x86_cpu_has_avx512vnni = ecx & 0x800;
} else {
x86_cpu_has_tzcnt = 0;
x86_cpu_has_avx2 = 0;
}
+
+
+ if (intel_cpu) {
+ /* All of the Knights Landing and Knights Ferry _likely_ benefit
+ * from the AVX512 adler checksum implementation */
+ if (family == 0xb) {
+ x86_cpu_well_suited_avx512 = 1;
+ } else if (family == 0x6) {
+ if (model == 0x5 && extended_model == 0x5) {
+ /* Experimentally, on skylake-x and cascadelake-x, it has been
+ * unwaiveringly faster to use avx512 and avx512 vnni */
+ x86_cpu_well_suited_avx512 = 1;
+ } else if (model == 0xa && extended_model == 0x6) {
+ /* Icelake server */
+ x86_cpu_well_suited_avx512 = 1;
+ } else if (model == 0xf && extended_model == 0x8) {
+ /* Saphire rapids */
+ x86_cpu_well_suited_avx512 = 1;
+ }
+
+ /* Still need to check whether Rocket Lake and/or AlderLake
+ * benefit from the AVX512VNNI accelerated adler32 implementations.
+ * For now this working list is probably safe */
+ }
+ }
+
}
#define CPU_H_
extern int x86_cpu_has_avx2;
+extern int x86_cpu_has_avx512;
+extern int x86_cpu_has_avx512vnni;
extern int x86_cpu_has_sse2;
extern int x86_cpu_has_ssse3;
extern int x86_cpu_has_sse42;
extern int x86_cpu_has_pclmulqdq;
extern int x86_cpu_has_tzcnt;
+extern int x86_cpu_well_suited_avx512;
void Z_INTERNAL x86_check_features(void);
set(CMAKE_REQUIRED_FLAGS)
endmacro()
+macro(check_avx512_intrinsics)
+ if(CMAKE_C_COMPILER_ID MATCHES "Intel")
+ if(CMAKE_HOST_UNIX OR APPLE)
+ set(AVX512FLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl")
+ else()
+ set(AVX512FLAG "/arch:AVX512")
+ endif()
+ elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+ if(NOT NATIVEFLAG)
+ # For CPUs that can benefit from AVX512, it seems GCC generates suboptimal
+ # instruction scheduling unless you specify a reasonable -mtune= target
+ set(AVX512FLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mtune=cascadelake")
+ endif()
+ elseif(MSVC)
+ set(AVX512FLAG "/ARCH:AVX512")
+ endif()
+ # Check whether compiler supports AVX512 intrinsics
+ set(CMAKE_REQUIRED_FLAGS "${AVX512FLAG}")
+ check_c_source_compile_or_run(
+ "#include <immintrin.h>
+ int main(void) {
+ __m512i x = _mm512_set1_epi8(2);
+ const __m512i y = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+ 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
+ 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+ 56, 57, 58, 59, 60, 61, 62, 63, 64);
+ x = _mm512_sub_epi8(x, y);
+ (void)x;
+ return 0;
+ }"
+ HAVE_AVX512_INTRIN
+ )
+
+ # Evidently both GCC and clang were late to implementing these
+ check_c_source_compile_or_run(
+ "#include <immintrin.h>
+ int main(void) {
+ __mmask16 a = 0xFF;
+ a = _knot_mask16(a);
+ (void)a;
+ return 0;
+ }"
+ HAVE_MASK_INTRIN
+ )
+ set(CMAKE_REQUIRED_FLAGS)
+endmacro()
+
+macro(check_avx512vnni_intrinsics)
+ if(CMAKE_C_COMPILER_ID MATCHES "Intel")
+ if(CMAKE_HOST_UNIX OR APPLE)
+ set(AVX512VNNIFLAG "-mavx512f -mavx512bw -mavx512dq -mavx512vl -mavx512vnni")
+ else()
+ set(AVX512FLAG "/ARCH:AVX512")
+ endif()
+ elseif(MSVC)
+ set(AVX512FLAG "/ARCH:AVX512")
+ elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+ if(NOT NATIVEFLAG)
+ set(AVX512VNNIFLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mtune=cascadelake")
+ endif()
+ endif()
+
+ # Check whether compiler supports AVX512vnni intrinsics
+ set(CMAKE_REQUIRED_FLAGS "${AVX512VNNIFLAG}")
+ check_c_source_compile_or_run(
+ "#include <immintrin.h>
+ int main(void) {
+ __m512i x = _mm512_set1_epi8(2);
+ const __m512i y = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+ 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
+ 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+ 56, 57, 58, 59, 60, 61, 62, 63, 64);
+ __m512i z = _mm512_setzero_epi32();
+ z = _mm512_dpbusd_epi32(z, x, y);
+ (void)z;
+ return 0;
+ }"
+ HAVE_AVX512VNNI_INTRIN
+ )
+ set(CMAKE_REQUIRED_FLAGS)
+endmacro()
+
macro(check_avx2_intrinsics)
if(CMAKE_C_COMPILER_ID MATCHES "Intel")
if(CMAKE_HOST_UNIX OR APPLE)
floatabi=
native=0
forcesse2=0
+# For CPUs that can benefit from AVX512, it seems GCC generates suboptimal
+# instruction scheduling unless you specify a reasonable -mtune= target
+avx512flag="-mavx512f -mavx512dq -mavx512bw -mavx512vl -mtune=cascadelake"
+avx512vnniflag="-mavx512vnni ${avx512flag}"
avx2flag="-mavx2"
sse2flag="-msse2"
ssse3flag="-mssse3"
esac
if test $native -eq 1; then
+ avx512flag=""
+ avx512vnniflag=""
avx2flag=""
sse2flag=""
ssse3flag=""
fi
}
+check_avx512_intrinsics() {
+ # Check whether compiler supports AVX512 intrinsics
+ cat > $test.c << EOF
+#include <immintrin.h>
+int main(void) {
+ __m512i x = _mm512_set1_epi8(2);
+ const __m512i y = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+ 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
+ 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+ 56, 57, 58, 59, 60, 61, 62, 63, 64);
+ x = _mm512_sub_epi8(x, y);
+ (void)x;
+ return 0;
+}
+EOF
+ if try ${CC} ${CFLAGS} ${avx512flag} $test.c; then
+ echo "Checking for AVX512 intrinsics ... Yes." | tee -a configure.log
+ HAVE_AVX512_INTRIN=1
+ else
+ echo "Checking for AVX512 intrinsics ... No." | tee -a configure.log
+ HAVE_AVX512_INTRIN=0
+ fi
+}
+
+check_avx512vnni_intrinsics() {
+ # Check whether compiler supports AVX512-VNNI intrinsics
+ cat > $test.c << EOF
+#include <immintrin.h>
+int main(void) {
+ __m512i x = _mm512_set1_epi8(2);
+ const __m512i y = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+ 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
+ 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+ 56, 57, 58, 59, 60, 61, 62, 63, 64);
+ __m512i z = _mm512_setzero_epi32();
+ z = _mm512_dpbusd_epi32(z, x, y);
+ (void)z;
+ return 0;
+}
+EOF
+ if try ${CC} ${CFLAGS} ${avx512vnniflag} $test.c; then
+ echo "Checking for AVX512VNNI intrinsics ... Yes." | tee -a configure.log
+ HAVE_AVX512VNNI_INTRIN=1
+ else
+ echo "Checking for AVX512VNNI intrinsics ... No." | tee -a configure.log
+ HAVE_AVX512VNNI_INTRIN=0
+ fi
+}
+
+check_mask_intrinsics() {
+ # Check whether compiler supports AVX512 k-mask intrinsics
+ cat > $test.c << EOF
+#include <immintrin.h>
+int main(void) {
+ __mmask16 a = 0xFF;
+ a = _knot_mask16(a);
+ (void)a;
+ return 0;
+}
+EOF
+ if try ${CC} ${CFLAGS} ${avx512flag} $test.c; then
+ echo "Checking for k-mask intrinsics ... Yes." | tee -a configure.log
+ HAVE_MASK_INTRIN=1
+ else
+ echo "Checking for k-mask intrinsics ... No." | tee -a configure.log
+ HAVE_MASK_INTRIN=0
+ fi
+}
+
check_neon_intrinsics() {
# Check whether -mfpu=neon is available on ARM processors.
cat > $test.c << EOF
ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} slide_hash_avx.lo chunkset_avx.lo compare258_avx.lo adler32_avx.lo"
fi
+ check_avx512_intrinsics
+
+ if test ${HAVE_AVX512_INTRIN} -eq 1; then
+ CFLAGS="${CFLAGS} -DX86_AVX512 -DX86_AVX512_ADLER32"
+ SFLAGS="${SFLAGS} -DX86_AVX512 -DX86_AVX512_ADLER32"
+ ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_avx512.o"
+ ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_avx512.lo"
+
+ check_mask_intrinsics
+
+ if test ${HAVE_MASK_INTRIN} -eq 1; then
+ CFLAGS="${CFLAGS} -DX86_MASK_INTRIN"
+ SFLAGS="${SFLAGS} -DX86_MASK_INTRIN"
+ fi
+ fi
+
+ check_avx512vnni_intrinsics
+
+ if test ${HAVE_AVX512VNNI_INTRIN} -eq 1; then
+ CFLAGS="${CFLAGS} -DX86_AVX512VNNI -DX86_AVX512VNNI_ADLER32"
+ SFLAGS="${SFLAGS} -DX86_AVX512VNNI -DX86_AVX512VNNI_ADLER32"
+ ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_avx512_vnni.o"
+ ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_avx512_vnni.lo"
+ fi
+
check_sse4_intrinsics
if test ${HAVE_SSE42CRC_INTRIN} -eq 1 || test ${HAVE_SSE42CRC_INLINE_ASM} -eq 1; then
/^SRCTOP *=/s#=.*#=$SRCDIR#
/^BUILDDIR *=/s#=.*#=$BUILDDIR#
/^AVX2FLAG *=/s#=.*#=$avx2flag#
+/^AVX512FLAG *=/s#=.*#=$avx512flag#
+/^AVX512VNNIFLAG *=/s#=.*#=$avx512vnniflag#
/^SSE2FLAG *=/s#=.*#=$sse2flag#
/^SSSE3FLAG *=/s#=.*#=$ssse3flag#
/^SSE4FLAG *=/s#=.*#=$sse4flag#
#ifdef X86_AVX2_ADLER32
extern uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_t len);
#endif
+#ifdef X86_AVX512_ADLER32
+extern uint32_t adler32_avx512(uint32_t adler, const unsigned char *buf, size_t len);
+#endif
+#ifdef X86_AVX512VNNI_ADLER32
+extern uint32_t adler32_avx512_vnni(uint32_t adler, const unsigned char *buf, size_t len);
+#endif
#ifdef POWER8_VSX_ADLER32
extern uint32_t adler32_power8(uint32_t adler, const unsigned char* buf, size_t len);
#endif
if (x86_cpu_has_avx2)
functable.adler32 = &adler32_avx2;
#endif
+#ifdef X86_AVX512_ADLER32
+ if (x86_cpu_has_avx512 && x86_cpu_well_suited_avx512)
+ functable.adler32 = &adler32_avx512;
+#endif
+#ifdef X86_AVX512VNNI_ADLER32
+ if (x86_cpu_has_avx512vnni && x86_cpu_well_suited_avx512) {
+ functable.adler32 = &adler32_avx512_vnni;
+ }
+#endif
#ifdef PPC_VMX_ADLER32
if (power_cpu_has_altivec)
functable.adler32 = &adler32_vmx;