From: Adam Stylinski Date: Fri, 7 Jan 2022 20:51:09 +0000 (-0500) Subject: Have functioning avx512{,_vnni} adler32 X-Git-Tag: 2.1.0-beta1~460 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=46031f5cdea1d48577490ce60bb921e78b76277e;p=thirdparty%2Fzlib-ng.git Have functioning avx512{,_vnni} adler32 The new adler32 checksum uses the VNNI instructions with appreciable gains when possible. Otherwise, a pure avx512f variant exists which still gives appreciable gains. --- diff --git a/CMakeLists.txt b/CMakeLists.txt index fc0f44935..526065f32 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -106,6 +106,8 @@ elseif(BASEARCH_S360_FOUND) option(WITH_CRC32_VX "Build with vectorized CRC32 on IBM Z" ON) elseif(BASEARCH_X86_FOUND) option(WITH_AVX2 "Build with AVX2" ON) + option(WITH_AVX512 "Build with AVX512" ON) + option(WITH_AVX512VNNI "Build with AVX512 VNNI extensions" ON) option(WITH_SSE2 "Build with SSE2" ON) option(WITH_SSSE3 "Build with SSSE3" ON) option(WITH_SSE4 "Build with SSE4" ON) @@ -724,6 +726,33 @@ if(WITH_OPTIM) set(WITH_AVX2 OFF) endif() endif() + if(WITH_AVX512) + check_avx512_intrinsics() + if(HAVE_AVX512_INTRIN) + add_definitions(-DX86_AVX512 -DX86_AVX512_ADLER32) + list(APPEND AVX512_SRCS ${ARCHDIR}/adler32_avx512.c) + add_feature_info(AVX512_ADLER32 1 "Support AVX512-accelerated adler32, using \"${AVX512FLAG}\"") + list(APPEND ZLIB_ARCH_SRCS ${AVX512_SRCS}) + if(HAVE_MASK_INTRIN) + add_definitions(-DX86_MASK_INTRIN) + endif() + set_property(SOURCE ${AVX512_SRCS} PROPERTY COMPILE_FLAGS "${AVX512FLAG} ${NOLTOFLAG}") + else() + set(WITH_AVX512 OFF) + endif() + endif() + if(WITH_AVX512VNNI) + check_avx512vnni_intrinsics() + if(HAVE_AVX512VNNI_INTRIN) + add_definitions(-DX86_AVX512VNNI -DX86_AVX512VNNI_ADLER32) + add_feature_info(AVX512VNNI_ADLER32 1 "Support AVX512VNNI adler32, using \"${AVX512VNNIFLAG}\"") + list(APPEND AVX512VNNI_SRCS ${ARCHDIR}/adler32_avx512_vnni.c) + list(APPEND ZLIB_ARCH_SRCS ${AVX512VNNI_SRCS}) + set_property(SOURCE ${AVX512VNNI_SRCS} PROPERTY COMPILE_FLAGS "${AVX512VNNIFLAG} ${NOLTOFLAG}") + else() + set(WITH_AVX512VNNI OFF) + endif() + endif() if(WITH_SSE4) check_sse4_intrinsics() if(HAVE_SSE42CRC_INLINE_ASM OR HAVE_SSE42CRC_INTRIN) @@ -1408,6 +1437,8 @@ elseif(BASEARCH_S360_FOUND) add_feature_info(WITH_CRC32_VX WITH_CRC32_VX "Build with vectorized CRC32 on IBM Z") elseif(BASEARCH_X86_FOUND) add_feature_info(WITH_AVX2 WITH_AVX2 "Build with AVX2") + add_feature_info(WITH_AVX512 WITH_AVX512 "Build with AVX512") + add_feature_info(WITH_AVX512VNNI WITH_AVX512VNNI "Build with AVX512 VNNI") add_feature_info(WITH_SSE2 WITH_SSE2 "Build with SSE2") add_feature_info(WITH_SSSE3 WITH_SSSE3 "Build with SSSE3") add_feature_info(WITH_SSE4 WITH_SSE4 "Build with SSE4") diff --git a/README.md b/README.md index 541ffb94f..0da19e407 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ Features * Modern C11 syntax and a clean code layout * Deflate medium and quick algorithms based on Intels zlib fork * Support for CPU intrinsics when available - * Adler32 implementation using SSSE3, AVX2, Neon, VMX & VSX + * Adler32 implementation using SSSE3, AVX2, AVX512, AVX512-VNNI, Neon, VMX & VSX * CRC32-B implementation using PCLMULQDQ & ACLE * Hash table implementation using CRC32-C intrinsics on x86 and ARM * Slide hash implementations using SSE2, AVX2, Neon, VMX & VSX @@ -197,6 +197,8 @@ Advanced Build Options | UNALIGNED_OK | | Allow unaligned reads | ON (x86, arm) | | | --force-sse2 | Skip runtime check for SSE2 instructions (Always on for x86_64) | OFF (x86) | | WITH_AVX2 | | Build with AVX2 intrinsics | ON | +| WITH_AVX512 | | Build with AVX512 intrinsics | ON | +| WITH_AVX512VNNI | | Build with AVX512VNNI intrinsics | ON | | WITH_SSE2 | | Build with SSE2 intrinsics | ON | | WITH_SSE4 | | Build with SSE4 intrinsics | ON | | WITH_PCLMULQDQ | | Build with PCLMULQDQ intrinsics | ON | diff --git a/arch/x86/Makefile.in b/arch/x86/Makefile.in index c5e588e70..3e92738ee 100644 --- a/arch/x86/Makefile.in +++ b/arch/x86/Makefile.in @@ -8,6 +8,8 @@ SFLAGS= INCLUDES= SUFFIX= +AVX512FLAG=-mavx512f -mavx512dq -mavx512vl -mavx512bw +AVX512VNNIFLAG=-mavx512vnni AVX2FLAG=-mavx2 SSE2FLAG=-msse2 SSSE3FLAG=-mssse3 @@ -21,7 +23,9 @@ TOPDIR=$(SRCTOP) all: \ x86.o x86.lo \ - adler32_avx.o adler32.lo \ + adler32_avx.o adler32_avx.lo \ + adler32_avx512.o adler32_avx512.lo \ + adler32_avx512_vnni.o adler32_avx512_vnni.lo \ adler32_ssse3.o adler32_ssse3.lo \ chunkset_avx.o chunkset_avx.lo \ chunkset_sse.o chunkset_sse.lo \ @@ -92,6 +96,18 @@ adler32_avx.o: $(SRCDIR)/adler32_avx.c adler32_avx.lo: $(SRCDIR)/adler32_avx.c $(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx.c +adler32_avx512.o: $(SRCDIR)/adler32_avx512.c + $(CC) $(CFLAGS) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512.c + +adler32_avx512.lo: $(SRCDIR)/adler32_avx512.c + $(CC) $(SFLAGS) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512.c + +adler32_avx512_vnni.o: $(SRCDIR)/adler32_avx512_vnni.c + $(CC) $(CFLAGS) $(AVX512VNNIFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512_vnni.c + +adler32_avx512_vnni.lo: $(SRCDIR)/adler32_avx512_vnni.c + $(CC) $(SFLAGS) $(AVX512VNNIFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512_vnni.c + adler32_ssse3.o: $(SRCDIR)/adler32_ssse3.c $(CC) $(CFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_ssse3.c diff --git a/arch/x86/adler32_avx512.c b/arch/x86/adler32_avx512.c new file mode 100644 index 000000000..f73ceccef --- /dev/null +++ b/arch/x86/adler32_avx512.c @@ -0,0 +1,85 @@ +/* adler32_avx512.c -- compute the Adler-32 checksum of a data stream + * Copyright (C) 1995-2011 Mark Adler + * Authors: + * Adam Stylinski + * Brian Bockelman + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include "../../zbuild.h" +#include "../../zutil.h" + +#include "../../adler32_p.h" + +#include + +#ifdef X86_AVX512_ADLER32 + +Z_INTERNAL uint32_t adler32_avx512(uint32_t adler, const unsigned char *buf, size_t len) { + uint32_t sum2; + + /* split Adler-32 into component sums */ + sum2 = (adler >> 16) & 0xffff; + adler &= 0xffff; + + /* in case user likes doing a byte at a time, keep it fast */ + if (UNLIKELY(len == 1)) + return adler32_len_1(adler, buf, sum2); + + /* initial Adler-32 value (deferred check for len == 1 speed) */ + if (UNLIKELY(buf == NULL)) + return 1L; + + /* in case short lengths are provided, keep it somewhat fast */ + if (UNLIKELY(len < 16)) + return adler32_len_16(adler, buf, len, sum2); + + const __mmask16 vs_mask = 1U << 15; + __m512i vs1 = _mm512_maskz_set1_epi32(vs_mask, adler); + __m512i vs2 = _mm512_maskz_set1_epi32(vs_mask, sum2); + + const __m512i dot1v = _mm512_set1_epi8(1); + const __m512i dot2v = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, + 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, 64); + const __m512i dot3v = _mm512_set1_epi16(1); + + while (len >= 64) { + __m512i vs1_0 = vs1; + + int k = (len < NMAX ? (int)len : NMAX); + k -= k % 64; + len -= k; + + while (k >= 64) { + /* + vs1 = adler + sum(c[i]) + vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] ) + */ + __m512i vbuf = _mm512_loadu_si512(buf); + buf += 64; + k -= 64; + + __m512i v_short_sum1 = _mm512_maddubs_epi16(vbuf, dot1v); // multiply-add, resulting in 16 shorts. + __m512i vsum1 = _mm512_madd_epi16(v_short_sum1, dot3v); // sum 16 shorts to 8 int32_t; + __m512i v_short_sum2 = _mm512_maddubs_epi16(vbuf, dot2v); + vs1 = _mm512_add_epi32(vsum1, vs1); + __m512i vsum2 = _mm512_madd_epi16(v_short_sum2, dot3v); + vs1_0 = _mm512_slli_epi32(vs1_0, 6); + vsum2 = _mm512_add_epi32(vsum2, vs2); + vs2 = _mm512_add_epi32(vsum2, vs1_0); + vs1_0 = vs1; + } + + adler = _mm512_reduce_add_epi32(vs1) % BASE; + vs1 = _mm512_maskz_set1_epi32(vs_mask, adler); + sum2 = _mm512_reduce_add_epi32(vs2) % BASE; + vs2 = _mm512_maskz_set1_epi32(vs_mask, sum2); + } + + /* Process tail (len < 16). */ + return adler32_len_16(adler, buf, len, sum2); +} + +#endif diff --git a/arch/x86/adler32_avx512_vnni.c b/arch/x86/adler32_avx512_vnni.c new file mode 100644 index 000000000..f4e2c33ed --- /dev/null +++ b/arch/x86/adler32_avx512_vnni.c @@ -0,0 +1,135 @@ +/* adler32_avx512_vnni.c -- compute the Adler-32 checksum of a data stream + * Based on Brian Bockelman's AVX2 version + * Copyright (C) 1995-2011 Mark Adler + * Authors: + * Adam Stylinski + * Brian Bockelman + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include "../../zbuild.h" +#include "../../zutil.h" + +#include "../../adler32_p.h" + +#include + +#ifdef X86_AVX512VNNI_ADLER32 + +static inline uint32_t partial_hsum(__m512i x) +{ + /* We need a permutation vector to extract every other integer. The + * rest are going to be zeros. Marking this const so the compiler stands + * a better chance of keeping this resident in a register through entire + * loop execution. We certainly have enough zmm registers (32) */ + const __m512i perm_vec = _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, + 1, 1, 1, 1, 1, 1, 1, 1); + __m512i non_zero = _mm512_permutexvar_epi32(perm_vec, x); + + /* From here, it's a simple 256 bit wide reduction sum */ + __m256i non_zero_avx = _mm512_castsi512_si256(non_zero); + + /* See Agner Fog's vectorclass for a decent reference. Essentially, phadd is + * pretty slow, much slower than the longer instruction sequence below */ + __m128i sum1 = _mm_add_epi32(_mm256_extracti128_si256(non_zero_avx, 1), + _mm256_castsi256_si128(non_zero_avx)); + __m128i sum2 = _mm_add_epi32(sum1,_mm_unpackhi_epi64(sum1, sum1)); + __m128i sum3 = _mm_add_epi32(sum2,_mm_shuffle_epi32(sum2, 1)); + return (uint32_t)_mm_cvtsi128_si32(sum3); +} + +Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const unsigned char *buf, size_t len) { + uint32_t sum2; + + /* split Adler-32 into component sums */ + sum2 = (adler >> 16) & 0xffff; + adler &= 0xffff; + + /* in case user likes doing a byte at a time, keep it fast */ + if (UNLIKELY(len == 1)) + return adler32_len_1(adler, buf, sum2); + + /* initial Adler-32 value (deferred check for len == 1 speed) */ + if (UNLIKELY(buf == NULL)) + return 1L; + + /* in case short lengths are provided, keep it somewhat fast */ + if (UNLIKELY(len < 16)) + return adler32_len_16(adler, buf, len, sum2); + + const __mmask16 vs_mask = 1U << 15; + /* We want to place initial adler sum at vector position 0, as it is one of the lanes that line up + * with the sum of absolute differences' reduction sum. If we do this, we can get away with a partial, + * less expensive horizontal sum for the vs1 component at the end. It also happens to be marginally better + * (by a single cycle) to do this with the ancient vmovd insruction, and simply allow the register to be + * aliased up to a 512 bit wide zmm */ + __m512i vs1 = _mm512_castsi128_si512(_mm_cvtsi32_si128(adler)); + __m512i vs2 = _mm512_maskz_set1_epi32(vs_mask, sum2); + + const __m512i dot2v = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, + 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, 64); + + const __m512i zero = _mm512_setzero_si512(); + + while (len >= 64) { + int k = (len < NMAX ? (int)len : NMAX); + k -= k % 64; + len -= k; + __m512i vs1_0 = vs1; + __m512i vs3 = _mm512_setzero_si512(); + + /* Manually unrolled this loop by 2 for an decent amount of ILP */ + while (k >= 128) { + /* + vs1 = adler + sum(c[i]) + vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] ) + */ + __m512i vbuf0 = _mm512_loadu_si512(buf); + __m512i vbuf1 = _mm512_loadu_si512(buf+64); + buf += 128; + k -= 128; + + __m512i vs1_sad = _mm512_sad_epu8(vbuf0, zero); + vs1 = _mm512_add_epi32(vs1, vs1_sad); + vs3 = _mm512_add_epi32(vs3, vs1_0); + /* multiply-add, resulting in 16 ints. Fuse with sum stage from prior versions, as we now have the dp + * instructions to eliminate them */ + vs2 = _mm512_dpbusd_epi32(vs2, vbuf0, dot2v); + vs1_0 = vs1; + + vs1_sad = _mm512_sad_epu8(vbuf1, zero); + vs1 = _mm512_add_epi32(vs1, vs1_sad); + vs3 = _mm512_add_epi32(vs3, vs1_0); + vs2 = _mm512_dpbusd_epi32(vs2, vbuf1, dot2v); + vs1_0 = vs1; + } + + /* Remainder peeling */ + while (k >= 64) { + __m512i vbuf = _mm512_loadu_si512(buf); + buf += 64; + k -= 64; + + __m512i vs1_sad = _mm512_sad_epu8(vbuf, zero); + vs1 = _mm512_add_epi32(vs1, vs1_sad); + vs3 = _mm512_add_epi32(vs3, vs1_0); + vs2 = _mm512_dpbusd_epi32(vs2, vbuf, dot2v); + vs1_0 = vs1; + } + + vs3 = _mm512_slli_epi32(vs3, 6); + vs2 = _mm512_add_epi32(vs2, vs3); + + adler = partial_hsum(vs1) % BASE; + vs1 = _mm512_castsi128_si512(_mm_cvtsi32_si128(adler)); + sum2 = _mm512_reduce_add_epi32(vs2) % BASE; + vs2 = _mm512_maskz_set1_epi32(vs_mask, sum2); + } + + /* Process tail (len < 16). */ + return adler32_len_16(adler, buf, len, sum2); +} + +#endif diff --git a/arch/x86/x86.c b/arch/x86/x86.c index e782cb8ee..f02e1a349 100644 --- a/arch/x86/x86.c +++ b/arch/x86/x86.c @@ -17,12 +17,17 @@ # include #endif +#include + Z_INTERNAL int x86_cpu_has_avx2; +Z_INTERNAL int x86_cpu_has_avx512; +Z_INTERNAL int x86_cpu_has_avx512vnni; Z_INTERNAL int x86_cpu_has_sse2; Z_INTERNAL int x86_cpu_has_ssse3; Z_INTERNAL int x86_cpu_has_sse42; Z_INTERNAL int x86_cpu_has_pclmulqdq; Z_INTERNAL int x86_cpu_has_tzcnt; +Z_INTERNAL int x86_cpu_well_suited_avx512; static void cpuid(int info, unsigned* eax, unsigned* ebx, unsigned* ecx, unsigned* edx) { #ifdef _MSC_VER @@ -55,15 +60,31 @@ static void cpuidex(int info, int subinfo, unsigned* eax, unsigned* ebx, unsigne void Z_INTERNAL x86_check_features(void) { unsigned eax, ebx, ecx, edx; unsigned maxbasic; + unsigned family, model, extended_model; + int intel_cpu; + char cpu_identity[13]; cpuid(0, &maxbasic, &ebx, &ecx, &edx); + /* NULL terminate the string */ + memset(cpu_identity, 0, 13); + memcpy(cpu_identity, (char*)&ebx, sizeof(int)); + memcpy(cpu_identity + 4, (char*)&edx, sizeof(int)); + memcpy(cpu_identity + 8, (char*)&ecx, sizeof(int)); + + intel_cpu = strncmp(cpu_identity, "GenuineIntel", 12) == 0; + cpuid(1 /*CPU_PROCINFO_AND_FEATUREBITS*/, &eax, &ebx, &ecx, &edx); x86_cpu_has_sse2 = edx & 0x4000000; x86_cpu_has_ssse3 = ecx & 0x200; x86_cpu_has_sse42 = ecx & 0x100000; x86_cpu_has_pclmulqdq = ecx & 0x2; + x86_cpu_well_suited_avx512 = 0; + + model = (eax & 0xf0) >> 4; + family = (eax & 0xf00) >> 8; + extended_model = (eax & 0xf0000) >> 16; if (maxbasic >= 7) { cpuidex(7, 0, &eax, &ebx, &ecx, &edx); @@ -73,8 +94,36 @@ void Z_INTERNAL x86_check_features(void) { x86_cpu_has_tzcnt = ebx & 0x8; // check AVX2 bit x86_cpu_has_avx2 = ebx & 0x20; + x86_cpu_has_avx512 = ebx & 0x00010000; + x86_cpu_has_avx512vnni = ecx & 0x800; } else { x86_cpu_has_tzcnt = 0; x86_cpu_has_avx2 = 0; } + + + if (intel_cpu) { + /* All of the Knights Landing and Knights Ferry _likely_ benefit + * from the AVX512 adler checksum implementation */ + if (family == 0xb) { + x86_cpu_well_suited_avx512 = 1; + } else if (family == 0x6) { + if (model == 0x5 && extended_model == 0x5) { + /* Experimentally, on skylake-x and cascadelake-x, it has been + * unwaiveringly faster to use avx512 and avx512 vnni */ + x86_cpu_well_suited_avx512 = 1; + } else if (model == 0xa && extended_model == 0x6) { + /* Icelake server */ + x86_cpu_well_suited_avx512 = 1; + } else if (model == 0xf && extended_model == 0x8) { + /* Saphire rapids */ + x86_cpu_well_suited_avx512 = 1; + } + + /* Still need to check whether Rocket Lake and/or AlderLake + * benefit from the AVX512VNNI accelerated adler32 implementations. + * For now this working list is probably safe */ + } + } + } diff --git a/arch/x86/x86.h b/arch/x86/x86.h index 8471e155c..4274ed09f 100644 --- a/arch/x86/x86.h +++ b/arch/x86/x86.h @@ -7,11 +7,14 @@ #define CPU_H_ extern int x86_cpu_has_avx2; +extern int x86_cpu_has_avx512; +extern int x86_cpu_has_avx512vnni; extern int x86_cpu_has_sse2; extern int x86_cpu_has_ssse3; extern int x86_cpu_has_sse42; extern int x86_cpu_has_pclmulqdq; extern int x86_cpu_has_tzcnt; +extern int x86_cpu_well_suited_avx512; void Z_INTERNAL x86_check_features(void); diff --git a/cmake/detect-intrinsics.cmake b/cmake/detect-intrinsics.cmake index e03daaa13..9f7a97190 100644 --- a/cmake/detect-intrinsics.cmake +++ b/cmake/detect-intrinsics.cmake @@ -13,6 +13,88 @@ macro(check_acle_intrinsics) set(CMAKE_REQUIRED_FLAGS) endmacro() +macro(check_avx512_intrinsics) + if(CMAKE_C_COMPILER_ID MATCHES "Intel") + if(CMAKE_HOST_UNIX OR APPLE) + set(AVX512FLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl") + else() + set(AVX512FLAG "/arch:AVX512") + endif() + elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") + if(NOT NATIVEFLAG) + # For CPUs that can benefit from AVX512, it seems GCC generates suboptimal + # instruction scheduling unless you specify a reasonable -mtune= target + set(AVX512FLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mtune=cascadelake") + endif() + elseif(MSVC) + set(AVX512FLAG "/ARCH:AVX512") + endif() + # Check whether compiler supports AVX512 intrinsics + set(CMAKE_REQUIRED_FLAGS "${AVX512FLAG}") + check_c_source_compile_or_run( + "#include + int main(void) { + __m512i x = _mm512_set1_epi8(2); + const __m512i y = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, + 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, 64); + x = _mm512_sub_epi8(x, y); + (void)x; + return 0; + }" + HAVE_AVX512_INTRIN + ) + + # Evidently both GCC and clang were late to implementing these + check_c_source_compile_or_run( + "#include + int main(void) { + __mmask16 a = 0xFF; + a = _knot_mask16(a); + (void)a; + return 0; + }" + HAVE_MASK_INTRIN + ) + set(CMAKE_REQUIRED_FLAGS) +endmacro() + +macro(check_avx512vnni_intrinsics) + if(CMAKE_C_COMPILER_ID MATCHES "Intel") + if(CMAKE_HOST_UNIX OR APPLE) + set(AVX512VNNIFLAG "-mavx512f -mavx512bw -mavx512dq -mavx512vl -mavx512vnni") + else() + set(AVX512FLAG "/ARCH:AVX512") + endif() + elseif(MSVC) + set(AVX512FLAG "/ARCH:AVX512") + elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") + if(NOT NATIVEFLAG) + set(AVX512VNNIFLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mtune=cascadelake") + endif() + endif() + + # Check whether compiler supports AVX512vnni intrinsics + set(CMAKE_REQUIRED_FLAGS "${AVX512VNNIFLAG}") + check_c_source_compile_or_run( + "#include + int main(void) { + __m512i x = _mm512_set1_epi8(2); + const __m512i y = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, + 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, 64); + __m512i z = _mm512_setzero_epi32(); + z = _mm512_dpbusd_epi32(z, x, y); + (void)z; + return 0; + }" + HAVE_AVX512VNNI_INTRIN + ) + set(CMAKE_REQUIRED_FLAGS) +endmacro() + macro(check_avx2_intrinsics) if(CMAKE_C_COMPILER_ID MATCHES "Intel") if(CMAKE_HOST_UNIX OR APPLE) diff --git a/configure b/configure index 3f145a6d4..0d79bca50 100755 --- a/configure +++ b/configure @@ -102,6 +102,10 @@ with_fuzzers=0 floatabi= native=0 forcesse2=0 +# For CPUs that can benefit from AVX512, it seems GCC generates suboptimal +# instruction scheduling unless you specify a reasonable -mtune= target +avx512flag="-mavx512f -mavx512dq -mavx512bw -mavx512vl -mtune=cascadelake" +avx512vnniflag="-mavx512vnni ${avx512flag}" avx2flag="-mavx2" sse2flag="-msse2" ssse3flag="-mssse3" @@ -250,6 +254,8 @@ case $($cc -v 2>&1) in esac if test $native -eq 1; then + avx512flag="" + avx512vnniflag="" avx2flag="" sse2flag="" ssse3flag="" @@ -1050,6 +1056,75 @@ EOF fi } +check_avx512_intrinsics() { + # Check whether compiler supports AVX512 intrinsics + cat > $test.c << EOF +#include +int main(void) { + __m512i x = _mm512_set1_epi8(2); + const __m512i y = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, + 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, 64); + x = _mm512_sub_epi8(x, y); + (void)x; + return 0; +} +EOF + if try ${CC} ${CFLAGS} ${avx512flag} $test.c; then + echo "Checking for AVX512 intrinsics ... Yes." | tee -a configure.log + HAVE_AVX512_INTRIN=1 + else + echo "Checking for AVX512 intrinsics ... No." | tee -a configure.log + HAVE_AVX512_INTRIN=0 + fi +} + +check_avx512vnni_intrinsics() { + # Check whether compiler supports AVX512-VNNI intrinsics + cat > $test.c << EOF +#include +int main(void) { + __m512i x = _mm512_set1_epi8(2); + const __m512i y = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, + 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, 64); + __m512i z = _mm512_setzero_epi32(); + z = _mm512_dpbusd_epi32(z, x, y); + (void)z; + return 0; +} +EOF + if try ${CC} ${CFLAGS} ${avx512vnniflag} $test.c; then + echo "Checking for AVX512VNNI intrinsics ... Yes." | tee -a configure.log + HAVE_AVX512VNNI_INTRIN=1 + else + echo "Checking for AVX512VNNI intrinsics ... No." | tee -a configure.log + HAVE_AVX512VNNI_INTRIN=0 + fi +} + +check_mask_intrinsics() { + # Check whether compiler supports AVX512 k-mask intrinsics + cat > $test.c << EOF +#include +int main(void) { + __mmask16 a = 0xFF; + a = _knot_mask16(a); + (void)a; + return 0; +} +EOF + if try ${CC} ${CFLAGS} ${avx512flag} $test.c; then + echo "Checking for k-mask intrinsics ... Yes." | tee -a configure.log + HAVE_MASK_INTRIN=1 + else + echo "Checking for k-mask intrinsics ... No." | tee -a configure.log + HAVE_MASK_INTRIN=0 + fi +} + check_neon_intrinsics() { # Check whether -mfpu=neon is available on ARM processors. cat > $test.c << EOF @@ -1318,6 +1393,31 @@ case "${ARCH}" in ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} slide_hash_avx.lo chunkset_avx.lo compare258_avx.lo adler32_avx.lo" fi + check_avx512_intrinsics + + if test ${HAVE_AVX512_INTRIN} -eq 1; then + CFLAGS="${CFLAGS} -DX86_AVX512 -DX86_AVX512_ADLER32" + SFLAGS="${SFLAGS} -DX86_AVX512 -DX86_AVX512_ADLER32" + ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_avx512.o" + ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_avx512.lo" + + check_mask_intrinsics + + if test ${HAVE_MASK_INTRIN} -eq 1; then + CFLAGS="${CFLAGS} -DX86_MASK_INTRIN" + SFLAGS="${SFLAGS} -DX86_MASK_INTRIN" + fi + fi + + check_avx512vnni_intrinsics + + if test ${HAVE_AVX512VNNI_INTRIN} -eq 1; then + CFLAGS="${CFLAGS} -DX86_AVX512VNNI -DX86_AVX512VNNI_ADLER32" + SFLAGS="${SFLAGS} -DX86_AVX512VNNI -DX86_AVX512VNNI_ADLER32" + ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_avx512_vnni.o" + ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_avx512_vnni.lo" + fi + check_sse4_intrinsics if test ${HAVE_SSE42CRC_INTRIN} -eq 1 || test ${HAVE_SSE42CRC_INLINE_ASM} -eq 1; then @@ -1908,6 +2008,8 @@ sed < $SRCDIR/$ARCHDIR/Makefile.in " /^SRCTOP *=/s#=.*#=$SRCDIR# /^BUILDDIR *=/s#=.*#=$BUILDDIR# /^AVX2FLAG *=/s#=.*#=$avx2flag# +/^AVX512FLAG *=/s#=.*#=$avx512flag# +/^AVX512VNNIFLAG *=/s#=.*#=$avx512vnniflag# /^SSE2FLAG *=/s#=.*#=$sse2flag# /^SSSE3FLAG *=/s#=.*#=$ssse3flag# /^SSE4FLAG *=/s#=.*#=$sse4flag# diff --git a/functable.c b/functable.c index d8b561b5a..da21a808b 100644 --- a/functable.c +++ b/functable.c @@ -69,6 +69,12 @@ extern uint32_t adler32_ssse3(uint32_t adler, const unsigned char *buf, size_t l #ifdef X86_AVX2_ADLER32 extern uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_t len); #endif +#ifdef X86_AVX512_ADLER32 +extern uint32_t adler32_avx512(uint32_t adler, const unsigned char *buf, size_t len); +#endif +#ifdef X86_AVX512VNNI_ADLER32 +extern uint32_t adler32_avx512_vnni(uint32_t adler, const unsigned char *buf, size_t len); +#endif #ifdef POWER8_VSX_ADLER32 extern uint32_t adler32_power8(uint32_t adler, const unsigned char* buf, size_t len); #endif @@ -303,6 +309,15 @@ Z_INTERNAL uint32_t adler32_stub(uint32_t adler, const unsigned char *buf, size_ if (x86_cpu_has_avx2) functable.adler32 = &adler32_avx2; #endif +#ifdef X86_AVX512_ADLER32 + if (x86_cpu_has_avx512 && x86_cpu_well_suited_avx512) + functable.adler32 = &adler32_avx512; +#endif +#ifdef X86_AVX512VNNI_ADLER32 + if (x86_cpu_has_avx512vnni && x86_cpu_well_suited_avx512) { + functable.adler32 = &adler32_avx512_vnni; + } +#endif #ifdef PPC_VMX_ADLER32 if (power_cpu_has_altivec) functable.adler32 = &adler32_vmx;