From: Adam Stylinski <kungfujesus06@gmail.com>
Date: Fri, 7 Jan 2022 20:51:09 +0000 (-0500)
Subject: Have functioning avx512{,_vnni} adler32
X-Git-Tag: 2.1.0-beta1~460
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=46031f5cdea1d48577490ce60bb921e78b76277e;p=thirdparty%2Fzlib-ng.git

Have functioning avx512{,_vnni} adler32

The new adler32 checksum uses the VNNI instructions with appreciable
gains when possible. Otherwise, a pure avx512f variant exists which
still gives appreciable gains.
---

diff --git a/CMakeLists.txt b/CMakeLists.txt
index fc0f44935..526065f32 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -106,6 +106,8 @@ elseif(BASEARCH_S360_FOUND)
     option(WITH_CRC32_VX "Build with vectorized CRC32 on IBM Z" ON)
 elseif(BASEARCH_X86_FOUND)
     option(WITH_AVX2 "Build with AVX2" ON)
+    option(WITH_AVX512 "Build with AVX512" ON)
+    option(WITH_AVX512VNNI "Build with AVX512 VNNI extensions" ON)
     option(WITH_SSE2 "Build with SSE2" ON)
     option(WITH_SSSE3 "Build with SSSE3" ON)
     option(WITH_SSE4 "Build with SSE4" ON)
@@ -724,6 +726,33 @@ if(WITH_OPTIM)
                 set(WITH_AVX2 OFF)
             endif()
         endif()
+        if(WITH_AVX512)
+            check_avx512_intrinsics()
+            if(HAVE_AVX512_INTRIN)
+                add_definitions(-DX86_AVX512 -DX86_AVX512_ADLER32)
+                list(APPEND AVX512_SRCS ${ARCHDIR}/adler32_avx512.c)
+                add_feature_info(AVX512_ADLER32 1 "Support AVX512-accelerated adler32, using \"${AVX512FLAG}\"")
+                list(APPEND ZLIB_ARCH_SRCS ${AVX512_SRCS})
+                if(HAVE_MASK_INTRIN)
+                    add_definitions(-DX86_MASK_INTRIN)
+                endif()
+                set_property(SOURCE ${AVX512_SRCS} PROPERTY COMPILE_FLAGS "${AVX512FLAG} ${NOLTOFLAG}")
+            else()
+                set(WITH_AVX512 OFF)
+            endif()
+        endif()
+        if(WITH_AVX512VNNI)
+            check_avx512vnni_intrinsics()
+            if(HAVE_AVX512VNNI_INTRIN)
+                add_definitions(-DX86_AVX512VNNI -DX86_AVX512VNNI_ADLER32)
+                add_feature_info(AVX512VNNI_ADLER32 1 "Support AVX512VNNI adler32, using \"${AVX512VNNIFLAG}\"")
+                list(APPEND AVX512VNNI_SRCS ${ARCHDIR}/adler32_avx512_vnni.c)
+                list(APPEND ZLIB_ARCH_SRCS ${AVX512VNNI_SRCS})
+                set_property(SOURCE ${AVX512VNNI_SRCS} PROPERTY COMPILE_FLAGS "${AVX512VNNIFLAG} ${NOLTOFLAG}")
+            else()
+                set(WITH_AVX512VNNI OFF)
+            endif()
+        endif()
         if(WITH_SSE4)
             check_sse4_intrinsics()
             if(HAVE_SSE42CRC_INLINE_ASM OR HAVE_SSE42CRC_INTRIN)
@@ -1408,6 +1437,8 @@ elseif(BASEARCH_S360_FOUND)
     add_feature_info(WITH_CRC32_VX WITH_CRC32_VX "Build with vectorized CRC32 on IBM Z")
 elseif(BASEARCH_X86_FOUND)
     add_feature_info(WITH_AVX2 WITH_AVX2 "Build with AVX2")
+    add_feature_info(WITH_AVX512 WITH_AVX512 "Build with AVX512")
+    add_feature_info(WITH_AVX512VNNI WITH_AVX512VNNI "Build with AVX512 VNNI")
     add_feature_info(WITH_SSE2 WITH_SSE2 "Build with SSE2")
     add_feature_info(WITH_SSSE3 WITH_SSSE3 "Build with SSSE3")
     add_feature_info(WITH_SSE4 WITH_SSE4 "Build with SSE4")
diff --git a/README.md b/README.md
index 541ffb94f..0da19e407 100644
--- a/README.md
+++ b/README.md
@@ -21,7 +21,7 @@ Features
 * Modern C11 syntax and a clean code layout
 * Deflate medium and quick algorithms based on Intels zlib fork
 * Support for CPU intrinsics when available
-  * Adler32 implementation using SSSE3, AVX2, Neon, VMX & VSX
+  * Adler32 implementation using SSSE3, AVX2, AVX512, AVX512-VNNI, Neon, VMX & VSX
   * CRC32-B implementation using PCLMULQDQ & ACLE
   * Hash table implementation using CRC32-C intrinsics on x86 and ARM
   * Slide hash implementations using SSE2, AVX2, Neon, VMX & VSX
@@ -197,6 +197,8 @@ Advanced Build Options
 | UNALIGNED_OK                    |                       | Allow unaligned reads                                               | ON (x86, arm)          |
 |                                 | --force-sse2          | Skip runtime check for SSE2 instructions (Always on for x86_64)     | OFF (x86)              |
 | WITH_AVX2                       |                       | Build with AVX2 intrinsics                                          | ON                     |
+| WITH_AVX512                     |                       | Build with AVX512 intrinsics                                        | ON                     |
+| WITH_AVX512VNNI                 |                       | Build with AVX512VNNI intrinsics                                    | ON                     |
 | WITH_SSE2                       |                       | Build with SSE2 intrinsics                                          | ON                     |
 | WITH_SSE4                       |                       | Build with SSE4 intrinsics                                          | ON                     |
 | WITH_PCLMULQDQ                  |                       | Build with PCLMULQDQ intrinsics                                     | ON                     |
diff --git a/arch/x86/Makefile.in b/arch/x86/Makefile.in
index c5e588e70..3e92738ee 100644
--- a/arch/x86/Makefile.in
+++ b/arch/x86/Makefile.in
@@ -8,6 +8,8 @@ SFLAGS=
 INCLUDES=
 SUFFIX=
 
+AVX512FLAG=-mavx512f -mavx512dq -mavx512vl -mavx512bw
+AVX512VNNIFLAG=-mavx512vnni
 AVX2FLAG=-mavx2
 SSE2FLAG=-msse2
 SSSE3FLAG=-mssse3
@@ -21,7 +23,9 @@ TOPDIR=$(SRCTOP)
 
 all: \
 	x86.o x86.lo \
-	adler32_avx.o adler32.lo \
+	adler32_avx.o adler32_avx.lo \
+	adler32_avx512.o adler32_avx512.lo \
+	adler32_avx512_vnni.o adler32_avx512_vnni.lo \
 	adler32_ssse3.o adler32_ssse3.lo \
 	chunkset_avx.o chunkset_avx.lo \
 	chunkset_sse.o chunkset_sse.lo \
@@ -92,6 +96,18 @@ adler32_avx.o: $(SRCDIR)/adler32_avx.c
 adler32_avx.lo: $(SRCDIR)/adler32_avx.c
 	$(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx.c
 
+adler32_avx512.o: $(SRCDIR)/adler32_avx512.c
+	$(CC) $(CFLAGS) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512.c
+
+adler32_avx512.lo: $(SRCDIR)/adler32_avx512.c
+	$(CC) $(SFLAGS) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512.c
+
+adler32_avx512_vnni.o: $(SRCDIR)/adler32_avx512_vnni.c
+	$(CC) $(CFLAGS) $(AVX512VNNIFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512_vnni.c
+
+adler32_avx512_vnni.lo: $(SRCDIR)/adler32_avx512_vnni.c
+	$(CC) $(SFLAGS) $(AVX512VNNIFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512_vnni.c
+
 adler32_ssse3.o: $(SRCDIR)/adler32_ssse3.c
 	$(CC) $(CFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_ssse3.c
 
diff --git a/arch/x86/adler32_avx512.c b/arch/x86/adler32_avx512.c
new file mode 100644
index 000000000..f73ceccef
--- /dev/null
+++ b/arch/x86/adler32_avx512.c
@@ -0,0 +1,85 @@
+/* adler32_avx512.c -- compute the Adler-32 checksum of a data stream
+ * Copyright (C) 1995-2011 Mark Adler
+ * Authors:
+ *   Adam Stylinski <kungfujesus06@gmail.com>
+ *   Brian Bockelman <bockelman@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "../../zbuild.h"
+#include "../../zutil.h"
+
+#include "../../adler32_p.h"
+
+#include <immintrin.h>
+
+#ifdef X86_AVX512_ADLER32
+
+Z_INTERNAL uint32_t adler32_avx512(uint32_t adler, const unsigned char *buf, size_t len) {
+    uint32_t sum2;
+
+     /* split Adler-32 into component sums */
+    sum2 = (adler >> 16) & 0xffff;
+    adler &= 0xffff;
+
+    /* in case user likes doing a byte at a time, keep it fast */
+    if (UNLIKELY(len == 1))
+        return adler32_len_1(adler, buf, sum2);
+
+    /* initial Adler-32 value (deferred check for len == 1 speed) */
+    if (UNLIKELY(buf == NULL))
+        return 1L;
+
+    /* in case short lengths are provided, keep it somewhat fast */
+    if (UNLIKELY(len < 16))
+        return adler32_len_16(adler, buf, len, sum2);
+
+    const __mmask16 vs_mask = 1U << 15;
+    __m512i vs1 = _mm512_maskz_set1_epi32(vs_mask, adler);
+    __m512i vs2 = _mm512_maskz_set1_epi32(vs_mask, sum2);
+
+    const __m512i dot1v = _mm512_set1_epi8(1);
+    const __m512i dot2v = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+                                          20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
+                                          38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+                                          56, 57, 58, 59, 60, 61, 62, 63, 64);
+    const __m512i dot3v = _mm512_set1_epi16(1);
+
+    while (len >= 64) {
+       __m512i vs1_0 = vs1;
+
+       int k = (len < NMAX ? (int)len : NMAX);
+       k -= k % 64;
+       len -= k;
+
+       while (k >= 64) {
+           /*
+              vs1 = adler + sum(c[i])
+              vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] )
+           */
+           __m512i vbuf = _mm512_loadu_si512(buf);
+           buf += 64;
+           k -= 64;
+
+           __m512i v_short_sum1 = _mm512_maddubs_epi16(vbuf, dot1v); // multiply-add, resulting in 16 shorts.
+           __m512i vsum1 = _mm512_madd_epi16(v_short_sum1, dot3v);   // sum 16 shorts to 8 int32_t;
+           __m512i v_short_sum2 = _mm512_maddubs_epi16(vbuf, dot2v);
+           vs1 = _mm512_add_epi32(vsum1, vs1);
+           __m512i vsum2 = _mm512_madd_epi16(v_short_sum2, dot3v);
+           vs1_0 = _mm512_slli_epi32(vs1_0, 6);
+           vsum2 = _mm512_add_epi32(vsum2, vs2);
+           vs2   = _mm512_add_epi32(vsum2, vs1_0);
+           vs1_0 = vs1;
+       }
+
+        adler = _mm512_reduce_add_epi32(vs1) % BASE;
+        vs1 = _mm512_maskz_set1_epi32(vs_mask, adler);
+        sum2 = _mm512_reduce_add_epi32(vs2) % BASE;
+        vs2 = _mm512_maskz_set1_epi32(vs_mask, sum2);
+    }
+
+    /* Process tail (len < 16).  */
+    return adler32_len_16(adler, buf, len, sum2);
+}
+
+#endif
diff --git a/arch/x86/adler32_avx512_vnni.c b/arch/x86/adler32_avx512_vnni.c
new file mode 100644
index 000000000..f4e2c33ed
--- /dev/null
+++ b/arch/x86/adler32_avx512_vnni.c
@@ -0,0 +1,135 @@
+/* adler32_avx512_vnni.c -- compute the Adler-32 checksum of a data stream
+ * Based on Brian Bockelman's AVX2 version
+ * Copyright (C) 1995-2011 Mark Adler
+ * Authors:
+ *   Adam Stylinski <kungfujesus06@gmail.com>
+ *   Brian Bockelman <bockelman@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "../../zbuild.h"
+#include "../../zutil.h"
+
+#include "../../adler32_p.h"
+
+#include <immintrin.h>
+
+#ifdef X86_AVX512VNNI_ADLER32
+
+static inline uint32_t partial_hsum(__m512i x)
+{
+    /* We need a permutation vector to extract every other integer. The
+     * rest are going to be zeros. Marking this const so the compiler stands
+     * a better chance of keeping this resident in a register through entire 
+     * loop execution. We certainly have enough zmm registers (32) */
+    const __m512i perm_vec = _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14,
+                                               1, 1, 1, 1, 1,  1,  1,  1);
+    __m512i non_zero = _mm512_permutexvar_epi32(perm_vec, x);
+
+    /* From here, it's a simple 256 bit wide reduction sum */
+    __m256i non_zero_avx = _mm512_castsi512_si256(non_zero);
+    
+    /* See Agner Fog's vectorclass for a decent reference. Essentially, phadd is
+     * pretty slow, much slower than the longer instruction sequence below */
+    __m128i sum1  = _mm_add_epi32(_mm256_extracti128_si256(non_zero_avx, 1),
+                                  _mm256_castsi256_si128(non_zero_avx));
+    __m128i sum2  = _mm_add_epi32(sum1,_mm_unpackhi_epi64(sum1, sum1));
+    __m128i sum3  = _mm_add_epi32(sum2,_mm_shuffle_epi32(sum2, 1));
+    return (uint32_t)_mm_cvtsi128_si32(sum3);
+}
+
+Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const unsigned char *buf, size_t len) {
+    uint32_t sum2;
+
+     /* split Adler-32 into component sums */
+    sum2 = (adler >> 16) & 0xffff;
+    adler &= 0xffff;
+
+    /* in case user likes doing a byte at a time, keep it fast */
+    if (UNLIKELY(len == 1))
+        return adler32_len_1(adler, buf, sum2);
+
+    /* initial Adler-32 value (deferred check for len == 1 speed) */
+    if (UNLIKELY(buf == NULL))
+        return 1L;
+
+    /* in case short lengths are provided, keep it somewhat fast */
+    if (UNLIKELY(len < 16))
+        return adler32_len_16(adler, buf, len, sum2);
+
+    const __mmask16 vs_mask = 1U << 15;
+    /* We want to place initial adler sum at vector position 0, as it is one of the lanes that line up
+     * with the sum of absolute differences' reduction sum. If we do this, we can get away with a partial,
+     * less expensive horizontal sum for the vs1 component at the end. It also happens to be marginally better
+     * (by a single cycle) to do this with the ancient vmovd insruction, and simply allow the register to be
+     * aliased up to a 512 bit wide zmm */
+    __m512i vs1 = _mm512_castsi128_si512(_mm_cvtsi32_si128(adler));
+    __m512i vs2 = _mm512_maskz_set1_epi32(vs_mask, sum2);
+
+    const __m512i dot2v = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+                                          20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
+                                          38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+                                          56, 57, 58, 59, 60, 61, 62, 63, 64);
+
+    const __m512i zero = _mm512_setzero_si512();
+
+    while (len >= 64) {
+        int k = (len < NMAX ? (int)len : NMAX);
+        k -= k % 64;
+        len -= k;
+        __m512i vs1_0 = vs1;
+        __m512i vs3 = _mm512_setzero_si512();
+
+        /* Manually unrolled this loop by 2 for an decent amount of ILP */
+        while (k >= 128) {
+            /*
+               vs1 = adler + sum(c[i])
+               vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] )
+            */
+            __m512i vbuf0 = _mm512_loadu_si512(buf);
+            __m512i vbuf1 = _mm512_loadu_si512(buf+64);
+            buf += 128;
+            k -= 128;
+
+            __m512i vs1_sad = _mm512_sad_epu8(vbuf0, zero);
+            vs1 = _mm512_add_epi32(vs1, vs1_sad);
+            vs3 = _mm512_add_epi32(vs3, vs1_0);
+            /* multiply-add, resulting in 16 ints. Fuse with sum stage from prior versions, as we now have the dp
+             * instructions to eliminate them */
+            vs2 = _mm512_dpbusd_epi32(vs2, vbuf0, dot2v);
+            vs1_0 = vs1;
+
+            vs1_sad = _mm512_sad_epu8(vbuf1, zero);
+            vs1 = _mm512_add_epi32(vs1, vs1_sad);
+            vs3 = _mm512_add_epi32(vs3, vs1_0);
+            vs2 = _mm512_dpbusd_epi32(vs2, vbuf1, dot2v);
+            vs1_0 = vs1;
+        }
+
+        /* Remainder peeling */
+        while (k >= 64) {
+            __m512i vbuf = _mm512_loadu_si512(buf);
+            buf += 64;
+            k -= 64;
+
+            __m512i vs1_sad = _mm512_sad_epu8(vbuf, zero);
+            vs1 = _mm512_add_epi32(vs1, vs1_sad);
+            vs3 = _mm512_add_epi32(vs3, vs1_0);
+            vs2 = _mm512_dpbusd_epi32(vs2, vbuf, dot2v);
+            vs1_0 = vs1;
+        }
+
+        vs3 = _mm512_slli_epi32(vs3, 6);
+        vs2 = _mm512_add_epi32(vs2, vs3);
+
+        adler = partial_hsum(vs1) % BASE;
+        vs1 = _mm512_castsi128_si512(_mm_cvtsi32_si128(adler));
+        sum2 = _mm512_reduce_add_epi32(vs2) % BASE;
+        vs2 = _mm512_maskz_set1_epi32(vs_mask, sum2);
+    }
+
+    /* Process tail (len < 16).  */
+    return adler32_len_16(adler, buf, len, sum2);
+}
+
+#endif
diff --git a/arch/x86/x86.c b/arch/x86/x86.c
index e782cb8ee..f02e1a349 100644
--- a/arch/x86/x86.c
+++ b/arch/x86/x86.c
@@ -17,12 +17,17 @@
 #  include <cpuid.h>
 #endif
 
+#include <string.h>
+
 Z_INTERNAL int x86_cpu_has_avx2;
+Z_INTERNAL int x86_cpu_has_avx512;
+Z_INTERNAL int x86_cpu_has_avx512vnni;
 Z_INTERNAL int x86_cpu_has_sse2;
 Z_INTERNAL int x86_cpu_has_ssse3;
 Z_INTERNAL int x86_cpu_has_sse42;
 Z_INTERNAL int x86_cpu_has_pclmulqdq;
 Z_INTERNAL int x86_cpu_has_tzcnt;
+Z_INTERNAL int x86_cpu_well_suited_avx512;
 
 static void cpuid(int info, unsigned* eax, unsigned* ebx, unsigned* ecx, unsigned* edx) {
 #ifdef _MSC_VER
@@ -55,15 +60,31 @@ static void cpuidex(int info, int subinfo, unsigned* eax, unsigned* ebx, unsigne
 void Z_INTERNAL x86_check_features(void) {
     unsigned eax, ebx, ecx, edx;
     unsigned maxbasic;
+    unsigned family, model, extended_model;
+    int intel_cpu;
+    char cpu_identity[13];
 
     cpuid(0, &maxbasic, &ebx, &ecx, &edx);
 
+    /* NULL terminate the string */
+    memset(cpu_identity, 0, 13);
+    memcpy(cpu_identity, (char*)&ebx, sizeof(int));
+    memcpy(cpu_identity + 4, (char*)&edx, sizeof(int));
+    memcpy(cpu_identity + 8, (char*)&ecx, sizeof(int));
+
+    intel_cpu = strncmp(cpu_identity, "GenuineIntel", 12) == 0;
+
     cpuid(1 /*CPU_PROCINFO_AND_FEATUREBITS*/, &eax, &ebx, &ecx, &edx);
 
     x86_cpu_has_sse2 = edx & 0x4000000;
     x86_cpu_has_ssse3 = ecx & 0x200;
     x86_cpu_has_sse42 = ecx & 0x100000;
     x86_cpu_has_pclmulqdq = ecx & 0x2;
+    x86_cpu_well_suited_avx512 = 0;
+
+    model = (eax & 0xf0) >> 4;
+    family = (eax & 0xf00) >> 8;
+    extended_model = (eax & 0xf0000) >> 16;
 
     if (maxbasic >= 7) {
         cpuidex(7, 0, &eax, &ebx, &ecx, &edx);
@@ -73,8 +94,36 @@ void Z_INTERNAL x86_check_features(void) {
         x86_cpu_has_tzcnt = ebx & 0x8;
         // check AVX2 bit
         x86_cpu_has_avx2 = ebx & 0x20;
+        x86_cpu_has_avx512 = ebx & 0x00010000;
+        x86_cpu_has_avx512vnni = ecx & 0x800;
     } else {
         x86_cpu_has_tzcnt = 0;
         x86_cpu_has_avx2 = 0;
     }
+
+
+    if (intel_cpu) {
+        /* All of the Knights Landing and Knights Ferry _likely_ benefit
+         * from the AVX512 adler checksum implementation */
+        if (family == 0xb) {
+            x86_cpu_well_suited_avx512 = 1;
+        } else if (family == 0x6) {
+            if (model == 0x5 && extended_model == 0x5) {
+                /* Experimentally, on skylake-x and cascadelake-x, it has been
+                 * unwaiveringly faster to use avx512 and avx512 vnni */
+                x86_cpu_well_suited_avx512 = 1;
+            } else if (model == 0xa && extended_model == 0x6) {
+                /* Icelake server */
+                x86_cpu_well_suited_avx512 = 1; 
+            } else if (model == 0xf && extended_model == 0x8) {
+                /* Saphire rapids */
+                x86_cpu_well_suited_avx512 = 1;
+            }
+
+            /* Still need to check whether Rocket Lake and/or AlderLake
+             * benefit from the AVX512VNNI accelerated adler32 implementations.
+             * For now this working list is probably safe */
+        }
+    }
+
 }
diff --git a/arch/x86/x86.h b/arch/x86/x86.h
index 8471e155c..4274ed09f 100644
--- a/arch/x86/x86.h
+++ b/arch/x86/x86.h
@@ -7,11 +7,14 @@
 #define CPU_H_
 
 extern int x86_cpu_has_avx2;
+extern int x86_cpu_has_avx512;
+extern int x86_cpu_has_avx512vnni;
 extern int x86_cpu_has_sse2;
 extern int x86_cpu_has_ssse3;
 extern int x86_cpu_has_sse42;
 extern int x86_cpu_has_pclmulqdq;
 extern int x86_cpu_has_tzcnt;
+extern int x86_cpu_well_suited_avx512;
 
 void Z_INTERNAL x86_check_features(void);
 
diff --git a/cmake/detect-intrinsics.cmake b/cmake/detect-intrinsics.cmake
index e03daaa13..9f7a97190 100644
--- a/cmake/detect-intrinsics.cmake
+++ b/cmake/detect-intrinsics.cmake
@@ -13,6 +13,88 @@ macro(check_acle_intrinsics)
     set(CMAKE_REQUIRED_FLAGS)
 endmacro()
 
+macro(check_avx512_intrinsics)
+    if(CMAKE_C_COMPILER_ID MATCHES "Intel")
+        if(CMAKE_HOST_UNIX OR APPLE)
+            set(AVX512FLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl")
+        else()
+            set(AVX512FLAG "/arch:AVX512")
+        endif()
+    elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+        if(NOT NATIVEFLAG)
+            # For CPUs that can benefit from AVX512, it seems GCC generates suboptimal
+            # instruction scheduling unless you specify a reasonable -mtune= target
+            set(AVX512FLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mtune=cascadelake")
+        endif()
+    elseif(MSVC)
+        set(AVX512FLAG "/ARCH:AVX512")
+    endif()
+    # Check whether compiler supports AVX512 intrinsics
+    set(CMAKE_REQUIRED_FLAGS "${AVX512FLAG}")
+    check_c_source_compile_or_run(
+        "#include <immintrin.h>
+        int main(void) {
+            __m512i x = _mm512_set1_epi8(2);
+            const __m512i y = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+                                              20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
+                                              38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+                                              56, 57, 58, 59, 60, 61, 62, 63, 64);
+            x = _mm512_sub_epi8(x, y);
+            (void)x;
+            return 0;
+        }"
+        HAVE_AVX512_INTRIN
+    )
+
+    # Evidently both GCC and clang were late to implementing these
+    check_c_source_compile_or_run(
+        "#include <immintrin.h>
+        int main(void) {
+            __mmask16 a = 0xFF;
+            a = _knot_mask16(a);
+            (void)a;
+            return 0;
+        }"
+        HAVE_MASK_INTRIN
+    )
+    set(CMAKE_REQUIRED_FLAGS)
+endmacro()
+
+macro(check_avx512vnni_intrinsics)
+    if(CMAKE_C_COMPILER_ID MATCHES "Intel")
+        if(CMAKE_HOST_UNIX OR APPLE)
+            set(AVX512VNNIFLAG "-mavx512f -mavx512bw -mavx512dq -mavx512vl -mavx512vnni")
+        else()
+            set(AVX512FLAG "/ARCH:AVX512")
+        endif()
+    elseif(MSVC)
+        set(AVX512FLAG "/ARCH:AVX512")
+    elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+        if(NOT NATIVEFLAG)
+            set(AVX512VNNIFLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mtune=cascadelake")
+        endif()
+    endif()
+
+    # Check whether compiler supports AVX512vnni intrinsics
+    set(CMAKE_REQUIRED_FLAGS "${AVX512VNNIFLAG}")
+    check_c_source_compile_or_run(
+        "#include <immintrin.h>
+        int main(void) {
+            __m512i x = _mm512_set1_epi8(2);
+            const __m512i y = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+                                              20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
+                                              38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+                                              56, 57, 58, 59, 60, 61, 62, 63, 64);
+            __m512i z = _mm512_setzero_epi32();
+            z = _mm512_dpbusd_epi32(z, x, y);
+            (void)z;
+            return 0;
+        }"
+        HAVE_AVX512VNNI_INTRIN
+    )
+    set(CMAKE_REQUIRED_FLAGS)
+endmacro()
+
 macro(check_avx2_intrinsics)
     if(CMAKE_C_COMPILER_ID MATCHES "Intel")
         if(CMAKE_HOST_UNIX OR APPLE)
diff --git a/configure b/configure
index 3f145a6d4..0d79bca50 100755
--- a/configure
+++ b/configure
@@ -102,6 +102,10 @@ with_fuzzers=0
 floatabi=
 native=0
 forcesse2=0
+# For CPUs that can benefit from AVX512, it seems GCC generates suboptimal
+# instruction scheduling unless you specify a reasonable -mtune= target
+avx512flag="-mavx512f -mavx512dq -mavx512bw -mavx512vl -mtune=cascadelake"
+avx512vnniflag="-mavx512vnni ${avx512flag}"
 avx2flag="-mavx2"
 sse2flag="-msse2"
 ssse3flag="-mssse3"
@@ -250,6 +254,8 @@ case $($cc -v 2>&1) in
 esac
 
 if test $native -eq 1; then
+  avx512flag=""
+  avx512vnniflag=""
   avx2flag=""
   sse2flag=""
   ssse3flag=""
@@ -1050,6 +1056,75 @@ EOF
     fi
 }
 
+check_avx512_intrinsics() {
+    # Check whether compiler supports AVX512 intrinsics
+    cat > $test.c << EOF
+#include <immintrin.h>
+int main(void) {
+    __m512i x = _mm512_set1_epi8(2);
+    const __m512i y = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+                                      20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
+                                      38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+                                      56, 57, 58, 59, 60, 61, 62, 63, 64);
+    x = _mm512_sub_epi8(x, y);
+    (void)x;
+    return 0;
+}
+EOF
+    if try ${CC} ${CFLAGS} ${avx512flag} $test.c; then
+        echo "Checking for AVX512 intrinsics ... Yes." | tee -a configure.log
+        HAVE_AVX512_INTRIN=1
+    else
+        echo "Checking for AVX512 intrinsics ... No." | tee -a configure.log
+        HAVE_AVX512_INTRIN=0
+    fi
+}
+
+check_avx512vnni_intrinsics() {
+    # Check whether compiler supports AVX512-VNNI intrinsics
+    cat > $test.c << EOF
+#include <immintrin.h>
+int main(void) {
+    __m512i x = _mm512_set1_epi8(2);
+    const __m512i y = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+                                      20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
+                                      38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+                                      56, 57, 58, 59, 60, 61, 62, 63, 64);
+    __m512i z = _mm512_setzero_epi32();
+    z = _mm512_dpbusd_epi32(z, x, y);
+    (void)z;
+    return 0;
+}
+EOF
+    if try ${CC} ${CFLAGS} ${avx512vnniflag} $test.c; then
+        echo "Checking for AVX512VNNI intrinsics ... Yes." | tee -a configure.log
+        HAVE_AVX512VNNI_INTRIN=1
+    else
+        echo "Checking for AVX512VNNI intrinsics ... No." | tee -a configure.log
+        HAVE_AVX512VNNI_INTRIN=0
+    fi
+}
+
+check_mask_intrinsics() {
+    # Check whether compiler supports AVX512 k-mask intrinsics
+    cat > $test.c << EOF
+#include <immintrin.h>
+int main(void) {
+    __mmask16 a = 0xFF;
+    a = _knot_mask16(a);
+    (void)a;
+    return 0;
+}
+EOF
+    if try ${CC} ${CFLAGS} ${avx512flag} $test.c; then
+        echo "Checking for k-mask intrinsics ... Yes." | tee -a configure.log
+        HAVE_MASK_INTRIN=1
+    else
+        echo "Checking for k-mask intrinsics ... No." | tee -a configure.log
+        HAVE_MASK_INTRIN=0
+    fi
+}
+
 check_neon_intrinsics() {
     # Check whether -mfpu=neon is available on ARM processors.
     cat > $test.c << EOF
@@ -1318,6 +1393,31 @@ case "${ARCH}" in
                 ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} slide_hash_avx.lo chunkset_avx.lo compare258_avx.lo adler32_avx.lo"
             fi
 
+            check_avx512_intrinsics
+
+            if test ${HAVE_AVX512_INTRIN} -eq 1; then
+                CFLAGS="${CFLAGS} -DX86_AVX512 -DX86_AVX512_ADLER32"
+                SFLAGS="${SFLAGS} -DX86_AVX512 -DX86_AVX512_ADLER32"
+                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_avx512.o"
+                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_avx512.lo"
+
+                check_mask_intrinsics
+
+                if test ${HAVE_MASK_INTRIN} -eq 1; then
+                    CFLAGS="${CFLAGS} -DX86_MASK_INTRIN"
+                    SFLAGS="${SFLAGS} -DX86_MASK_INTRIN"
+                fi
+            fi
+
+            check_avx512vnni_intrinsics
+
+            if test ${HAVE_AVX512VNNI_INTRIN} -eq 1; then
+                CFLAGS="${CFLAGS} -DX86_AVX512VNNI -DX86_AVX512VNNI_ADLER32"
+                SFLAGS="${SFLAGS} -DX86_AVX512VNNI -DX86_AVX512VNNI_ADLER32"
+                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_avx512_vnni.o"
+                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_avx512_vnni.lo"
+            fi
+
             check_sse4_intrinsics
 
             if test ${HAVE_SSE42CRC_INTRIN} -eq 1 || test ${HAVE_SSE42CRC_INLINE_ASM} -eq 1; then
@@ -1908,6 +2008,8 @@ sed < $SRCDIR/$ARCHDIR/Makefile.in "
 /^SRCTOP *=/s#=.*#=$SRCDIR#
 /^BUILDDIR *=/s#=.*#=$BUILDDIR#
 /^AVX2FLAG *=/s#=.*#=$avx2flag#
+/^AVX512FLAG *=/s#=.*#=$avx512flag#
+/^AVX512VNNIFLAG *=/s#=.*#=$avx512vnniflag#
 /^SSE2FLAG *=/s#=.*#=$sse2flag#
 /^SSSE3FLAG *=/s#=.*#=$ssse3flag#
 /^SSE4FLAG *=/s#=.*#=$sse4flag#
diff --git a/functable.c b/functable.c
index d8b561b5a..da21a808b 100644
--- a/functable.c
+++ b/functable.c
@@ -69,6 +69,12 @@ extern uint32_t adler32_ssse3(uint32_t adler, const unsigned char *buf, size_t l
 #ifdef X86_AVX2_ADLER32
 extern uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_t len);
 #endif
+#ifdef X86_AVX512_ADLER32
+extern uint32_t adler32_avx512(uint32_t adler, const unsigned char *buf, size_t len);
+#endif
+#ifdef X86_AVX512VNNI_ADLER32
+extern uint32_t adler32_avx512_vnni(uint32_t adler, const unsigned char *buf, size_t len);
+#endif
 #ifdef POWER8_VSX_ADLER32
 extern uint32_t adler32_power8(uint32_t adler, const unsigned char* buf, size_t len);
 #endif
@@ -303,6 +309,15 @@ Z_INTERNAL uint32_t adler32_stub(uint32_t adler, const unsigned char *buf, size_
     if (x86_cpu_has_avx2)
         functable.adler32 = &adler32_avx2;
 #endif
+#ifdef X86_AVX512_ADLER32
+    if (x86_cpu_has_avx512 && x86_cpu_well_suited_avx512)
+        functable.adler32 = &adler32_avx512;
+#endif
+#ifdef X86_AVX512VNNI_ADLER32
+    if (x86_cpu_has_avx512vnni && x86_cpu_well_suited_avx512) {
+        functable.adler32 = &adler32_avx512_vnni;
+    }
+#endif
 #ifdef PPC_VMX_ADLER32
     if (power_cpu_has_altivec)
         functable.adler32 = &adler32_vmx;