From: Adam Stylinski <kungfujesus06@gmail.com>
Date: Fri, 8 Apr 2022 17:24:21 +0000 (-0400)
Subject: Adding avx512_vnni inline + copy elision
X-Git-Tag: 2.1.0-beta1~240
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=d79984b5bcaccab15e6cd13d7d1edea32ac36977;p=thirdparty%2Fzlib-ng.git

Adding avx512_vnni inline + copy elision

Interesting revelation while benchmarking all of this is that our
chunkmemset_avx seems to be slower in a lot of use cases than
chunkmemset_sse.  That will be an interesting function to attempt to
optimize.

Right now though, we're basically beating google for all PNG decode and
encode benchmarks.  There are some variations of flags that can
basically have us trading blows, but we're about as much as 14% faster
than chromium's zlib patches.

While we're here, add a more direct benchmark of the folded copy method
versus the explicit copy + checksum.
---

diff --git a/adler32_fold.c b/adler32_fold.c
index 688f84853..20fec2bd3 100644
--- a/adler32_fold.c
+++ b/adler32_fold.c
@@ -1,4 +1,4 @@
-/* crc32_fold.c -- adler32 folding interface
+/* adler32_fold.c -- adler32 folding interface
  * Copyright (C) 2022 Adam Stylinski 
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
diff --git a/adler32_fold.h b/adler32_fold.h
index ea456adc3..20aa1c740 100644
--- a/adler32_fold.h
+++ b/adler32_fold.h
@@ -6,8 +6,6 @@
 #ifndef ADLER32_FOLD_H_
 #define ADLER32_FOLD_H_
 
-#include <stdint.h>
-
 Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
 
 #endif
diff --git a/adler32_p.h b/adler32_p.h
index 5a14172f7..1d2e77f49 100644
--- a/adler32_p.h
+++ b/adler32_p.h
@@ -26,10 +26,10 @@ static inline uint32_t adler32_len_1(uint32_t adler, const unsigned char *buf, u
     return adler | (sum2 << 16);
 }
 
-static inline uint32_t adler32_copy_len_16(uint32_t adler, const unsigned char *buf, uint8_t *dst, size_t len, uint32_t sum2) {
-    while (len--) {
-        *dst = *buf++; 
-        adler += *dst++;
+static inline uint32_t adler32_len_16(uint32_t adler, const unsigned char *buf, size_t len, uint32_t sum2) {
+    while (len) {
+        --len;
+        adler += *buf++;
         sum2 += adler;
     }
     adler %= BASE;
@@ -38,10 +38,10 @@ static inline uint32_t adler32_copy_len_16(uint32_t adler, const unsigned char *
     return adler | (sum2 << 16);
 }
 
-static inline uint32_t adler32_len_16(uint32_t adler, const unsigned char *buf, size_t len, uint32_t sum2) {
-    while (len) {
-        --len;
-        adler += *buf++;
+static inline uint32_t adler32_copy_len_16(uint32_t adler, const unsigned char *buf, uint8_t *dst, size_t len, uint32_t sum2) {
+    while (len--) {
+        *dst = *buf++; 
+        adler += *dst++;
         sum2 += adler;
     }
     adler %= BASE;
diff --git a/arch/x86/adler32_avx2.c b/arch/x86/adler32_avx2.c
index fcca34ec5..dcd1166f3 100644
--- a/arch/x86/adler32_avx2.c
+++ b/arch/x86/adler32_avx2.c
@@ -5,29 +5,13 @@
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
-#include "../../zbuild.h"
-#include "../../adler32_p.h"
-#include "../../fallback_builtins.h"
-#include "adler32_avx2_p.h"
-#include "../../adler32_fold.h"
-#include <stdio.h>
-
 #include <immintrin.h>
 
 #ifdef X86_AVX2_ADLER32
 
 #include "adler32_avx2_tpl.h"
-#undef ADLER32_AVX2_TPL_H_
+
 #define COPY
 #include "adler32_avx2_tpl.h"
-#undef COPY
-
-/*
-Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_t len) {
-    if (buf == NULL) return 1L;
-    if (len == 0) return adler;
-    return adler32_fold_avx2(adler, buf, len);
-}
-*/
 
 #endif
diff --git a/arch/x86/adler32_avx2_p.h b/arch/x86/adler32_avx2_p.h
index 1c80bde05..f7079bf3e 100644
--- a/arch/x86/adler32_avx2_p.h
+++ b/arch/x86/adler32_avx2_p.h
@@ -6,10 +6,10 @@
 #ifndef ADLER32_AVX2_P_H_
 #define ADLER32_AVX2_P_H_
 
-#ifdef X86_AVX2_ADLER32
+#if defined(X86_AVX2_ADLER32) || defined(X86_AVX512VNNI_ADLER32)
 
 /* 32 bit horizontal sum, adapted from Agner Fog's vector library. */
-static inline uint32_t hsum(__m256i x) {
+static inline uint32_t hsum256(__m256i x) {
     __m128i sum1  = _mm_add_epi32(_mm256_extracti128_si256(x, 1),
                                   _mm256_castsi256_si128(x));
     __m128i sum2  = _mm_add_epi32(sum1, _mm_unpackhi_epi64(sum1, sum1));
@@ -17,7 +17,7 @@ static inline uint32_t hsum(__m256i x) {
     return (uint32_t)_mm_cvtsi128_si32(sum3);
 }
 
-static inline uint32_t partial_hsum(__m256i x) {
+static inline uint32_t partial_hsum256(__m256i x) {
     /* We need a permutation vector to extract every other integer. The
      * rest are going to be zeros */
     const __m256i perm_vec = _mm256_setr_epi32(0, 2, 4, 6, 1, 1, 1, 1);
diff --git a/arch/x86/adler32_avx2_tpl.h b/arch/x86/adler32_avx2_tpl.h
index 7df51d573..59cacfa48 100644
--- a/arch/x86/adler32_avx2_tpl.h
+++ b/arch/x86/adler32_avx2_tpl.h
@@ -3,9 +3,6 @@
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
-#ifndef ADLER32_AVX2_TPL_H_
-#define ADLER32_AVX2_TPL_H_
-
 #include "../../zbuild.h"
 #include <immintrin.h>
 #include "../../adler32_fold.h"
@@ -38,9 +35,9 @@ Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const uint8_t *src, size_t len)
 rem_peel:
     if (len < 16) {
 #ifdef COPY
-       return adler32_copy_len_16(adler0, src, dst, len, adler1);
+        return adler32_copy_len_16(adler0, src, dst, len, adler1);
 #else
-       return adler32_len_16(adler0, src, len, adler1);
+        return adler32_len_16(adler0, src, len, adler1);
 #endif
     } else if (len < 32) {
 #ifdef COPY
@@ -129,8 +126,8 @@ rem_peel:
          * conversion from 64 bit integer to 32 bit (needed for the inexpensive modulus with a constant).
          * This casting to 32 bit is cheap through GPRs (just register aliasing). See above for exactly
          * what the compiler is doing to avoid integer divisions. */
-        adler0 = partial_hsum(vs1) % BASE;
-        adler1 = hsum(vs2) % BASE;
+        adler0 = partial_hsum256(vs1) % BASE;
+        adler1 = hsum256(vs2) % BASE;
     }
 
     adler = adler0 | (adler1 << 16);
@@ -141,5 +138,3 @@ rem_peel:
 
     return adler;
 }
-
-#endif
diff --git a/arch/x86/adler32_avx512.c b/arch/x86/adler32_avx512.c
index e26b9cc52..c0bf0721f 100644
--- a/arch/x86/adler32_avx512.c
+++ b/arch/x86/adler32_avx512.c
@@ -6,20 +6,11 @@
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
-#include "../../zbuild.h"
-#include "../../adler32_p.h"
-#include "../../cpu_features.h"
-#include "../../fallback_builtins.h"
-#include <immintrin.h>
-#include "adler32_avx512_p.h"
-#include "../../adler32_fold.h"
-
 #ifdef X86_AVX512_ADLER32
 
 #include "adler32_avx512_tpl.h"
-#undef ADLER32_AVX512_TPL_H_
+
 #define COPY
 #include "adler32_avx512_tpl.h"
-#undef COPY
 
 #endif
diff --git a/arch/x86/adler32_avx512_tpl.h b/arch/x86/adler32_avx512_tpl.h
index df5dd3810..d324ce985 100644
--- a/arch/x86/adler32_avx512_tpl.h
+++ b/arch/x86/adler32_avx512_tpl.h
@@ -3,16 +3,13 @@
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
-#ifndef ADLER32_AVX512_TPL_H_
-#define ADLER32_AVX512_TPL_H_
-
 #include "../../zbuild.h"
 #include "../../adler32_p.h"
+#include "../../adler32_fold.h"
 #include "../../cpu_features.h"
 #include "../../fallback_builtins.h"
 #include <immintrin.h>
 #include "adler32_avx512_p.h"
-#include "../../adler32_fold.h"
 
 #ifdef X86_AVX512_ADLER32
 
@@ -22,13 +19,13 @@ Z_INTERNAL uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const
 Z_INTERNAL uint32_t adler32_avx512(uint32_t adler, const uint8_t *src, size_t len) {
 #endif
 
+    if (src == NULL) return 1L;
+    if (len == 0) return adler;
+
     uint32_t adler0, adler1;
     adler1 = (adler >> 16) & 0xffff;
     adler0 = adler & 0xffff; 
 
-    if (src == NULL) return 1L;
-    if (len == 0) return adler;
-
 rem_peel:
     if (len < 64) {
         /* This handles the remaining copies, just call normal adler checksum after this */
@@ -107,4 +104,3 @@ rem_peel:
 }
 
 #endif
-#endif
diff --git a/arch/x86/adler32_avx512_vnni.c b/arch/x86/adler32_avx512_vnni.c
index 253eed9c6..330bfe38e 100644
--- a/arch/x86/adler32_avx512_vnni.c
+++ b/arch/x86/adler32_avx512_vnni.c
@@ -7,66 +7,54 @@
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
+#ifdef X86_AVX512VNNI_ADLER32
+
 #include "../../zbuild.h"
 #include "../../adler32_p.h"
 #include "../../cpu_features.h"
 #include "../../fallback_builtins.h"
 #include <immintrin.h>
+#include "../../adler32_fold.h"
 #include "adler32_avx512_p.h"
+#include "adler32_avx2_p.h"
 
-#ifdef X86_AVX512VNNI_ADLER32
-Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const unsigned char *buf, size_t len) {
-    uint32_t sum2;
+Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *src, size_t len) {
+    if (src == NULL) return 1L;
+    if (len == 0) return adler;
+
+    uint32_t adler0, adler1;
+    adler1 = (adler >> 16) & 0xffff;
+    adler0 = adler & 0xffff; 
 
-    /* For impossibly tiny sizes, use the smaller width versions. We still need
-     * to check for compile time support for these but they are likely there */
-#ifdef X86_SSE41_ADLER32
+rem_peel:
     if (len < 32)
-        return adler32_sse41(adler, buf, len);
+#if defined(X86_SSSE3_ADLER32)
+        return adler32_ssse3(adler, src, len);
+#else
+        return adler32_len_16(adler0, src, len, adler1); 
 #endif
 
-#ifdef X86_AVX2_ADLER32
     if (len < 64)
-        return adler32_avx2(adler, buf, len);
-#endif
-
-     /* split Adler-32 into component sums */
-    sum2 = (adler >> 16) & 0xffff;
-    adler &= 0xffff;
-
-    /* Only capture these corner cases if we didn't compile with SSE41 and AVX2 support
-     * This should make for shorter compiled code */
-#if !defined(X86_AVX2_ADLER32) && !defined(X86_SSE41_ADLER32)
-    /* in case user likes doing a byte at a time, keep it fast */
-    if (UNLIKELY(len == 1))
-        return adler32_len_1(adler, buf, sum2);
-
-    /* initial Adler-32 value (deferred check for len == 1 speed) */
-    if (UNLIKELY(buf == NULL))
-        return 1L;
-
-    /* in case short lengths are provided, keep it somewhat fast */
-    if (UNLIKELY(len < 16))
-        return adler32_len_16(adler, buf, len, sum2);
+#ifdef X86_AVX2_ADLER32
+        return adler32_avx2(adler, src, len);
+#elif defined(X86_SSE3_ADLER32)
+        return adler32_ssse3(adler, src, len);
+#else
+        return adler32_len_16(adler0, src, len, adler1); 
 #endif
 
-    /* We want to place initial adler sum at vector position 0, as it is one of the lanes that line up
-     * with the sum of absolute differences' reduction sum. If we do this, we can get away with a partial,
-     * less expensive horizontal sum for the vs1 component at the end. It also happens to be marginally better
-     * (by a single cycle) to do this with the ancient vmovd insruction, and simply allow the register to be
-     * aliased up to a 512 bit wide zmm */
-    __m512i vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler));
-    __m512i vs2 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(sum2));
-
     const __m512i dot2v = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
                                           20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
                                           38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
                                           56, 57, 58, 59, 60, 61, 62, 63, 64);
 
     const __m512i zero = _mm512_setzero_si512();
+    __m512i vs1, vs2;
 
     while (len >= 64) {
-        int k = (len < NMAX ? (int)len : NMAX);
+        vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler0));
+        vs2 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler1));
+        size_t k = MIN(len, NMAX);
         k -= k % 64;
         len -= k;
         __m512i vs1_0 = vs1;
@@ -77,8 +65,9 @@ Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const unsigned char *buf
 
         /* Remainder peeling */
         if (k % 128) {
-            vbuf1 = _mm512_loadu_si512(buf);
-            buf += 64;
+            vbuf1 = _mm512_loadu_si512((__m512i*)src);
+
+            src += 64;
             k -= 64;
 
             __m512i vs1_sad = _mm512_sad_epu8(vbuf1, zero);
@@ -94,9 +83,9 @@ Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const unsigned char *buf
                vs1 = adler + sum(c[i])
                vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] )
             */
-            vbuf0 = _mm512_loadu_si512(buf);
-            vbuf1 = _mm512_loadu_si512(buf + 64);
-            buf += 128;
+            vbuf0 = _mm512_loadu_si512((__m512i*)src);
+            vbuf1 = _mm512_loadu_si512((__m512i*)(src + 64));
+            src += 128;
             k -= 128;
 
             __m512i vs1_sad = _mm512_sad_epu8(vbuf0, zero);
@@ -117,14 +106,120 @@ Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const unsigned char *buf
         vs2 = _mm512_add_epi32(vs2, vs3);
         vs2 = _mm512_add_epi32(vs2, vs2_1);
 
-        adler = partial_hsum(vs1) % BASE;
-        vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler));
-        sum2 = _mm512_reduce_add_epu32(vs2) % BASE;
-        vs2 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(sum2));
+        adler0 = partial_hsum(vs1) % BASE;
+        adler1 = _mm512_reduce_add_epu32(vs2) % BASE;
+    }
+
+    adler = adler0 | (adler1 << 16);
+
+    /* Process tail (len < 64). */
+    if (len) {
+        goto rem_peel;
+    }
+
+    return adler; 
+}
+
+Z_INTERNAL uint32_t adler32_fold_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
+    if (src == NULL) return 1L;
+    if (len == 0) return adler;
+
+    uint32_t adler0, adler1;
+    adler1 = (adler >> 16) & 0xffff;
+    adler0 = adler & 0xffff; 
+
+rem_peel_copy:
+    if (len < 32) {
+        /* This handles the remaining copies, just call normal adler checksum after this */
+        __mmask32 storemask = (0xFFFFFFFFUL >> (32 - len));
+        __m256i copy_vec = _mm256_maskz_loadu_epi8(storemask, src);
+        _mm256_mask_storeu_epi8(dst, storemask, copy_vec);
+
+#if defined(X86_SSSE3_ADLER32)
+        return adler32_ssse3(adler, src, len);
+#else
+        return adler32_len_16(adler0, src, len, adler1); 
+#endif
+    }
+
+    const __m256i dot2v = _mm256_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+                                          20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
+
+    const __m256i zero = _mm256_setzero_si256();
+    __m256i vs1, vs2;
+
+    while (len >= 32) {
+        vs1 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler0));
+        vs2 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler1));
+        size_t k = MIN(len, NMAX);
+        k -= k % 32;
+        len -= k;
+        __m256i vs1_0 = vs1;
+        __m256i vs3 = _mm256_setzero_si256();
+        /* We might get a tad bit more ILP here if we sum to a second register in the loop */
+        __m256i vs2_1 = _mm256_setzero_si256();
+        __m256i vbuf0, vbuf1;
+
+        /* Remainder peeling */
+        if (k % 64) {
+            vbuf1 = _mm256_loadu_si256((__m256i*)src);
+            _mm256_storeu_si256((__m256i*)dst, vbuf1);
+            dst += 32;
+
+            src += 32;
+            k -= 32;
+
+            __m256i vs1_sad = _mm256_sad_epu8(vbuf1, zero);
+            vs1 = _mm256_add_epi32(vs1, vs1_sad);
+            vs3 = _mm256_add_epi32(vs3, vs1_0);
+            vs2 = _mm256_dpbusd_epi32(vs2, vbuf1, dot2v);
+            vs1_0 = vs1;
+        }
+
+        /* Manually unrolled this loop by 2 for an decent amount of ILP */
+        while (k >= 64) {
+            /*
+               vs1 = adler + sum(c[i])
+               vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] )
+            */
+            vbuf0 = _mm256_loadu_si256((__m256i*)src);
+            vbuf1 = _mm256_loadu_si256((__m256i*)(src + 32));
+            _mm256_storeu_si256((__m256i*)dst, vbuf0);
+            _mm256_storeu_si256((__m256i*)(dst + 32), vbuf1);
+            dst += 64;
+            src += 64;
+            k -= 64;
+
+            __m256i vs1_sad = _mm256_sad_epu8(vbuf0, zero);
+            vs1 = _mm256_add_epi32(vs1, vs1_sad);
+            vs3 = _mm256_add_epi32(vs3, vs1_0);
+            /* multiply-add, resulting in 16 ints. Fuse with sum stage from prior versions, as we now have the dp
+             * instructions to eliminate them */
+            vs2 = _mm256_dpbusd_epi32(vs2, vbuf0, dot2v);
+
+            vs3 = _mm256_add_epi32(vs3, vs1);
+            vs1_sad = _mm256_sad_epu8(vbuf1, zero);
+            vs1 = _mm256_add_epi32(vs1, vs1_sad);
+            vs2_1 = _mm256_dpbusd_epi32(vs2_1, vbuf1, dot2v);
+            vs1_0 = vs1;
+        }
+
+        vs3 = _mm256_slli_epi32(vs3, 5);
+        vs2 = _mm256_add_epi32(vs2, vs3);
+        vs2 = _mm256_add_epi32(vs2, vs2_1);
+
+        adler0 = partial_hsum256(vs1) % BASE;
+        adler1 = hsum256(vs2) % BASE;
     }
 
+    adler = adler0 | (adler1 << 16);
+
     /* Process tail (len < 64). */
-    return adler32_len_16(adler, buf, len, sum2);
+    if (len) {
+        goto rem_peel_copy;
+    }
+
+    return adler; 
 }
 
 #endif
diff --git a/arch/x86/adler32_sse42.c b/arch/x86/adler32_sse42.c
index 4f21702aa..92efe4d8d 100644
--- a/arch/x86/adler32_sse42.c
+++ b/arch/x86/adler32_sse42.c
@@ -1,4 +1,4 @@
-/* adler32_sse4.c -- compute the Adler-32 checksum of a data stream
+/* adler32_sse42.c -- compute the Adler-32 checksum of a data stream
  * Copyright (C) 1995-2011 Mark Adler
  * Authors:
  *   Adam Stylinski <kungfujesus06@gmail.com>
@@ -15,7 +15,6 @@
 #ifdef X86_SSE42_ADLER32
 
 Z_INTERNAL uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
-
     uint32_t adler0, adler1;
     adler1 = (adler >> 16) & 0xffff;
     adler0 = adler & 0xffff; 
diff --git a/arch/x86/adler32_ssse3_tpl.h b/arch/x86/adler32_ssse3_tpl.h
deleted file mode 100644
index aedfa8124..000000000
--- a/arch/x86/adler32_ssse3_tpl.h
+++ /dev/null
@@ -1,188 +0,0 @@
-/* adler32_ssse3_tpl.h -- adler32 ssse3 vectorized function templates
- * Copyright (C) 2022 Adam Stylinski
- * For conditions of distribution and use, see copyright notice in zlib.h
- */
-
-#ifndef ADLER32_SSSE3_TPL_H_
-#define ADLER32_SSSE3_TPL_H_
-
-#include "../../zbuild.h"
-#include <immintrin.h>
-#include "../../adler32_fold.h"
-#include "../../adler32_p.h"
-#include "adler32_ssse3_p.h"
-
-#ifdef COPY
-Z_INTERNAL void adler32_fold_copy_ssse3(adler32_fold *adler, uint8_t *dst, const uint8_t *src, size_t len) {
-#else
-Z_INTERNAL void adler32_fold_ssse3(adler32_fold *adler, const uint8_t *src, size_t len) {
-#endif
-    uint32_t adler0, adler1;
-
-     /* split Adler-32 into component sums */
-    adler1 = (adler->nsums >> 16) & 0xffff;
-    adler0 = adler->nsums & 0xffff;
-
-    /* in case user likes doing a byte at a time, keep it fast */
-    if (UNLIKELY(len == 1)) {
-#ifdef COPY
-        *(dst++) = *src;
-#endif
-        adler->nsums = adler32_len_1(adler0, src, adler1);
-        return;
-    }
-
-    /* initial Adler-32 value (deferred check for len == 1 speed) */
-    if (UNLIKELY(src == NULL)) {
-        adler->nsums = 1L;
-        return;
-    }
-
-    /* in case short lengths are provided, keep it somewhat fast */
-    if (UNLIKELY(len < 16)) {
-        goto sub16;
-    }
-
-    const __m128i dot2v = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17);
-    const __m128i dot2v_0 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
-    const __m128i dot3v = _mm_set1_epi16(1);
-    const __m128i zero = _mm_setzero_si128();
-
-    __m128i vbuf, vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0,
-            vbuf_0, v_sad_sum2, vsum2, vsum2_0;
-
-    /* If our buffer is unaligned (likely), make the determination whether
-     * or not there's enough of a buffer to consume to make the scalar, aligning
-     * additions worthwhile or if it's worth it to just eat the cost of an unaligned
-     * load. This is a pretty simple test, just test if 16 - the remainder + len is
-     * < 16 */
-    size_t max_iters = NMAX;
-    size_t rem = (uintptr_t)src & 15;
-    size_t align_offset = 16 - rem;
-    size_t k = 0;
-    if (rem) {
-        if (len < 16 + align_offset) {
-            /* Let's eat the cost of this one unaligned load so that
-             * we don't completely skip over the vectorization. Doing
-             * 16 bytes at a time unaligned is is better than 16 + <= 15
-             * sums */
-            vbuf = _mm_loadu_si128((__m128i*)src);
-            len -= 16;
-            src += 16;
-#ifdef COPY
-            _mm_storeu_si128((__m128i*)dst, vbuf);
-            dst += 16;
-#endif
-            vs1 = _mm_cvtsi32_si128(adler0);
-            vs2 = _mm_cvtsi32_si128(adler1);
-            vs3 = _mm_setzero_si128();
-            vs1_0 = vs1;
-            goto unaligned_jmp;
-        }
-
-#ifdef COPY
-        memcpy(dst, src, align_offset);
-        dst += align_offset;
-#endif
-        for (size_t i = 0; i < align_offset; ++i) {
-            adler0 += *(src++);
-            adler1 += adler0;
-        }
-
-        /* lop off the max number of sums based on the scalar sums done
-         * above */
-        len -= align_offset;
-        max_iters -= align_offset; 
-    }
-
-
-    while (len >= 16) {
-        vs1 = _mm_cvtsi32_si128(adler0);
-        vs2 = _mm_cvtsi32_si128(adler1);
-        vs3 = _mm_setzero_si128();
-        vs2_0 = _mm_setzero_si128();
-        vs1_0 = vs1;
-
-        k = (len < max_iters ? len : max_iters);
-        k -= k % 16;
-        len -= k;
-
-        while (k >= 32) {
-            /*
-               vs1 = adler + sum(c[i])
-               vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
-            */
-            vbuf = _mm_load_si128((__m128i*)src);
-            vbuf_0 = _mm_load_si128((__m128i*)(src + 16));
-            src += 32;
-            k -= 32;
-
-            v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
-            v_sad_sum2 = _mm_sad_epu8(vbuf_0, zero);
-#ifdef COPY
-            _mm_storeu_si128((__m128i*)dst, vbuf);
-            _mm_storeu_si128((__m128i*)(dst + 16), vbuf_0);
-            dst += 32;
-#endif
-            vs1 = _mm_add_epi32(v_sad_sum1, vs1);
-            vs3 = _mm_add_epi32(vs1_0, vs3);
-
-            vs1 = _mm_add_epi32(v_sad_sum2, vs1);
-            v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v);
-            vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
-            v_short_sum2_0 = _mm_maddubs_epi16(vbuf_0, dot2v_0);
-            vs2 = _mm_add_epi32(vsum2, vs2);
-            vsum2_0 = _mm_madd_epi16(v_short_sum2_0, dot3v);
-            vs2_0 = _mm_add_epi32(vsum2_0, vs2_0);
-            vs1_0 = vs1;
-        }
-
-        vs2 = _mm_add_epi32(vs2_0, vs2);
-        vs3 = _mm_slli_epi32(vs3, 5);
-        vs2 = _mm_add_epi32(vs3, vs2);
-        vs3 = _mm_setzero_si128();
-
-        while (k >= 16) {
-            /*
-               vs1 = adler + sum(c[i])
-               vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
-            */
-            vbuf = _mm_load_si128((__m128i*)src);
-            src += 16;
-            k -= 16;
-
-unaligned_jmp:
-            v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
-#ifdef COPY
-            _mm_storeu_si128((__m128i*)dst, vbuf);
-            dst += 16;
-#endif
-            vs1 = _mm_add_epi32(v_sad_sum1, vs1);
-            vs3 = _mm_add_epi32(vs1_0, vs3);
-            v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v_0);
-            vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
-            vs2 = _mm_add_epi32(vsum2, vs2);
-            vs1_0 = vs1;
-        }
-
-        vs3 = _mm_slli_epi32(vs3, 4);
-        vs2 = _mm_add_epi32(vs2, vs3);
-
-        /* We don't actually need to do a full horizontal sum, since psadbw is actually doing
-         * a partial reduction sum implicitly and only summing to integers in vector positions
-         * 0 and 2. This saves us some contention on the shuffle port(s) */
-        adler0 = partial_hsum(vs1) % BASE;
-        adler1 = hsum(vs2) % BASE;
-        max_iters = NMAX;
-    }
-
-sub16:
-#ifdef COPY
-    adler->nsums = adler32_copy_len_16(adler0, src, dst, len, adler1);
-#else
-    /* Process tail (len < 16).  */
-    adler->nsums = adler32_len_16(adler0, src, len, adler1);
-#endif
-}
-
-#endif
diff --git a/cpu_features.h b/cpu_features.h
index 9e0d5cb95..fc1b5d733 100644
--- a/cpu_features.h
+++ b/cpu_features.h
@@ -6,8 +6,8 @@
 #ifndef CPU_FEATURES_H_
 #define CPU_FEATURES_H_
 
-#include "crc32_fold.h"
 #include "adler32_fold.h"
+#include "crc32_fold.h"
 
 #if defined(X86_FEATURES)
 #  include "arch/x86/x86_features.h"
@@ -35,16 +35,11 @@ extern uint32_t adler32_vmx(uint32_t adler, const unsigned char *buf, size_t len
 #ifdef X86_SSSE3_ADLER32
 extern uint32_t adler32_ssse3(uint32_t adler, const unsigned char *buf, size_t len);
 #endif
-#ifdef X86_SSE42_ADLER32
-extern uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
-#endif
 #ifdef X86_AVX2_ADLER32
 extern uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_t len);
-extern uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
 #endif
 #ifdef X86_AVX512_ADLER32
 extern uint32_t adler32_avx512(uint32_t adler, const unsigned char *buf, size_t len);
-extern uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
 #endif
 #ifdef X86_AVX512VNNI_ADLER32
 extern uint32_t adler32_avx512_vnni(uint32_t adler, const unsigned char *buf, size_t len);
@@ -53,6 +48,20 @@ extern uint32_t adler32_avx512_vnni(uint32_t adler, const unsigned char *buf, si
 extern uint32_t adler32_power8(uint32_t adler, const unsigned char* buf, size_t len);
 #endif
 
+/* adler32 folding */
+#ifdef X86_SSE42_ADLER32
+extern uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+#ifdef X86_AVX2_ADLER32
+extern uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+#ifdef X86_AVX512_ADLER32
+extern uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+#ifdef X86_AVX512VNNI_ADLER32
+extern uint32_t adler32_fold_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+
 /* CRC32 folding */
 #ifdef X86_PCLMULQDQ_CRC
 extern uint32_t crc32_fold_reset_pclmulqdq(crc32_fold *crc);
diff --git a/deflate.c b/deflate.c
index 006803c32..c2700f2b5 100644
--- a/deflate.c
+++ b/deflate.c
@@ -52,7 +52,6 @@
 #include "deflate.h"
 #include "deflate_p.h"
 #include "functable.h"
-#include <stdio.h>
 
 const char PREFIX(deflate_copyright)[] = " deflate 1.2.11.f Copyright 1995-2016 Jean-loup Gailly and Mark Adler ";
 /*
@@ -446,7 +445,6 @@ int32_t Z_EXPORT PREFIX(deflateResetKeep)(PREFIX3(stream) *strm) {
 
 #ifdef GZIP
     if (s->wrap == 2) {
-        /* Ensure that there's always a reset, regardless of "wrap" */
         strm->adler = functable.crc32_fold_reset(&s->crc_fold);
     } else
 #endif
diff --git a/deflate.h b/deflate.h
index abc87d8b8..1b59c7539 100644
--- a/deflate.h
+++ b/deflate.h
@@ -10,9 +10,9 @@
    subject to change. Applications should only use zlib.h.
  */
 
-#include "adler32_fold.h"
 #include "zutil.h"
 #include "zendian.h"
+#include "adler32_fold.h"
 #include "crc32_fold.h"
 
 /* define NO_GZIP when compiling if you want to disable gzip header and
diff --git a/functable.c b/functable.c
index 832835970..394532312 100644
--- a/functable.c
+++ b/functable.c
@@ -204,7 +204,7 @@ Z_INTERNAL uint32_t adler32_stub(uint32_t adler, const unsigned char *buf, size_
 
 Z_INTERNAL uint32_t adler32_fold_copy_stub(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
     functable.adler32_fold_copy = &adler32_fold_copy_c;
-#if (defined X86_SSE42_ADLER32) && !defined(X86_AVX512VNNI_ADLER32)
+#if (defined X86_SSE42_ADLER32)
     if (x86_cpu_has_sse42)
         functable.adler32_fold_copy = &adler32_fold_copy_sse42;
 #endif
@@ -215,6 +215,10 @@ Z_INTERNAL uint32_t adler32_fold_copy_stub(uint32_t adler, uint8_t *dst, const u
 #ifdef X86_AVX512_ADLER32
     if (x86_cpu_has_avx512)
         functable.adler32_fold_copy = &adler32_fold_copy_avx512;
+#endif
+#ifdef X86_AVX512VNNI_ADLER32
+    if (x86_cpu_has_avx512vnni)
+        functable.adler32_fold_copy = &adler32_fold_copy_avx512_vnni;
 #endif
     return functable.adler32_fold_copy(adler, dst, src, len);
 }
diff --git a/inflate.c b/inflate.c
index 8611e4bd4..b9c3aa797 100644
--- a/inflate.c
+++ b/inflate.c
@@ -609,7 +609,6 @@ int32_t Z_EXPORT PREFIX(inflate)(PREFIX3(stream) *strm, int32_t flush) {
 #endif
         case DICTID:
             NEEDBITS(32);
-            //strm->adler = state->check = ZSWAP32(hold);
             strm->adler = state->check = ZSWAP32(hold);
             INITBITS();
             state->mode = DICT;
diff --git a/inflate.h b/inflate.h
index 5761077a6..941e8b0a2 100644
--- a/inflate.h
+++ b/inflate.h
@@ -11,8 +11,8 @@
 #ifndef INFLATE_H_
 #define INFLATE_H_
 
-#include "crc32_fold.h"
 #include "adler32_fold.h"
+#include "crc32_fold.h"
 
 /* define NO_GZIP when compiling if you want to disable gzip header and trailer decoding by inflate().
    NO_GZIP would be used to avoid linking in the crc code when it is not needed.
diff --git a/test/benchmarks/CMakeLists.txt b/test/benchmarks/CMakeLists.txt
index df1df4973..19762fc73 100644
--- a/test/benchmarks/CMakeLists.txt
+++ b/test/benchmarks/CMakeLists.txt
@@ -24,6 +24,7 @@ endif()
 
 add_executable(benchmark_zlib
     benchmark_adler32.cc
+    benchmark_adler32_copy.cc
     benchmark_compare256.cc
     benchmark_crc32.cc
     benchmark_main.cc
diff --git a/test/benchmarks/benchmark_adler32_copy.cc b/test/benchmarks/benchmark_adler32_copy.cc
new file mode 100644
index 000000000..fac4c7f1c
--- /dev/null
+++ b/test/benchmarks/benchmark_adler32_copy.cc
@@ -0,0 +1,117 @@
+/* benchmark_adler32_copy.cc -- benchmark adler32 (elided copy) variants
+ * Copyright (C) 2022 Nathan Moinvaziri, Adam Stylinski
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include <stdio.h>
+#include <assert.h>
+#include <string.h>
+
+#include <benchmark/benchmark.h>
+
+extern "C" {
+#  include "zbuild.h"
+#  include "zutil_p.h"
+#  include "cpu_features.h"
+}
+
+#define MAX_RANDOM_INTS (1024 * 1024)
+#define MAX_RANDOM_INTS_SIZE (MAX_RANDOM_INTS * sizeof(uint32_t))
+
+typedef uint32_t (*adler32_cpy_func)(uint32_t adler, unsigned char *dst, const unsigned char *buf, size_t len);
+
+class adler32_copy: public benchmark::Fixture {
+private:
+    uint32_t *random_ints_src;
+    uint32_t *random_ints_dst;
+
+public:
+    void SetUp(const ::benchmark::State& state) {
+        /* Control the alignment so that we have the best case scenario for loads. With
+         * AVX512, unaligned loads can mean we're crossing a cacheline boundary at every load.
+         * And while this is a realistic scenario, it makes it difficult to compare benchmark
+         * to benchmark because one allocation could have been aligned perfectly for the loads
+         * while the subsequent one happened to not be. This is not to be advantageous to AVX512
+         * (indeed, all lesser SIMD implementations benefit from this aligned allocation), but to
+         * control the _consistency_ of the results */
+        random_ints_src = (uint32_t *)zng_alloc(MAX_RANDOM_INTS_SIZE);
+        random_ints_dst = (uint32_t *)zng_alloc(MAX_RANDOM_INTS_SIZE);
+        assert(random_ints != NULL);
+
+        for (int32_t i = 0; i < MAX_RANDOM_INTS; i++) {
+            random_ints_src[i] = rand();
+        }
+    }
+
+    void Bench(benchmark::State& state, adler32_cpy_func adler32_func) {
+        uint32_t hash = 0;
+
+        for (auto _ : state) {
+            hash = adler32_func(hash, (unsigned char *)random_ints_dst,
+                                (const unsigned char*)random_ints_src, state.range(0));
+        }
+
+        benchmark::DoNotOptimize(hash);
+    }
+
+    void TearDown(const ::benchmark::State& state) {
+        zng_free(random_ints_src);
+        zng_free(random_ints_dst);
+    }
+};
+
+#define BENCHMARK_ADLER32_COPY(name, fptr, support_flag) \
+    BENCHMARK_DEFINE_F(adler32_copy, name)(benchmark::State& state) { \
+        if (!support_flag) { \
+            state.SkipWithError("CPU does not support " #name); \
+        } \
+        Bench(state, fptr); \
+    } \
+    BENCHMARK_REGISTER_F(adler32_copy, name)->Range(8192, MAX_RANDOM_INTS_SIZE);
+
+#define BENCHMARK_ADLER32_BASELINE_COPY(name, fptr, support_flag) \
+    BENCHMARK_DEFINE_F(adler32_copy, name)(benchmark::State& state) { \
+        if (!support_flag) { \
+            state.SkipWithError("CPU does not support " #name); \
+        } \
+        Bench(state, [](uint32_t init_sum, unsigned char *dst, \
+                        const unsigned char *buf, size_t len) -> uint32_t { \
+            memcpy(dst, buf, len); \
+            return fptr(init_sum, buf, len); \
+        }); \
+    } \
+    BENCHMARK_REGISTER_F(adler32_copy, name)->Range(8192, MAX_RANDOM_INTS_SIZE);
+
+BENCHMARK_ADLER32_BASELINE_COPY(c, adler32_c, 1);
+
+#ifdef ARM_NEON_ADLER32
+/* If we inline this copy for neon, the function would go here */
+//BENCHMARK_ADLER32_COPY(neon, adler32_neon, arm_cpu_has_neon);
+BENCHMARK_ADLER32_BASELINE_COPY(neon_copy_baseline, adler32_neon, arm_cpu_has_neon);
+#endif
+
+#ifdef PPC_VMX_ADLER32
+//BENCHMARK_ADLER32_COPY(vmx_inline_copy, adler32_fold_copy_vmx, power_cpu_has_altivec);
+BENCHMARK_ADLER32_BASELINE_COPY(vmx_copy_baseline, adler32_vmx, power_cpu_has_altivec);
+#endif
+#ifdef POWER8_VSX_ADLER32
+//BENCHMARK_ADLER32_COPY(power8_inline_copy, adler32_fold_copy_power8, power_cpu_has_arch_2_07);
+BENCHMARK_ADLER32_BASELINE_COPY(power8, adler32_power8, power_cpu_has_arch_2_07);
+#endif
+
+#ifdef X86_SSE42_ADLER32
+BENCHMARK_ADLER32_BASELINE_COPY(sse42_baseline, adler32_ssse3, x86_cpu_has_ssse3);
+BENCHMARK_ADLER32_COPY(sse42, adler32_fold_copy_sse42, x86_cpu_has_sse42);
+#endif
+#ifdef X86_AVX2_ADLER32
+BENCHMARK_ADLER32_BASELINE_COPY(avx2_baseline, adler32_avx2, x86_cpu_has_avx2);
+BENCHMARK_ADLER32_COPY(avx2, adler32_fold_copy_avx2, x86_cpu_has_avx2);
+#endif
+#ifdef X86_AVX512_ADLER32
+BENCHMARK_ADLER32_BASELINE_COPY(avx512_baseline, adler32_avx512, x86_cpu_has_avx512);
+BENCHMARK_ADLER32_COPY(avx512, adler32_fold_copy_avx512, x86_cpu_has_avx512);
+#endif
+#ifdef X86_AVX512VNNI_ADLER32
+BENCHMARK_ADLER32_BASELINE_COPY(avx512_vnni_baseline, adler32_avx512_vnni, x86_cpu_has_avx512vnni);
+BENCHMARK_ADLER32_COPY(avx512_vnni, adler32_fold_copy_avx512_vnni, x86_cpu_has_avx512vnni);
+#endif
diff --git a/win32/Makefile.msc b/win32/Makefile.msc
index 8db2633b4..8a398e499 100644
--- a/win32/Makefile.msc
+++ b/win32/Makefile.msc
@@ -30,15 +30,15 @@ WFLAGS  = \
 	-DX86_FEATURES \
 	-DX86_PCLMULQDQ_CRC \
 	-DX86_SSE2 \
-    -DX86_SSE42_ADLER32 \
+	-DX86_SSE42_ADLER32 \
 	-DX86_SSE42_CRC_INTRIN \
 	-DX86_SSE42_CRC_HASH \
-    -DX86_SSSE3_ADLER32 \
+	-DX86_SSSE3_ADLER32 \
 	-DX86_AVX2 \
-    -DX86_AVX2_ADLER32 \
+	-DX86_AVX2_ADLER32 \
 	-DX86_AVX_CHUNKSET \
-	-DX86_SSE2_CHUNKSET \
-	#
+	-DX86_SSE2_CHUNKSET
+
 LDFLAGS = -nologo -debug -incremental:no -opt:ref -manifest
 ARFLAGS = -nologo
 RCFLAGS = /dWIN32 /r
@@ -51,12 +51,12 @@ SUFFIX =
 
 OBJS = \
 	adler32.obj \
-    adler32_avx2.obj \
-    adler32_avx512.obj \
-    adler32_avx512_vnni.obj \
-    adler32_sse42.obj \
-    adler32_ssse3.obj \
-    adler32_fold.obj \
+	adler32_avx2.obj \
+	adler32_avx512.obj \
+	adler32_avx512_vnni.obj \
+	adler32_sse42.obj \
+	adler32_ssse3.obj \
+	adler32_fold.obj \
 	chunkset.obj \
 	chunkset_avx.obj \
 	chunkset_sse2.obj \