From: Adam Stylinski <kungfujesus06@gmail.com>
Date: Sun, 3 Apr 2022 16:18:12 +0000 (-0400)
Subject: Adding an SSE42 optimized copy + adler checksum implementation
X-Git-Tag: 2.1.0-beta1~243
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=21f461e23829a0ce8ec4b267a885b5406c1d9b99;p=thirdparty%2Fzlib-ng.git

Adding an SSE42 optimized copy + adler checksum implementation

We are protecting its usage around a lot of preprocessor macros as the
other methods are not yet implemented and calling this version bypasses
the faster adler implementations implicitly.

When more versions are written for faster vectorizations, the functable
entries will be populated and preprocessor macros removed. This round,
the copy + checksum is not employing as many tricks as one would hope
with a "folded" checksum routine.  The reason for this is the
particularly tricky case of dealing with unaligned buffers.  The
implementations which don't have CPUs in the mix that have a huge
penalty for unaligned loads will have a much faster implementation.

Fancier methods that minimized rebasing, while having the potential to
be faster, ended up being slower because the compiler structured the
code in a way that ended up either spilling to the stack or trampolining
out of a loop and back in it instead of just jumping over the first load
and store.

Revisiting this for AVX512, where more registers are abundant and more
advanced loads exist, may be prudent.
---

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 247af86d2..9719468bf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -779,8 +779,8 @@ if(WITH_OPTIM)
         if(WITH_SSE42)
             check_sse42_intrinsics()
             if(HAVE_SSE42CRC_INLINE_ASM OR HAVE_SSE42CRC_INTRIN)
-                add_definitions(-DX86_SSE42_CRC_HASH)
-                set(SSE42_SRCS ${ARCHDIR}/insert_string_sse42.c)
+                add_definitions(-DX86_SSE42_CRC_HASH -DX86_SSE42_ADLER32)
+                set(SSE42_SRCS ${ARCHDIR}/adler32_sse42.c ${ARCHDIR}/insert_string_sse42.c)
                 add_feature_info(SSE42_CRC 1 "Support SSE4.2 optimized CRC hash generation, using \"${SSE42FLAG}\"")
                 list(APPEND ZLIB_ARCH_SRCS ${SSE42_SRCS})
                 set_property(SOURCE ${SSE42_SRCS} PROPERTY COMPILE_FLAGS "${SSE42FLAG} ${NOLTOFLAG}")
diff --git a/adler32_fold.h b/adler32_fold.h
index d93a51053..ec4270a7a 100644
--- a/adler32_fold.h
+++ b/adler32_fold.h
@@ -11,7 +11,9 @@
 typedef struct adler32_fold_s {
     uint8_t adler[64]; // First half of component sums
     uint8_t sum2[64]; // Second half of component sums
-    uint32_t nsums; // The number of scalar sums performed
+    uint8_t leftover[16]; // A buffer for sub 16 sized carry over, sized for full loads and alignment
+    uint32_t nsums; // The number of scalar sums leftover
+    uint32_t bytes_leftover; // The number of leftover bytes from the previous sum
 } adler32_fold;
 
 Z_INTERNAL void adler32_fold_reset_c(adler32_fold *adler, uint32_t init_adler);
diff --git a/adler32_p.h b/adler32_p.h
index 1adc1ccb0..5a14172f7 100644
--- a/adler32_p.h
+++ b/adler32_p.h
@@ -26,6 +26,18 @@ static inline uint32_t adler32_len_1(uint32_t adler, const unsigned char *buf, u
     return adler | (sum2 << 16);
 }
 
+static inline uint32_t adler32_copy_len_16(uint32_t adler, const unsigned char *buf, uint8_t *dst, size_t len, uint32_t sum2) {
+    while (len--) {
+        *dst = *buf++; 
+        adler += *dst++;
+        sum2 += adler;
+    }
+    adler %= BASE;
+    sum2 %= BASE;            /* only added so many BASE's */
+    /* return recombined sums */
+    return adler | (sum2 << 16);
+}
+
 static inline uint32_t adler32_len_16(uint32_t adler, const unsigned char *buf, size_t len, uint32_t sum2) {
     while (len) {
         --len;
diff --git a/arch/x86/Makefile.in b/arch/x86/Makefile.in
index 689e3a0c2..f9aedf82b 100644
--- a/arch/x86/Makefile.in
+++ b/arch/x86/Makefile.in
@@ -28,6 +28,7 @@ all: \
 	adler32_avx2.o adler32_avx2.lo \
 	adler32_avx512.o adler32_avx512.lo \
 	adler32_avx512_vnni.o adler32_avx512_vnni.lo \
+	adler32_sse42.o adler32_sse42.lo \
 	adler32_ssse3.o adler32_ssse3.lo \
 	chunkset_avx.o chunkset_avx.lo \
 	chunkset_sse2.o chunkset_sse2.lo \
@@ -110,25 +111,31 @@ adler32_avx2.o: $(SRCDIR)/adler32_avx2.c
 	$(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx2.c
 
 adler32_avx2.lo: $(SRCDIR)/adler32_avx2.c
-	$(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx2.c
+	$(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx2.c
 
 adler32_avx512.o: $(SRCDIR)/adler32_avx512.c
 	$(CC) $(CFLAGS) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512.c
 
 adler32_avx512.lo: $(SRCDIR)/adler32_avx512.c
-	$(CC) $(SFLAGS) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512.c
+	$(CC) $(SFLAGS) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512.c
 
 adler32_avx512_vnni.o: $(SRCDIR)/adler32_avx512_vnni.c
 	$(CC) $(CFLAGS) $(AVX512VNNIFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512_vnni.c
 
 adler32_avx512_vnni.lo: $(SRCDIR)/adler32_avx512_vnni.c
-	$(CC) $(SFLAGS) $(AVX512VNNIFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512_vnni.c
+	$(CC) $(SFLAGS) $(AVX512VNNIFLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512_vnni.c
 
 adler32_ssse3.o: $(SRCDIR)/adler32_ssse3.c
 	$(CC) $(CFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_ssse3.c
 
 adler32_ssse3.lo: $(SRCDIR)/adler32_ssse3.c
-	$(CC) $(SFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_ssse3.c
+	$(CC) $(SFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_ssse3.c
+
+adler32_sse42.o: $(SRCDIR)/adler32_sse42.c
+	$(CC) $(CFLAGS) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_sse42.c
+
+adler32_sse42.lo: $(SRCDIR)/adler32_sse42.c
+	$(CC) $(SFLAGS) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_sse42.c
 
 mostlyclean: clean
 clean:
diff --git a/arch/x86/adler32_sse42.c b/arch/x86/adler32_sse42.c
new file mode 100644
index 000000000..dce155028
--- /dev/null
+++ b/arch/x86/adler32_sse42.c
@@ -0,0 +1,31 @@
+/* adler32_sse4.c -- compute the Adler-32 checksum of a data stream
+ * Copyright (C) 1995-2011 Mark Adler
+ * Authors:
+ *   Adam Stylinski <kungfujesus06@gmail.com>
+ *   Brian Bockelman <bockelman@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "../../zbuild.h"
+#include "../../adler32_p.h"
+#include "../../adler32_fold.h"
+#include "adler32_ssse3_p.h"
+#include <immintrin.h>
+
+#ifdef X86_SSE42_ADLER32
+
+Z_INTERNAL void adler32_fold_reset_sse42(adler32_fold *adler, uint32_t init_adler) {
+    adler->nsums = init_adler;
+}
+
+Z_INTERNAL uint32_t adler32_fold_final_sse42(adler32_fold *adler) {
+    return adler->nsums;
+}
+
+#include "adler32_sse42_tpl.h"
+#undef ADLER32_SSE42_TPL_H_
+#define COPY
+#include "adler32_sse42_tpl.h"
+#undef COPY
+
+#endif
diff --git a/arch/x86/adler32_sse42_tpl.h b/arch/x86/adler32_sse42_tpl.h
new file mode 100644
index 000000000..71d1db818
--- /dev/null
+++ b/arch/x86/adler32_sse42_tpl.h
@@ -0,0 +1,132 @@
+/* adler32_ssse3_tpl.h -- adler32 ssse3 vectorized function templates
+ * Copyright (C) 2022 Adam Stylinski
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef ADLER32_SSE42_TPL_H_
+#define ADLER32_SSE42_TPL_H_
+
+#include "../../zbuild.h"
+#include <immintrin.h>
+#include "../../adler32_fold.h"
+#include "../../adler32_p.h"
+#include "adler32_ssse3_p.h"
+
+#ifdef COPY
+Z_INTERNAL void adler32_fold_copy_sse42(adler32_fold *adler, uint8_t *dst, const uint8_t *src, size_t len) {
+#else
+Z_INTERNAL void adler32_fold_sse42(adler32_fold *adler, const uint8_t *src, size_t len) {
+#endif
+
+    uint32_t adler0, adler1;
+    adler1 = (adler->nsums >> 16) & 0xffff;
+    adler0 = adler->nsums & 0xffff; 
+
+    if (len < 16) {
+rem_peel:
+#ifdef COPY
+       adler->nsums = adler32_copy_len_16(adler0, src, dst, len, adler1);
+#else
+       adler->nsums = adler32_len_16(adler0, src, len, adler1);
+#endif
+       return;
+    }
+
+    __m128i vbuf, vbuf_0;
+    __m128i vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0,
+            v_sad_sum2, vsum2, vsum2_0;
+    __m128i zero = _mm_setzero_si128();
+    const __m128i dot2v = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17);
+    const __m128i dot2v_0 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+    const __m128i dot3v = _mm_set1_epi16(1);
+    size_t k;
+
+    while (len >= 16) {
+
+        k = MIN(len, NMAX);
+        k -= k % 16;
+        len -= k;
+
+        vs1 = _mm_cvtsi32_si128(adler0);
+        vs2 = _mm_cvtsi32_si128(adler1);
+
+        vs3 = _mm_setzero_si128();
+        vs2_0 = _mm_setzero_si128();
+        vs1_0 = vs1;
+
+        while (k >= 32) {
+            /*
+               vs1 = adler + sum(c[i])
+               vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
+            */
+            vbuf = _mm_loadu_si128((__m128i*)src);
+            vbuf_0 = _mm_loadu_si128((__m128i*)(src + 16));
+            src += 32;
+            k -= 32;
+
+            v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
+            v_sad_sum2 = _mm_sad_epu8(vbuf_0, zero);
+#ifdef COPY
+            _mm_storeu_si128((__m128i*)dst, vbuf);
+            _mm_storeu_si128((__m128i*)(dst + 16), vbuf_0);
+            dst += 32;
+#endif
+            v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v);
+            v_short_sum2_0 = _mm_maddubs_epi16(vbuf_0, dot2v_0);
+
+            vs1 = _mm_add_epi32(v_sad_sum1, vs1);
+            vs3 = _mm_add_epi32(vs1_0, vs3);
+
+            vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
+            vsum2_0 = _mm_madd_epi16(v_short_sum2_0, dot3v);
+            vs1 = _mm_add_epi32(v_sad_sum2, vs1);
+            vs2 = _mm_add_epi32(vsum2, vs2);
+            vs2_0 = _mm_add_epi32(vsum2_0, vs2_0);
+            vs1_0 = vs1;
+        }
+
+        vs2 = _mm_add_epi32(vs2_0, vs2);
+        vs3 = _mm_slli_epi32(vs3, 5);
+        vs2 = _mm_add_epi32(vs3, vs2);
+        vs3 = _mm_setzero_si128();
+
+        while (k >= 16) {
+            /*
+               vs1 = adler + sum(c[i])
+               vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
+            */
+            vbuf = _mm_loadu_si128((__m128i*)src);
+            src += 16;
+            k -= 16;
+
+            v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
+            v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v_0);
+
+            vs1 = _mm_add_epi32(v_sad_sum1, vs1);
+            vs3 = _mm_add_epi32(vs1_0, vs3);
+            vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
+            vs2 = _mm_add_epi32(vsum2, vs2);
+            vs1_0 = vs1;
+
+#ifdef COPY
+            _mm_storeu_si128((__m128i*)dst, vbuf);
+            dst += 16;
+#endif
+        }
+
+        vs3 = _mm_slli_epi32(vs3, 4);
+        vs2 = _mm_add_epi32(vs2, vs3);
+
+        adler0 = partial_hsum(vs1) % BASE;
+        adler1 = hsum(vs2) % BASE;
+    }
+
+    /* If this is true, there's fewer than 16 elements remaining */
+    if (len) {
+        goto rem_peel;
+    }
+
+    adler->nsums = adler0 | (adler1 << 16);
+}
+
+#endif
diff --git a/arch/x86/adler32_ssse3.c b/arch/x86/adler32_ssse3.c
index b2ef0115c..8c55badf9 100644
--- a/arch/x86/adler32_ssse3.c
+++ b/arch/x86/adler32_ssse3.c
@@ -8,25 +8,12 @@
 
 #include "../../zbuild.h"
 #include "../../adler32_p.h"
+#include "adler32_ssse3_p.h"
 
 #ifdef X86_SSSE3_ADLER32
 
 #include <immintrin.h>
 
-static inline uint32_t partial_hsum(__m128i x) {
-    __m128i second_int = _mm_bsrli_si128(x, 8);
-    __m128i sum = _mm_add_epi32(x, second_int);
-    return _mm_cvtsi128_si32(sum);
-}
-
-static inline uint32_t hsum(__m128i x) {
-    __m128i sum1 = _mm_unpackhi_epi64(x, x);
-    __m128i sum2 = _mm_add_epi32(x, sum1);
-    __m128i sum3 = _mm_shuffle_epi32(sum2, 0x01);
-    __m128i sum4 = _mm_add_epi32(sum2, sum3);
-    return _mm_cvtsi128_si32(sum4);
-}
-
 Z_INTERNAL uint32_t adler32_ssse3(uint32_t adler, const unsigned char *buf, size_t len) {
     uint32_t sum2;
 
diff --git a/arch/x86/adler32_ssse3_p.h b/arch/x86/adler32_ssse3_p.h
new file mode 100644
index 000000000..ba914e1df
--- /dev/null
+++ b/arch/x86/adler32_ssse3_p.h
@@ -0,0 +1,29 @@
+/* adler32_ssse3_p.h -- adler32 ssse3 utility functions
+ * Copyright (C) 2022 Adam Stylinski
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef ADLER32_SSSE3_P_H_
+#define ADLER32_SSSE3_P_H_
+
+#ifdef X86_SSSE3_ADLER32
+
+#include <immintrin.h>
+#include <stdint.h>
+
+static inline uint32_t partial_hsum(__m128i x) {
+    __m128i second_int = _mm_bsrli_si128(x, 8);
+    __m128i sum = _mm_add_epi32(x, second_int);
+    return _mm_cvtsi128_si32(sum);
+}
+
+static inline uint32_t hsum(__m128i x) {
+    __m128i sum1 = _mm_unpackhi_epi64(x, x);
+    __m128i sum2 = _mm_add_epi32(x, sum1);
+    __m128i sum3 = _mm_shuffle_epi32(sum2, 0x01);
+    __m128i sum4 = _mm_add_epi32(sum2, sum3);
+    return _mm_cvtsi128_si32(sum4);
+}
+#endif
+
+#endif
diff --git a/arch/x86/adler32_ssse3_tpl.h b/arch/x86/adler32_ssse3_tpl.h
new file mode 100644
index 000000000..aedfa8124
--- /dev/null
+++ b/arch/x86/adler32_ssse3_tpl.h
@@ -0,0 +1,188 @@
+/* adler32_ssse3_tpl.h -- adler32 ssse3 vectorized function templates
+ * Copyright (C) 2022 Adam Stylinski
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef ADLER32_SSSE3_TPL_H_
+#define ADLER32_SSSE3_TPL_H_
+
+#include "../../zbuild.h"
+#include <immintrin.h>
+#include "../../adler32_fold.h"
+#include "../../adler32_p.h"
+#include "adler32_ssse3_p.h"
+
+#ifdef COPY
+Z_INTERNAL void adler32_fold_copy_ssse3(adler32_fold *adler, uint8_t *dst, const uint8_t *src, size_t len) {
+#else
+Z_INTERNAL void adler32_fold_ssse3(adler32_fold *adler, const uint8_t *src, size_t len) {
+#endif
+    uint32_t adler0, adler1;
+
+     /* split Adler-32 into component sums */
+    adler1 = (adler->nsums >> 16) & 0xffff;
+    adler0 = adler->nsums & 0xffff;
+
+    /* in case user likes doing a byte at a time, keep it fast */
+    if (UNLIKELY(len == 1)) {
+#ifdef COPY
+        *(dst++) = *src;
+#endif
+        adler->nsums = adler32_len_1(adler0, src, adler1);
+        return;
+    }
+
+    /* initial Adler-32 value (deferred check for len == 1 speed) */
+    if (UNLIKELY(src == NULL)) {
+        adler->nsums = 1L;
+        return;
+    }
+
+    /* in case short lengths are provided, keep it somewhat fast */
+    if (UNLIKELY(len < 16)) {
+        goto sub16;
+    }
+
+    const __m128i dot2v = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17);
+    const __m128i dot2v_0 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+    const __m128i dot3v = _mm_set1_epi16(1);
+    const __m128i zero = _mm_setzero_si128();
+
+    __m128i vbuf, vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0,
+            vbuf_0, v_sad_sum2, vsum2, vsum2_0;
+
+    /* If our buffer is unaligned (likely), make the determination whether
+     * or not there's enough of a buffer to consume to make the scalar, aligning
+     * additions worthwhile or if it's worth it to just eat the cost of an unaligned
+     * load. This is a pretty simple test, just test if 16 - the remainder + len is
+     * < 16 */
+    size_t max_iters = NMAX;
+    size_t rem = (uintptr_t)src & 15;
+    size_t align_offset = 16 - rem;
+    size_t k = 0;
+    if (rem) {
+        if (len < 16 + align_offset) {
+            /* Let's eat the cost of this one unaligned load so that
+             * we don't completely skip over the vectorization. Doing
+             * 16 bytes at a time unaligned is is better than 16 + <= 15
+             * sums */
+            vbuf = _mm_loadu_si128((__m128i*)src);
+            len -= 16;
+            src += 16;
+#ifdef COPY
+            _mm_storeu_si128((__m128i*)dst, vbuf);
+            dst += 16;
+#endif
+            vs1 = _mm_cvtsi32_si128(adler0);
+            vs2 = _mm_cvtsi32_si128(adler1);
+            vs3 = _mm_setzero_si128();
+            vs1_0 = vs1;
+            goto unaligned_jmp;
+        }
+
+#ifdef COPY
+        memcpy(dst, src, align_offset);
+        dst += align_offset;
+#endif
+        for (size_t i = 0; i < align_offset; ++i) {
+            adler0 += *(src++);
+            adler1 += adler0;
+        }
+
+        /* lop off the max number of sums based on the scalar sums done
+         * above */
+        len -= align_offset;
+        max_iters -= align_offset; 
+    }
+
+
+    while (len >= 16) {
+        vs1 = _mm_cvtsi32_si128(adler0);
+        vs2 = _mm_cvtsi32_si128(adler1);
+        vs3 = _mm_setzero_si128();
+        vs2_0 = _mm_setzero_si128();
+        vs1_0 = vs1;
+
+        k = (len < max_iters ? len : max_iters);
+        k -= k % 16;
+        len -= k;
+
+        while (k >= 32) {
+            /*
+               vs1 = adler + sum(c[i])
+               vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
+            */
+            vbuf = _mm_load_si128((__m128i*)src);
+            vbuf_0 = _mm_load_si128((__m128i*)(src + 16));
+            src += 32;
+            k -= 32;
+
+            v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
+            v_sad_sum2 = _mm_sad_epu8(vbuf_0, zero);
+#ifdef COPY
+            _mm_storeu_si128((__m128i*)dst, vbuf);
+            _mm_storeu_si128((__m128i*)(dst + 16), vbuf_0);
+            dst += 32;
+#endif
+            vs1 = _mm_add_epi32(v_sad_sum1, vs1);
+            vs3 = _mm_add_epi32(vs1_0, vs3);
+
+            vs1 = _mm_add_epi32(v_sad_sum2, vs1);
+            v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v);
+            vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
+            v_short_sum2_0 = _mm_maddubs_epi16(vbuf_0, dot2v_0);
+            vs2 = _mm_add_epi32(vsum2, vs2);
+            vsum2_0 = _mm_madd_epi16(v_short_sum2_0, dot3v);
+            vs2_0 = _mm_add_epi32(vsum2_0, vs2_0);
+            vs1_0 = vs1;
+        }
+
+        vs2 = _mm_add_epi32(vs2_0, vs2);
+        vs3 = _mm_slli_epi32(vs3, 5);
+        vs2 = _mm_add_epi32(vs3, vs2);
+        vs3 = _mm_setzero_si128();
+
+        while (k >= 16) {
+            /*
+               vs1 = adler + sum(c[i])
+               vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
+            */
+            vbuf = _mm_load_si128((__m128i*)src);
+            src += 16;
+            k -= 16;
+
+unaligned_jmp:
+            v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
+#ifdef COPY
+            _mm_storeu_si128((__m128i*)dst, vbuf);
+            dst += 16;
+#endif
+            vs1 = _mm_add_epi32(v_sad_sum1, vs1);
+            vs3 = _mm_add_epi32(vs1_0, vs3);
+            v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v_0);
+            vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
+            vs2 = _mm_add_epi32(vsum2, vs2);
+            vs1_0 = vs1;
+        }
+
+        vs3 = _mm_slli_epi32(vs3, 4);
+        vs2 = _mm_add_epi32(vs2, vs3);
+
+        /* We don't actually need to do a full horizontal sum, since psadbw is actually doing
+         * a partial reduction sum implicitly and only summing to integers in vector positions
+         * 0 and 2. This saves us some contention on the shuffle port(s) */
+        adler0 = partial_hsum(vs1) % BASE;
+        adler1 = hsum(vs2) % BASE;
+        max_iters = NMAX;
+    }
+
+sub16:
+#ifdef COPY
+    adler->nsums = adler32_copy_len_16(adler0, src, dst, len, adler1);
+#else
+    /* Process tail (len < 16).  */
+    adler->nsums = adler32_len_16(adler0, src, len, adler1);
+#endif
+}
+
+#endif
diff --git a/configure b/configure
index 6f79825a7..b90c2342f 100755
--- a/configure
+++ b/configure
@@ -1495,16 +1495,16 @@ case "${ARCH}" in
             check_sse42_intrinsics
 
             if test ${HAVE_SSE42CRC_INTRIN} -eq 1 || test ${HAVE_SSE42CRC_INLINE_ASM} -eq 1; then
-                CFLAGS="${CFLAGS} -DX86_SSE42_CRC_HASH"
-                SFLAGS="${SFLAGS} -DX86_SSE42_CRC_HASH"
+                CFLAGS="${CFLAGS} -DX86_SSE42_CRC_HASH -DX86_SSE42_ADLER32"
+                SFLAGS="${SFLAGS} -DX86_SSE42_CRC_HASH -DX86_SSE42_ADLER32"
 
                 if test ${HAVE_SSE42CRC_INTRIN} -eq 1; then
                   CFLAGS="${CFLAGS} -DX86_SSE42_CRC_INTRIN"
                   SFLAGS="${SFLAGS} -DX86_SSE42_CRC_INTRIN"
                 fi
 
-                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} insert_string_sse42.o"
-                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} insert_string_sse42.lo"
+                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_sse42.o insert_string_sse42.o"
+                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_sse42.lo insert_string_sse42.lo"
             fi
 
             check_sse2_intrinsics
diff --git a/cpu_features.h b/cpu_features.h
index d3df33b9f..3dfaf6ce2 100644
--- a/cpu_features.h
+++ b/cpu_features.h
@@ -7,6 +7,7 @@
 #define CPU_FEATURES_H_
 
 #include "crc32_fold.h"
+#include "adler32_fold.h"
 
 #if defined(X86_FEATURES)
 #  include "arch/x86/x86_features.h"
@@ -34,6 +35,12 @@ extern uint32_t adler32_vmx(uint32_t adler, const unsigned char *buf, size_t len
 #ifdef X86_SSSE3_ADLER32
 extern uint32_t adler32_ssse3(uint32_t adler, const unsigned char *buf, size_t len);
 #endif
+#ifdef X86_SSE42_ADLER32
+extern void     adler32_fold_reset_sse42(adler32_fold *adler, uint32_t init_adler);
+extern void     adler32_fold_copy_sse42(adler32_fold *adler, uint8_t *dst, const uint8_t *src, size_t len);
+extern void     adler32_fold_sse42(adler32_fold *adler, const uint8_t *src, size_t len);
+extern uint32_t adler32_fold_final_sse42(adler32_fold *adler);
+#endif
 #ifdef X86_AVX2_ADLER32
 extern uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_t len);
 #endif
diff --git a/deflate.c b/deflate.c
index a98988a83..6818a860a 100644
--- a/deflate.c
+++ b/deflate.c
@@ -448,7 +448,11 @@ int32_t Z_EXPORT PREFIX(deflateResetKeep)(PREFIX3(stream) *strm) {
         strm->adler = functable.crc32_fold_reset(&s->crc_fold);
     else
 #endif
+        //strm->adler = ADLER32_INITIAL_VALUE;
+    {
         strm->adler = ADLER32_INITIAL_VALUE;
+        functable.adler32_fold_reset(&s->adler_fold, ADLER32_INITIAL_VALUE);
+    }
     s->last_flush = -2;
 
     zng_tr_init(s);
@@ -767,6 +771,7 @@ int32_t Z_EXPORT PREFIX(deflate)(PREFIX3(stream) *strm, int32_t flush) {
         if (s->strstart != 0)
             put_uint32_msb(s, strm->adler);
         strm->adler = ADLER32_INITIAL_VALUE;
+        functable.adler32_fold_reset(&s->adler_fold, ADLER32_INITIAL_VALUE);
         s->status = BUSY_STATE;
 
         /* Compression must start with an empty pending buffer */
@@ -973,8 +978,11 @@ int32_t Z_EXPORT PREFIX(deflate)(PREFIX3(stream) *strm, int32_t flush) {
         put_uint32(s, (uint32_t)strm->total_in);
     } else
 #endif
-    if (s->wrap == 1)
-        put_uint32_msb(s, strm->adler);
+    {
+        strm->adler = functable.adler32_fold_final(&s->adler_fold);
+        if (s->wrap == 1)
+            put_uint32_msb(s, strm->adler);
+    }
     PREFIX(flush_pending)(strm);
     /* If avail_out is zero, the application will call deflate again
      * to flush the rest.
@@ -1083,9 +1091,11 @@ Z_INTERNAL unsigned read_buf(PREFIX3(stream) *strm, unsigned char *buf, unsigned
         functable.crc32_fold_copy(&strm->state->crc_fold, buf, strm->next_in, len);
 #endif
     } else {
-        memcpy(buf, strm->next_in, len);
         if (strm->state->wrap == 1)
-            strm->adler = functable.adler32(strm->adler, buf, len);
+            functable.adler32_fold_copy(&strm->state->adler_fold, buf, strm->next_in, len);
+        else
+            memcpy(buf, strm->next_in, len);
+            //strm->adler = functable.adler32(strm->adler, buf, len);
     }
     strm->next_in  += len;
     strm->total_in += len;
diff --git a/deflate.h b/deflate.h
index b5ae2f8e6..2d34c95ed 100644
--- a/deflate.h
+++ b/deflate.h
@@ -10,6 +10,7 @@
    subject to change. Applications should only use zlib.h.
  */
 
+#include "adler32_fold.h"
 #include "zutil.h"
 #include "zendian.h"
 #include "crc32_fold.h"
@@ -211,6 +212,7 @@ struct internal_state {
 
     int nice_match; /* Stop searching when current match exceeds this */
 
+    struct adler32_fold_s ALIGNED_(64) adler_fold;
     struct crc32_fold_s ALIGNED_(16) crc_fold;
 
                 /* used by trees.c: */
diff --git a/functable.c b/functable.c
index 317349f30..a2c75c9fc 100644
--- a/functable.c
+++ b/functable.c
@@ -204,21 +204,37 @@ Z_INTERNAL uint32_t adler32_stub(uint32_t adler, const unsigned char *buf, size_
 
 Z_INTERNAL void adler32_fold_reset_stub(adler32_fold *adler, uint32_t init_adler) {
     functable.adler32_fold_reset = &adler32_fold_reset_c;
+#if (defined X86_SSE42_ADLER32) && !defined(X86_AVX2_ADLER32) && !defined(X86_AVX512_ADLER32) && !defined(X86_AVX512VNNI_ADLER32)
+    if (x86_cpu_has_sse42)
+        functable.adler32_fold_reset = &adler32_fold_reset_sse42;
+#endif
     functable.adler32_fold_reset(adler, init_adler);
 }
 
 Z_INTERNAL void adler32_fold_copy_stub(adler32_fold *adler, uint8_t *dst, const uint8_t *src, size_t len) {
     functable.adler32_fold_copy = &adler32_fold_copy_c;
+#if (defined X86_SSE42_ADLER32) && !defined(X86_AVX2_ADLER32) && !defined(X86_AVX512_ADLER32) && !defined(X86_AVX512VNNI_ADLER32)
+    if (x86_cpu_has_sse42)
+        functable.adler32_fold_copy = &adler32_fold_copy_sse42;
+#endif
     functable.adler32_fold_copy(adler, dst, src, len);
 }
 
 Z_INTERNAL void adler32_fold_stub(adler32_fold *adler, const uint8_t *src, size_t len) {
     functable.adler32_fold = &adler32_fold_c;
+#if (defined X86_SSE42_ADLER32) && !defined(X86_AVX2_ADLER32) && !defined(X86_AVX512_ADLER32) && !defined(X86_AVX512VNNI_ADLER32)
+    if (x86_cpu_has_sse42)
+        functable.adler32_fold = &adler32_fold_sse42;
+#endif
     functable.adler32_fold(adler, src, len);
 }
 
 Z_INTERNAL uint32_t adler32_fold_final_stub(adler32_fold *adler) {
     functable.adler32_fold_final = &adler32_fold_final_c;
+#if (defined X86_SSE42_ADLER32) && !defined(X86_AVX2_ADLER32) && !defined(X86_AVX512_ADLER32) && !defined(X86_AVX512VNNI_ADLER32)
+    if (x86_cpu_has_sse42)
+        functable.adler32_fold_final = &adler32_fold_final_sse42;
+#endif
     return functable.adler32_fold_final(adler);
 }
 
diff --git a/inflate.c b/inflate.c
index 9e816a40d..864ca8882 100644
--- a/inflate.c
+++ b/inflate.c
@@ -20,6 +20,7 @@ static uint32_t syncsearch(uint32_t *have, const unsigned char *buf, uint32_t le
 
 static inline void inf_chksum_cpy(PREFIX3(stream) *strm, uint8_t *dst,
                            const uint8_t *src, uint32_t copy) {
+    if (!copy) return;
     struct inflate_state *state = (struct inflate_state*)strm->state;
 #ifdef GUNZIP
     if (state->flags) {
diff --git a/win32/Makefile.msc b/win32/Makefile.msc
index b121cde22..bce5c0044 100644
--- a/win32/Makefile.msc
+++ b/win32/Makefile.msc
@@ -30,9 +30,12 @@ WFLAGS  = \
 	-DX86_FEATURES \
 	-DX86_PCLMULQDQ_CRC \
 	-DX86_SSE2 \
+    -DX86_SSE42_ADLER32 \
 	-DX86_SSE42_CRC_INTRIN \
 	-DX86_SSE42_CRC_HASH \
+    -DX86_SSSE3_ADLER32 \
 	-DX86_AVX2 \
+    -DX86_AVX2_ADLER32 \
 	-DX86_AVX_CHUNKSET \
 	-DX86_SSE2_CHUNKSET \
 	#
@@ -48,6 +51,11 @@ SUFFIX =
 
 OBJS = \
 	adler32.obj \
+    adler32_avx2.obj \
+    adler32_avx512.obj \
+    adler32_avx512_vnni.obj \
+    adler32_sse42.obj \
+    adler32_ssse3.obj \
     adler32_fold.obj \
 	chunkset.obj \
 	chunkset_avx.obj \
@@ -182,6 +190,13 @@ $(TOP)/zconf$(SUFFIX).h: zconf
 SRCDIR = $(TOP)
 # Keep the dependences in sync with top-level Makefile.in
 adler32.obj: $(SRCDIR)/adler32.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/functable.h $(SRCDIR)/adler32_p.h
+adler32_avx2.obj: $(SRCDIR)/arch/x86/adler32_avx2.c $(SRCDIR)/zbuild.h $(SRCDIR)/cpu_features.h $(SRCDIR)/adler32_p.h $(SRCDIR)/fallback_builtins.h
+adler32_avx512.obj: $(SRCDIR)/arch/x86/adler32_avx512.c $(SRCDIR)/zbuild.h $(SRCDIR)/cpu_features.h $(SRCDIR)/adler32_p.h $(SRCDIR)/arch/x86/adler32_avx512_p.h
+adler32_avx512_vnni.obj: $(SRCDIR)/arch/x86/adler32_avx512_vnni.c $(SRCDIR)/zbuild.h $(SRCDIR)/cpu_features.h $(SRCDIR)/adler32_p.h $(SRCDIR)/arch/x86/adler32_avx512_p.h
+adler32_sse42.obj: $(SRCDIR)/arch/x86/adler32_sse42.c $(SRCDIR)/zbuild.h $(SRCDIR)/cpu_features.h $(SRCDIR)/adler32_p.h $(SRCDIR)/adler32_fold.h \
+                   $(SRCDIR)/arch/x86/adler32_ssse3_p.h $(SRCDIR)/arch/x86/adler32_sse42_tpl.h
+adler32_ssse3.obj: $(SRCDIR)/arch/x86/adler32_ssse3.c $(SRCDIR)/zbuild.h $(SRCDIR)/cpu_features.h $(SRCDIR)/adler32_p.h $(SRCDIR)/adler32_fold.h \
+                   $(SRCDIR)/arch/x86/adler32_ssse3_p.h
 adler32_fold.obj: $(SRCDIR)/adler32_fold.c $(SRCDIR)/zbuild.h $(SRCDIR)/adler32_fold.h $(SRCDIR)/functable.h
 functable.obj: $(SRCDIR)/functable.c $(SRCDIR)/zbuild.h $(SRCDIR)/functable.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/zendian.h $(SRCDIR)/arch/x86/x86_features.h
 gzlib.obj: $(SRCDIR)/gzlib.c $(SRCDIR)/zbuild.h $(SRCDIR)/gzguts.h $(SRCDIR)/zutil_p.h