From: Adam Stylinski <kungfujesus06@gmail.com>
Date: Sun, 10 Apr 2022 17:01:22 +0000 (-0400)
Subject: Improved chunkset substantially where it's heavily used
X-Git-Tag: 2.1.0-beta1~245
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=ef0cf5ca17057062a895b187ff8bef9ad7762c2b;p=thirdparty%2Fzlib-ng.git

Improved chunkset substantially where it's heavily used

For most realistic use cases, this doesn't make a ton of difference.
However, for things which are highly compressible and enjoy very large
run length encodes in the window, this is a huge win.

We leverage a permutation table to swizzle the contents of the memory
chunk into a vector register and then splat that over memory with a fast
copy loop.

In essence, where this helps, it helps a lot.  Where it doesn't, it does
no measurable damage to the runtime.

This commit also simplifies a chunkcopy_safe call for determining a
distance.  Using labs is enough to give the same behavior as before,
with the added benefit that no predication is required _and_, most
importantly, static analysis by GCC's string fortification can't throw a
fit because it conveys better to the compiler that the input into
builtin_memcpy will always be in range.
---

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6c45fd844..24fe98b3e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -769,6 +769,9 @@ if(WITH_OPTIM)
             check_sse41_intrinsics()
             if(HAVE_SSE41_INTRIN)
                 add_definitions(-DX86_SSE41)
+                list(APPEND SSE41_SRCS ${ARCHDIR}/chunkset_sse41.c)
+                list(APPEND ZLIB_ARCH_SRCS ${SSE41_SRCS})
+                set_property(SOURCE ${SSE41_SRCS} PROPERTY COMPILE_FLAGS "${SSE41FLAG} ${NOLTOFLAG}")
             else()
                 set(WITH_SSE41 OFF)
             endif()
diff --git a/arch/x86/Makefile.in b/arch/x86/Makefile.in
index 05cf144b3..689e3a0c2 100644
--- a/arch/x86/Makefile.in
+++ b/arch/x86/Makefile.in
@@ -31,6 +31,7 @@ all: \
 	adler32_ssse3.o adler32_ssse3.lo \
 	chunkset_avx.o chunkset_avx.lo \
 	chunkset_sse2.o chunkset_sse2.lo \
+	chunkset_sse41.o chunkset_sse41.lo \
 	compare256_avx2.o compare256_avx2.lo \
 	compare256_sse2.o compare256_sse2.lo \
 	insert_string_sse42.o insert_string_sse42.lo \
@@ -57,6 +58,12 @@ chunkset_sse2.o:
 chunkset_sse2.lo:
 	$(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse2.c
 
+chunkset_sse41.o:
+	$(CC) $(CFLAGS) $(SSE41FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse41.c
+
+chunkset_sse41.lo:
+	$(CC) $(SFLAGS) $(SSE41FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse41.c
+
 compare256_avx2.o:
 	$(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx2.c
 
diff --git a/arch/x86/chunk_permute_table.h b/arch/x86/chunk_permute_table.h
new file mode 100644
index 000000000..c7b2d2de7
--- /dev/null
+++ b/arch/x86/chunk_permute_table.h
@@ -0,0 +1,53 @@
+/* chunk_permute_table.h - shared AVX/SSE4 permutation table for use with chunkmemset family of functions.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef CHUNK_PERMUTE_TABLE_H_
+#define CHUNK_PERMUTE_TABLE_H_
+
+#include "zbuild.h"
+
+/* Need entries for all numbers not an even modulus for 1, 2, 4, 8, 16 & 32 */
+static const ALIGNED_(32) uint8_t permute_table[26*32] = {
+    0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, /* dist 3 */
+    0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, /* dist 5 */
+    0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, /* dist 6 */
+    0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, /* dist 7 */
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, /* dist 9 */
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, /* dist 10 */
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, /* dist 11 */
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, /* dist 12 */
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2, 3, 4, 5, /* dist 13 */
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 1, 2, 3, /* dist 14 */
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, 1, /* dist 15 */
+
+    /* Beyond dists of 15 means we have to permute from a vector > len(m128i). Because AVX couldn't permute
+     * beyond 128 bit lanes until AVX512 for sub 4-byte sequences, we have to do some math here for an eventual
+     * blend with a comparison. That means we need to wrap the indices with yet another derived table. For simplicity,
+     * we'll use absolute indexing here to derive a blend vector. This is actually a lot simpler with ARM's TBL, but,
+     * this is what we're dealt.
+     */
+
+    16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, /* dist 17 */
+    16, 17, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, /* dist 18 */
+    16, 17, 18, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, /* dist 19 */
+    16, 17, 18, 19, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, /* dist 20 */
+    16, 17, 18, 19, 20, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, /* dist 21 */
+    16, 17, 18, 19, 20, 21, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, /* dist 22 */
+    16, 17, 18, 19, 20, 21, 22, 0, 1, 2, 3, 4, 5, 6, 7, 8, /* dist 23 */
+    16, 17, 18, 19, 20, 21, 22, 23, 0, 1, 2, 3, 4, 5, 6, 7, /* dist 24 */
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 0, 1, 2, 3, 4, 5, 6, /* dist 25 */
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 0, 1, 2, 3, 4, 5, /* dist 26 */
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 0, 1, 2, 3, 4, /* dist 27 */
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 0, 1, 2, 3, /* dist 28 */
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 0, 1, 2, /* dist 29 */
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 0, 1, /* dist 30 */
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 0, /* dist 31 */
+};
+
+typedef struct lut_rem_pair_s {
+    uint16_t idx;
+    uint16_t remval;
+} lut_rem_pair;
+
+#endif
diff --git a/arch/x86/chunkset_avx.c b/arch/x86/chunkset_avx.c
index 6643b0468..91aaa458e 100644
--- a/arch/x86/chunkset_avx.c
+++ b/arch/x86/chunkset_avx.c
@@ -5,6 +5,7 @@
 
 #ifdef X86_AVX_CHUNKSET
 #include <immintrin.h>
+#include "chunk_permute_table.h"
 
 typedef __m256i chunk_t;
 
@@ -13,6 +14,41 @@ typedef __m256i chunk_t;
 #define HAVE_CHUNKMEMSET_2
 #define HAVE_CHUNKMEMSET_4
 #define HAVE_CHUNKMEMSET_8
+#define HAVE_CHUNK_MAG
+
+/* Populate don't cares so that this is a direct lookup (with some indirection into the permute table), because dist can
+ * never be 0 - 2, we'll start with an offset, subtracting 3 from the input */
+static const lut_rem_pair perm_idx_lut[29] = {
+    { 0, 2},                /* 3 */
+    { 0, 0},                /* don't care */
+    { 1 * 32, 2},           /* 5 */
+    { 2 * 32, 2},           /* 6 */
+    { 3 * 32, 4},           /* 7 */
+    { 0 * 32, 0},           /* don't care */
+    { 4 * 32, 5},           /* 9 */
+    { 5 * 32, 22},          /* 10 */
+    { 6 * 32, 21},          /* 11 */
+    { 7 * 32, 20},          /* 12 */
+    { 8 * 32, 6},           /* 13 */
+    { 9 * 32, 4},           /* 14 */
+    {10 * 32, 2},           /* 15 */
+    { 0 * 32, 0},           /* don't care */
+    {11 * 32, 15},          /* 17 */
+    {11 * 32 + 16, 14},     /* 18 */
+    {11 * 32 + 16 * 2, 13}, /* 19 */
+    {11 * 32 + 16 * 3, 12}, /* 20 */
+    {11 * 32 + 16 * 4, 11}, /* 21 */
+    {11 * 32 + 16 * 5, 10}, /* 22 */
+    {11 * 32 + 16 * 6,  9}, /* 23 */
+    {11 * 32 + 16 * 7,  8}, /* 24 */
+    {11 * 32 + 16 * 8,  7}, /* 25 */
+    {11 * 32 + 16 * 9,  6}, /* 26 */
+    {11 * 32 + 16 * 10, 5}, /* 27 */
+    {11 * 32 + 16 * 11, 4}, /* 28 */
+    {11 * 32 + 16 * 12, 3}, /* 29 */
+    {11 * 32 + 16 * 13, 2}, /* 30 */
+    {11 * 32 + 16 * 14, 1}  /* 31 */
+};
 
 static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
     int16_t tmp;
@@ -40,6 +76,50 @@ static inline void storechunk(uint8_t *out, chunk_t *chunk) {
     _mm256_storeu_si256((__m256i *)out, *chunk);
 }
 
+static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
+    lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
+    __m256i ret_vec;
+    /* While technically we only need to read 4 or 8 bytes into this vector register for a lot of cases, GCC is
+     * compiling this to a shared load for all branches, preferring the simpler code.  Given that the buf value isn't in
+     * GPRs to begin with the 256 bit load is _probably_ just as inexpensive */
+    *chunk_rem = lut_rem.remval;
+
+#ifdef Z_MEMORY_SANITIZER
+    /* See note in chunkset_sse4.c for why this is ok */
+    __msan_unpoison(buf + dist, 32 - dist);
+#endif
+
+    if (dist < 16) {
+        /* This simpler case still requires us to shuffle in 128 bit lanes, so we must apply a static offset after
+         * broadcasting the first vector register to both halves. This is _marginally_ faster than doing two separate
+         * shuffles and combining the halves later */
+        const __m256i permute_xform =
+            _mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                             16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16);
+        __m256i perm_vec = _mm256_load_si256((__m256i*)(permute_table+lut_rem.idx));
+        __m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf);
+        perm_vec = _mm256_add_epi8(perm_vec, permute_xform);
+        ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), ret_vec0, 1);
+        ret_vec = _mm256_shuffle_epi8(ret_vec, perm_vec);
+    } else if (dist == 16) {
+        __m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf);
+        return _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), ret_vec0, 1);
+    } else {
+        __m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf);
+        __m128i ret_vec1 = _mm_loadu_si128((__m128i*)(buf + 16));
+        /* Take advantage of the fact that only the latter half of the 256 bit vector will actually differ */
+        __m128i perm_vec1 = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx));
+        __m128i xlane_permutes = _mm_cmpgt_epi8(_mm_set1_epi8(16), perm_vec1);
+        __m128i xlane_res  = _mm_shuffle_epi8(ret_vec0, perm_vec1);
+        /* Since we can't wrap twice, we can simply keep the later half exactly how it is instead of having to _also_
+         * shuffle those values */
+        __m128i latter_half = _mm_blendv_epi8(ret_vec1, xlane_res, xlane_permutes);
+        ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), latter_half, 1);
+    }
+
+    return ret_vec;
+}
+
 #define CHUNKSIZE        chunksize_avx
 #define CHUNKCOPY        chunkcopy_avx
 #define CHUNKCOPY_SAFE   chunkcopy_safe_avx
diff --git a/arch/x86/chunkset_sse41.c b/arch/x86/chunkset_sse41.c
new file mode 100644
index 000000000..c6f982183
--- /dev/null
+++ b/arch/x86/chunkset_sse41.c
@@ -0,0 +1,98 @@
+/* chunkset_sse41.c -- SSE4 inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+
+/* This requires SSE2 support. While it's implicit with SSE4, we can minimize
+ * code size by sharing the chunkcopy functions, which will certainly compile
+ * to identical machine code */
+#if defined(X86_SSE41) && defined(X86_SSE2)
+#include <immintrin.h>
+#include "chunk_permute_table.h"
+
+typedef __m128i chunk_t;
+
+#define CHUNK_SIZE 16
+
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+#define HAVE_CHUNK_MAG
+#define HAVE_CHUNKCOPY
+#define HAVE_CHUNKUNROLL
+
+static const lut_rem_pair perm_idx_lut[13] = {
+    {0, 1},      /* 3 */
+    {0, 0},      /* don't care */
+    {1 * 32, 1}, /* 5 */
+    {2 * 32, 4}, /* 6 */
+    {3 * 32, 2}, /* 7 */
+    {0 * 32, 0}, /* don't care */
+    {4 * 32, 7}, /* 9 */
+    {5 * 32, 6}, /* 10 */
+    {6 * 32, 5}, /* 11 */
+    {7 * 32, 4}, /* 12 */
+    {8 * 32, 3}, /* 13 */
+    {9 * 32, 2}, /* 14 */
+    {10 * 32, 1},/* 15 */
+};
+
+
+static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
+    int16_t tmp;
+    zmemcpy_2(&tmp, from);
+    *chunk = _mm_set1_epi16(tmp);
+}
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+    int32_t tmp;
+    zmemcpy_4(&tmp, from);
+    *chunk = _mm_set1_epi32(tmp);
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+    int64_t tmp;
+    zmemcpy_8(&tmp, from);
+    *chunk = _mm_set1_epi64x(tmp);
+}
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+    *chunk = _mm_loadu_si128((__m128i *)s);
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+    _mm_storeu_si128((__m128i *)out, *chunk);
+}
+
+static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
+    lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
+    __m128i perm_vec, ret_vec;
+#ifdef Z_MEMORY_SANITIZER
+    /* Important to note: 
+     * This is _not_ to subvert the memory sanitizer but to instead unpoison some
+     * bytes we willingly and purposefully load unitialized that we swizzle over
+     * in a vector register, anyway.  If what we assume is wrong about what is used,
+     * the memory sanitizer will still usefully flag it */
+    __msan_unpoison(buf + dist, 16 - dist);
+#endif
+    ret_vec = _mm_loadu_si128((__m128i*)buf);
+    *chunk_rem = lut_rem.remval;
+
+    perm_vec = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx));
+    ret_vec = _mm_shuffle_epi8(ret_vec, perm_vec);
+
+    return ret_vec;
+}
+
+extern uint8_t* chunkcopy_sse2(uint8_t *out, uint8_t const *from, unsigned len);
+
+#define CHUNKSIZE            chunksize_sse41
+#define CHUNKMEMSET          chunkmemset_sse41
+#define CHUNKMEMSET_SAFE     chunkmemset_safe_sse41
+#define CHUNKCOPY(a, b, c)   chunkcopy_sse2(a, b, c)
+#define CHUNKUNROLL(a, b, c) chunkunroll_sse2(a, b, c)
+
+#include "chunkset_tpl.h"
+
+#endif
diff --git a/chunkset_tpl.h b/chunkset_tpl.h
index 25ee95596..f70ef42cd 100644
--- a/chunkset_tpl.h
+++ b/chunkset_tpl.h
@@ -5,6 +5,10 @@
 #include "zbuild.h"
 #include <stdlib.h>
 
+#if CHUNK_SIZE == 32 && defined(X86_SSE41) && defined(X86_SSE2)
+extern uint8_t* chunkmemset_sse41(uint8_t *out, unsigned dist, unsigned len);
+#endif
+
 /* Returns the chunk size */
 Z_INTERNAL uint32_t CHUNKSIZE(void) {
     return sizeof(chunk_t);
@@ -20,6 +24,7 @@ Z_INTERNAL uint32_t CHUNKSIZE(void) {
    (chunk_t bytes or fewer) will fall straight through the loop
    without iteration, which will hopefully make the branch prediction more
    reliable. */
+#ifndef HAVE_CHUNKCOPY
 Z_INTERNAL uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
     Assert(len > 0, "chunkcopy should never have a length 0");
     chunk_t chunk;
@@ -38,6 +43,7 @@ Z_INTERNAL uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
     }
     return out;
 }
+#endif
 
 /* Perform short copies until distance can be rewritten as being at least
    sizeof chunk_t.
@@ -47,6 +53,7 @@ Z_INTERNAL uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
    This assumption holds because inflate_fast() starts every iteration with at
    least 258 bytes of output space available (258 being the maximum length
    output from a single token; see inflate_fast()'s assumptions below). */
+#ifndef HAVE_CHUNKUNROLL
 Z_INTERNAL uint8_t* CHUNKUNROLL(uint8_t *out, unsigned *dist, unsigned *len) {
     unsigned char const *from = out - *dist;
     chunk_t chunk;
@@ -59,6 +66,30 @@ Z_INTERNAL uint8_t* CHUNKUNROLL(uint8_t *out, unsigned *dist, unsigned *len) {
     }
     return out;
 }
+#endif
+
+#ifndef HAVE_CHUNK_MAG
+/* Loads a magazine to feed into memory of the pattern */
+static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
+        /* This code takes string of length dist from "from" and repeats
+         * it for as many times as can fit in a chunk_t (vector register) */
+        uint32_t cpy_dist;
+        uint32_t bytes_remaining = sizeof(chunk_t);
+        chunk_t chunk_load;
+        uint8_t *cur_chunk = (uint8_t *)&chunk_load;
+        while (bytes_remaining) {
+            cpy_dist = MIN(dist, bytes_remaining);
+            memcpy(cur_chunk, buf, cpy_dist);
+            bytes_remaining -= cpy_dist;
+            cur_chunk += cpy_dist;
+            /* This allows us to bypass an expensive integer division since we're effectively
+             * counting in this loop, anyway */
+            *chunk_rem = cpy_dist;
+        }
+
+        return chunk_load;
+}
+#endif
 
 /* Copy DIST bytes from OUT - DIST into OUT + DIST * k, for 0 <= k < LEN/DIST.
    Return OUT + LEN. */
@@ -66,6 +97,12 @@ Z_INTERNAL uint8_t* CHUNKMEMSET(uint8_t *out, unsigned dist, unsigned len) {
     /* Debug performance related issues when len < sizeof(uint64_t):
        Assert(len >= sizeof(uint64_t), "chunkmemset should be called on larger chunks"); */
     Assert(dist > 0, "chunkmemset cannot have a distance 0");
+    /* Only AVX2 */
+#if CHUNK_SIZE == 32 && defined(X86_SSE41) && defined(X86_SSE2)
+    if (len <= 16) {
+        return chunkmemset_sse41(out, dist, len);
+    }
+#endif
 
     uint8_t *from = out - dist;
 
@@ -98,22 +135,7 @@ Z_INTERNAL uint8_t* CHUNKMEMSET(uint8_t *out, unsigned dist, unsigned len) {
     } else
 #endif
     {
-        /* This code takes string of length dist from "from" and repeats
-         * it for as many times as can fit in a chunk_t (vector register) */
-        uint32_t cpy_dist;
-        uint32_t bytes_remaining = sizeof(chunk_t);
-        uint8_t *cur_chunk = (uint8_t *)&chunk_load;
-        while (bytes_remaining) {
-            cpy_dist = MIN(dist, bytes_remaining);
-            memcpy(cur_chunk, from, cpy_dist);
-            bytes_remaining -= cpy_dist;
-            cur_chunk += cpy_dist;
-            /* This allows us to bypass an expensive integer division since we're effectively
-             * counting in this loop, anyway. However, we may have to derive a similarly
-             * sensible solution for if we use a permutation table that allows us to construct
-             * this vector in one load and one permute instruction */
-            chunk_mod = cpy_dist;
-        }
+        chunk_load = GET_CHUNK_MAG(from, &chunk_mod, dist);
     }
 
     /* If we're lucky enough and dist happens to be an even modulus of our vector length,
diff --git a/configure b/configure
index ff657e698..6f79825a7 100755
--- a/configure
+++ b/configure
@@ -1487,6 +1487,9 @@ case "${ARCH}" in
             if test ${HAVE_SSE41_INTRIN} -eq 1; then
                 CFLAGS="${CFLAGS} -DX86_SSE41"
                 SFLAGS="${SFLAGS} -DX86_SSE41"
+
+                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} chunkset_sse41.o"
+                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} chunkset_sse41.lo"
             fi
 
             check_sse42_intrinsics
diff --git a/cpu_features.h b/cpu_features.h
index 861ae0c4d..d3df33b9f 100644
--- a/cpu_features.h
+++ b/cpu_features.h
@@ -71,6 +71,10 @@ extern uint8_t* chunkunroll_sse2(uint8_t *out, unsigned *dist, unsigned *len);
 extern uint8_t* chunkmemset_sse2(uint8_t *out, unsigned dist, unsigned len);
 extern uint8_t* chunkmemset_safe_sse2(uint8_t *out, unsigned dist, unsigned len, unsigned left);
 #endif
+#ifdef X86_SSE41
+extern uint8_t* chunkmemset_sse41(uint8_t *out, unsigned dist, unsigned len);
+extern uint8_t* chunkmemset_safe_sse41(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+#endif
 #ifdef X86_AVX_CHUNKSET
 extern uint32_t chunksize_avx(void);
 extern uint8_t* chunkcopy_avx(uint8_t *out, uint8_t const *from, unsigned len);
diff --git a/functable.c b/functable.c
index 64992bc7b..960c51f1b 100644
--- a/functable.c
+++ b/functable.c
@@ -331,6 +331,10 @@ Z_INTERNAL uint8_t* chunkmemset_stub(uint8_t *out, unsigned dist, unsigned len)
 # endif
         functable.chunkmemset = &chunkmemset_sse2;
 #endif
+#if defined(X86_SSE41) && defined(X86_SSE2)
+    if (x86_cpu_has_sse41)
+        functable.chunkmemset = &chunkmemset_sse41;
+#endif
 #ifdef X86_AVX_CHUNKSET
     if (x86_cpu_has_avx2)
         functable.chunkmemset = &chunkmemset_avx;
@@ -358,6 +362,10 @@ Z_INTERNAL uint8_t* chunkmemset_safe_stub(uint8_t *out, unsigned dist, unsigned
 # endif
         functable.chunkmemset_safe = &chunkmemset_safe_sse2;
 #endif
+#if defined(X86_SSE41) && defined(X86_SSE2)
+    if (x86_cpu_has_sse41)
+        functable.chunkmemset_safe = &chunkmemset_safe_sse41;
+#endif
 #ifdef X86_AVX_CHUNKSET
     if (x86_cpu_has_avx2)
         functable.chunkmemset_safe = &chunkmemset_safe_avx;
diff --git a/inflate_p.h b/inflate_p.h
index a0b6aa8fb..65bbd4448 100644
--- a/inflate_p.h
+++ b/inflate_p.h
@@ -5,6 +5,8 @@
 #ifndef INFLATE_P_H
 #define INFLATE_P_H
 
+#include <stdlib.h>
+
 /* Architecture-specific hooks. */
 #ifdef S390_DFLTCC_INFLATE
 #  include "arch/s390/dfltcc_inflate.h"
@@ -145,8 +147,8 @@ static inline uint8_t* chunkcopy_safe(uint8_t *out, uint8_t *from, size_t len, u
      * we have to get a bit clever. First if the overlap is such that src falls between dst and dst+len, we can do the
      * initial bulk memcpy of the nonoverlapping region. Then, we can leverage the size of this to determine the safest
      * atomic memcpy size we can pick such that we have non-overlapping regions. This effectively becomes a safe look
-     * behind or lookahead distance */
-    size_t non_olap_size = ((from > out) ? from - out : out - from);
+     * behind or lookahead distance. */
+    size_t non_olap_size = llabs(from - out); // llabs vs labs for compatibility with windows
 
     memcpy(out, from, non_olap_size);
     out += non_olap_size;