Improved chunkset substantially where it's heavily used

author Adam Stylinski <kungfujesus06@gmail.com>

Sun, 10 Apr 2022 17:01:22 +0000 (13:01 -0400)

committer Hans Kristian Rosbach <hk-github@circlestorm.org>

Mon, 23 May 2022 14:13:29 +0000 (16:13 +0200)
author Adam Stylinski <kungfujesus06@gmail.com>
Sun, 10 Apr 2022 17:01:22 +0000 (13:01 -0400)
committer Hans Kristian Rosbach <hk-github@circlestorm.org>
Mon, 23 May 2022 14:13:29 +0000 (16:13 +0200)
diff --git a/CMakeLists.txt b/CMakeLists.txt

index 6c45fd84463ca33d9f7b162a212cd37ba05a1a68..24fe98b3e67393ebababb45d984e138a6763798c 100644 (file)
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -769,6 +769,9 @@ if(WITH_OPTIM)
              check_sse41_intrinsics()
              if(HAVE_SSE41_INTRIN)
                  add_definitions(-DX86_SSE41)
+                list(APPEND SSE41_SRCS ${ARCHDIR}/chunkset_sse41.c)
+                list(APPEND ZLIB_ARCH_SRCS ${SSE41_SRCS})
+                set_property(SOURCE ${SSE41_SRCS} PROPERTY COMPILE_FLAGS "${SSE41FLAG} ${NOLTOFLAG}")
              else()
                  set(WITH_SSE41 OFF)
              endif()
diff --git a/arch/x86/Makefile.in b/arch/x86/Makefile.in

index 05cf144b3e40571e8722006c7a16aea3e4b5fa63..689e3a0c24c7e59c34fe730d6e972650407309fd 100644 (file)
--- a/arch/x86/Makefile.in
+++ b/arch/x86/Makefile.in
@@ -31,6 +31,7 @@ all: \
         adler32_ssse3.o adler32_ssse3.lo \
         chunkset_avx.o chunkset_avx.lo \
         chunkset_sse2.o chunkset_sse2.lo \
+       chunkset_sse41.o chunkset_sse41.lo \
         compare256_avx2.o compare256_avx2.lo \
         compare256_sse2.o compare256_sse2.lo \
         insert_string_sse42.o insert_string_sse42.lo \
@@ -57,6 +58,12 @@ chunkset_sse2.o:
  chunkset_sse2.lo:
         $(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse2.c
  
+chunkset_sse41.o:
+       $(CC) $(CFLAGS) $(SSE41FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse41.c
+
+chunkset_sse41.lo:
+       $(CC) $(SFLAGS) $(SSE41FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse41.c
+
  compare256_avx2.o:
         $(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx2.c
  
diff --git a/arch/x86/chunk_permute_table.h b/arch/x86/chunk_permute_table.h

new file mode 100644 (file)

index 0000000..c7b2d2d
--- /dev/null
+++ b/arch/x86/chunk_permute_table.h
@@ -0,0 +1,53 @@
+/* chunk_permute_table.h - shared AVX/SSE4 permutation table for use with chunkmemset family of functions.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef CHUNK_PERMUTE_TABLE_H_
+#define CHUNK_PERMUTE_TABLE_H_
+
+#include "zbuild.h"
+
+/* Need entries for all numbers not an even modulus for 1, 2, 4, 8, 16 & 32 */
+static const ALIGNED_(32) uint8_t permute_table[26*32] = {
+    0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, /* dist 3 */
+    0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, /* dist 5 */
+    0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, /* dist 6 */
+    0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, /* dist 7 */
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, /* dist 9 */
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, /* dist 10 */
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, /* dist 11 */
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, /* dist 12 */
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2, 3, 4, 5, /* dist 13 */
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 1, 2, 3, /* dist 14 */
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, 1, /* dist 15 */
+
+    /* Beyond dists of 15 means we have to permute from a vector > len(m128i). Because AVX couldn't permute
+     * beyond 128 bit lanes until AVX512 for sub 4-byte sequences, we have to do some math here for an eventual
+     * blend with a comparison. That means we need to wrap the indices with yet another derived table. For simplicity,
+     * we'll use absolute indexing here to derive a blend vector. This is actually a lot simpler with ARM's TBL, but,
+     * this is what we're dealt.
+     */
+
+    16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, /* dist 17 */
+    16, 17, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, /* dist 18 */
+    16, 17, 18, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, /* dist 19 */
+    16, 17, 18, 19, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, /* dist 20 */
+    16, 17, 18, 19, 20, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, /* dist 21 */
+    16, 17, 18, 19, 20, 21, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, /* dist 22 */
+    16, 17, 18, 19, 20, 21, 22, 0, 1, 2, 3, 4, 5, 6, 7, 8, /* dist 23 */
+    16, 17, 18, 19, 20, 21, 22, 23, 0, 1, 2, 3, 4, 5, 6, 7, /* dist 24 */
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 0, 1, 2, 3, 4, 5, 6, /* dist 25 */
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 0, 1, 2, 3, 4, 5, /* dist 26 */
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 0, 1, 2, 3, 4, /* dist 27 */
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 0, 1, 2, 3, /* dist 28 */
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 0, 1, 2, /* dist 29 */
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 0, 1, /* dist 30 */
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 0, /* dist 31 */
+};
+
+typedef struct lut_rem_pair_s {
+    uint16_t idx;
+    uint16_t remval;
+} lut_rem_pair;
+
+#endif
diff --git a/arch/x86/chunkset_avx.c b/arch/x86/chunkset_avx.c

index 6643b0468372b782ce7a7b4fd60b6518e0d4ba0f..91aaa458ec62468b5edbe83908554300133d577c 100644 (file)
--- a/arch/x86/chunkset_avx.c
+++ b/arch/x86/chunkset_avx.c
@@ -5,6 +5,7 @@
  
  #ifdef X86_AVX_CHUNKSET
  #include <immintrin.h>
+#include "chunk_permute_table.h"
  
  typedef __m256i chunk_t;
  
@@ -13,6 +14,41 @@ typedef __m256i chunk_t;
  #define HAVE_CHUNKMEMSET_2
  #define HAVE_CHUNKMEMSET_4
  #define HAVE_CHUNKMEMSET_8
+#define HAVE_CHUNK_MAG
+
+/* Populate don't cares so that this is a direct lookup (with some indirection into the permute table), because dist can
+ * never be 0 - 2, we'll start with an offset, subtracting 3 from the input */
+static const lut_rem_pair perm_idx_lut[29] = {
+    { 0, 2},                /* 3 */
+    { 0, 0},                /* don't care */
+    { 1 * 32, 2},           /* 5 */
+    { 2 * 32, 2},           /* 6 */
+    { 3 * 32, 4},           /* 7 */
+    { 0 * 32, 0},           /* don't care */
+    { 4 * 32, 5},           /* 9 */
+    { 5 * 32, 22},          /* 10 */
+    { 6 * 32, 21},          /* 11 */
+    { 7 * 32, 20},          /* 12 */
+    { 8 * 32, 6},           /* 13 */
+    { 9 * 32, 4},           /* 14 */
+    {10 * 32, 2},           /* 15 */
+    { 0 * 32, 0},           /* don't care */
+    {11 * 32, 15},          /* 17 */
+    {11 * 32 + 16, 14},     /* 18 */
+    {11 * 32 + 16 * 2, 13}, /* 19 */
+    {11 * 32 + 16 * 3, 12}, /* 20 */
+    {11 * 32 + 16 * 4, 11}, /* 21 */
+    {11 * 32 + 16 * 5, 10}, /* 22 */
+    {11 * 32 + 16 * 6,  9}, /* 23 */
+    {11 * 32 + 16 * 7,  8}, /* 24 */
+    {11 * 32 + 16 * 8,  7}, /* 25 */
+    {11 * 32 + 16 * 9,  6}, /* 26 */
+    {11 * 32 + 16 * 10, 5}, /* 27 */
+    {11 * 32 + 16 * 11, 4}, /* 28 */
+    {11 * 32 + 16 * 12, 3}, /* 29 */
+    {11 * 32 + 16 * 13, 2}, /* 30 */
+    {11 * 32 + 16 * 14, 1}  /* 31 */
+};
  
  static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
      int16_t tmp;
@@ -40,6 +76,50 @@ static inline void storechunk(uint8_t *out, chunk_t *chunk) {
      _mm256_storeu_si256((__m256i *)out, *chunk);
  }
  
+static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
+    lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
+    __m256i ret_vec;
+    /* While technically we only need to read 4 or 8 bytes into this vector register for a lot of cases, GCC is
+     * compiling this to a shared load for all branches, preferring the simpler code.  Given that the buf value isn't in
+     * GPRs to begin with the 256 bit load is _probably_ just as inexpensive */
+    *chunk_rem = lut_rem.remval;
+
+#ifdef Z_MEMORY_SANITIZER
+    /* See note in chunkset_sse4.c for why this is ok */
+    __msan_unpoison(buf + dist, 32 - dist);
+#endif
+
+    if (dist < 16) {
+        /* This simpler case still requires us to shuffle in 128 bit lanes, so we must apply a static offset after
+         * broadcasting the first vector register to both halves. This is _marginally_ faster than doing two separate
+         * shuffles and combining the halves later */
+        const __m256i permute_xform =
+            _mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                             16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16);
+        __m256i perm_vec = _mm256_load_si256((__m256i*)(permute_table+lut_rem.idx));
+        __m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf);
+        perm_vec = _mm256_add_epi8(perm_vec, permute_xform);
+        ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), ret_vec0, 1);
+        ret_vec = _mm256_shuffle_epi8(ret_vec, perm_vec);
+    } else if (dist == 16) {
+        __m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf);
+        return _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), ret_vec0, 1);
+    } else {
+        __m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf);
+        __m128i ret_vec1 = _mm_loadu_si128((__m128i*)(buf + 16));
+        /* Take advantage of the fact that only the latter half of the 256 bit vector will actually differ */
+        __m128i perm_vec1 = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx));
+        __m128i xlane_permutes = _mm_cmpgt_epi8(_mm_set1_epi8(16), perm_vec1);
+        __m128i xlane_res  = _mm_shuffle_epi8(ret_vec0, perm_vec1);
+        /* Since we can't wrap twice, we can simply keep the later half exactly how it is instead of having to _also_
+         * shuffle those values */
+        __m128i latter_half = _mm_blendv_epi8(ret_vec1, xlane_res, xlane_permutes);
+        ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), latter_half, 1);
+    }
+
+    return ret_vec;
+}
+
  #define CHUNKSIZE        chunksize_avx
  #define CHUNKCOPY        chunkcopy_avx
  #define CHUNKCOPY_SAFE   chunkcopy_safe_avx
diff --git a/arch/x86/chunkset_sse41.c b/arch/x86/chunkset_sse41.c

new file mode 100644 (file)

index 0000000..c6f9821
--- /dev/null
+++ b/arch/x86/chunkset_sse41.c
@@ -0,0 +1,98 @@
+/* chunkset_sse41.c -- SSE4 inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+
+/* This requires SSE2 support. While it's implicit with SSE4, we can minimize
+ * code size by sharing the chunkcopy functions, which will certainly compile
+ * to identical machine code */
+#if defined(X86_SSE41) && defined(X86_SSE2)
+#include <immintrin.h>
+#include "chunk_permute_table.h"
+
+typedef __m128i chunk_t;
+
+#define CHUNK_SIZE 16
+
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+#define HAVE_CHUNK_MAG
+#define HAVE_CHUNKCOPY
+#define HAVE_CHUNKUNROLL
+
+static const lut_rem_pair perm_idx_lut[13] = {
+    {0, 1},      /* 3 */
+    {0, 0},      /* don't care */
+    {1 * 32, 1}, /* 5 */
+    {2 * 32, 4}, /* 6 */
+    {3 * 32, 2}, /* 7 */
+    {0 * 32, 0}, /* don't care */
+    {4 * 32, 7}, /* 9 */
+    {5 * 32, 6}, /* 10 */
+    {6 * 32, 5}, /* 11 */
+    {7 * 32, 4}, /* 12 */
+    {8 * 32, 3}, /* 13 */
+    {9 * 32, 2}, /* 14 */
+    {10 * 32, 1},/* 15 */
+};
+
+
+static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
+    int16_t tmp;
+    zmemcpy_2(&tmp, from);
+    *chunk = _mm_set1_epi16(tmp);
+}
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+    int32_t tmp;
+    zmemcpy_4(&tmp, from);
+    *chunk = _mm_set1_epi32(tmp);
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+    int64_t tmp;
+    zmemcpy_8(&tmp, from);
+    *chunk = _mm_set1_epi64x(tmp);
+}
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+    *chunk = _mm_loadu_si128((__m128i *)s);
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+    _mm_storeu_si128((__m128i *)out, *chunk);
+}
+
+static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
+    lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
+    __m128i perm_vec, ret_vec;
+#ifdef Z_MEMORY_SANITIZER
+    /* Important to note: 
+     * This is _not_ to subvert the memory sanitizer but to instead unpoison some
+     * bytes we willingly and purposefully load unitialized that we swizzle over
+     * in a vector register, anyway.  If what we assume is wrong about what is used,
+     * the memory sanitizer will still usefully flag it */
+    __msan_unpoison(buf + dist, 16 - dist);
+#endif
+    ret_vec = _mm_loadu_si128((__m128i*)buf);
+    *chunk_rem = lut_rem.remval;
+
+    perm_vec = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx));
+    ret_vec = _mm_shuffle_epi8(ret_vec, perm_vec);
+
+    return ret_vec;
+}
+
+extern uint8_t* chunkcopy_sse2(uint8_t *out, uint8_t const *from, unsigned len);
+
+#define CHUNKSIZE            chunksize_sse41
+#define CHUNKMEMSET          chunkmemset_sse41
+#define CHUNKMEMSET_SAFE     chunkmemset_safe_sse41
+#define CHUNKCOPY(a, b, c)   chunkcopy_sse2(a, b, c)
+#define CHUNKUNROLL(a, b, c) chunkunroll_sse2(a, b, c)
+
+#include "chunkset_tpl.h"
+
+#endif
diff --git a/chunkset_tpl.h b/chunkset_tpl.h

index 25ee9559671e17aaffce5052c61b4f6f6403cccb..f70ef42cdb836f93d15241ce8d4a961b0cfbb775 100644 (file)
--- a/chunkset_tpl.h
+++ b/chunkset_tpl.h
@@ -5,6 +5,10 @@
  #include "zbuild.h"
  #include <stdlib.h>
  
+#if CHUNK_SIZE == 32 && defined(X86_SSE41) && defined(X86_SSE2)
+extern uint8_t* chunkmemset_sse41(uint8_t *out, unsigned dist, unsigned len);
+#endif
+
  /* Returns the chunk size */
  Z_INTERNAL uint32_t CHUNKSIZE(void) {
      return sizeof(chunk_t);
@@ -20,6 +24,7 @@ Z_INTERNAL uint32_t CHUNKSIZE(void) {
     (chunk_t bytes or fewer) will fall straight through the loop
     without iteration, which will hopefully make the branch prediction more
     reliable. */
+#ifndef HAVE_CHUNKCOPY
  Z_INTERNAL uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
      Assert(len > 0, "chunkcopy should never have a length 0");
      chunk_t chunk;
@@ -38,6 +43,7 @@ Z_INTERNAL uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
      }
      return out;
  }
+#endif
  
  /* Perform short copies until distance can be rewritten as being at least
     sizeof chunk_t.
@@ -47,6 +53,7 @@ Z_INTERNAL uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
     This assumption holds because inflate_fast() starts every iteration with at
     least 258 bytes of output space available (258 being the maximum length
     output from a single token; see inflate_fast()'s assumptions below). */
+#ifndef HAVE_CHUNKUNROLL
  Z_INTERNAL uint8_t* CHUNKUNROLL(uint8_t *out, unsigned *dist, unsigned *len) {
      unsigned char const *from = out - *dist;
      chunk_t chunk;
@@ -59,6 +66,30 @@ Z_INTERNAL uint8_t* CHUNKUNROLL(uint8_t *out, unsigned *dist, unsigned *len) {
      }
      return out;
  }
+#endif
+
+#ifndef HAVE_CHUNK_MAG
+/* Loads a magazine to feed into memory of the pattern */
+static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
+        /* This code takes string of length dist from "from" and repeats
+         * it for as many times as can fit in a chunk_t (vector register) */
+        uint32_t cpy_dist;
+        uint32_t bytes_remaining = sizeof(chunk_t);
+        chunk_t chunk_load;
+        uint8_t *cur_chunk = (uint8_t *)&chunk_load;
+        while (bytes_remaining) {
+            cpy_dist = MIN(dist, bytes_remaining);
+            memcpy(cur_chunk, buf, cpy_dist);
+            bytes_remaining -= cpy_dist;
+            cur_chunk += cpy_dist;
+            /* This allows us to bypass an expensive integer division since we're effectively
+             * counting in this loop, anyway */
+            *chunk_rem = cpy_dist;
+        }
+
+        return chunk_load;
+}
+#endif
  
  /* Copy DIST bytes from OUT - DIST into OUT + DIST * k, for 0 <= k < LEN/DIST.
     Return OUT + LEN. */
@@ -66,6 +97,12 @@ Z_INTERNAL uint8_t* CHUNKMEMSET(uint8_t *out, unsigned dist, unsigned len) {
      /* Debug performance related issues when len < sizeof(uint64_t):
         Assert(len >= sizeof(uint64_t), "chunkmemset should be called on larger chunks"); */
      Assert(dist > 0, "chunkmemset cannot have a distance 0");
+    /* Only AVX2 */
+#if CHUNK_SIZE == 32 && defined(X86_SSE41) && defined(X86_SSE2)
+    if (len <= 16) {
+        return chunkmemset_sse41(out, dist, len);
+    }
+#endif
  
      uint8_t *from = out - dist;
  
@@ -98,22 +135,7 @@ Z_INTERNAL uint8_t* CHUNKMEMSET(uint8_t *out, unsigned dist, unsigned len) {
      } else
  #endif
      {
-        /* This code takes string of length dist from "from" and repeats
-         * it for as many times as can fit in a chunk_t (vector register) */
-        uint32_t cpy_dist;
-        uint32_t bytes_remaining = sizeof(chunk_t);
-        uint8_t *cur_chunk = (uint8_t *)&chunk_load;
-        while (bytes_remaining) {
-            cpy_dist = MIN(dist, bytes_remaining);
-            memcpy(cur_chunk, from, cpy_dist);
-            bytes_remaining -= cpy_dist;
-            cur_chunk += cpy_dist;
-            /* This allows us to bypass an expensive integer division since we're effectively
-             * counting in this loop, anyway. However, we may have to derive a similarly
-             * sensible solution for if we use a permutation table that allows us to construct
-             * this vector in one load and one permute instruction */
-            chunk_mod = cpy_dist;
-        }
+        chunk_load = GET_CHUNK_MAG(from, &chunk_mod, dist);
      }
  
      /* If we're lucky enough and dist happens to be an even modulus of our vector length,
diff --git a/configure b/configure

index ff657e698f5eb0498628d10dad855d8d4b90dab0..6f79825a7f9fedaa99d6b5005b58856385946c96 100755 (executable)
--- a/configure
+++ b/configure
@@ -1487,6 +1487,9 @@ case "${ARCH}" in
              if test ${HAVE_SSE41_INTRIN} -eq 1; then
                  CFLAGS="${CFLAGS} -DX86_SSE41"
                  SFLAGS="${SFLAGS} -DX86_SSE41"
+
+                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} chunkset_sse41.o"
+                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} chunkset_sse41.lo"
              fi
  
              check_sse42_intrinsics
diff --git a/cpu_features.h b/cpu_features.h

index 861ae0c4d80644085cea4df9d93c21c4de3029f0..d3df33b9f6c52225ffc41c7ee2ca999d8a12973d 100644 (file)
--- a/cpu_features.h
+++ b/cpu_features.h
@@ -71,6 +71,10 @@ extern uint8_t* chunkunroll_sse2(uint8_t *out, unsigned *dist, unsigned *len);
  extern uint8_t* chunkmemset_sse2(uint8_t *out, unsigned dist, unsigned len);
  extern uint8_t* chunkmemset_safe_sse2(uint8_t *out, unsigned dist, unsigned len, unsigned left);
  #endif
+#ifdef X86_SSE41
+extern uint8_t* chunkmemset_sse41(uint8_t *out, unsigned dist, unsigned len);
+extern uint8_t* chunkmemset_safe_sse41(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+#endif
  #ifdef X86_AVX_CHUNKSET
  extern uint32_t chunksize_avx(void);
  extern uint8_t* chunkcopy_avx(uint8_t *out, uint8_t const *from, unsigned len);
diff --git a/functable.c b/functable.c

index 64992bc7be9a2ea82e7ada6efab977b8c2cfc3c2..960c51f1bc99241610651eefd27cf7834363b944 100644 (file)
--- a/functable.c
+++ b/functable.c
@@ -331,6 +331,10 @@ Z_INTERNAL uint8_t* chunkmemset_stub(uint8_t *out, unsigned dist, unsigned len)
  # endif
          functable.chunkmemset = &chunkmemset_sse2;
  #endif
+#if defined(X86_SSE41) && defined(X86_SSE2)
+    if (x86_cpu_has_sse41)
+        functable.chunkmemset = &chunkmemset_sse41;
+#endif
  #ifdef X86_AVX_CHUNKSET
      if (x86_cpu_has_avx2)
          functable.chunkmemset = &chunkmemset_avx;
@@ -358,6 +362,10 @@ Z_INTERNAL uint8_t* chunkmemset_safe_stub(uint8_t *out, unsigned dist, unsigned
  # endif
          functable.chunkmemset_safe = &chunkmemset_safe_sse2;
  #endif
+#if defined(X86_SSE41) && defined(X86_SSE2)
+    if (x86_cpu_has_sse41)
+        functable.chunkmemset_safe = &chunkmemset_safe_sse41;
+#endif
  #ifdef X86_AVX_CHUNKSET
      if (x86_cpu_has_avx2)
          functable.chunkmemset_safe = &chunkmemset_safe_avx;
diff --git a/inflate_p.h b/inflate_p.h

index a0b6aa8fb585af66f5cd6657a9aeeec9a696a049..65bbd4448a3026dbff767446149b67b38d09b08a 100644 (file)
--- a/inflate_p.h
+++ b/inflate_p.h
@@ -5,6 +5,8 @@
  #ifndef INFLATE_P_H
  #define INFLATE_P_H
  
+#include <stdlib.h>
+
  /* Architecture-specific hooks. */
  #ifdef S390_DFLTCC_INFLATE
  #  include "arch/s390/dfltcc_inflate.h"
@@ -145,8 +147,8 @@ static inline uint8_t* chunkcopy_safe(uint8_t *out, uint8_t *from, size_t len, u
       * we have to get a bit clever. First if the overlap is such that src falls between dst and dst+len, we can do the
       * initial bulk memcpy of the nonoverlapping region. Then, we can leverage the size of this to determine the safest
       * atomic memcpy size we can pick such that we have non-overlapping regions. This effectively becomes a safe look
-     * behind or lookahead distance */
-    size_t non_olap_size = ((from > out) ? from - out : out - from);
+     * behind or lookahead distance. */
+    size_t non_olap_size = llabs(from - out); // llabs vs labs for compatibility with windows
  
      memcpy(out, from, non_olap_size);
      out += non_olap_size;
author	Adam Stylinski <kungfujesus06@gmail.com>
	Sun, 10 Apr 2022 17:01:22 +0000 (13:01 -0400)
committer	Hans Kristian Rosbach <hk-github@circlestorm.org>
	Mon, 23 May 2022 14:13:29 +0000 (16:13 +0200)
CMakeLists.txt		patch \| blob \| blame \| history
arch/x86/Makefile.in		patch \| blob \| blame \| history
arch/x86/chunk_permute_table.h	[new file with mode: 0644]	patch \| blob
arch/x86/chunkset_avx.c		patch \| blob \| blame \| history
arch/x86/chunkset_sse41.c	[new file with mode: 0644]	patch \| blob
chunkset_tpl.h		patch \| blob \| blame \| history
configure		patch \| blob \| blame \| history
cpu_features.h		patch \| blob \| blame \| history
functable.c		patch \| blob \| blame \| history
inflate_p.h		patch \| blob \| blame \| history