Make an AVX512 inflate fast with low cost masked writes

author Adam Stylinski <kungfujesus06@gmail.com>

Wed, 25 Sep 2024 21:56:36 +0000 (17:56 -0400)

committer Hans Kristian Rosbach <hk-github@circlestorm.org>

Wed, 20 Nov 2024 21:14:44 +0000 (22:14 +0100)
author Adam Stylinski <kungfujesus06@gmail.com>
Wed, 25 Sep 2024 21:56:36 +0000 (17:56 -0400)
committer Hans Kristian Rosbach <hk-github@circlestorm.org>
Wed, 20 Nov 2024 21:14:44 +0000 (22:14 +0100)
diff --git a/CMakeLists.txt b/CMakeLists.txt

index fe83ceb9b63b3c1178db9bb067b039b70661d690..92dc2d4d61a37e64ed43226450a9a3c66a3f1a2e 100644 (file)
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -981,8 +981,10 @@ if(WITH_OPTIM)
                  add_definitions(-DX86_AVX512)
                  list(APPEND AVX512_SRCS ${ARCHDIR}/adler32_avx512.c)
                  add_feature_info(AVX512_ADLER32 1 "Support AVX512-accelerated adler32, using \"${AVX512FLAG}\"")
-                list(APPEND ZLIB_ARCH_SRCS ${AVX512_SRCS})
+                list(APPEND AVX512_SRCS ${ARCHDIR}/chunkset_avx512.c)
+                add_feature_info(AVX512_CHUNKSET 1 "Support AVX512 optimized chunkset, using \"${AVX512FLAG}\"")
                  list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/adler32_avx512_p.h)
+                list(APPEND ZLIB_ARCH_SRCS ${AVX512_SRCS})
                  set_property(SOURCE ${AVX512_SRCS} PROPERTY COMPILE_FLAGS "${AVX512FLAG} ${NOLTOFLAG}")
              else()
                  set(WITH_AVX512 OFF)
diff --git a/arch/x86/Makefile.in b/arch/x86/Makefile.in

index c13cd179c0cc72111c1f89ef8f22a9f8da88c288..a012e61ea7270951904e107142c4e85886b7ea38 100644 (file)
--- a/arch/x86/Makefile.in
+++ b/arch/x86/Makefile.in
@@ -8,8 +8,8 @@ SFLAGS=
  INCLUDES=
  SUFFIX=
  
-AVX512FLAG=-mavx512f -mavx512dq -mavx512vl -mavx512bw
-AVX512VNNIFLAG=-mavx512vnni
+AVX512FLAG=-mavx512f -mavx512dq -mavx512vl -mavx512bw -mbmi2
+AVX512VNNIFLAG=-mavx512vnni -mbmi2
  AVX2FLAG=-mavx2
  SSE2FLAG=-msse2
  SSSE3FLAG=-mssse3
@@ -31,6 +31,7 @@ all: \
         adler32_sse42.o adler32_sse42.lo \
         adler32_ssse3.o adler32_ssse3.lo \
         chunkset_avx2.o chunkset_avx2.lo \
+       chunkset_avx512.o chunkset_avx512.lo \
         chunkset_sse2.o chunkset_sse2.lo \
         chunkset_ssse3.o chunkset_ssse3.lo \
         compare256_avx2.o compare256_avx2.lo \
@@ -52,6 +53,12 @@ chunkset_avx2.o:
  chunkset_avx2.lo:
         $(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx2.c
  
+chunkset_avx512.o:
+       $(CC) $(CFLAGS) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx512.c
+
+chunkset_avx512.lo:
+       $(CC) $(SFLAGS) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx512.c
+
  chunkset_sse2.o:
         $(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse2.c
  
diff --git a/arch/x86/avx2_tables.h b/arch/x86/avx2_tables.h

new file mode 100644 (file)

index 0000000..5075999
--- /dev/null
+++ b/arch/x86/avx2_tables.h
@@ -0,0 +1,44 @@
+#ifndef _AVX2_TABLES_H
+#define _AVX2_TABLES_H
+
+#include "../generic/chunk_permute_table.h"
+
+/* Populate don't cares so that this is a direct lookup (with some indirection into the permute table), because dist can
+ * never be 0 - 2, we'll start with an offset, subtracting 3 from the input */
+static const lut_rem_pair perm_idx_lut[29] = {
+    { 0, 2},                /* 3 */
+    { 0, 0},                /* don't care */
+    { 1 * 32, 2},           /* 5 */
+    { 2 * 32, 2},           /* 6 */
+    { 3 * 32, 4},           /* 7 */
+    { 0 * 32, 0},           /* don't care */
+    { 4 * 32, 5},           /* 9 */
+    { 5 * 32, 22},          /* 10 */
+    { 6 * 32, 21},          /* 11 */
+    { 7 * 32, 20},          /* 12 */
+    { 8 * 32, 6},           /* 13 */
+    { 9 * 32, 4},           /* 14 */
+    {10 * 32, 2},           /* 15 */
+    { 0 * 32, 0},           /* don't care */
+    {11 * 32, 15},          /* 17 */
+    {11 * 32 + 16, 14},     /* 18 */
+    {11 * 32 + 16 * 2, 13}, /* 19 */
+    {11 * 32 + 16 * 3, 12}, /* 20 */
+    {11 * 32 + 16 * 4, 11}, /* 21 */
+    {11 * 32 + 16 * 5, 10}, /* 22 */
+    {11 * 32 + 16 * 6,  9}, /* 23 */
+    {11 * 32 + 16 * 7,  8}, /* 24 */
+    {11 * 32 + 16 * 8,  7}, /* 25 */
+    {11 * 32 + 16 * 9,  6}, /* 26 */
+    {11 * 32 + 16 * 10, 5}, /* 27 */
+    {11 * 32 + 16 * 11, 4}, /* 28 */
+    {11 * 32 + 16 * 12, 3}, /* 29 */
+    {11 * 32 + 16 * 13, 2}, /* 30 */
+    {11 * 32 + 16 * 14, 1}  /* 31 */
+};
+
+static const uint16_t half_rem_vals[13] = {
+    1, 0, 1, 4, 2, 0, 7, 6, 5, 4, 3, 2, 1
+};
+
+#endif
diff --git a/arch/x86/chunkset_avx2.c b/arch/x86/chunkset_avx2.c

index 8cc17103203ec3ff87c4cba1e3d2f0acbe888ced..8f29e5b292e46713402c66aed78ebf2c8faa4067 100644 (file)
--- a/arch/x86/chunkset_avx2.c
+++ b/arch/x86/chunkset_avx2.c
@@ -4,8 +4,8 @@
  #include "zbuild.h"
  
  #ifdef X86_AVX2
+#include "avx2_tables.h"
  #include <immintrin.h>
-#include "../generic/chunk_permute_table.h"
  #include "x86_intrins.h"
  
  typedef __m256i chunk_t;
@@ -19,44 +19,6 @@ typedef __m128i halfchunk_t;
  #define HAVE_CHUNK_MAG
  #define HAVE_HALF_CHUNK
  
-/* Populate don't cares so that this is a direct lookup (with some indirection into the permute table), because dist can
- * never be 0 - 2, we'll start with an offset, subtracting 3 from the input */
-static const lut_rem_pair perm_idx_lut[29] = {
-    { 0, 2},                /* 3 */
-    { 0, 0},                /* don't care */
-    { 1 * 32, 2},           /* 5 */
-    { 2 * 32, 2},           /* 6 */
-    { 3 * 32, 4},           /* 7 */
-    { 0 * 32, 0},           /* don't care */
-    { 4 * 32, 5},           /* 9 */
-    { 5 * 32, 22},          /* 10 */
-    { 6 * 32, 21},          /* 11 */
-    { 7 * 32, 20},          /* 12 */
-    { 8 * 32, 6},           /* 13 */
-    { 9 * 32, 4},           /* 14 */
-    {10 * 32, 2},           /* 15 */
-    { 0 * 32, 0},           /* don't care */
-    {11 * 32, 15},          /* 17 */
-    {11 * 32 + 16, 14},     /* 18 */
-    {11 * 32 + 16 * 2, 13}, /* 19 */
-    {11 * 32 + 16 * 3, 12}, /* 20 */
-    {11 * 32 + 16 * 4, 11}, /* 21 */
-    {11 * 32 + 16 * 5, 10}, /* 22 */
-    {11 * 32 + 16 * 6,  9}, /* 23 */
-    {11 * 32 + 16 * 7,  8}, /* 24 */
-    {11 * 32 + 16 * 8,  7}, /* 25 */
-    {11 * 32 + 16 * 9,  6}, /* 26 */
-    {11 * 32 + 16 * 10, 5}, /* 27 */
-    {11 * 32 + 16 * 11, 4}, /* 28 */
-    {11 * 32 + 16 * 12, 3}, /* 29 */
-    {11 * 32 + 16 * 13, 2}, /* 30 */
-    {11 * 32 + 16 * 14, 1}  /* 31 */
-};
-
-static const uint16_t half_rem_vals[13] = {
-    1, 0, 1, 4, 2, 0, 7, 6, 5, 4, 3, 2, 1
-};
-
  static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
      int16_t tmp;
      memcpy(&tmp, from, sizeof(tmp));
diff --git a/arch/x86/chunkset_avx512.c b/arch/x86/chunkset_avx512.c

new file mode 100644 (file)

index 0000000..551df02
--- /dev/null
+++ b/arch/x86/chunkset_avx512.c
@@ -0,0 +1,189 @@
+/* chunkset_avx512.c -- AVX512 inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#include "zbuild.h"
+
+#ifdef X86_AVX512
+
+#include "avx2_tables.h"
+#include <immintrin.h>
+#include "x86_intrins.h"
+
+typedef __m256i chunk_t;
+typedef __m128i halfchunk_t;
+typedef __mmask32 mask_t;
+typedef __mmask16 halfmask_t;
+
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+#define HAVE_CHUNKMEMSET_16
+#define HAVE_CHUNKMEMSET_1
+#define HAVE_CHUNK_MAG
+#define HAVE_HALF_CHUNK
+#define HAVE_MASKED_READWRITE
+#define HAVE_CHUNKCOPY
+#define HAVE_HALFCHUNKCOPY
+
+static inline halfmask_t gen_half_mask(unsigned len) {
+   return (halfmask_t)_bzhi_u32(0xFFFF, len);
+}
+
+static inline mask_t gen_mask(unsigned len) {
+   return (mask_t)_bzhi_u32(0xFFFFFFFF, len);
+}
+
+static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
+    int16_t tmp;
+    memcpy(&tmp, from, sizeof(tmp));
+    *chunk = _mm256_set1_epi16(tmp);
+}
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+    int32_t tmp;
+    memcpy(&tmp, from, sizeof(tmp));
+    *chunk = _mm256_set1_epi32(tmp);
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+    int64_t tmp;
+    memcpy(&tmp, from, sizeof(tmp));
+    *chunk = _mm256_set1_epi64x(tmp);
+}
+
+static inline void chunkmemset_16(uint8_t *from, chunk_t *chunk) {
+    *chunk = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)from));
+}
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+    *chunk = _mm256_loadu_si256((__m256i *)s);
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+    _mm256_storeu_si256((__m256i *)out, *chunk);
+}
+
+static inline void storechunk_mask(uint8_t *out, mask_t mask, chunk_t *chunk) {
+    _mm256_mask_storeu_epi8(out, mask, *chunk);
+}
+
+static inline uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
+    Assert(len > 0, "chunkcopy should never have a length 0");
+
+    unsigned rem = len % sizeof(chunk_t);
+    mask_t rem_mask = gen_mask(rem);
+
+    /* Since this is only ever called if dist >= a chunk, we don't need a masked load */
+    chunk_t chunk;
+    loadchunk(from, &chunk);
+    _mm256_mask_storeu_epi8(out, rem_mask, chunk);
+    out += rem;
+    from += rem;
+    len -= rem;
+
+    while (len > 0) {
+        loadchunk(from, &chunk);
+        storechunk(out, &chunk);
+        out += sizeof(chunk_t);
+        from += sizeof(chunk_t);
+        len -= sizeof(chunk_t);
+    }
+
+    return out;
+}
+
+static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
+    lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
+    __m256i ret_vec;
+    *chunk_rem = lut_rem.remval;
+
+    /* See the AVX2 implementation for more detailed comments. This is that + some masked
+     * loads to avoid an out of bounds read on the heap */
+
+    if (dist < 16) {
+        const __m256i permute_xform =
+            _mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                             16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16);
+        __m256i perm_vec = _mm256_load_si256((__m256i*)(permute_table+lut_rem.idx));
+        halfmask_t load_mask = gen_half_mask(dist);
+        __m128i ret_vec0 = _mm_maskz_loadu_epi8(load_mask, buf);
+        perm_vec = _mm256_add_epi8(perm_vec, permute_xform);
+        ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), ret_vec0, 1);
+        ret_vec = _mm256_shuffle_epi8(ret_vec, perm_vec);
+    }  else {
+        halfmask_t load_mask = gen_half_mask(dist - 16);
+        __m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf);
+        __m128i ret_vec1 = _mm_maskz_loadu_epi8(load_mask, (__m128i*)(buf + 16));
+        __m128i perm_vec1 = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx));
+        halfmask_t xlane_mask = _mm_cmp_epi8_mask(perm_vec1, _mm_set1_epi8(15), _MM_CMPINT_LE);
+        __m128i latter_half = _mm_mask_shuffle_epi8(ret_vec1, xlane_mask, ret_vec0, perm_vec1);
+        ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), latter_half, 1);
+    }
+
+    return ret_vec;
+}
+
+static inline void loadhalfchunk(uint8_t const *s, halfchunk_t *chunk) {
+    *chunk = _mm_loadu_si128((__m128i *)s);
+}
+
+static inline void storehalfchunk(uint8_t *out, halfchunk_t *chunk) {
+    _mm_storeu_si128((__m128i *)out, *chunk);
+}
+
+static inline chunk_t halfchunk2whole(halfchunk_t *chunk) {
+    /* We zero extend mostly to appease some memory sanitizers. These bytes are ultimately
+     * unlikely to be actually written or read from */
+    return _mm256_zextsi128_si256(*chunk);
+}
+
+static inline halfchunk_t GET_HALFCHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
+    lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
+    __m128i perm_vec, ret_vec;
+    halfmask_t load_mask = gen_half_mask(dist);
+    ret_vec = _mm_maskz_loadu_epi8(load_mask, buf);
+    *chunk_rem = half_rem_vals[dist - 3];
+
+    perm_vec = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx));
+    ret_vec = _mm_shuffle_epi8(ret_vec, perm_vec);
+
+    return ret_vec;
+}
+
+static inline uint8_t* HALFCHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
+    Assert(len > 0, "chunkcopy should never have a length 0");
+
+    unsigned rem = len % sizeof(halfchunk_t);
+    halfmask_t rem_mask = gen_half_mask(rem);
+
+    /* Since this is only ever called if dist >= a chunk, we don't need a masked load */
+    halfchunk_t chunk;
+    loadhalfchunk(from, &chunk);
+    _mm_mask_storeu_epi8(out, rem_mask, chunk);
+    out += rem;
+    from += rem;
+    len -= rem;
+
+    while (len > 0) {
+        loadhalfchunk(from, &chunk);
+        storehalfchunk(out, &chunk);
+        out += sizeof(halfchunk_t);
+        from += sizeof(halfchunk_t);
+        len -= sizeof(halfchunk_t);
+    }
+
+    return out;
+}
+
+#define CHUNKSIZE        chunksize_avx512
+#define CHUNKUNROLL      chunkunroll_avx512
+#define CHUNKMEMSET      chunkmemset_avx512
+#define CHUNKMEMSET_SAFE chunkmemset_safe_avx512
+
+#include "chunkset_tpl.h"
+
+#define INFLATE_FAST     inflate_fast_avx512
+
+#include "inffast_tpl.h"
+
+#endif
diff --git a/arch/x86/x86_features.c b/arch/x86/x86_features.c

index 58cb4df341f2a5b45fce0ab52efc01610f4d84df..9491a00730831fc8658cfb1afe5bde254c631398 100644 (file)
--- a/arch/x86/x86_features.c
+++ b/arch/x86/x86_features.c
@@ -97,6 +97,8 @@ void Z_INTERNAL x86_check_features(struct x86_cpu_features *features) {
              features->has_avx2 = ebx & 0x20;
          }
  
+        features->has_bmi2 = ebx & 0x8;
+
          // check AVX512 bits if the OS supports saving ZMM registers
          if (features->has_os_save_zmm) {
              features->has_avx512f = ebx & 0x00010000;
@@ -108,7 +110,7 @@ void Z_INTERNAL x86_check_features(struct x86_cpu_features *features) {
                  features->has_avx512vl = ebx & 0x80000000;
              }
              features->has_avx512_common = features->has_avx512f && features->has_avx512dq && features->has_avx512bw \
-              && features->has_avx512vl;
+              && features->has_avx512vl && features->has_bmi2;
              features->has_avx512vnni = ecx & 0x800;
          }
      }
diff --git a/arch/x86/x86_features.h b/arch/x86/x86_features.h

index 6daa5e38282ffc5b318e2381ffe139aa3ee50ecd..3901ad75becc2b928827010ad9c55e1ccaf3d7ff 100644 (file)
--- a/arch/x86/x86_features.h
+++ b/arch/x86/x86_features.h
@@ -14,6 +14,7 @@ struct x86_cpu_features {
      int has_avx512vl;
      int has_avx512_common; // Enabled when AVX512(F,DQ,BW,VL) are all enabled.
      int has_avx512vnni;
+    int has_bmi2;
      int has_sse2;
      int has_ssse3;
      int has_sse42;
diff --git a/arch/x86/x86_functions.h b/arch/x86/x86_functions.h

index 5f8fcf63fce116d14784c0b228764641d30305fe..fc62daeae15af6f1ef44b0d6731f8f7296b090b4 100644 (file)
--- a/arch/x86/x86_functions.h
+++ b/arch/x86/x86_functions.h
@@ -46,6 +46,9 @@ uint8_t* chunkmemset_safe_avx2(uint8_t *out, uint8_t *from, unsigned len, unsign
  #ifdef X86_AVX512
  uint32_t adler32_avx512(uint32_t adler, const uint8_t *buf, size_t len);
  uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+uint32_t chunksize_avx512(void);
+uint8_t* chunkmemset_safe_avx512(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
+void inflate_fast_avx512(PREFIX3(stream)* strm, uint32_t start);
  #endif
  #ifdef X86_AVX512VNNI
  uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *buf, size_t len);
@@ -146,6 +149,12 @@ uint32_t crc32_vpclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len);
  #    define native_adler32 adler32_avx512
  #    undef native_adler32_fold_copy
  #    define native_adler32_fold_copy adler32_fold_copy_avx512
+#    undef native_chunkmemset_safe
+#    define native_chunkmemset_safe chunkmemset_safe_avx512
+#    undef native_chunksize
+#    define native_chunksize chunksize_avx512
+#    undef native_inflate_fast
+#    define native_inflate_fast inflate_fast_avx512
  // X86 - AVX512 (VNNI)
  #    if defined(X86_AVX512VNNI) && defined(__AVX512VNNI__)
  #      undef native_adler32
diff --git a/chunkset_tpl.h b/chunkset_tpl.h

index fc9f755e753f494a42de11b610df1d971b906bc1..5af1fbe8aa12907aada08a17625fb23706ae0922 100644 (file)
--- a/chunkset_tpl.h
+++ b/chunkset_tpl.h
@@ -4,7 +4,6 @@
  
  #include "zbuild.h"
  #include <stdlib.h>
-#include <stdio.h>
  
  /* Returns the chunk size */
  Z_INTERNAL uint32_t CHUNKSIZE(void) {
@@ -88,7 +87,7 @@ static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t
  }
  #endif
  
-#ifdef HAVE_HALF_CHUNK
+#if defined(HAVE_HALF_CHUNK) && !defined(HAVE_HALFCHUNKCOPY)
  static inline uint8_t* HALFCHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
      halfchunk_t chunk;
      int32_t align = ((len - 1) % sizeof(halfchunk_t)) + 1;
@@ -126,6 +125,15 @@ static inline uint8_t* CHUNKMEMSET(uint8_t *out, uint8_t *from, unsigned len) {
       * always needed to be handled here or if we're just now seeing it because we are
       * dispatching to this function, more */
      if (sdist < 0 && dist < len) {
+#ifdef HAVE_MASKED_READWRITE
+        /* We can still handle this case if we can mitigate over writing _and_ we
+         * fit the entirety of the copy length with one load */
+        if (len <= sizeof(chunk_t)) {
+            /* Tempting to add a goto to the block below but hopefully most compilers
+             * collapse these identical code segments as one label to jump to */
+            return CHUNKCOPY(out, from, len);
+        }
+#endif
          /* Here the memmove semantics match perfectly, as when this happens we are
           * effectively sliding down the contents of memory by dist bytes */
          memmove(out, from, len);
@@ -139,7 +147,7 @@ static inline uint8_t* CHUNKMEMSET(uint8_t *out, uint8_t *from, unsigned len) {
          return CHUNKCOPY(out, from, len);
      }
  
-    /* Only AVX2 as there's 128 bit vectors and 256 bit. We allow for shorter vector
+    /* Only AVX2+ as there's 128 bit vectors and 256 bit. We allow for shorter vector
       * lengths because they serve to allow more cases to fall into chunkcopy, as the
       * distance of the shorter length is still deemed a safe distance. We rewrite this
       * here rather than calling the ssse3 variant directly now because doing so required
@@ -154,11 +162,10 @@ static inline uint8_t* CHUNKMEMSET(uint8_t *out, uint8_t *from, unsigned len) {
          if ((dist % 2) != 0 || dist == 6) {
              halfchunk_t halfchunk_load = GET_HALFCHUNK_MAG(from, &chunk_mod, (unsigned)dist);
  
-            adv_amount = sizeof(halfchunk_t) - chunk_mod;
              if (len == sizeof(halfchunk_t)) {
                  storehalfchunk(out, &halfchunk_load);
-                len -= adv_amount;
-                out += adv_amount;
+                len -= sizeof(halfchunk_t);
+                out += sizeof(halfchunk_t);
              }
  
              chunk_load = halfchunk2whole(&halfchunk_load);
@@ -212,7 +219,11 @@ static inline uint8_t* CHUNKMEMSET(uint8_t *out, uint8_t *from, unsigned len) {
  rem_bytes:
  #endif
      if (len) {
+#ifndef HAVE_MASKED_READWRITE
          memcpy(out, &chunk_load, len);
+#else
+        storechunk_mask(out, gen_mask(len), &chunk_load);
+#endif
          out += len;
      }
  
@@ -237,6 +248,8 @@ Z_INTERNAL uint8_t* CHUNKMEMSET_SAFE(uint8_t *out, uint8_t *from, unsigned len,
          --left;
      }
  #endif
+
+#ifndef HAVE_MASKED_READWRITE
      if (UNLIKELY(left < sizeof(chunk_t))) {
          while (len > 0) {
              *out++ = *from++;
@@ -245,6 +258,7 @@ Z_INTERNAL uint8_t* CHUNKMEMSET_SAFE(uint8_t *out, uint8_t *from, unsigned len,
  
          return out;
      }
+#endif
  
      if (len)
          out = CHUNKMEMSET(out, from, len);
@@ -252,14 +266,15 @@ Z_INTERNAL uint8_t* CHUNKMEMSET_SAFE(uint8_t *out, uint8_t *from, unsigned len,
      return out;
  }
  
-static inline uint8_t *CHUNKCOPY_SAFE(uint8_t *out, uint8_t *from, unsigned len, uint8_t *safe)
+static inline uint8_t *CHUNKCOPY_SAFE(uint8_t *out, uint8_t *from, uint64_t len, uint8_t *safe)
  {
      if (out == from)
          return out + len;
  
      uint64_t safelen = (safe - out);
-    len = MIN(len, (unsigned)safelen);
+    len = MIN(len, safelen);
  
+#ifndef HAVE_MASKED_READWRITE
      uint64_t from_dist = (uint64_t)llabs(safe - from);
      if (UNLIKELY(from_dist < sizeof(chunk_t) || safelen < sizeof(chunk_t))) {
          while (len--) {
@@ -268,6 +283,7 @@ static inline uint8_t *CHUNKCOPY_SAFE(uint8_t *out, uint8_t *from, unsigned len,
  
          return out;
      }
+#endif
  
-    return CHUNKMEMSET(out, from, len);
+    return CHUNKMEMSET(out, from, (unsigned)len);
  }
diff --git a/cmake/detect-intrinsics.cmake b/cmake/detect-intrinsics.cmake

index 1906f21547b5c439bb3915ee6806c74a8aa9d622..b8eabe8e2198887a3234dfe8f280c753b43531f4 100644 (file)
--- a/cmake/detect-intrinsics.cmake
+++ b/cmake/detect-intrinsics.cmake
@@ -76,14 +76,14 @@ macro(check_avx512_intrinsics)
      if(NOT NATIVEFLAG)
          if(CMAKE_C_COMPILER_ID MATCHES "Intel")
              if(CMAKE_HOST_UNIX OR APPLE)
-                set(AVX512FLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl")
+                set(AVX512FLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mbmi2")
              else()
                  set(AVX512FLAG "/arch:AVX512")
              endif()
          elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
              # For CPUs that can benefit from AVX512, it seems GCC generates suboptimal
              # instruction scheduling unless you specify a reasonable -mtune= target
-            set(AVX512FLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl")
+            set(AVX512FLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mbmi2")
              if(NOT MSVC)
                  check_c_compiler_flag("-mtune=cascadelake" HAVE_CASCADE_LAKE)
                  if(HAVE_CASCADE_LAKE)
@@ -114,12 +114,12 @@ macro(check_avx512vnni_intrinsics)
      if(NOT NATIVEFLAG)
          if(CMAKE_C_COMPILER_ID MATCHES "Intel")
              if(CMAKE_HOST_UNIX OR APPLE OR CMAKE_C_COMPILER_ID MATCHES "IntelLLVM")
-                set(AVX512VNNIFLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni")
+                set(AVX512VNNIFLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mbmi2")
              else()
                  set(AVX512VNNIFLAG "/arch:AVX512")
              endif()
          elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
-            set(AVX512VNNIFLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni")
+            set(AVX512VNNIFLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mbmi2")
              if(NOT MSVC)
                  check_c_compiler_flag("-mtune=cascadelake" HAVE_CASCADE_LAKE)
                  if(HAVE_CASCADE_LAKE)
diff --git a/configure b/configure

index 04d962e1dd535e8fd0465cf01fb4838b2da3cf36..738e5f928b02319c805e3ad6e6c9a3a4c3d4049f 100755 (executable)
--- a/configure
+++ b/configure
@@ -106,7 +106,7 @@ floatabi=
  forcesse2=0
  # For CPUs that can benefit from AVX512, it seems GCC generates suboptimal
  # instruction scheduling unless you specify a reasonable -mtune= target
-avx512flag="-mavx512f -mavx512dq -mavx512bw -mavx512vl"
+avx512flag="-mavx512f -mavx512dq -mavx512bw -mavx512vl -mbmi2"
  avx512vnniflag="${avx512flag} -mavx512vnni"
  avx2flag="-mavx2"
  sse2flag="-msse2"
@@ -1589,8 +1589,8 @@ case "${ARCH}" in
              if test ${HAVE_AVX512_INTRIN} -eq 1; then
                  CFLAGS="${CFLAGS} -DX86_AVX512"
                  SFLAGS="${SFLAGS} -DX86_AVX512"
-                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_avx512.o"
-                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_avx512.lo"
+                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_avx512.o chunkset_avx512.o"
+                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_avx512.lo chunkset_avx512.lo"
              fi
  
              check_mtune_cascadelake_compiler_flag
diff --git a/functable.c b/functable.c

index 832a57e78afcc3ca51c2fcf4b2ec449899d0152c..c8b11b5fa16b44b09cb7bdd8f9948b15a4c704cd 100644 (file)
--- a/functable.c
+++ b/functable.c
@@ -129,6 +129,9 @@ static void init_functable(void) {
      if (cf.x86.has_avx512_common) {
          ft.adler32 = &adler32_avx512;
          ft.adler32_fold_copy = &adler32_fold_copy_avx512;
+        ft.chunkmemset_safe = &chunkmemset_safe_avx512;
+        ft.chunksize = &chunksize_avx512;
+        ft.inflate_fast = &inflate_fast_avx512;
      }
  #endif
  #ifdef X86_AVX512VNNI
diff --git a/inffast_tpl.h b/inffast_tpl.h

index afa5e04ec4a613185991dfb5c5badb35c02f7af3..2ec865dbff6c1f38090ff30d0dce159b6f1d60f1 100644 (file)
--- a/inffast_tpl.h
+++ b/inffast_tpl.h
@@ -254,14 +254,18 @@ void Z_INTERNAL INFLATE_FAST(PREFIX3(stream) *strm, uint32_t start) {
                              out = chunkcopy_safe(out, out - dist, len, safe);
                          }
                      } else {
-                        if (!extra_safe)
-                            out = CHUNKCOPY_SAFE(out, from, len, safe);
-                        else
+#ifndef HAVE_MASKED_READWRITE
+                        if (extra_safe)
                              out = chunkcopy_safe(out, from, len, safe);
+                        else
+#endif
+                            out = CHUNKCOPY_SAFE(out, from, len, safe);
                      }
+#ifndef HAVE_MASKED_READWRITE
                  } else if (extra_safe) {
                      /* Whole reference is in range of current output. */
                          out = chunkcopy_safe(out, out - dist, len, safe);
+#endif
                  } else {
                      /* Whole reference is in range of current output.  No range checks are
                         necessary because we start with room for at least 258 bytes of output,
author	Adam Stylinski <kungfujesus06@gmail.com>
	Wed, 25 Sep 2024 21:56:36 +0000 (17:56 -0400)
committer	Hans Kristian Rosbach <hk-github@circlestorm.org>
	Wed, 20 Nov 2024 21:14:44 +0000 (22:14 +0100)
CMakeLists.txt		patch \| blob \| blame \| history
arch/x86/Makefile.in		patch \| blob \| blame \| history
arch/x86/avx2_tables.h	[new file with mode: 0644]	patch \| blob
arch/x86/chunkset_avx2.c		patch \| blob \| blame \| history
arch/x86/chunkset_avx512.c	[new file with mode: 0644]	patch \| blob
arch/x86/x86_features.c		patch \| blob \| blame \| history
arch/x86/x86_features.h		patch \| blob \| blame \| history
arch/x86/x86_functions.h		patch \| blob \| blame \| history
chunkset_tpl.h		patch \| blob \| blame \| history
cmake/detect-intrinsics.cmake		patch \| blob \| blame \| history
configure		patch \| blob \| blame \| history
functable.c		patch \| blob \| blame \| history
inffast_tpl.h		patch \| blob \| blame \| history