add_definitions(-DX86_AVX512)
list(APPEND AVX512_SRCS ${ARCHDIR}/adler32_avx512.c)
add_feature_info(AVX512_ADLER32 1 "Support AVX512-accelerated adler32, using \"${AVX512FLAG}\"")
- list(APPEND ZLIB_ARCH_SRCS ${AVX512_SRCS})
+ list(APPEND AVX512_SRCS ${ARCHDIR}/chunkset_avx512.c)
+ add_feature_info(AVX512_CHUNKSET 1 "Support AVX512 optimized chunkset, using \"${AVX512FLAG}\"")
list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/adler32_avx512_p.h)
+ list(APPEND ZLIB_ARCH_SRCS ${AVX512_SRCS})
set_property(SOURCE ${AVX512_SRCS} PROPERTY COMPILE_FLAGS "${AVX512FLAG} ${NOLTOFLAG}")
else()
set(WITH_AVX512 OFF)
INCLUDES=
SUFFIX=
-AVX512FLAG=-mavx512f -mavx512dq -mavx512vl -mavx512bw
-AVX512VNNIFLAG=-mavx512vnni
+AVX512FLAG=-mavx512f -mavx512dq -mavx512vl -mavx512bw -mbmi2
+AVX512VNNIFLAG=-mavx512vnni -mbmi2
AVX2FLAG=-mavx2
SSE2FLAG=-msse2
SSSE3FLAG=-mssse3
adler32_sse42.o adler32_sse42.lo \
adler32_ssse3.o adler32_ssse3.lo \
chunkset_avx2.o chunkset_avx2.lo \
+ chunkset_avx512.o chunkset_avx512.lo \
chunkset_sse2.o chunkset_sse2.lo \
chunkset_ssse3.o chunkset_ssse3.lo \
compare256_avx2.o compare256_avx2.lo \
chunkset_avx2.lo:
$(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx2.c
+chunkset_avx512.o:
+ $(CC) $(CFLAGS) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx512.c
+
+chunkset_avx512.lo:
+ $(CC) $(SFLAGS) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx512.c
+
chunkset_sse2.o:
$(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse2.c
--- /dev/null
+#ifndef _AVX2_TABLES_H
+#define _AVX2_TABLES_H
+
+#include "../generic/chunk_permute_table.h"
+
+/* Populate don't cares so that this is a direct lookup (with some indirection into the permute table), because dist can
+ * never be 0 - 2, we'll start with an offset, subtracting 3 from the input */
+static const lut_rem_pair perm_idx_lut[29] = {
+ { 0, 2}, /* 3 */
+ { 0, 0}, /* don't care */
+ { 1 * 32, 2}, /* 5 */
+ { 2 * 32, 2}, /* 6 */
+ { 3 * 32, 4}, /* 7 */
+ { 0 * 32, 0}, /* don't care */
+ { 4 * 32, 5}, /* 9 */
+ { 5 * 32, 22}, /* 10 */
+ { 6 * 32, 21}, /* 11 */
+ { 7 * 32, 20}, /* 12 */
+ { 8 * 32, 6}, /* 13 */
+ { 9 * 32, 4}, /* 14 */
+ {10 * 32, 2}, /* 15 */
+ { 0 * 32, 0}, /* don't care */
+ {11 * 32, 15}, /* 17 */
+ {11 * 32 + 16, 14}, /* 18 */
+ {11 * 32 + 16 * 2, 13}, /* 19 */
+ {11 * 32 + 16 * 3, 12}, /* 20 */
+ {11 * 32 + 16 * 4, 11}, /* 21 */
+ {11 * 32 + 16 * 5, 10}, /* 22 */
+ {11 * 32 + 16 * 6, 9}, /* 23 */
+ {11 * 32 + 16 * 7, 8}, /* 24 */
+ {11 * 32 + 16 * 8, 7}, /* 25 */
+ {11 * 32 + 16 * 9, 6}, /* 26 */
+ {11 * 32 + 16 * 10, 5}, /* 27 */
+ {11 * 32 + 16 * 11, 4}, /* 28 */
+ {11 * 32 + 16 * 12, 3}, /* 29 */
+ {11 * 32 + 16 * 13, 2}, /* 30 */
+ {11 * 32 + 16 * 14, 1} /* 31 */
+};
+
+static const uint16_t half_rem_vals[13] = {
+ 1, 0, 1, 4, 2, 0, 7, 6, 5, 4, 3, 2, 1
+};
+
+#endif
#include "zbuild.h"
#ifdef X86_AVX2
+#include "avx2_tables.h"
#include <immintrin.h>
-#include "../generic/chunk_permute_table.h"
#include "x86_intrins.h"
typedef __m256i chunk_t;
#define HAVE_CHUNK_MAG
#define HAVE_HALF_CHUNK
-/* Populate don't cares so that this is a direct lookup (with some indirection into the permute table), because dist can
- * never be 0 - 2, we'll start with an offset, subtracting 3 from the input */
-static const lut_rem_pair perm_idx_lut[29] = {
- { 0, 2}, /* 3 */
- { 0, 0}, /* don't care */
- { 1 * 32, 2}, /* 5 */
- { 2 * 32, 2}, /* 6 */
- { 3 * 32, 4}, /* 7 */
- { 0 * 32, 0}, /* don't care */
- { 4 * 32, 5}, /* 9 */
- { 5 * 32, 22}, /* 10 */
- { 6 * 32, 21}, /* 11 */
- { 7 * 32, 20}, /* 12 */
- { 8 * 32, 6}, /* 13 */
- { 9 * 32, 4}, /* 14 */
- {10 * 32, 2}, /* 15 */
- { 0 * 32, 0}, /* don't care */
- {11 * 32, 15}, /* 17 */
- {11 * 32 + 16, 14}, /* 18 */
- {11 * 32 + 16 * 2, 13}, /* 19 */
- {11 * 32 + 16 * 3, 12}, /* 20 */
- {11 * 32 + 16 * 4, 11}, /* 21 */
- {11 * 32 + 16 * 5, 10}, /* 22 */
- {11 * 32 + 16 * 6, 9}, /* 23 */
- {11 * 32 + 16 * 7, 8}, /* 24 */
- {11 * 32 + 16 * 8, 7}, /* 25 */
- {11 * 32 + 16 * 9, 6}, /* 26 */
- {11 * 32 + 16 * 10, 5}, /* 27 */
- {11 * 32 + 16 * 11, 4}, /* 28 */
- {11 * 32 + 16 * 12, 3}, /* 29 */
- {11 * 32 + 16 * 13, 2}, /* 30 */
- {11 * 32 + 16 * 14, 1} /* 31 */
-};
-
-static const uint16_t half_rem_vals[13] = {
- 1, 0, 1, 4, 2, 0, 7, 6, 5, 4, 3, 2, 1
-};
-
static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
int16_t tmp;
memcpy(&tmp, from, sizeof(tmp));
--- /dev/null
+/* chunkset_avx512.c -- AVX512 inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#include "zbuild.h"
+
+#ifdef X86_AVX512
+
+#include "avx2_tables.h"
+#include <immintrin.h>
+#include "x86_intrins.h"
+
+typedef __m256i chunk_t;
+typedef __m128i halfchunk_t;
+typedef __mmask32 mask_t;
+typedef __mmask16 halfmask_t;
+
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+#define HAVE_CHUNKMEMSET_16
+#define HAVE_CHUNKMEMSET_1
+#define HAVE_CHUNK_MAG
+#define HAVE_HALF_CHUNK
+#define HAVE_MASKED_READWRITE
+#define HAVE_CHUNKCOPY
+#define HAVE_HALFCHUNKCOPY
+
+static inline halfmask_t gen_half_mask(unsigned len) {
+ return (halfmask_t)_bzhi_u32(0xFFFF, len);
+}
+
+static inline mask_t gen_mask(unsigned len) {
+ return (mask_t)_bzhi_u32(0xFFFFFFFF, len);
+}
+
+static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
+ int16_t tmp;
+ memcpy(&tmp, from, sizeof(tmp));
+ *chunk = _mm256_set1_epi16(tmp);
+}
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+ int32_t tmp;
+ memcpy(&tmp, from, sizeof(tmp));
+ *chunk = _mm256_set1_epi32(tmp);
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+ int64_t tmp;
+ memcpy(&tmp, from, sizeof(tmp));
+ *chunk = _mm256_set1_epi64x(tmp);
+}
+
+static inline void chunkmemset_16(uint8_t *from, chunk_t *chunk) {
+ *chunk = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)from));
+}
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+ *chunk = _mm256_loadu_si256((__m256i *)s);
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+ _mm256_storeu_si256((__m256i *)out, *chunk);
+}
+
+static inline void storechunk_mask(uint8_t *out, mask_t mask, chunk_t *chunk) {
+ _mm256_mask_storeu_epi8(out, mask, *chunk);
+}
+
+static inline uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
+ Assert(len > 0, "chunkcopy should never have a length 0");
+
+ unsigned rem = len % sizeof(chunk_t);
+ mask_t rem_mask = gen_mask(rem);
+
+ /* Since this is only ever called if dist >= a chunk, we don't need a masked load */
+ chunk_t chunk;
+ loadchunk(from, &chunk);
+ _mm256_mask_storeu_epi8(out, rem_mask, chunk);
+ out += rem;
+ from += rem;
+ len -= rem;
+
+ while (len > 0) {
+ loadchunk(from, &chunk);
+ storechunk(out, &chunk);
+ out += sizeof(chunk_t);
+ from += sizeof(chunk_t);
+ len -= sizeof(chunk_t);
+ }
+
+ return out;
+}
+
+static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
+ lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
+ __m256i ret_vec;
+ *chunk_rem = lut_rem.remval;
+
+ /* See the AVX2 implementation for more detailed comments. This is that + some masked
+ * loads to avoid an out of bounds read on the heap */
+
+ if (dist < 16) {
+ const __m256i permute_xform =
+ _mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16);
+ __m256i perm_vec = _mm256_load_si256((__m256i*)(permute_table+lut_rem.idx));
+ halfmask_t load_mask = gen_half_mask(dist);
+ __m128i ret_vec0 = _mm_maskz_loadu_epi8(load_mask, buf);
+ perm_vec = _mm256_add_epi8(perm_vec, permute_xform);
+ ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), ret_vec0, 1);
+ ret_vec = _mm256_shuffle_epi8(ret_vec, perm_vec);
+ } else {
+ halfmask_t load_mask = gen_half_mask(dist - 16);
+ __m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf);
+ __m128i ret_vec1 = _mm_maskz_loadu_epi8(load_mask, (__m128i*)(buf + 16));
+ __m128i perm_vec1 = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx));
+ halfmask_t xlane_mask = _mm_cmp_epi8_mask(perm_vec1, _mm_set1_epi8(15), _MM_CMPINT_LE);
+ __m128i latter_half = _mm_mask_shuffle_epi8(ret_vec1, xlane_mask, ret_vec0, perm_vec1);
+ ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), latter_half, 1);
+ }
+
+ return ret_vec;
+}
+
+static inline void loadhalfchunk(uint8_t const *s, halfchunk_t *chunk) {
+ *chunk = _mm_loadu_si128((__m128i *)s);
+}
+
+static inline void storehalfchunk(uint8_t *out, halfchunk_t *chunk) {
+ _mm_storeu_si128((__m128i *)out, *chunk);
+}
+
+static inline chunk_t halfchunk2whole(halfchunk_t *chunk) {
+ /* We zero extend mostly to appease some memory sanitizers. These bytes are ultimately
+ * unlikely to be actually written or read from */
+ return _mm256_zextsi128_si256(*chunk);
+}
+
+static inline halfchunk_t GET_HALFCHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
+ lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
+ __m128i perm_vec, ret_vec;
+ halfmask_t load_mask = gen_half_mask(dist);
+ ret_vec = _mm_maskz_loadu_epi8(load_mask, buf);
+ *chunk_rem = half_rem_vals[dist - 3];
+
+ perm_vec = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx));
+ ret_vec = _mm_shuffle_epi8(ret_vec, perm_vec);
+
+ return ret_vec;
+}
+
+static inline uint8_t* HALFCHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
+ Assert(len > 0, "chunkcopy should never have a length 0");
+
+ unsigned rem = len % sizeof(halfchunk_t);
+ halfmask_t rem_mask = gen_half_mask(rem);
+
+ /* Since this is only ever called if dist >= a chunk, we don't need a masked load */
+ halfchunk_t chunk;
+ loadhalfchunk(from, &chunk);
+ _mm_mask_storeu_epi8(out, rem_mask, chunk);
+ out += rem;
+ from += rem;
+ len -= rem;
+
+ while (len > 0) {
+ loadhalfchunk(from, &chunk);
+ storehalfchunk(out, &chunk);
+ out += sizeof(halfchunk_t);
+ from += sizeof(halfchunk_t);
+ len -= sizeof(halfchunk_t);
+ }
+
+ return out;
+}
+
+#define CHUNKSIZE chunksize_avx512
+#define CHUNKUNROLL chunkunroll_avx512
+#define CHUNKMEMSET chunkmemset_avx512
+#define CHUNKMEMSET_SAFE chunkmemset_safe_avx512
+
+#include "chunkset_tpl.h"
+
+#define INFLATE_FAST inflate_fast_avx512
+
+#include "inffast_tpl.h"
+
+#endif
features->has_avx2 = ebx & 0x20;
}
+ features->has_bmi2 = ebx & 0x8;
+
// check AVX512 bits if the OS supports saving ZMM registers
if (features->has_os_save_zmm) {
features->has_avx512f = ebx & 0x00010000;
features->has_avx512vl = ebx & 0x80000000;
}
features->has_avx512_common = features->has_avx512f && features->has_avx512dq && features->has_avx512bw \
- && features->has_avx512vl;
+ && features->has_avx512vl && features->has_bmi2;
features->has_avx512vnni = ecx & 0x800;
}
}
int has_avx512vl;
int has_avx512_common; // Enabled when AVX512(F,DQ,BW,VL) are all enabled.
int has_avx512vnni;
+ int has_bmi2;
int has_sse2;
int has_ssse3;
int has_sse42;
#ifdef X86_AVX512
uint32_t adler32_avx512(uint32_t adler, const uint8_t *buf, size_t len);
uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+uint32_t chunksize_avx512(void);
+uint8_t* chunkmemset_safe_avx512(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
+void inflate_fast_avx512(PREFIX3(stream)* strm, uint32_t start);
#endif
#ifdef X86_AVX512VNNI
uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *buf, size_t len);
# define native_adler32 adler32_avx512
# undef native_adler32_fold_copy
# define native_adler32_fold_copy adler32_fold_copy_avx512
+# undef native_chunkmemset_safe
+# define native_chunkmemset_safe chunkmemset_safe_avx512
+# undef native_chunksize
+# define native_chunksize chunksize_avx512
+# undef native_inflate_fast
+# define native_inflate_fast inflate_fast_avx512
// X86 - AVX512 (VNNI)
# if defined(X86_AVX512VNNI) && defined(__AVX512VNNI__)
# undef native_adler32
#include "zbuild.h"
#include <stdlib.h>
-#include <stdio.h>
/* Returns the chunk size */
Z_INTERNAL uint32_t CHUNKSIZE(void) {
}
#endif
-#ifdef HAVE_HALF_CHUNK
+#if defined(HAVE_HALF_CHUNK) && !defined(HAVE_HALFCHUNKCOPY)
static inline uint8_t* HALFCHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
halfchunk_t chunk;
int32_t align = ((len - 1) % sizeof(halfchunk_t)) + 1;
* always needed to be handled here or if we're just now seeing it because we are
* dispatching to this function, more */
if (sdist < 0 && dist < len) {
+#ifdef HAVE_MASKED_READWRITE
+ /* We can still handle this case if we can mitigate over writing _and_ we
+ * fit the entirety of the copy length with one load */
+ if (len <= sizeof(chunk_t)) {
+ /* Tempting to add a goto to the block below but hopefully most compilers
+ * collapse these identical code segments as one label to jump to */
+ return CHUNKCOPY(out, from, len);
+ }
+#endif
/* Here the memmove semantics match perfectly, as when this happens we are
* effectively sliding down the contents of memory by dist bytes */
memmove(out, from, len);
return CHUNKCOPY(out, from, len);
}
- /* Only AVX2 as there's 128 bit vectors and 256 bit. We allow for shorter vector
+ /* Only AVX2+ as there's 128 bit vectors and 256 bit. We allow for shorter vector
* lengths because they serve to allow more cases to fall into chunkcopy, as the
* distance of the shorter length is still deemed a safe distance. We rewrite this
* here rather than calling the ssse3 variant directly now because doing so required
if ((dist % 2) != 0 || dist == 6) {
halfchunk_t halfchunk_load = GET_HALFCHUNK_MAG(from, &chunk_mod, (unsigned)dist);
- adv_amount = sizeof(halfchunk_t) - chunk_mod;
if (len == sizeof(halfchunk_t)) {
storehalfchunk(out, &halfchunk_load);
- len -= adv_amount;
- out += adv_amount;
+ len -= sizeof(halfchunk_t);
+ out += sizeof(halfchunk_t);
}
chunk_load = halfchunk2whole(&halfchunk_load);
rem_bytes:
#endif
if (len) {
+#ifndef HAVE_MASKED_READWRITE
memcpy(out, &chunk_load, len);
+#else
+ storechunk_mask(out, gen_mask(len), &chunk_load);
+#endif
out += len;
}
--left;
}
#endif
+
+#ifndef HAVE_MASKED_READWRITE
if (UNLIKELY(left < sizeof(chunk_t))) {
while (len > 0) {
*out++ = *from++;
return out;
}
+#endif
if (len)
out = CHUNKMEMSET(out, from, len);
return out;
}
-static inline uint8_t *CHUNKCOPY_SAFE(uint8_t *out, uint8_t *from, unsigned len, uint8_t *safe)
+static inline uint8_t *CHUNKCOPY_SAFE(uint8_t *out, uint8_t *from, uint64_t len, uint8_t *safe)
{
if (out == from)
return out + len;
uint64_t safelen = (safe - out);
- len = MIN(len, (unsigned)safelen);
+ len = MIN(len, safelen);
+#ifndef HAVE_MASKED_READWRITE
uint64_t from_dist = (uint64_t)llabs(safe - from);
if (UNLIKELY(from_dist < sizeof(chunk_t) || safelen < sizeof(chunk_t))) {
while (len--) {
return out;
}
+#endif
- return CHUNKMEMSET(out, from, len);
+ return CHUNKMEMSET(out, from, (unsigned)len);
}
if(NOT NATIVEFLAG)
if(CMAKE_C_COMPILER_ID MATCHES "Intel")
if(CMAKE_HOST_UNIX OR APPLE)
- set(AVX512FLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl")
+ set(AVX512FLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mbmi2")
else()
set(AVX512FLAG "/arch:AVX512")
endif()
elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
# For CPUs that can benefit from AVX512, it seems GCC generates suboptimal
# instruction scheduling unless you specify a reasonable -mtune= target
- set(AVX512FLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl")
+ set(AVX512FLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mbmi2")
if(NOT MSVC)
check_c_compiler_flag("-mtune=cascadelake" HAVE_CASCADE_LAKE)
if(HAVE_CASCADE_LAKE)
if(NOT NATIVEFLAG)
if(CMAKE_C_COMPILER_ID MATCHES "Intel")
if(CMAKE_HOST_UNIX OR APPLE OR CMAKE_C_COMPILER_ID MATCHES "IntelLLVM")
- set(AVX512VNNIFLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni")
+ set(AVX512VNNIFLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mbmi2")
else()
set(AVX512VNNIFLAG "/arch:AVX512")
endif()
elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
- set(AVX512VNNIFLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni")
+ set(AVX512VNNIFLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mbmi2")
if(NOT MSVC)
check_c_compiler_flag("-mtune=cascadelake" HAVE_CASCADE_LAKE)
if(HAVE_CASCADE_LAKE)
forcesse2=0
# For CPUs that can benefit from AVX512, it seems GCC generates suboptimal
# instruction scheduling unless you specify a reasonable -mtune= target
-avx512flag="-mavx512f -mavx512dq -mavx512bw -mavx512vl"
+avx512flag="-mavx512f -mavx512dq -mavx512bw -mavx512vl -mbmi2"
avx512vnniflag="${avx512flag} -mavx512vnni"
avx2flag="-mavx2"
sse2flag="-msse2"
if test ${HAVE_AVX512_INTRIN} -eq 1; then
CFLAGS="${CFLAGS} -DX86_AVX512"
SFLAGS="${SFLAGS} -DX86_AVX512"
- ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_avx512.o"
- ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_avx512.lo"
+ ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_avx512.o chunkset_avx512.o"
+ ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_avx512.lo chunkset_avx512.lo"
fi
check_mtune_cascadelake_compiler_flag
if (cf.x86.has_avx512_common) {
ft.adler32 = &adler32_avx512;
ft.adler32_fold_copy = &adler32_fold_copy_avx512;
+ ft.chunkmemset_safe = &chunkmemset_safe_avx512;
+ ft.chunksize = &chunksize_avx512;
+ ft.inflate_fast = &inflate_fast_avx512;
}
#endif
#ifdef X86_AVX512VNNI
out = chunkcopy_safe(out, out - dist, len, safe);
}
} else {
- if (!extra_safe)
- out = CHUNKCOPY_SAFE(out, from, len, safe);
- else
+#ifndef HAVE_MASKED_READWRITE
+ if (extra_safe)
out = chunkcopy_safe(out, from, len, safe);
+ else
+#endif
+ out = CHUNKCOPY_SAFE(out, from, len, safe);
}
+#ifndef HAVE_MASKED_READWRITE
} else if (extra_safe) {
/* Whole reference is in range of current output. */
out = chunkcopy_safe(out, out - dist, len, safe);
+#endif
} else {
/* Whole reference is in range of current output. No range checks are
necessary because we start with room for at least 258 bytes of output,