list(APPEND ZLIB_ARCH_HDRS fallback_builtins.h)
endif()
if(WITH_AVX2 AND HAVE_AVX2_INTRIN)
- add_definitions(-DX86_AVX2 -DX86_AVX2_ADLER32)
+ add_definitions(-DX86_AVX2 -DX86_AVX2_ADLER32 -DX86_AVX_CHUNKSET)
set(AVX2_SRCS ${ARCHDIR}/slide_avx.c)
add_feature_info(AVX2_SLIDEHASH 1 "Support AVX2 optimized slide_hash, using \"${AVX2FLAG}\"")
+ list(APPEND AVX2_SRCS ${ARCHDIR}/chunkset_avx.c)
+ add_feature_info(AVX_CHUNKSET 1 "Support AVX optimized chunkset, using \"${AVX2FLAG}\"")
list(APPEND AVX2_SRCS ${ARCHDIR}/compare258_avx.c)
add_feature_info(AVX2_COMPARE258 1 "Support AVX2 optimized compare258, using \"${AVX2FLAG}\"")
list(APPEND AVX2_SRCS ${ARCHDIR}/adler32_avx.c)
x86.o x86.lo \
adler32_avx.o adler32.lo \
adler32_ssse3.o adler32_ssse3.lo \
+ chunkset_avx.o chunkset_avx.lo \
chunkset_sse.o chunkset_sse.lo \
compare258_avx.o compare258_avx.lo \
compare258_sse.o compare258_sse.lo \
x86.lo:
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/x86.c
+chunkset_avx.o:
+ $(CC) $(CFLAGS) $(AVX2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx.c
+
+chunkset_avx.lo:
+ $(CC) $(SFLAGS) $(AVX2FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx.c
+
chunkset_sse.o:
$(CC) $(CFLAGS) $(SSE2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse.c
--- /dev/null
+/* chunkset_avx.c -- AVX inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#include "zbuild.h"
+#include "zutil.h"
+
+#ifdef X86_AVX_CHUNKSET
+#include <immintrin.h>
+
+typedef __m256i chunk_t;
+
+#define HAVE_CHUNKMEMSET_1
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+
+static inline void chunkmemset_1(uint8_t *from, chunk_t *chunk) {
+ *chunk = _mm256_set1_epi8(*(int8_t *)from);
+}
+
+static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
+ *chunk = _mm256_set1_epi16(*(int16_t *)from);
+}
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+ *chunk = _mm256_set1_epi32(*(int32_t *)from);
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+ *chunk = _mm256_set1_epi64x(*(int64_t *)from);
+}
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+ *chunk = _mm256_loadu_si256((__m256i *)s);
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+ _mm256_storeu_si256((__m256i *)out, *chunk);
+}
+
+#define CHUNKSIZE chunksize_avx
+#define CHUNKCOPY chunkcopy_avx
+#define CHUNKCOPY_SAFE chunkcopy_safe_avx
+#define CHUNKUNROLL chunkunroll_avx
+#define CHUNKMEMSET chunkmemset_avx
+#define CHUNKMEMSET_SAFE chunkmemset_safe_avx
+
+#include "chunkset_tpl.h"
+
+#endif
/* Behave like chunkcopy, but avoid writing beyond of legal output. */
Z_INTERNAL uint8_t* CHUNKCOPY_SAFE(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe) {
if ((safe - out) < (ptrdiff_t)sizeof(chunk_t)) {
+ if (sizeof(chunk_t) > 16 && (len & 16)) {
+ memcpy(out, from, 16);
+ out += 16;
+ from += 16;
+ }
if (len & 8) {
memcpy(out, from, 8);
out += 8;
ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} x86.lo"
if test ${HAVE_AVX2_INTRIN} -eq 1; then
- CFLAGS="${CFLAGS} -DX86_AVX2 -DX86_AVX2_ADLER32"
- SFLAGS="${SFLAGS} -DX86_AVX2 -DX86_AVX2_ADLER32"
- ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} slide_avx.o compare258_avx.o adler32_avx.o"
- ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} slide_avx.lo compare258_avx.lo adler32_avx.lo"
+ CFLAGS="${CFLAGS} -DX86_AVX2 -DX86_AVX2_ADLER32 -DX86_AVX_CHUNKSET"
+ SFLAGS="${SFLAGS} -DX86_AVX2 -DX86_AVX2_ADLER32 -DX86_AVX_CHUNKSET"
+ ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} slide_avx.o chunkset_avx.o compare258_avx.o adler32_avx.o"
+ ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} slide_avx.lo chunkset_avx.lo compare258_avx.lo adler32_avx.lo"
fi
if test ${HAVE_SSE42CRC_INTRIN} -eq 1 || test ${HAVE_SSE42CRC_INLINE_ASM} -eq 1; then
extern uint8_t* chunkmemset_sse2(uint8_t *out, unsigned dist, unsigned len);
extern uint8_t* chunkmemset_safe_sse2(uint8_t *out, unsigned dist, unsigned len, unsigned left);
#endif
+#ifdef X86_AVX_CHUNKSET
+extern uint32_t chunksize_avx(void);
+extern uint8_t* chunkcopy_avx(uint8_t *out, uint8_t const *from, unsigned len);
+extern uint8_t* chunkcopy_safe_avx(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe);
+extern uint8_t* chunkunroll_avx(uint8_t *out, unsigned *dist, unsigned *len);
+extern uint8_t* chunkmemset_avx(uint8_t *out, unsigned dist, unsigned len);
+extern uint8_t* chunkmemset_safe_avx(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+#endif
#ifdef ARM_NEON_CHUNKSET
extern uint32_t chunksize_neon(void);
extern uint8_t* chunkcopy_neon(uint8_t *out, uint8_t const *from, unsigned len);
# endif
functable.chunksize = &chunksize_sse2;
#endif
+#ifdef X86_AVX_CHUNKSET
+ if (x86_cpu_has_avx2)
+ functable.chunksize = &chunksize_avx;
+#endif
#ifdef ARM_NEON_CHUNKSET
if (arm_cpu_has_neon)
functable.chunksize = &chunksize_neon;
# endif
functable.chunkcopy = &chunkcopy_sse2;
#endif
+#ifdef X86_AVX_CHUNKSET
+ if (x86_cpu_has_avx2)
+ functable.chunkcopy = &chunkcopy_avx;
+#endif
#ifdef ARM_NEON_CHUNKSET
if (arm_cpu_has_neon)
functable.chunkcopy = &chunkcopy_neon;
# endif
functable.chunkcopy_safe = &chunkcopy_safe_sse2;
#endif
+#ifdef X86_AVX_CHUNKSET
+ if (x86_cpu_has_avx2)
+ functable.chunkcopy_safe = &chunkcopy_safe_avx;
+#endif
#ifdef ARM_NEON_CHUNKSET
if (arm_cpu_has_neon)
functable.chunkcopy_safe = &chunkcopy_safe_neon;
# endif
functable.chunkunroll = &chunkunroll_sse2;
#endif
+#ifdef X86_AVX_CHUNKSET
+ if (x86_cpu_has_avx2)
+ functable.chunkunroll = &chunkunroll_avx;
+#endif
#ifdef ARM_NEON_CHUNKSET
if (arm_cpu_has_neon)
functable.chunkunroll = &chunkunroll_neon;
# endif
functable.chunkmemset = &chunkmemset_sse2;
#endif
+#ifdef X86_AVX_CHUNKSET
+ if (x86_cpu_has_avx2)
+ functable.chunkmemset = &chunkmemset_avx;
+#endif
#ifdef ARM_NEON_CHUNKSET
if (arm_cpu_has_neon)
functable.chunkmemset = &chunkmemset_neon;
# endif
functable.chunkmemset_safe = &chunkmemset_safe_sse2;
#endif
+#ifdef X86_AVX_CHUNKSET
+ if (x86_cpu_has_avx2)
+ functable.chunkmemset_safe = &chunkmemset_safe_avx;
+#endif
#ifdef ARM_NEON_CHUNKSET
if (arm_cpu_has_neon)
functable.chunkmemset_safe = &chunkmemset_safe_neon;
-DX86_SSE42_CRC_INTRIN \
-DX86_SSE42_CRC_HASH \
-DX86_AVX2 \
+ -DX86_AVX_CHUNKSET \
-DX86_SSE2_CHUNKSET \
-DUNALIGNED_OK \
-DUNALIGNED64_OK \
OBJS = \
adler32.obj \
chunkset.obj \
+ chunkset_avx.obj \
chunkset_sse.obj \
compare258.obj \
compare258_avx.obj \
compress.obj: $(SRCDIR)/compress.c $(SRCDIR)/zbuild.h $(SRCDIR)/zlib$(SUFFIX).h
uncompr.obj: $(SRCDIR)/uncompr.c $(SRCDIR)/zbuild.h $(SRCDIR)/zlib$(SUFFIX).h
chunkset.obj: $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h
+chunkset_avx.obj: $(SRCDIR)/arch/x86/chunkset_avx.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h
chunkset_sse.obj: $(SRCDIR)/arch/x86/chunkset_sse.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h
crc32.obj: $(SRCDIR)/crc32.c $(SRCDIR)/zbuild.h $(SRCDIR)/zendian.h $(SRCDIR)/deflate.h $(SRCDIR)/functable.h $(SRCDIR)/crc32_tbl.h
deflate.obj: $(SRCDIR)/deflate.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h