add_definitions(-DX86_AVX2)
list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/slide_avx.c)
add_feature_info(AVX2_SLIDEHASH 1 "Support AVX2 optimized slide_hash, using \"${AVX2FLAG}\"")
+ list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/compare258_avx.c)
+ add_feature_info(AVX2_COMPARE258 1 "Support AVX2 optimized compare258, using \"${AVX2FLAG}\"")
add_intrinsics_option("${AVX2FLAG}")
endif()
if(WITH_SSE4 AND (HAVE_SSE42CRC_INLINE_ASM OR HAVE_SSE42CRC_INTRIN))
SRCTOP=../..
TOPDIR=$(SRCTOP)
-all: x86.o x86.lo compare258_sse.o compare258_sse.lo deflate_quick.o deflate_quick.lo insert_string_sse.o insert_string_sse.lo crc_folding.o crc_folding.lo slide_avx.o slide_avx.lo slide_sse.o slide_sse.lo
+all: x86.o x86.lo compare258_avx.o compare258_avx.lo compare258_sse.o compare258_sse.lo deflate_quick.o deflate_quick.lo insert_string_sse.o insert_string_sse.lo crc_folding.o crc_folding.lo slide_avx.o slide_avx.lo slide_sse.o slide_sse.lo
x86.o:
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/x86.c
x86.lo:
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/x86.c
+compare258_avx.o:
+ $(CC) $(CFLAGS) $(AVX2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare258_avx.c
+
+compare258_avx.lo:
+ $(CC) $(SFLAGS) $(AVX2FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare258_avx.c
+
compare258_sse.o:
$(CC) $(CFLAGS) $(SSE4FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare258_sse.c
--- /dev/null
+/* compare258_avx.c -- AVX2 version of compare258
+ * Copyright Mika T. Lindqvist <postmaster@raasu.org>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "../../zbuild.h"
+#include "../../zutil.h"
+
+#include "fallback_builtins.h"
+
+#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
+
+#include <immintrin.h>
+#ifdef _MSC_VER
+# include <nmmintrin.h>
+#endif
+
+/* UNALIGNED_OK, AVX2 intrinsic comparison */
+int32_t compare258_unaligned_avx2(const unsigned char *src0, const unsigned char *src1) {
+ const unsigned char *src0start = src0;
+ const unsigned char *src0end = src0 + 256;
+
+ do {
+ __m256i ymm_src0, ymm_src1, ymm_cmp;
+ ymm_src0 = _mm256_loadu_si256((__m256i*)src0);
+ ymm_src1 = _mm256_loadu_si256((__m256i*)src1);
+ ymm_cmp = _mm256_cmpeq_epi8(ymm_src0, ymm_src1); /* non-identical bytes = 00, identical bytes = FF */
+ int mask = _mm256_movemask_epi8(ymm_cmp);
+ if ((unsigned int)mask != 0xFFFFFFFF) {
+ int match_byte = __builtin_ctz(~mask); /* Invert bits so identical = 0 */
+ return (int32_t)(src0 - src0start + match_byte);
+ }
+
+ src0 += 32, src1 += 32;
+
+ ymm_src0 = _mm256_loadu_si256((__m256i*)src0);
+ ymm_src1 = _mm256_loadu_si256((__m256i*)src1);
+ ymm_cmp = _mm256_cmpeq_epi8(ymm_src0, ymm_src1);
+ mask = _mm256_movemask_epi8(ymm_cmp);
+ if ((unsigned int)mask != 0xFFFFFFFF) {
+ int match_byte = __builtin_ctz(~mask);
+ return (int32_t)(src0 - src0start + match_byte);
+ }
+
+ src0 += 32, src1 += 32;
+ } while (src0 < src0end);
+
+ if (*(uint16_t *)src0 == *(uint16_t *)src1)
+ src0 += 2, src1 += 2;
+ else if (*src0 == *src1)
+ src0 += 1, src1 += 1;
+
+ return (int32_t)(src0 - src0start);
+}
+
+#endif
if test ${HAVE_AVX2_INTRIN} -eq 1; then
CFLAGS="${CFLAGS} -DX86_AVX2"
SFLAGS="${SFLAGS} -DX86_AVX2"
- ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} slide_avx.o"
- ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} slide_avx.lo"
+ ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} compare258_avx.o slide_avx.o"
+ ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} compare258_avx.lo slide_avx.lo"
fi
CFLAGS="${CFLAGS} -DX86_SSE42_CRC_HASH"
if test ${HAVE_AVX2_INTRIN} -eq 1; then
CFLAGS="${CFLAGS} -DX86_AVX2"
SFLAGS="${SFLAGS} -DX86_AVX2"
- ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} slide_avx.o"
- ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} slide_avx.lo"
+ ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} compare258_avx.o slide_avx.o"
+ ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} compare258_avx.lo slide_avx.lo"
fi
if test ${HAVE_SSE42CMPSTR_INTRIN} -eq 1; then
#ifdef X86_SSE42_CMP_STR
extern int32_t compare258_unaligned_sse4(const unsigned char *src0, const unsigned char *src1);
#endif
+#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
+extern int32_t compare258_unaligned_avx2(const unsigned char *src0, const unsigned char *src1);
+#endif
#endif
/* stub definitions */
if (x86_cpu_has_sse42)
functable.compare258 = &compare258_unaligned_sse4;
# endif
+# if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
+ if (x86_cpu_has_avx2)
+ functable.compare258 = &compare258_unaligned_avx2;
+# endif
#endif
return functable.compare258(src0, src1);
ZLIB_COMPAT =
SUFFIX =
-OBJS = adler32.obj compare258.obj compare258_sse.obj compress.obj crc32.obj \
+OBJS = adler32.obj compare258.obj compare258_avx.obj compare258_sse.obj compress.obj crc32.obj \
deflate.obj deflate_fast.obj deflate_quick.obj deflate_slow.obj deflate_medium.obj \
functable.obj infback.obj inflate.obj inftrees.obj inffast.obj insert_string.obj \
slide_avx.obj slide_sse.obj trees.obj uncompr.obj zutil.obj \