From: Nathan Moinvaziri Date: Thu, 7 May 2020 14:54:37 +0000 (-0700) Subject: Added support for AVX2 intrinsics to compare258. X-Git-Tag: 1.9.9-b1~298 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=0f34903697be3723f1cb3cee492f892f542e8f14;p=thirdparty%2Fzlib-ng.git Added support for AVX2 intrinsics to compare258. --- diff --git a/CMakeLists.txt b/CMakeLists.txt index c3b3edcb0..ee223e421 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -715,6 +715,8 @@ if(WITH_OPTIM) add_definitions(-DX86_AVX2) list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/slide_avx.c) add_feature_info(AVX2_SLIDEHASH 1 "Support AVX2 optimized slide_hash, using \"${AVX2FLAG}\"") + list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/compare258_avx.c) + add_feature_info(AVX2_COMPARE258 1 "Support AVX2 optimized compare258, using \"${AVX2FLAG}\"") add_intrinsics_option("${AVX2FLAG}") endif() if(WITH_SSE4 AND (HAVE_SSE42CRC_INLINE_ASM OR HAVE_SSE42CRC_INTRIN)) diff --git a/arch/x86/Makefile.in b/arch/x86/Makefile.in index 365f58a68..c11959107 100644 --- a/arch/x86/Makefile.in +++ b/arch/x86/Makefile.in @@ -17,7 +17,7 @@ SRCDIR=. SRCTOP=../.. TOPDIR=$(SRCTOP) -all: x86.o x86.lo compare258_sse.o compare258_sse.lo deflate_quick.o deflate_quick.lo insert_string_sse.o insert_string_sse.lo crc_folding.o crc_folding.lo slide_avx.o slide_avx.lo slide_sse.o slide_sse.lo +all: x86.o x86.lo compare258_avx.o compare258_avx.lo compare258_sse.o compare258_sse.lo deflate_quick.o deflate_quick.lo insert_string_sse.o insert_string_sse.lo crc_folding.o crc_folding.lo slide_avx.o slide_avx.lo slide_sse.o slide_sse.lo x86.o: $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/x86.c @@ -25,6 +25,12 @@ x86.o: x86.lo: $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/x86.c +compare258_avx.o: + $(CC) $(CFLAGS) $(AVX2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare258_avx.c + +compare258_avx.lo: + $(CC) $(SFLAGS) $(AVX2FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare258_avx.c + compare258_sse.o: $(CC) $(CFLAGS) $(SSE4FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare258_sse.c diff --git a/arch/x86/compare258_avx.c b/arch/x86/compare258_avx.c new file mode 100644 index 000000000..10096eab6 --- /dev/null +++ b/arch/x86/compare258_avx.c @@ -0,0 +1,56 @@ +/* compare258_avx.c -- AVX2 version of compare258 + * Copyright Mika T. Lindqvist + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include "../../zbuild.h" +#include "../../zutil.h" + +#include "fallback_builtins.h" + +#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) + +#include +#ifdef _MSC_VER +# include +#endif + +/* UNALIGNED_OK, AVX2 intrinsic comparison */ +int32_t compare258_unaligned_avx2(const unsigned char *src0, const unsigned char *src1) { + const unsigned char *src0start = src0; + const unsigned char *src0end = src0 + 256; + + do { + __m256i ymm_src0, ymm_src1, ymm_cmp; + ymm_src0 = _mm256_loadu_si256((__m256i*)src0); + ymm_src1 = _mm256_loadu_si256((__m256i*)src1); + ymm_cmp = _mm256_cmpeq_epi8(ymm_src0, ymm_src1); /* non-identical bytes = 00, identical bytes = FF */ + int mask = _mm256_movemask_epi8(ymm_cmp); + if ((unsigned int)mask != 0xFFFFFFFF) { + int match_byte = __builtin_ctz(~mask); /* Invert bits so identical = 0 */ + return (int32_t)(src0 - src0start + match_byte); + } + + src0 += 32, src1 += 32; + + ymm_src0 = _mm256_loadu_si256((__m256i*)src0); + ymm_src1 = _mm256_loadu_si256((__m256i*)src1); + ymm_cmp = _mm256_cmpeq_epi8(ymm_src0, ymm_src1); + mask = _mm256_movemask_epi8(ymm_cmp); + if ((unsigned int)mask != 0xFFFFFFFF) { + int match_byte = __builtin_ctz(~mask); + return (int32_t)(src0 - src0start + match_byte); + } + + src0 += 32, src1 += 32; + } while (src0 < src0end); + + if (*(uint16_t *)src0 == *(uint16_t *)src1) + src0 += 2, src1 += 2; + else if (*src0 == *src1) + src0 += 1, src1 += 1; + + return (int32_t)(src0 - src0start); +} + +#endif diff --git a/configure b/configure index 463ae44c1..50172fddc 100755 --- a/configure +++ b/configure @@ -1095,8 +1095,8 @@ case "${ARCH}" in if test ${HAVE_AVX2_INTRIN} -eq 1; then CFLAGS="${CFLAGS} -DX86_AVX2" SFLAGS="${SFLAGS} -DX86_AVX2" - ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} slide_avx.o" - ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} slide_avx.lo" + ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} compare258_avx.o slide_avx.o" + ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} compare258_avx.lo slide_avx.lo" fi CFLAGS="${CFLAGS} -DX86_SSE42_CRC_HASH" @@ -1144,8 +1144,8 @@ case "${ARCH}" in if test ${HAVE_AVX2_INTRIN} -eq 1; then CFLAGS="${CFLAGS} -DX86_AVX2" SFLAGS="${SFLAGS} -DX86_AVX2" - ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} slide_avx.o" - ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} slide_avx.lo" + ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} compare258_avx.o slide_avx.o" + ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} compare258_avx.lo slide_avx.lo" fi if test ${HAVE_SSE42CMPSTR_INTRIN} -eq 1; then diff --git a/functable.c b/functable.c index 943c2b41f..a95cfc550 100644 --- a/functable.c +++ b/functable.c @@ -72,6 +72,9 @@ extern int32_t compare258_unaligned_64(const unsigned char *src0, const unsigned #ifdef X86_SSE42_CMP_STR extern int32_t compare258_unaligned_sse4(const unsigned char *src0, const unsigned char *src1); #endif +#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) +extern int32_t compare258_unaligned_avx2(const unsigned char *src0, const unsigned char *src1); +#endif #endif /* stub definitions */ @@ -223,6 +226,10 @@ ZLIB_INTERNAL int32_t compare258_stub(const unsigned char *src0, const unsigned if (x86_cpu_has_sse42) functable.compare258 = &compare258_unaligned_sse4; # endif +# if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) + if (x86_cpu_has_avx2) + functable.compare258 = &compare258_unaligned_avx2; +# endif #endif return functable.compare258(src0, src1); diff --git a/win32/Makefile.msc b/win32/Makefile.msc index abb9808b9..ec24b45f0 100644 --- a/win32/Makefile.msc +++ b/win32/Makefile.msc @@ -34,7 +34,7 @@ WITH_GZFILEOP = ZLIB_COMPAT = SUFFIX = -OBJS = adler32.obj compare258.obj compare258_sse.obj compress.obj crc32.obj \ +OBJS = adler32.obj compare258.obj compare258_avx.obj compare258_sse.obj compress.obj crc32.obj \ deflate.obj deflate_fast.obj deflate_quick.obj deflate_slow.obj deflate_medium.obj \ functable.obj infback.obj inflate.obj inftrees.obj inffast.obj insert_string.obj \ slide_avx.obj slide_sse.obj trees.obj uncompr.obj zutil.obj \