From: Nathan Moinvaziri Date: Tue, 25 Feb 2020 22:36:56 +0000 (-0800) Subject: Abstracted out architecture specific implementations of 258 byte comparison to compar... X-Git-Tag: 1.9.9-b1~304 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=9bd28d93813db93aa2fb1b14309cced1efe7d721;p=thirdparty%2Fzlib-ng.git Abstracted out architecture specific implementations of 258 byte comparison to compare258. --- diff --git a/CMakeLists.txt b/CMakeLists.txt index 5d186440d..71374e07f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -418,6 +418,22 @@ if(HAVE_ATTRIBUTE_VISIBILITY_INTERNAL) add_definitions(-DHAVE_VISIBILITY_INTERNAL) endif() +# +# check for __builtin_ctz() support in the compiler +# +check_c_source_compiles( + "int main(void) + { + unsigned int zero = 0; + long test = __builtin_ctz(zero); + (void)test; + return 0; + }" + HAVE_BUILTIN_CTZ +) +if(HAVE_BUILTIN_CTZ) + add_definitions(-DHAVE_BUILTIN_CTZ) +endif() # # check for __builtin_ctzl() support in the compiler # @@ -434,6 +450,22 @@ check_c_source_compiles( if(HAVE_BUILTIN_CTZL) add_definitions(-DHAVE_BUILTIN_CTZL) endif() +# +# check for __builtin_ctzll() support in the compiler +# +check_c_source_compiles( + "int main(void) + { + unsigned int zero = 0; + long test = __builtin_ctzll(zero); + (void)test; + return 0; + }" + HAVE_BUILTIN_CTZLL +) +if(HAVE_BUILTIN_CTZLL) + add_definitions(-DHAVE_BUILTIN_CTZLL) +endif() # # check for ptrdiff_t support @@ -531,6 +563,20 @@ if(BASEARCH_X86_FOUND) }" HAVE_SSE42CRC_INTRIN ) + # Check whether compiler supports SSE4.2 compare string instrinics + check_c_source_compile_or_run( + "#include + int main(void) + { + unsigned char a[64] = { 0 }; + unsigned char b[64] = { 0 }; + __m128i xmm_src0, xmm_src1; + xmm_src0 = _mm_loadu_si128((__m128i *)(char *)a); + xmm_src1 = _mm_loadu_si128((__m128i *)(char *)b); + return _mm_cmpestri(xmm_src0, 16, xmm_src1, 16, 0); + }" + HAVE_SSE42CMPSTR_INTRIN + ) set(CMAKE_REQUIRED_FLAGS) # Check whether compiler supports PCLMULQDQ intrinics @@ -659,7 +705,7 @@ if(WITH_OPTIM) if("${ARCH}" MATCHES "arm" OR NOT WITH_NEON) add_intrinsics_option("${ACLEFLAG}") endif() - add_feature_info(ACLE_CRC 1 "Support CRC hash generation using the ACLE instruction set, using \"${ACLEFLAG}\"") + add_feature_info(ACLE_CRC 1 "Support ACLE optimized CRC hash generation, using \"${ACLEFLAG}\"") endif() elseif(BASEARCH_S360_FOUND AND "${ARCH}" MATCHES "s390x") if(WITH_DFLTCC_DEFLATE OR WITH_DFLTCC_INFLATE) @@ -684,13 +730,13 @@ if(WITH_OPTIM) if(WITH_AVX2 AND HAVE_AVX2_INTRIN) add_definitions(-DX86_AVX2) list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/slide_avx.c) - add_feature_info(AVX2_SLIDEHASH 1 "Support AVX2-optimized slide_hash, using \"${AVX2FLAG}\"") + add_feature_info(AVX2_SLIDEHASH 1 "Support AVX2 optimized slide_hash, using \"${AVX2FLAG}\"") add_intrinsics_option("${AVX2FLAG}") endif() if(WITH_SSE4 AND (HAVE_SSE42CRC_INLINE_ASM OR HAVE_SSE42CRC_INTRIN)) add_definitions(-DX86_SSE42_CRC_HASH) list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/insert_string_sse.c) - add_feature_info(SSE42_CRC 1 "Support CRC hash generation using the SSE4.2 instruction set, using \"${SSE4FLAG}\"") + add_feature_info(SSE42_CRC 1 "Support SSE4.2 optimized CRC hash generation, using \"${SSE4FLAG}\"") add_intrinsics_option("${SSE4FLAG}") if(HAVE_SSE42CRC_INTRIN) add_definitions(-DX86_SSE42_CRC_INTRIN) @@ -698,9 +744,14 @@ if(WITH_OPTIM) if(WITH_NEW_STRATEGIES) add_definitions(-DX86_QUICK_STRATEGY) list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/deflate_quick.c) - add_feature_info(SSE42_DEFLATE_QUICK 1 "Support SSE4.2-accelerated quick compression") + add_feature_info(SSE42_DEFLATE_QUICK 1 "Support SSE4.2 accelerated quick compression") endif() endif() + if(HAVE_SSE42CMPSTR_INTRIN) + add_definitions(-DX86_SSE42_CMP_STR) + list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/compare258_sse.c) + add_feature_info(SSE42_COMPARE258 1 "Support SSE4.2 optimized compare258, using \"${SSE4FLAG}\"") + endif() if(WITH_SSE2 AND HAVE_SSE2_INTRIN) add_definitions(-DX86_SSE2) list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/slide_sse.c) @@ -809,6 +860,7 @@ set(ZLIB_PRIVATE_HDRS ) set(ZLIB_SRCS adler32.c + compare258.c compress.c crc32.c deflate.c diff --git a/Makefile.in b/Makefile.in index 26185a12e..1063fa963 100644 --- a/Makefile.in +++ b/Makefile.in @@ -71,11 +71,11 @@ mandir = ${prefix}/share/man man3dir = ${mandir}/man3 pkgconfigdir = ${libdir}/pkgconfig -OBJZ = adler32.o compress.o crc32.o deflate.o deflate_fast.o deflate_medium.o deflate_slow.o functable.o infback.o inffast.o inflate.o inftrees.o insert_string.o trees.o uncompr.o zutil.o $(ARCH_STATIC_OBJS) +OBJZ = adler32.o compare258.o compress.o crc32.o deflate.o deflate_fast.o deflate_medium.o deflate_slow.o functable.o infback.o inffast.o inflate.o inftrees.o insert_string.o trees.o uncompr.o zutil.o $(ARCH_STATIC_OBJS) OBJG = gzclose.o gzlib.o gzread.o gzwrite.o OBJC = $(OBJZ) $(OBJG) -PIC_OBJZ = adler32.lo compress.lo crc32.lo deflate.lo deflate_fast.lo deflate_medium.lo deflate_slow.lo functable.lo infback.lo inffast.lo inflate.lo inftrees.lo insert_string.lo trees.lo uncompr.lo zutil.lo $(ARCH_SHARED_OBJS) +PIC_OBJZ = adler32.lo compare258.lo compress.lo crc32.lo deflate.lo deflate_fast.lo deflate_medium.lo deflate_slow.lo functable.lo infback.lo inffast.lo inflate.lo inftrees.lo insert_string.lo trees.lo uncompr.lo zutil.lo $(ARCH_SHARED_OBJS) PIC_OBJG = gzclose.lo gzlib.lo gzread.lo gzwrite.lo PIC_OBJC = $(PIC_OBJZ) $(PIC_OBJG) diff --git a/arch/x86/Makefile.in b/arch/x86/Makefile.in index 8da40bf7c..365f58a68 100644 --- a/arch/x86/Makefile.in +++ b/arch/x86/Makefile.in @@ -17,7 +17,7 @@ SRCDIR=. SRCTOP=../.. TOPDIR=$(SRCTOP) -all: x86.o x86.lo deflate_quick.o deflate_quick.lo insert_string_sse.o insert_string_sse.lo crc_folding.o crc_folding.lo slide_avx.o slide_avx.lo slide_sse.o slide_sse.lo +all: x86.o x86.lo compare258_sse.o compare258_sse.lo deflate_quick.o deflate_quick.lo insert_string_sse.o insert_string_sse.lo crc_folding.o crc_folding.lo slide_avx.o slide_avx.lo slide_sse.o slide_sse.lo x86.o: $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/x86.c @@ -25,6 +25,12 @@ x86.o: x86.lo: $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/x86.c +compare258_sse.o: + $(CC) $(CFLAGS) $(SSE4FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare258_sse.c + +compare258_sse.lo: + $(CC) $(SFLAGS) $(SSE4FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare258_sse.c + deflate_quick.o: $(CC) $(CFLAGS) $(SSE4FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/deflate_quick.c diff --git a/arch/x86/compare258_sse.c b/arch/x86/compare258_sse.c new file mode 100644 index 000000000..916e383b1 --- /dev/null +++ b/arch/x86/compare258_sse.c @@ -0,0 +1,115 @@ +/* compare258_sse.c -- SSE4.2 version of compare258 + * + * Copyright (C) 2013 Intel Corporation. All rights reserved. + * Authors: + * Wajdi Feghali + * Jim Guilford + * Vinodh Gopal + * Erdinc Ozturk + * Jim Kukunas + * + * Portions are Copyright (C) 2016 12Sided Technology, LLC. + * Author: + * Phil Vachon + * + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include "../../zbuild.h" +#include "../../zutil.h" + +#ifdef X86_SSE42_CMP_STR + +#include +#ifdef _MSC_VER +# include +#endif + +/* UNALIGNED_OK, SSE4.2 intrinsic comparison */ +int32_t compare258_unaligned_sse4(const unsigned char *src0, const unsigned char *src1) { +#ifdef _MSC_VER + const unsigned char *src0start = src0; + const unsigned char *src0end = src0 + 256; + + do { + #define mode _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_EACH | _SIDD_NEGATIVE_POLARITY + __m128i xmm_src0, xmm_src1; + int ret; + + xmm_src0 = _mm_loadu_si128((__m128i *)src0); + xmm_src1 = _mm_loadu_si128((__m128i *)src1); + ret = _mm_cmpestri(xmm_src0, 16, xmm_src1, 16, mode); + if (_mm_cmpestrc(xmm_src0, 16, xmm_src1, 16, mode)) { + return (int32_t)(src0 - src0start + ret); + } + src0 += 16, src1 += 16; + + xmm_src0 = _mm_loadu_si128((__m128i *)src0); + xmm_src1 = _mm_loadu_si128((__m128i *)src1); + ret = _mm_cmpestri(xmm_src0, 16, xmm_src1, 16, mode); + if (_mm_cmpestrc(xmm_src0, 16, xmm_src1, 16, mode)) { + return (int32_t)(src0 - src0start + ret); + } + src0 += 16, src1 += 16; + } while (src0 < src0end); + + if (*(uint16_t *)src0 == *(uint16_t *)src1) + src0 += 2, src1 += 2; + else if (*src0 == *src1) + src0 += 1, src1 += 1; + + return (int32_t)(src0 - src0start); +#else + uintptr_t ax, dx, cx; + __m128i xmm_src0; + + ax = 16; + dx = 16; + /* Set cx to something, otherwise gcc thinks it's used + uninitalised */ + cx = 0; + + __asm__ __volatile__ ( + "1:" + "movdqu -16(%[src0], %[ax]), %[xmm_src0]\n\t" + "pcmpestri $0x18, -16(%[src1], %[ax]), %[xmm_src0]\n\t" + "jc 2f\n\t" + "add $16, %[ax]\n\t" + + "movdqu -16(%[src0], %[ax]), %[xmm_src0]\n\t" + "pcmpestri $0x18, -16(%[src1], %[ax]), %[xmm_src0]\n\t" + "jc 2f\n\t" + "add $16, %[ax]\n\t" + + "cmp $256 + 16, %[ax]\n\t" + "jb 1b\n\t" + +# if !defined(__x86_64__) + "movzwl -16(%[src0], %[ax]), %[dx]\n\t" +# else + "movzwq -16(%[src0], %[ax]), %[dx]\n\t" +# endif + "xorw -16(%[src1], %[ax]), %%dx\n\t" + "jnz 3f\n\t" + + "add $2, %[ax]\n\t" + "jmp 4f\n\t" + "3:\n\t" + "rep; bsf %[dx], %[cx]\n\t" + "shr $3, %[cx]\n\t" + "2:" + "add %[cx], %[ax]\n\t" + "4:" + : [ax] "+a" (ax), + [cx] "+c" (cx), + [dx] "+d" (dx), + [xmm_src0] "=x" (xmm_src0) + : [src0] "r" (src0), + [src1] "r" (src1) + : "cc" + ); + return (int32_t)(ax - 16); +#endif +} + +#endif diff --git a/compare258.c b/compare258.c new file mode 100644 index 000000000..b538277a1 --- /dev/null +++ b/compare258.c @@ -0,0 +1,122 @@ +/* compare258.c -- aligned and unaligned versions of compare258 + * Copyright (C) 2020 Nathan Moinvaziri + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include "zbuild.h" +#include "zutil.h" + +#include "fallback_builtins.h" + +/* ALIGNED, byte comparison */ +int32_t compare258_c(const unsigned char *src0, const unsigned char *src1) { + const unsigned char *src0start = src0; + const unsigned char *src0end = src0 + 258; + + do { + if (*src0 != *src1) + break; + src0 += 1, src1 += 1; + if (*src0 != *src1) + break; + src0 += 1, src1 += 1; + if (*src0 != *src1) + break; + src0 += 1, src1 += 1; + if (*src0 != *src1) + break; + src0 += 1, src1 += 1; + if (*src0 != *src1) + break; + src0 += 1, src1 += 1; + if (*src0 != *src1) + break; + src0 += 1, src1 += 1; + } while (src0 < src0end); + + return (int32_t)(src0 - src0start); +} + +#ifdef UNALIGNED_OK +/* UNALIGNED_OK, 16-bit integer comparison */ +int32_t compare258_unaligned_16(const unsigned char *src0, const unsigned char *src1) { + const unsigned char *src0start = src0; + const unsigned char *src0end = src0 + 258; + + do { + if (*(uint16_t *)src0 != *(uint16_t *)src1) + break; + src0 += 2, src1 += 2; + if (*(uint16_t *)src0 != *(uint16_t *)src1) + break; + src0 += 2, src1 += 2; + if (*(uint16_t *)src0 != *(uint16_t *)src1) + break; + src0 += 2, src1 += 2; + } while (src0 < src0end); + + if (*src0 == *src1) + src0 += 1; + + return (int32_t)(src0 - src0start); +} + +#ifdef HAVE_BUILTIN_CTZ +/* UNALIGNED_OK, 32-bit integer comparison */ +int32_t compare258_unaligned_32(const unsigned char *src0, const unsigned char *src1) { + const unsigned char *src0start = src0; + const unsigned char *src0end = src0 + 256; + + do { + uint32_t sv = *(uint32_t *)src0; + uint32_t mv = *(uint32_t *)src1; + uint32_t xor = sv ^ mv; + + if (xor) { + uint32_t match_byte = __builtin_ctz(xor) / 8; + return (int32_t)(src0 - src0start + match_byte); + } + + src0 += 4, src1 += 4; + } while (src0 < src0end); + + if (*(uint16_t *)src0 == *(uint16_t *)src1) + src0 += 2, src1 += 2; + else if (*src0 == *src1) + src0 += 1, src1 += 1; + + return (int32_t)(src0 - src0start); +} + +#endif + +#ifdef HAVE_BUILTIN_CTZLL +/* UNALIGNED_OK, 64-bit integer comparison */ +int32_t compare258_unaligned_64(const unsigned char *src0, const unsigned char *src1) { + const unsigned char *src0start = src0; + const unsigned char *src0end = src0 + 256; + + do { + uint64_t sv = *(uint64_t *)src0; + uint64_t mv = *(uint64_t *)src1; + uint64_t xor = sv ^ mv; + + if (xor) { + uint64_t match_byte = __builtin_ctzll(xor) / 8; + return (int32_t)(src0 - src0start + match_byte); + } + + src0 += 8, src1 += 8; + } while (src0 < src0end); + + if (*(uint16_t *)src0 == *(uint16_t *)src1) + src0 += 2, src1 += 2; + else if (*src0 == *src1) + src0 += 1, src1 += 1; + + return (int32_t)(src0 - src0start); +} + +#endif + +#endif diff --git a/configure b/configure index 5177d1715..a0c8f60fe 100755 --- a/configure +++ b/configure @@ -852,6 +852,23 @@ EOF fi fi +# Check for __builtin_ctz() support in compiler +cat > $test.c << EOF +int main(void) { + unsigned int zero = 0; + long test = __builtin_ctz(zero); + (void)test; + return 0; +} +EOF +if try ${CC} ${CFLAGS} $test.c $LDSHAREDLIBC; then + echo "Checking for __builtin_ctz ... Yes." | tee -a configure.log + CFLAGS="$CFLAGS -DHAVE_BUILTIN_CTZ" + SFLAGS="$SFLAGS -DHAVE_BUILTIN_CTZ" +else + echo "Checking for __builtin_ctz ... No." | tee -a configure.log +fi + # Check for __builtin_ctzl() support in compiler cat > $test.c << EOF int main(void) { @@ -869,6 +886,23 @@ else echo "Checking for __builtin_ctzl ... No." | tee -a configure.log fi +# Check for __builtin_ctzll() support in compiler +cat > $test.c << EOF +int main(void) { + unsigned long long zero = 0; + long test = __builtin_ctzll(zero); + (void)test; + return 0; +} +EOF +if try ${CC} ${CFLAGS} $test.c $LDSHAREDLIBC; then + echo "Checking for __builtin_ctzll ... Yes." | tee -a configure.log + CFLAGS="$CFLAGS -DHAVE_BUILTIN_CTZLL" + SFLAGS="$SFLAGS -DHAVE_BUILTIN_CTZLL" +else + echo "Checking for __builtin_ctzll ... No." | tee -a configure.log +fi + # Check for SSE2 intrinsics case "${ARCH}" in i386 | i486 | i586 | i686) @@ -912,6 +946,31 @@ EOF ;; esac +# Check for SSE4.2 compare string intrinsics +case "${ARCH}" in + i386 | i486 | i586 | i686 | x86_64) + cat > $test.c << EOF +#include +int main(void) +{ + unsigned char a[64] = { 0 }; + unsigned char b[64] = { 0 }; + __m128i xmm_src0, xmm_src1; + xmm_src0 = _mm_loadu_si128((__m128i *)(char *)a); + xmm_src1 = _mm_loadu_si128((__m128i *)(char *)b); + return _mm_cmpestri(xmm_src0, 16, xmm_src1, 16, 0); +} +EOF + if try ${CC} ${CFLAGS} ${sse42flag} $test.c; then + echo "Checking for SSE4.2 compare string intrinsics ... Yes." | tee -a configure.log + HAVE_SSE42CMPSTR_INTRIN=1 + else + echo "Checking for SSE4.2 compare string intrinsics ... No." | tee -a configure.log + HAVE_SSE42CMPSTR_INTRIN=0 + fi + ;; +esac + # Check for PCLMULQDQ intrinsics case "${ARCH}" in i386 | i486 | i586 | i686 | x86_64) @@ -1042,6 +1101,14 @@ case "${ARCH}" in SFLAGS="${SFLAGS} -DX86_SSE42_CRC_INTRIN" fi + if test ${HAVE_SSE42CMPSTR_INTRIN} -eq 1; then + CFLAGS="${CFLAGS} -DX86_SSE42_CMP_STR" + SFLAGS="${SFLAGS} -DX86_SSE42_CMP_STR" + + ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} compare258_sse.o" + ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} compare258_sse.lo" + fi + if test ${HAVE_AVX2_INTRIN} -eq 1; then CFLAGS="${CFLAGS} -DX86_AVX2" SFLAGS="${SFLAGS} -DX86_AVX2" @@ -1098,6 +1165,14 @@ case "${ARCH}" in ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} slide_avx.lo" fi + if test ${HAVE_SSE42CMPSTR_INTRIN} -eq 1; then + CFLAGS="${CFLAGS} -DX86_SSE42_CMP_STR" + SFLAGS="${SFLAGS} -DX86_SSE42_CMP_STR" + + ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} compare258_sse.o" + ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} compare258_sse.lo" + fi + # Enable deflate_quick at level 1? if test $without_new_strategies -eq 0; then CFLAGS="${CFLAGS} -DX86_QUICK_STRATEGY" diff --git a/fallback_builtins.h b/fallback_builtins.h index 8bd16ed87..cd597f622 100644 --- a/fallback_builtins.h +++ b/fallback_builtins.h @@ -1,12 +1,28 @@ -#ifndef X86_CTZL_H -#define X86_CTZL_H +#ifndef X86_BUILTIN_CTZ_H +#define X86_BUILTIN_CTZ_H + +#if defined(_MSC_VER) && !defined(__clang__) +#if defined(_M_IX86) || defined(_M_AMD64) || defined(_M_IA64) || defined(_M_ARM) || defined(_M_ARM64) #include #ifdef X86_CPUID # include "arch/x86/x86.h" #endif -#if defined(_MSC_VER) && !defined(__clang__) +/* This is not a general purpose replacement for __builtin_ctz. The function expects that value is != 0 + * Because of that assumption trailing_zero is not initialized and the return value of _BitScanForward is not checked + */ +static __forceinline unsigned long __builtin_ctz(uint32_t value) { +#ifdef X86_CPUID + if (x86_cpu_has_tzcnt) + return _tzcnt_u32(value); +#endif + unsigned long trailing_zero; + _BitScanForward(&trailing_zero, value); + return trailing_zero; +} +#define HAVE_BUILTIN_CTZ + /* This is not a general purpose replacement for __builtin_ctzl. The function expects that value is != 0 * Because of that assumption trailing_zero is not initialized and the return value of _BitScanForward is not checked */ @@ -20,6 +36,23 @@ static __forceinline unsigned long __builtin_ctzl(unsigned long value) { _BitScanForward(&trailing_zero, value); return trailing_zero; } + +#ifdef _M_AMD64 +/* This is not a general purpose replacement for __builtin_ctzll. The function expects that value is != 0 + * Because of that assumption trailing_zero is not initialized and the return value of _BitScanForward64 is not checked + */ +static __forceinline unsigned long long __builtin_ctzll(uint64_t value) { +#ifdef X86_CPUID + if (x86_cpu_has_tzcnt) + return _tzcnt_u64(value); +#endif + unsigned long trailing_zero; + _BitScanForward64(&trailing_zero, value); + return trailing_zero; +} +#define HAVE_BUILTIN_CTZLL #endif #endif +#endif +#endif diff --git a/functable.c b/functable.c index ca9b82f14..943c2b41f 100644 --- a/functable.c +++ b/functable.c @@ -9,6 +9,11 @@ #include "deflate_p.h" #include "functable.h" + +#ifdef X86_CPUID +# include "fallback_builtins.h" +#endif + /* insert_string */ extern Pos insert_string_c(deflate_state *const s, const Pos str, unsigned int count); #ifdef X86_SSE42_CRC_HASH @@ -58,6 +63,16 @@ extern uint32_t crc32_little(uint32_t, const unsigned char *, uint64_t); extern uint32_t crc32_big(uint32_t, const unsigned char *, uint64_t); #endif +/* compare258 */ +extern int32_t compare258_c(const unsigned char *src0, const unsigned char *src1); +#ifdef UNALIGNED_OK +extern int32_t compare258_unaligned_16(const unsigned char *src0, const unsigned char *src1); +extern int32_t compare258_unaligned_32(const unsigned char *src0, const unsigned char *src1); +extern int32_t compare258_unaligned_64(const unsigned char *src0, const unsigned char *src1); +#ifdef X86_SSE42_CMP_STR +extern int32_t compare258_unaligned_sse4(const unsigned char *src0, const unsigned char *src1); +#endif +#endif /* stub definitions */ ZLIB_INTERNAL Pos insert_string_stub(deflate_state *const s, const Pos str, unsigned int count); @@ -65,6 +80,7 @@ ZLIB_INTERNAL Pos quick_insert_string_stub(deflate_state *const s, const Pos str ZLIB_INTERNAL uint32_t adler32_stub(uint32_t adler, const unsigned char *buf, size_t len); ZLIB_INTERNAL uint32_t crc32_stub(uint32_t crc, const unsigned char *buf, uint64_t len); ZLIB_INTERNAL void slide_hash_stub(deflate_state *s); +ZLIB_INTERNAL int32_t compare258_stub(const unsigned char *src0, const unsigned char *src1); /* functable init */ ZLIB_INTERNAL __thread struct functable_s functable = { @@ -72,7 +88,8 @@ ZLIB_INTERNAL __thread struct functable_s functable = { quick_insert_string_stub, adler32_stub, crc32_stub, - slide_hash_stub + slide_hash_stub, + compare258_stub }; ZLIB_INTERNAL void cpu_check_features(void) @@ -189,3 +206,25 @@ ZLIB_INTERNAL uint32_t crc32_stub(uint32_t crc, const unsigned char *buf, uint64 return functable.crc32(crc, buf, len); } + +ZLIB_INTERNAL int32_t compare258_stub(const unsigned char *src0, const unsigned char *src1) { + + functable.compare258 = &compare258_c; + +#ifdef UNALIGNED_OK +# ifdef HAVE_BUILTIN_CTZLL + functable.compare258 = &compare258_unaligned_64; +# elif defined(HAVE_BUILTIN_CTZ) + functable.compare258 = &compare258_unaligned_32; +# else + functable.compare258 = &compare258_unaligned_16; +# endif +# ifdef X86_SSE42_CMP_STR + if (x86_cpu_has_sse42) + functable.compare258 = &compare258_unaligned_sse4; +# endif +#endif + + return functable.compare258(src0, src1); +} + diff --git a/functable.h b/functable.h index a03c1e40c..42881d5b8 100644 --- a/functable.h +++ b/functable.h @@ -14,6 +14,7 @@ struct functable_s { uint32_t (* adler32) (uint32_t adler, const unsigned char *buf, size_t len); uint32_t (* crc32) (uint32_t crc, const unsigned char *buf, uint64_t len); void (* slide_hash) (deflate_state *s); + int32_t (* compare258) (const unsigned char *src0, const unsigned char *src1); }; ZLIB_INTERNAL extern __thread struct functable_s functable; diff --git a/win32/Makefile.msc b/win32/Makefile.msc index 9f9cd31dd..abb9808b9 100644 --- a/win32/Makefile.msc +++ b/win32/Makefile.msc @@ -34,8 +34,8 @@ WITH_GZFILEOP = ZLIB_COMPAT = SUFFIX = -OBJS = adler32.obj compress.obj crc32.obj deflate.obj deflate_fast.obj deflate_quick.obj deflate_slow.obj \ - deflate_medium.obj \ +OBJS = adler32.obj compare258.obj compare258_sse.obj compress.obj crc32.obj \ + deflate.obj deflate_fast.obj deflate_quick.obj deflate_slow.obj deflate_medium.obj \ functable.obj infback.obj inflate.obj inftrees.obj inffast.obj insert_string.obj \ slide_avx.obj slide_sse.obj trees.obj uncompr.obj zutil.obj \ x86.obj insert_string_sse.obj crc_folding.obj