]> git.ipfire.org Git - thirdparty/zlib-ng.git/commitdiff
Abstracted out architecture specific implementations of 258 byte comparison to compar...
authorNathan Moinvaziri <nathan@nathanm.com>
Tue, 25 Feb 2020 22:36:56 +0000 (14:36 -0800)
committerHans Kristian Rosbach <hk-github@circlestorm.org>
Sun, 24 May 2020 11:53:25 +0000 (13:53 +0200)
CMakeLists.txt
Makefile.in
arch/x86/Makefile.in
arch/x86/compare258_sse.c [new file with mode: 0644]
compare258.c [new file with mode: 0644]
configure
fallback_builtins.h
functable.c
functable.h
win32/Makefile.msc

index 5d186440d79e2132f8e31bcdccaae4bba727d6bb..71374e07ff16f0a8aeb897c2ff2724dd02a52f1f 100644 (file)
@@ -418,6 +418,22 @@ if(HAVE_ATTRIBUTE_VISIBILITY_INTERNAL)
     add_definitions(-DHAVE_VISIBILITY_INTERNAL)
 endif()
 
+#
+# check for __builtin_ctz() support in the compiler
+#
+check_c_source_compiles(
+    "int main(void)
+    {
+        unsigned int zero = 0;
+        long test = __builtin_ctz(zero);
+        (void)test;
+        return 0;
+    }"
+    HAVE_BUILTIN_CTZ
+)
+if(HAVE_BUILTIN_CTZ)
+    add_definitions(-DHAVE_BUILTIN_CTZ)
+endif()
 #
 # check for __builtin_ctzl() support in the compiler
 #
@@ -434,6 +450,22 @@ check_c_source_compiles(
 if(HAVE_BUILTIN_CTZL)
     add_definitions(-DHAVE_BUILTIN_CTZL)
 endif()
+#
+# check for __builtin_ctzll() support in the compiler
+#
+check_c_source_compiles(
+    "int main(void)
+    {
+        unsigned int zero = 0;
+        long test = __builtin_ctzll(zero);
+        (void)test;
+        return 0;
+    }"
+    HAVE_BUILTIN_CTZLL
+)
+if(HAVE_BUILTIN_CTZLL)
+    add_definitions(-DHAVE_BUILTIN_CTZLL)
+endif()
 
 #
 # check for ptrdiff_t support
@@ -531,6 +563,20 @@ if(BASEARCH_X86_FOUND)
         }"
         HAVE_SSE42CRC_INTRIN
     )
+    # Check whether compiler supports SSE4.2 compare string instrinics
+    check_c_source_compile_or_run(
+        "#include <immintrin.h>
+        int main(void)
+        {
+            unsigned char a[64] = { 0 };
+            unsigned char b[64] = { 0 };
+            __m128i xmm_src0, xmm_src1;
+            xmm_src0 = _mm_loadu_si128((__m128i *)(char *)a);
+            xmm_src1 = _mm_loadu_si128((__m128i *)(char *)b);
+            return _mm_cmpestri(xmm_src0, 16, xmm_src1, 16, 0);
+        }"
+        HAVE_SSE42CMPSTR_INTRIN
+    )
     set(CMAKE_REQUIRED_FLAGS)
 
     # Check whether compiler supports PCLMULQDQ intrinics
@@ -659,7 +705,7 @@ if(WITH_OPTIM)
             if("${ARCH}" MATCHES "arm" OR NOT WITH_NEON)
                 add_intrinsics_option("${ACLEFLAG}")
             endif()
-            add_feature_info(ACLE_CRC 1 "Support CRC hash generation using the ACLE instruction set, using \"${ACLEFLAG}\"")
+            add_feature_info(ACLE_CRC 1 "Support ACLE optimized CRC hash generation, using \"${ACLEFLAG}\"")
         endif()
     elseif(BASEARCH_S360_FOUND AND "${ARCH}" MATCHES "s390x")
         if(WITH_DFLTCC_DEFLATE OR WITH_DFLTCC_INFLATE)
@@ -684,13 +730,13 @@ if(WITH_OPTIM)
         if(WITH_AVX2 AND HAVE_AVX2_INTRIN)
             add_definitions(-DX86_AVX2)
             list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/slide_avx.c)
-            add_feature_info(AVX2_SLIDEHASH 1 "Support AVX2-optimized slide_hash, using \"${AVX2FLAG}\"")
+            add_feature_info(AVX2_SLIDEHASH 1 "Support AVX2 optimized slide_hash, using \"${AVX2FLAG}\"")
             add_intrinsics_option("${AVX2FLAG}")
         endif()
         if(WITH_SSE4 AND (HAVE_SSE42CRC_INLINE_ASM OR HAVE_SSE42CRC_INTRIN))
             add_definitions(-DX86_SSE42_CRC_HASH)
             list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/insert_string_sse.c)
-            add_feature_info(SSE42_CRC 1 "Support CRC hash generation using the SSE4.2 instruction set, using \"${SSE4FLAG}\"")
+            add_feature_info(SSE42_CRC 1 "Support SSE4.2 optimized CRC hash generation, using \"${SSE4FLAG}\"")
             add_intrinsics_option("${SSE4FLAG}")
             if(HAVE_SSE42CRC_INTRIN)
                 add_definitions(-DX86_SSE42_CRC_INTRIN)
@@ -698,9 +744,14 @@ if(WITH_OPTIM)
             if(WITH_NEW_STRATEGIES)
                 add_definitions(-DX86_QUICK_STRATEGY)
                 list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/deflate_quick.c)
-                add_feature_info(SSE42_DEFLATE_QUICK 1 "Support SSE4.2-accelerated quick compression")
+                add_feature_info(SSE42_DEFLATE_QUICK 1 "Support SSE4.2 accelerated quick compression")
             endif()
         endif()
+        if(HAVE_SSE42CMPSTR_INTRIN)
+            add_definitions(-DX86_SSE42_CMP_STR)
+            list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/compare258_sse.c)
+            add_feature_info(SSE42_COMPARE258 1 "Support SSE4.2 optimized compare258, using \"${SSE4FLAG}\"")
+        endif()
         if(WITH_SSE2 AND HAVE_SSE2_INTRIN)
             add_definitions(-DX86_SSE2)
             list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/slide_sse.c)
@@ -809,6 +860,7 @@ set(ZLIB_PRIVATE_HDRS
 )
 set(ZLIB_SRCS
     adler32.c
+    compare258.c
     compress.c
     crc32.c
     deflate.c
index 26185a12e35d500b76313a9224d43a38ead527f0..1063fa9636a85b6ecf979631e951eb677d5eaa0d 100644 (file)
@@ -71,11 +71,11 @@ mandir = ${prefix}/share/man
 man3dir = ${mandir}/man3
 pkgconfigdir = ${libdir}/pkgconfig
 
-OBJZ = adler32.o compress.o crc32.o deflate.o deflate_fast.o deflate_medium.o deflate_slow.o functable.o infback.o inffast.o inflate.o inftrees.o insert_string.o trees.o uncompr.o zutil.o $(ARCH_STATIC_OBJS)
+OBJZ = adler32.o compare258.o compress.o crc32.o deflate.o deflate_fast.o deflate_medium.o deflate_slow.o functable.o infback.o inffast.o inflate.o inftrees.o insert_string.o trees.o uncompr.o zutil.o $(ARCH_STATIC_OBJS)
 OBJG = gzclose.o gzlib.o gzread.o gzwrite.o
 OBJC = $(OBJZ) $(OBJG)
 
-PIC_OBJZ = adler32.lo compress.lo crc32.lo deflate.lo deflate_fast.lo deflate_medium.lo deflate_slow.lo functable.lo infback.lo inffast.lo inflate.lo inftrees.lo insert_string.lo trees.lo uncompr.lo zutil.lo $(ARCH_SHARED_OBJS)
+PIC_OBJZ = adler32.lo compare258.lo compress.lo crc32.lo deflate.lo deflate_fast.lo deflate_medium.lo deflate_slow.lo functable.lo infback.lo inffast.lo inflate.lo inftrees.lo insert_string.lo trees.lo uncompr.lo zutil.lo $(ARCH_SHARED_OBJS)
 PIC_OBJG = gzclose.lo gzlib.lo gzread.lo gzwrite.lo
 PIC_OBJC = $(PIC_OBJZ) $(PIC_OBJG)
 
index 8da40bf7c3de3cd37255b93af6be63f72485388c..365f58a6838b54f442c41d5e2ffb912f99e0fe9a 100644 (file)
@@ -17,7 +17,7 @@ SRCDIR=.
 SRCTOP=../..
 TOPDIR=$(SRCTOP)
 
-all: x86.o x86.lo deflate_quick.o deflate_quick.lo insert_string_sse.o insert_string_sse.lo crc_folding.o crc_folding.lo slide_avx.o slide_avx.lo slide_sse.o slide_sse.lo
+all: x86.o x86.lo compare258_sse.o compare258_sse.lo deflate_quick.o deflate_quick.lo insert_string_sse.o insert_string_sse.lo crc_folding.o crc_folding.lo slide_avx.o slide_avx.lo slide_sse.o slide_sse.lo
 
 x86.o:
        $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/x86.c
@@ -25,6 +25,12 @@ x86.o:
 x86.lo:
        $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/x86.c
 
+compare258_sse.o:
+       $(CC) $(CFLAGS) $(SSE4FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare258_sse.c
+
+compare258_sse.lo:
+       $(CC) $(SFLAGS) $(SSE4FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare258_sse.c
+
 deflate_quick.o:
        $(CC) $(CFLAGS) $(SSE4FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/deflate_quick.c
 
diff --git a/arch/x86/compare258_sse.c b/arch/x86/compare258_sse.c
new file mode 100644 (file)
index 0000000..916e383
--- /dev/null
@@ -0,0 +1,115 @@
+/* compare258_sse.c -- SSE4.2 version of compare258
+ *
+ * Copyright (C) 2013 Intel Corporation. All rights reserved.
+ * Authors:
+ *  Wajdi Feghali   <wajdi.k.feghali@intel.com>
+ *  Jim Guilford    <james.guilford@intel.com>
+ *  Vinodh Gopal    <vinodh.gopal@intel.com>
+ *     Erdinc Ozturk   <erdinc.ozturk@intel.com>
+ *  Jim Kukunas     <james.t.kukunas@linux.intel.com>
+ *
+ * Portions are Copyright (C) 2016 12Sided Technology, LLC.
+ * Author:
+ *  Phil Vachon     <pvachon@12sidedtech.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "../../zbuild.h"
+#include "../../zutil.h"
+
+#ifdef X86_SSE42_CMP_STR
+
+#include <immintrin.h>
+#ifdef _MSC_VER
+#  include <nmmintrin.h>
+#endif
+
+/* UNALIGNED_OK, SSE4.2 intrinsic comparison */
+int32_t compare258_unaligned_sse4(const unsigned char *src0, const unsigned char *src1) {
+#ifdef _MSC_VER
+    const unsigned char *src0start = src0;
+    const unsigned char *src0end = src0 + 256;
+
+    do {
+        #define mode _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_EACH | _SIDD_NEGATIVE_POLARITY
+        __m128i xmm_src0, xmm_src1;
+        int ret;
+
+        xmm_src0 = _mm_loadu_si128((__m128i *)src0);
+        xmm_src1 = _mm_loadu_si128((__m128i *)src1);
+        ret = _mm_cmpestri(xmm_src0, 16, xmm_src1, 16, mode);
+        if (_mm_cmpestrc(xmm_src0, 16, xmm_src1, 16, mode)) {
+            return (int32_t)(src0 - src0start + ret);
+        }
+        src0 += 16, src1 += 16;
+
+        xmm_src0 = _mm_loadu_si128((__m128i *)src0);
+        xmm_src1 = _mm_loadu_si128((__m128i *)src1);
+        ret = _mm_cmpestri(xmm_src0, 16, xmm_src1, 16, mode);
+        if (_mm_cmpestrc(xmm_src0, 16, xmm_src1, 16, mode)) {
+            return (int32_t)(src0 - src0start + ret);
+        }
+        src0 += 16, src1 += 16;
+    } while (src0 < src0end);
+
+    if (*(uint16_t *)src0 == *(uint16_t *)src1)
+        src0 += 2, src1 += 2;
+    else if (*src0 == *src1)
+        src0 += 1, src1 += 1;
+
+    return (int32_t)(src0 - src0start);
+#else
+    uintptr_t ax, dx, cx;
+    __m128i xmm_src0;
+
+    ax = 16;
+    dx = 16;
+    /* Set cx to something, otherwise gcc thinks it's used
+       uninitalised */
+    cx = 0;
+
+    __asm__ __volatile__ (
+    "1:"
+        "movdqu     -16(%[src0], %[ax]), %[xmm_src0]\n\t"
+        "pcmpestri  $0x18, -16(%[src1], %[ax]), %[xmm_src0]\n\t"
+        "jc         2f\n\t"
+        "add        $16, %[ax]\n\t"
+
+        "movdqu     -16(%[src0], %[ax]), %[xmm_src0]\n\t"
+        "pcmpestri  $0x18, -16(%[src1], %[ax]), %[xmm_src0]\n\t"
+        "jc         2f\n\t"
+        "add        $16, %[ax]\n\t"
+
+        "cmp        $256 + 16, %[ax]\n\t"
+        "jb         1b\n\t"
+
+#  if !defined(__x86_64__)
+        "movzwl     -16(%[src0], %[ax]), %[dx]\n\t"
+#  else
+        "movzwq     -16(%[src0], %[ax]), %[dx]\n\t"
+#  endif
+        "xorw       -16(%[src1], %[ax]), %%dx\n\t"
+        "jnz        3f\n\t"
+
+        "add        $2, %[ax]\n\t"
+        "jmp        4f\n\t"
+    "3:\n\t"
+        "rep; bsf   %[dx], %[cx]\n\t"
+        "shr        $3, %[cx]\n\t"
+    "2:"
+        "add        %[cx], %[ax]\n\t"
+    "4:"
+    : [ax] "+a" (ax),
+      [cx] "+c" (cx),
+      [dx] "+d" (dx),
+      [xmm_src0] "=x" (xmm_src0)
+    : [src0] "r" (src0),
+      [src1] "r" (src1)
+    : "cc"
+    );
+    return (int32_t)(ax - 16);
+#endif
+}
+
+#endif
diff --git a/compare258.c b/compare258.c
new file mode 100644 (file)
index 0000000..b538277
--- /dev/null
@@ -0,0 +1,122 @@
+/* compare258.c -- aligned and unaligned versions of compare258
+ * Copyright (C) 2020 Nathan Moinvaziri
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "zutil.h"
+
+#include "fallback_builtins.h"
+
+/* ALIGNED, byte comparison */
+int32_t compare258_c(const unsigned char *src0, const unsigned char *src1) {
+    const unsigned char *src0start = src0;
+    const unsigned char *src0end = src0 + 258;
+
+    do {
+        if (*src0 != *src1)
+            break;
+        src0 += 1, src1 += 1;
+        if (*src0 != *src1)
+            break;
+        src0 += 1, src1 += 1;
+        if (*src0 != *src1)
+            break;
+        src0 += 1, src1 += 1;
+        if (*src0 != *src1)
+            break;
+        src0 += 1, src1 += 1;
+        if (*src0 != *src1)
+            break;
+        src0 += 1, src1 += 1;
+        if (*src0 != *src1)
+            break;
+        src0 += 1, src1 += 1;
+    } while (src0 < src0end);
+    return (int32_t)(src0 - src0start);
+}
+
+#ifdef UNALIGNED_OK
+/* UNALIGNED_OK, 16-bit integer comparison */
+int32_t compare258_unaligned_16(const unsigned char *src0, const unsigned char *src1) {
+    const unsigned char *src0start = src0;
+    const unsigned char *src0end = src0 + 258;
+
+    do {
+        if (*(uint16_t *)src0 != *(uint16_t *)src1)
+            break;
+        src0 += 2, src1 += 2;
+        if (*(uint16_t *)src0 != *(uint16_t *)src1)
+            break;
+        src0 += 2, src1 += 2;
+        if (*(uint16_t *)src0 != *(uint16_t *)src1)
+            break;
+        src0 += 2, src1 += 2;
+    } while (src0 < src0end);
+
+    if (*src0 == *src1)
+        src0 += 1;
+
+    return (int32_t)(src0 - src0start);
+}
+
+#ifdef HAVE_BUILTIN_CTZ
+/* UNALIGNED_OK, 32-bit integer comparison */
+int32_t compare258_unaligned_32(const unsigned char *src0, const unsigned char *src1) {
+    const unsigned char *src0start = src0;
+    const unsigned char *src0end = src0 + 256;
+
+    do {
+        uint32_t sv = *(uint32_t *)src0;
+        uint32_t mv = *(uint32_t *)src1;
+        uint32_t xor = sv ^ mv;
+
+        if (xor) {
+            uint32_t match_byte = __builtin_ctz(xor) / 8;
+            return (int32_t)(src0 - src0start + match_byte);
+        }
+
+        src0 += 4, src1 += 4;
+    } while (src0 < src0end);
+
+    if (*(uint16_t *)src0 == *(uint16_t *)src1)
+        src0 += 2, src1 += 2;
+    else if (*src0 == *src1)
+        src0 += 1, src1 += 1;
+
+    return (int32_t)(src0 - src0start);
+}
+
+#endif
+
+#ifdef HAVE_BUILTIN_CTZLL
+/* UNALIGNED_OK, 64-bit integer comparison */
+int32_t compare258_unaligned_64(const unsigned char *src0, const unsigned char *src1) {
+    const unsigned char *src0start = src0;
+    const unsigned char *src0end = src0 + 256;
+
+    do {
+        uint64_t sv = *(uint64_t *)src0;
+        uint64_t mv = *(uint64_t *)src1;
+        uint64_t xor = sv ^ mv;
+
+        if (xor) {
+            uint64_t match_byte = __builtin_ctzll(xor) / 8;
+            return (int32_t)(src0 - src0start + match_byte);
+        }
+
+        src0 += 8, src1 += 8;
+    } while (src0 < src0end);
+
+    if (*(uint16_t *)src0 == *(uint16_t *)src1)
+        src0 += 2, src1 += 2;
+    else if (*src0 == *src1)
+        src0 += 1, src1 += 1;
+
+    return (int32_t)(src0 - src0start);
+}
+
+#endif
+
+#endif
index 5177d1715364239b31f39b10a03d2583e4d6a854..a0c8f60fe491e6590391069386b1fb9374552e74 100755 (executable)
--- a/configure
+++ b/configure
@@ -852,6 +852,23 @@ EOF
   fi
 fi
 
+# Check for __builtin_ctz() support in compiler
+cat > $test.c << EOF
+int main(void) {
+    unsigned int zero = 0;
+    long test = __builtin_ctz(zero);
+    (void)test;
+    return 0;
+}
+EOF
+if try ${CC} ${CFLAGS} $test.c $LDSHAREDLIBC; then
+    echo "Checking for __builtin_ctz ... Yes." | tee -a configure.log
+    CFLAGS="$CFLAGS -DHAVE_BUILTIN_CTZ"
+    SFLAGS="$SFLAGS -DHAVE_BUILTIN_CTZ"
+else
+    echo "Checking for __builtin_ctz ... No." | tee -a configure.log
+fi
+
 # Check for __builtin_ctzl() support in compiler
 cat > $test.c << EOF
 int main(void) {
@@ -869,6 +886,23 @@ else
     echo "Checking for __builtin_ctzl ... No." | tee -a configure.log
 fi
 
+# Check for __builtin_ctzll() support in compiler
+cat > $test.c << EOF
+int main(void) {
+    unsigned long long zero = 0;
+    long test = __builtin_ctzll(zero);
+    (void)test;
+    return 0;
+}
+EOF
+if try ${CC} ${CFLAGS} $test.c $LDSHAREDLIBC; then
+    echo "Checking for __builtin_ctzll ... Yes." | tee -a configure.log
+    CFLAGS="$CFLAGS -DHAVE_BUILTIN_CTZLL"
+    SFLAGS="$SFLAGS -DHAVE_BUILTIN_CTZLL"
+else
+    echo "Checking for __builtin_ctzll ... No." | tee -a configure.log
+fi
+
 # Check for SSE2 intrinsics
 case "${ARCH}" in
     i386 | i486 | i586 | i686)
@@ -912,6 +946,31 @@ EOF
         ;;
 esac
 
+# Check for SSE4.2 compare string intrinsics
+case "${ARCH}" in
+    i386 | i486 | i586 | i686 | x86_64)
+        cat > $test.c << EOF
+#include <immintrin.h>
+int main(void)
+{
+    unsigned char a[64] = { 0 };
+    unsigned char b[64] = { 0 };
+    __m128i xmm_src0, xmm_src1;
+    xmm_src0 = _mm_loadu_si128((__m128i *)(char *)a);
+    xmm_src1 = _mm_loadu_si128((__m128i *)(char *)b);
+    return _mm_cmpestri(xmm_src0, 16, xmm_src1, 16, 0);
+}
+EOF
+        if try ${CC} ${CFLAGS} ${sse42flag} $test.c; then
+            echo "Checking for SSE4.2 compare string intrinsics ... Yes." | tee -a configure.log
+            HAVE_SSE42CMPSTR_INTRIN=1
+        else
+            echo "Checking for SSE4.2 compare string intrinsics ... No." | tee -a configure.log
+            HAVE_SSE42CMPSTR_INTRIN=0
+        fi
+        ;;
+esac
+
 # Check for PCLMULQDQ intrinsics
 case "${ARCH}" in
     i386 | i486 | i586 | i686 | x86_64)
@@ -1042,6 +1101,14 @@ case "${ARCH}" in
                 SFLAGS="${SFLAGS} -DX86_SSE42_CRC_INTRIN"
             fi
 
+            if test ${HAVE_SSE42CMPSTR_INTRIN} -eq 1; then
+                CFLAGS="${CFLAGS} -DX86_SSE42_CMP_STR"
+                SFLAGS="${SFLAGS} -DX86_SSE42_CMP_STR"
+
+                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} compare258_sse.o"
+                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} compare258_sse.lo"
+            fi
+
             if test ${HAVE_AVX2_INTRIN} -eq 1; then
                 CFLAGS="${CFLAGS} -DX86_AVX2"
                 SFLAGS="${SFLAGS} -DX86_AVX2"
@@ -1098,6 +1165,14 @@ case "${ARCH}" in
                 ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} slide_avx.lo"
             fi
 
+            if test ${HAVE_SSE42CMPSTR_INTRIN} -eq 1; then
+                CFLAGS="${CFLAGS} -DX86_SSE42_CMP_STR"
+                SFLAGS="${SFLAGS} -DX86_SSE42_CMP_STR"
+
+                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} compare258_sse.o"
+                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} compare258_sse.lo"
+            fi
+
             # Enable deflate_quick at level 1?
             if test $without_new_strategies -eq 0; then
                 CFLAGS="${CFLAGS} -DX86_QUICK_STRATEGY"
index 8bd16ed874319ff828966741dc9227d419a1b7a1..cd597f622a0c98deee6774591b875acd673179ed 100644 (file)
@@ -1,12 +1,28 @@
-#ifndef X86_CTZL_H
-#define X86_CTZL_H
+#ifndef X86_BUILTIN_CTZ_H
+#define X86_BUILTIN_CTZ_H
+
+#if defined(_MSC_VER) && !defined(__clang__)
+#if defined(_M_IX86) || defined(_M_AMD64) || defined(_M_IA64) ||  defined(_M_ARM) || defined(_M_ARM64)
 
 #include <intrin.h>
 #ifdef X86_CPUID
 #  include "arch/x86/x86.h"
 #endif
 
-#if defined(_MSC_VER) && !defined(__clang__)
+/* This is not a general purpose replacement for __builtin_ctz. The function expects that value is != 0
+ * Because of that assumption trailing_zero is not initialized and the return value of _BitScanForward is not checked
+ */
+static __forceinline unsigned long __builtin_ctz(uint32_t value) {
+#ifdef X86_CPUID
+    if (x86_cpu_has_tzcnt)
+        return _tzcnt_u32(value);
+#endif
+    unsigned long trailing_zero;
+    _BitScanForward(&trailing_zero, value);
+    return trailing_zero;
+}
+#define HAVE_BUILTIN_CTZ
+
 /* This is not a general purpose replacement for __builtin_ctzl. The function expects that value is != 0
  * Because of that assumption trailing_zero is not initialized and the return value of _BitScanForward is not checked
  */
@@ -20,6 +36,23 @@ static __forceinline unsigned long __builtin_ctzl(unsigned long value) {
     _BitScanForward(&trailing_zero, value);
     return trailing_zero;
 }
+
+#ifdef _M_AMD64
+/* This is not a general purpose replacement for __builtin_ctzll. The function expects that value is != 0
+ * Because of that assumption trailing_zero is not initialized and the return value of _BitScanForward64 is not checked
+ */
+static __forceinline unsigned long long __builtin_ctzll(uint64_t value) {
+#ifdef X86_CPUID
+    if (x86_cpu_has_tzcnt)
+        return _tzcnt_u64(value);
+#endif
+    unsigned long trailing_zero;
+    _BitScanForward64(&trailing_zero, value);
+    return trailing_zero;
+}
+#define HAVE_BUILTIN_CTZLL
 #endif
 
 #endif
+#endif
+#endif
index ca9b82f14fa945cfa6a2b95c9c44943480361d5f..943c2b41fd691be84cfd21330e298951767a8661 100644 (file)
@@ -9,6 +9,11 @@
 #include "deflate_p.h"
 
 #include "functable.h"
+
+#ifdef X86_CPUID
+#  include "fallback_builtins.h"
+#endif
+
 /* insert_string */
 extern Pos insert_string_c(deflate_state *const s, const Pos str, unsigned int count);
 #ifdef X86_SSE42_CRC_HASH
@@ -58,6 +63,16 @@ extern uint32_t crc32_little(uint32_t, const unsigned char *, uint64_t);
 extern uint32_t crc32_big(uint32_t, const unsigned char *, uint64_t);
 #endif
 
+/* compare258 */
+extern int32_t compare258_c(const unsigned char *src0, const unsigned char *src1);
+#ifdef UNALIGNED_OK
+extern int32_t compare258_unaligned_16(const unsigned char *src0, const unsigned char *src1);
+extern int32_t compare258_unaligned_32(const unsigned char *src0, const unsigned char *src1);
+extern int32_t compare258_unaligned_64(const unsigned char *src0, const unsigned char *src1);
+#ifdef X86_SSE42_CMP_STR
+extern int32_t compare258_unaligned_sse4(const unsigned char *src0, const unsigned char *src1);
+#endif
+#endif
 
 /* stub definitions */
 ZLIB_INTERNAL Pos insert_string_stub(deflate_state *const s, const Pos str, unsigned int count);
@@ -65,6 +80,7 @@ ZLIB_INTERNAL Pos quick_insert_string_stub(deflate_state *const s, const Pos str
 ZLIB_INTERNAL uint32_t adler32_stub(uint32_t adler, const unsigned char *buf, size_t len);
 ZLIB_INTERNAL uint32_t crc32_stub(uint32_t crc, const unsigned char *buf, uint64_t len);
 ZLIB_INTERNAL void slide_hash_stub(deflate_state *s);
+ZLIB_INTERNAL int32_t compare258_stub(const unsigned char *src0, const unsigned char *src1);
 
 /* functable init */
 ZLIB_INTERNAL __thread struct functable_s functable = {
@@ -72,7 +88,8 @@ ZLIB_INTERNAL __thread struct functable_s functable = {
     quick_insert_string_stub,
     adler32_stub,
     crc32_stub,
-    slide_hash_stub
+    slide_hash_stub,
+    compare258_stub
 };
 
 ZLIB_INTERNAL void cpu_check_features(void)
@@ -189,3 +206,25 @@ ZLIB_INTERNAL uint32_t crc32_stub(uint32_t crc, const unsigned char *buf, uint64
 
     return functable.crc32(crc, buf, len);
 }
+
+ZLIB_INTERNAL int32_t compare258_stub(const unsigned char *src0, const unsigned char *src1) {
+
+    functable.compare258 = &compare258_c;
+
+#ifdef UNALIGNED_OK
+#  ifdef HAVE_BUILTIN_CTZLL
+    functable.compare258 = &compare258_unaligned_64;
+#  elif defined(HAVE_BUILTIN_CTZ)
+    functable.compare258 = &compare258_unaligned_32;
+#  else
+    functable.compare258 = &compare258_unaligned_16;
+#  endif
+#  ifdef X86_SSE42_CMP_STR
+    if (x86_cpu_has_sse42)
+        functable.compare258 = &compare258_unaligned_sse4;
+#  endif
+#endif
+
+    return functable.compare258(src0, src1);
+}
+
index a03c1e40c39af0d82b7bf10d0aa08b47f7c7cc92..42881d5b826008b44c437d45fbc85ad9c8f114ac 100644 (file)
@@ -14,6 +14,7 @@ struct functable_s {
     uint32_t (* adler32)            (uint32_t adler, const unsigned char *buf, size_t len);
     uint32_t (* crc32)              (uint32_t crc, const unsigned char *buf, uint64_t len);
     void     (* slide_hash)         (deflate_state *s);
+    int32_t  (* compare258)         (const unsigned char *src0, const unsigned char *src1);
 };
 
 ZLIB_INTERNAL extern __thread struct functable_s functable;
index 9f9cd31dd8ce0bfec4f880158589dbb5a1aee78d..abb9808b92d2b0d98340ebb9d16cd04056d711c3 100644 (file)
@@ -34,8 +34,8 @@ WITH_GZFILEOP =
 ZLIB_COMPAT =
 SUFFIX =
 
-OBJS = adler32.obj compress.obj crc32.obj deflate.obj deflate_fast.obj deflate_quick.obj deflate_slow.obj \
-       deflate_medium.obj \
+OBJS = adler32.obj compare258.obj compare258_sse.obj compress.obj crc32.obj \
+       deflate.obj deflate_fast.obj deflate_quick.obj deflate_slow.obj deflate_medium.obj \
        functable.obj infback.obj inflate.obj inftrees.obj inffast.obj insert_string.obj \
        slide_avx.obj slide_sse.obj trees.obj uncompr.obj zutil.obj \
        x86.obj insert_string_sse.obj crc_folding.obj