]> git.ipfire.org Git - thirdparty/zlib-ng.git/commitdiff
Axe the SSE4 compare256 functions
authorAdam Stylinski <kungfujesus06@gmail.com>
Sun, 23 Jan 2022 05:18:17 +0000 (00:18 -0500)
committerHans Kristian Rosbach <hk-github@circlestorm.org>
Fri, 11 Feb 2022 08:56:19 +0000 (09:56 +0100)
CMakeLists.txt
README.md
arch/x86/Makefile.in
arch/x86/compare256_sse2.c
arch/x86/compare256_sse42.c [deleted file]
configure
cpu_features.h
functable.c
test/benchmarks/benchmark_compare256.cc
win32/Makefile.msc

index 8d340087fbb01d37947c1f8212f55adc02bed28d..000f3ad9bd3d858a5a4ca3bf31339b8022c49698 100644 (file)
@@ -787,13 +787,6 @@ if(WITH_OPTIM)
                     add_definitions(-DX86_SSE42_CRC_INTRIN)
                 endif()
             endif()
-            if(HAVE_SSE42CMPSTR_INTRIN)
-                add_definitions(-DX86_SSE42_CMP_STR)
-                set(SSE42_SRCS ${ARCHDIR}/compare256_sse42.c)
-                add_feature_info(SSE42_COMPARE256 1 "Support SSE4.2 optimized compare256, using \"${SSE42FLAG}\"")
-                list(APPEND ZLIB_ARCH_SRCS ${SSE42_SRCS})
-                set_property(SOURCE ${SSE42_SRCS} PROPERTY COMPILE_FLAGS "${SSE42FLAG} ${NOLTOFLAG}")
-            endif()
             if(NOT HAVE_SSE42CRC_INLINE_ASM AND NOT HAVE_SSE42CRC_INTRIN AND NOT HAVE_SSE42CMPSTR_INTRIN)
                 set(WITH_SSE4 OFF)
             endif()
index d70ac82729dc729bd2bde2ba1ab10fb4bbca15ba..3c4fd00b6459bf750323fc74f84a09dbc892ffef 100644 (file)
--- a/README.md
+++ b/README.md
@@ -25,7 +25,7 @@ Features
   * CRC32-B implementation using PCLMULQDQ, VPCLMULQDQ, ACLE, & IBM Z
   * Hash table implementation using CRC32-C intrinsics on x86 and ARM
   * Slide hash implementations using SSE2, AVX2, Neon, VMX & VSX
-  * Compare256 implementations using SSE4.2 & AVX2
+  * Compare256 implementations using SSE2 & AVX2
   * Inflate chunk copying using SSE2, AVX, Neon & VSX
   * Support for hardware-accelerated deflate using IBM Z DFLTCC
 * Unaligned memory read/writes and large bit buffer improvements
index 389fc2f328689fa73217448c59399fe264ce9f95..2b90e2ad83fc2c47db56dc73e8fd0164562e8608 100644 (file)
@@ -34,7 +34,6 @@ all: \
        chunkset_sse2.o chunkset_sse2.lo \
        compare256_avx2.o compare256_avx2.lo \
        compare256_sse2.o compare256_sse2.lo \
-       compare256_sse42.o compare256_sse42.lo \
        insert_string_sse42.o insert_string_sse42.lo \
        crc32_fold_pclmulqdq.o crc32_fold_pclmulqdq.lo \
        crc32_fold_vpclmulqdq.o crc32_fold_vpclmulqdq.lo \
@@ -71,12 +70,6 @@ compare256_sse2.o:
 compare256_sse2.lo:
        $(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse2.c
 
-compare256_sse42.o:
-       $(CC) $(CFLAGS) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse42.c
-
-compare256_sse42.lo:
-       $(CC) $(SFLAGS) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse42.c
-
 insert_string_sse42.o:
        $(CC) $(CFLAGS) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_sse42.c
 
index 44d893d982e826a7393c90fba3720df4fb517849..bd5d62cf7b391e1452cf30aaa3148e227368e2dd 100644 (file)
@@ -4,7 +4,6 @@
  */
 
 #include "../../zbuild.h"
-#include "../../zutil.h"
 
 #include "fallback_builtins.h"
 
diff --git a/arch/x86/compare256_sse42.c b/arch/x86/compare256_sse42.c
deleted file mode 100644 (file)
index 37a847c..0000000
+++ /dev/null
@@ -1,71 +0,0 @@
-/* compare256_sse42.c -- SSE4.2 version of compare256
- *
- * Copyright (C) 2013 Intel Corporation. All rights reserved.
- * Authors:
- *  Wajdi Feghali   <wajdi.k.feghali@intel.com>
- *  Jim Guilford    <james.guilford@intel.com>
- *  Vinodh Gopal    <vinodh.gopal@intel.com>
- *     Erdinc Ozturk   <erdinc.ozturk@intel.com>
- *  Jim Kukunas     <james.t.kukunas@linux.intel.com>
- *
- * Portions are Copyright (C) 2016 12Sided Technology, LLC.
- * Author:
- *  Phil Vachon     <pvachon@12sidedtech.com>
- *
- * For conditions of distribution and use, see copyright notice in zlib.h
- */
-
-#include "../../zbuild.h"
-
-#ifdef X86_SSE42_CMP_STR
-
-#include <immintrin.h>
-#ifdef _MSC_VER
-#  include <nmmintrin.h>
-#endif
-
-/* UNALIGNED_OK, SSE4.2 intrinsic comparison */
-static inline uint32_t compare256_unaligned_sse4_static(const uint8_t *src0, const uint8_t *src1) {
-    uint32_t len = 0;
-
-    do {
-        #define cmp_mode _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_EACH | _SIDD_NEGATIVE_POLARITY
-        __m128i xmm_src0, xmm_src1;
-        uint32_t ret;
-
-        xmm_src0 = _mm_loadu_si128((__m128i *)src0);
-        xmm_src1 = _mm_loadu_si128((__m128i *)src1);
-        ret = (uint32_t)_mm_cmpestri(xmm_src0, 16, xmm_src1, 16, cmp_mode);
-        if (_mm_cmpestrc(xmm_src0, 16, xmm_src1, 16, cmp_mode)) {
-            return len + ret;
-        }
-        src0 += 16, src1 += 16, len += 16;
-
-        xmm_src0 = _mm_loadu_si128((__m128i *)src0);
-        xmm_src1 = _mm_loadu_si128((__m128i *)src1);
-        ret = (uint32_t)_mm_cmpestri(xmm_src0, 16, xmm_src1, 16, cmp_mode);
-        if (_mm_cmpestrc(xmm_src0, 16, xmm_src1, 16, cmp_mode)) {
-            return len + ret;
-        }
-        src0 += 16, src1 += 16, len += 16;
-    } while (len < 256);
-
-    return 256;
-}
-
-Z_INTERNAL uint32_t compare256_unaligned_sse4(const uint8_t *src0, const uint8_t *src1) {
-    return compare256_unaligned_sse4_static(src0, src1);
-}
-
-#define LONGEST_MATCH       longest_match_unaligned_sse4
-#define COMPARE256          compare256_unaligned_sse4_static
-
-#include "match_tpl.h"
-
-#define LONGEST_MATCH_SLOW
-#define LONGEST_MATCH       longest_match_slow_unaligned_sse4
-#define COMPARE256          compare256_unaligned_sse4_static
-
-#include "match_tpl.h"
-
-#endif
index 3ea2fe65e9908e37a0a8d98f1d3aec7b94875d3b..328e1821d85b639f609a136f5787f3d51daa4c48 100755 (executable)
--- a/configure
+++ b/configure
@@ -1540,14 +1540,6 @@ case "${ARCH}" in
                 ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} insert_string_sse42.lo"
             fi
 
-            if test ${HAVE_SSE42CMPSTR_INTRIN} -eq 1; then
-                CFLAGS="${CFLAGS} -DX86_SSE42_CMP_STR"
-                SFLAGS="${SFLAGS} -DX86_SSE42_CMP_STR"
-
-                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} compare256_sse42.o"
-                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} compare256_sse42.lo"
-            fi
-
             check_sse2_intrinsics
 
             if test ${HAVE_SSE2_INTRIN} -eq 1; then
index 1f25433660de20eab1b7c58e006f44c18a83f11a..c0223ae1594929bd42c3d25737ce31ec68ebc7c9 100644 (file)
@@ -120,10 +120,7 @@ extern uint32_t compare256_unaligned_32(const uint8_t *src0, const uint8_t *src1
 extern uint32_t compare256_unaligned_64(const uint8_t *src0, const uint8_t *src1);
 #endif
 #if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
-extern uint32_t compare256_unaligned_sse2(const unsigned char *src0, const unsigned char *src1);
-#endif
-#ifdef X86_SSE42_CMP_STR
-extern uint32_t compare256_unaligned_sse4(const uint8_t *src0, const uint8_t *src1);
+extern uint32_t compare256_unaligned_sse2(const uint8_t *src0, const uint8_t *src1);
 #endif
 #if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
 extern uint32_t compare256_unaligned_avx2(const uint8_t *src0, const uint8_t *src1);
@@ -150,9 +147,6 @@ extern uint32_t longest_match_unaligned_64(deflate_state *const s, Pos cur_match
 #if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
 extern uint32_t longest_match_unaligned_sse2(deflate_state *const s, Pos cur_match);
 #endif
-#ifdef X86_SSE42_CMP_STR
-extern uint32_t longest_match_unaligned_sse4(deflate_state *const s, Pos cur_match);
-#endif
 #if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
 extern uint32_t longest_match_unaligned_avx2(deflate_state *const s, Pos cur_match);
 #endif
@@ -169,9 +163,6 @@ extern uint32_t longest_match_slow_unaligned_64(deflate_state *const s, Pos cur_
 #if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
 extern uint32_t longest_match_slow_unaligned_sse2(deflate_state *const s, Pos cur_match);
 #endif
-#ifdef X86_SSE42_CMP_STR
-extern uint32_t longest_match_slow_unaligned_sse4(deflate_state *const s, Pos cur_match);
-#endif
 #if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
 extern uint32_t longest_match_slow_unaligned_avx2(deflate_state *const s, Pos cur_match);
 #endif
index 78866a7922e27f19147e3318f599a0ea5f664532..c84f55e42b5aadad05b75e29afff87617e391803 100644 (file)
@@ -110,10 +110,6 @@ Z_INTERNAL uint32_t longest_match_stub(deflate_state *const s, Pos cur_match) {
     if (x86_cpu_has_sse2)
         functable.longest_match = &longest_match_unaligned_sse2;
 #  endif
-#  ifdef X86_SSE42_CMP_STR
-    if (x86_cpu_has_sse42)
-        functable.longest_match = &longest_match_unaligned_sse4;
-#  endif
 #  if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
     if (x86_cpu_has_avx2)
         functable.longest_match = &longest_match_unaligned_avx2;
@@ -139,10 +135,6 @@ Z_INTERNAL uint32_t longest_match_slow_stub(deflate_state *const s, Pos cur_matc
     if (x86_cpu_has_sse2)
         functable.longest_match = &longest_match_slow_unaligned_sse2;
 #  endif
-#  ifdef X86_SSE42_CMP_STR
-    if (x86_cpu_has_sse42)
-        functable.longest_match_slow = &longest_match_slow_unaligned_sse4;
-#  endif
 #  if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
     if (x86_cpu_has_avx2)
         functable.longest_match_slow = &longest_match_slow_unaligned_avx2;
@@ -420,10 +412,6 @@ Z_INTERNAL uint32_t compare256_stub(const uint8_t *src0, const uint8_t *src1) {
     if (x86_cpu_has_sse2)
         functable.compare256 = &compare256_unaligned_sse2;
 #  endif
-#  ifdef X86_SSE42_CMP_STR
-    if (x86_cpu_has_sse42)
-        functable.compare256 = &compare256_unaligned_sse4;
-#  endif
 #  if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
     if (x86_cpu_has_avx2)
         functable.compare256 = &compare256_unaligned_avx2;
index 010453497a66799d5a607697f14bf93c9a3872af..5a8a98b2ddfa9f98a1e86ff8dcbf73475c34b9b6 100644 (file)
@@ -75,9 +75,6 @@ BENCHMARK_COMPARE256(unaligned_64, compare256_unaligned_64, 1);
 #ifdef X86_SSE2
 BENCHMARK_COMPARE256(unaligned_sse2, compare256_unaligned_sse2, x86_cpu_has_sse2);
 #endif
-#ifdef X86_SSE42_CMP_STR
-BENCHMARK_COMPARE256(unaligned_sse4, compare256_unaligned_sse4, x86_cpu_has_sse42);
-#endif
 #if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
 BENCHMARK_COMPARE256(unaligned_avx2, compare256_unaligned_avx2, x86_cpu_has_avx2);
 #endif
index 56b97dbc8eac47ea6d77b3890e3420de98fa869b..3d8f1b2e3ed97bd42d336952147e6c9c0bd71f8c 100644 (file)
@@ -55,7 +55,7 @@ OBJS = \
        chunkset_sse2.obj \
        compare256.obj \
        compare256_avx2.obj \
-       compare256_sse42.obj \
+       compare256_sse2.obj \
        compress.obj \
        cpu_features.obj \
        crc32.obj \