From: Adam Stylinski <kungfujesus06@gmail.com>
Date: Sun, 23 Jan 2022 05:18:17 +0000 (-0500)
Subject: Axe the SSE4 compare256 functions
X-Git-Tag: 2.1.0-beta1~376
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=b3260fd0c82c8210668399d53c3277b01cb18a07;p=thirdparty%2Fzlib-ng.git

Axe the SSE4 compare256 functions
---

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8d340087..000f3ad9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -787,13 +787,6 @@ if(WITH_OPTIM)
                     add_definitions(-DX86_SSE42_CRC_INTRIN)
                 endif()
             endif()
-            if(HAVE_SSE42CMPSTR_INTRIN)
-                add_definitions(-DX86_SSE42_CMP_STR)
-                set(SSE42_SRCS ${ARCHDIR}/compare256_sse42.c)
-                add_feature_info(SSE42_COMPARE256 1 "Support SSE4.2 optimized compare256, using \"${SSE42FLAG}\"")
-                list(APPEND ZLIB_ARCH_SRCS ${SSE42_SRCS})
-                set_property(SOURCE ${SSE42_SRCS} PROPERTY COMPILE_FLAGS "${SSE42FLAG} ${NOLTOFLAG}")
-            endif()
             if(NOT HAVE_SSE42CRC_INLINE_ASM AND NOT HAVE_SSE42CRC_INTRIN AND NOT HAVE_SSE42CMPSTR_INTRIN)
                 set(WITH_SSE4 OFF)
             endif()
diff --git a/README.md b/README.md
index d70ac827..3c4fd00b 100644
--- a/README.md
+++ b/README.md
@@ -25,7 +25,7 @@ Features
   * CRC32-B implementation using PCLMULQDQ, VPCLMULQDQ, ACLE, & IBM Z
   * Hash table implementation using CRC32-C intrinsics on x86 and ARM
   * Slide hash implementations using SSE2, AVX2, Neon, VMX & VSX
-  * Compare256 implementations using SSE4.2 & AVX2
+  * Compare256 implementations using SSE2 & AVX2
   * Inflate chunk copying using SSE2, AVX, Neon & VSX
   * Support for hardware-accelerated deflate using IBM Z DFLTCC
 * Unaligned memory read/writes and large bit buffer improvements
diff --git a/arch/x86/Makefile.in b/arch/x86/Makefile.in
index 389fc2f3..2b90e2ad 100644
--- a/arch/x86/Makefile.in
+++ b/arch/x86/Makefile.in
@@ -34,7 +34,6 @@ all: \
 	chunkset_sse2.o chunkset_sse2.lo \
 	compare256_avx2.o compare256_avx2.lo \
 	compare256_sse2.o compare256_sse2.lo \
-	compare256_sse42.o compare256_sse42.lo \
 	insert_string_sse42.o insert_string_sse42.lo \
 	crc32_fold_pclmulqdq.o crc32_fold_pclmulqdq.lo \
 	crc32_fold_vpclmulqdq.o crc32_fold_vpclmulqdq.lo \
@@ -71,12 +70,6 @@ compare256_sse2.o:
 compare256_sse2.lo:
 	$(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse2.c
 
-compare256_sse42.o:
-	$(CC) $(CFLAGS) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse42.c
-
-compare256_sse42.lo:
-	$(CC) $(SFLAGS) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse42.c
-
 insert_string_sse42.o:
 	$(CC) $(CFLAGS) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_sse42.c
 
diff --git a/arch/x86/compare256_sse2.c b/arch/x86/compare256_sse2.c
index 44d893d9..bd5d62cf 100644
--- a/arch/x86/compare256_sse2.c
+++ b/arch/x86/compare256_sse2.c
@@ -4,7 +4,6 @@
  */
 
 #include "../../zbuild.h"
-#include "../../zutil.h"
 
 #include "fallback_builtins.h"
 
diff --git a/arch/x86/compare256_sse42.c b/arch/x86/compare256_sse42.c
deleted file mode 100644
index 37a847c4..00000000
--- a/arch/x86/compare256_sse42.c
+++ /dev/null
@@ -1,71 +0,0 @@
-/* compare256_sse42.c -- SSE4.2 version of compare256
- *
- * Copyright (C) 2013 Intel Corporation. All rights reserved.
- * Authors:
- *  Wajdi Feghali   <wajdi.k.feghali@intel.com>
- *  Jim Guilford    <james.guilford@intel.com>
- *  Vinodh Gopal    <vinodh.gopal@intel.com>
- *     Erdinc Ozturk   <erdinc.ozturk@intel.com>
- *  Jim Kukunas     <james.t.kukunas@linux.intel.com>
- *
- * Portions are Copyright (C) 2016 12Sided Technology, LLC.
- * Author:
- *  Phil Vachon     <pvachon@12sidedtech.com>
- *
- * For conditions of distribution and use, see copyright notice in zlib.h
- */
-
-#include "../../zbuild.h"
-
-#ifdef X86_SSE42_CMP_STR
-
-#include <immintrin.h>
-#ifdef _MSC_VER
-#  include <nmmintrin.h>
-#endif
-
-/* UNALIGNED_OK, SSE4.2 intrinsic comparison */
-static inline uint32_t compare256_unaligned_sse4_static(const uint8_t *src0, const uint8_t *src1) {
-    uint32_t len = 0;
-
-    do {
-        #define cmp_mode _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_EACH | _SIDD_NEGATIVE_POLARITY
-        __m128i xmm_src0, xmm_src1;
-        uint32_t ret;
-
-        xmm_src0 = _mm_loadu_si128((__m128i *)src0);
-        xmm_src1 = _mm_loadu_si128((__m128i *)src1);
-        ret = (uint32_t)_mm_cmpestri(xmm_src0, 16, xmm_src1, 16, cmp_mode);
-        if (_mm_cmpestrc(xmm_src0, 16, xmm_src1, 16, cmp_mode)) {
-            return len + ret;
-        }
-        src0 += 16, src1 += 16, len += 16;
-
-        xmm_src0 = _mm_loadu_si128((__m128i *)src0);
-        xmm_src1 = _mm_loadu_si128((__m128i *)src1);
-        ret = (uint32_t)_mm_cmpestri(xmm_src0, 16, xmm_src1, 16, cmp_mode);
-        if (_mm_cmpestrc(xmm_src0, 16, xmm_src1, 16, cmp_mode)) {
-            return len + ret;
-        }
-        src0 += 16, src1 += 16, len += 16;
-    } while (len < 256);
-
-    return 256;
-}
-
-Z_INTERNAL uint32_t compare256_unaligned_sse4(const uint8_t *src0, const uint8_t *src1) {
-    return compare256_unaligned_sse4_static(src0, src1);
-}
-
-#define LONGEST_MATCH       longest_match_unaligned_sse4
-#define COMPARE256          compare256_unaligned_sse4_static
-
-#include "match_tpl.h"
-
-#define LONGEST_MATCH_SLOW
-#define LONGEST_MATCH       longest_match_slow_unaligned_sse4
-#define COMPARE256          compare256_unaligned_sse4_static
-
-#include "match_tpl.h"
-
-#endif
diff --git a/configure b/configure
index 3ea2fe65..328e1821 100755
--- a/configure
+++ b/configure
@@ -1540,14 +1540,6 @@ case "${ARCH}" in
                 ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} insert_string_sse42.lo"
             fi
 
-            if test ${HAVE_SSE42CMPSTR_INTRIN} -eq 1; then
-                CFLAGS="${CFLAGS} -DX86_SSE42_CMP_STR"
-                SFLAGS="${SFLAGS} -DX86_SSE42_CMP_STR"
-
-                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} compare256_sse42.o"
-                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} compare256_sse42.lo"
-            fi
-
             check_sse2_intrinsics
 
             if test ${HAVE_SSE2_INTRIN} -eq 1; then
diff --git a/cpu_features.h b/cpu_features.h
index 1f254336..c0223ae1 100644
--- a/cpu_features.h
+++ b/cpu_features.h
@@ -120,10 +120,7 @@ extern uint32_t compare256_unaligned_32(const uint8_t *src0, const uint8_t *src1
 extern uint32_t compare256_unaligned_64(const uint8_t *src0, const uint8_t *src1);
 #endif
 #if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
-extern uint32_t compare256_unaligned_sse2(const unsigned char *src0, const unsigned char *src1);
-#endif
-#ifdef X86_SSE42_CMP_STR
-extern uint32_t compare256_unaligned_sse4(const uint8_t *src0, const uint8_t *src1);
+extern uint32_t compare256_unaligned_sse2(const uint8_t *src0, const uint8_t *src1);
 #endif
 #if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
 extern uint32_t compare256_unaligned_avx2(const uint8_t *src0, const uint8_t *src1);
@@ -150,9 +147,6 @@ extern uint32_t longest_match_unaligned_64(deflate_state *const s, Pos cur_match
 #if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
 extern uint32_t longest_match_unaligned_sse2(deflate_state *const s, Pos cur_match);
 #endif
-#ifdef X86_SSE42_CMP_STR
-extern uint32_t longest_match_unaligned_sse4(deflate_state *const s, Pos cur_match);
-#endif
 #if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
 extern uint32_t longest_match_unaligned_avx2(deflate_state *const s, Pos cur_match);
 #endif
@@ -169,9 +163,6 @@ extern uint32_t longest_match_slow_unaligned_64(deflate_state *const s, Pos cur_
 #if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
 extern uint32_t longest_match_slow_unaligned_sse2(deflate_state *const s, Pos cur_match);
 #endif
-#ifdef X86_SSE42_CMP_STR
-extern uint32_t longest_match_slow_unaligned_sse4(deflate_state *const s, Pos cur_match);
-#endif
 #if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
 extern uint32_t longest_match_slow_unaligned_avx2(deflate_state *const s, Pos cur_match);
 #endif
diff --git a/functable.c b/functable.c
index 78866a79..c84f55e4 100644
--- a/functable.c
+++ b/functable.c
@@ -110,10 +110,6 @@ Z_INTERNAL uint32_t longest_match_stub(deflate_state *const s, Pos cur_match) {
     if (x86_cpu_has_sse2)
         functable.longest_match = &longest_match_unaligned_sse2;
 #  endif
-#  ifdef X86_SSE42_CMP_STR
-    if (x86_cpu_has_sse42)
-        functable.longest_match = &longest_match_unaligned_sse4;
-#  endif
 #  if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
     if (x86_cpu_has_avx2)
         functable.longest_match = &longest_match_unaligned_avx2;
@@ -139,10 +135,6 @@ Z_INTERNAL uint32_t longest_match_slow_stub(deflate_state *const s, Pos cur_matc
     if (x86_cpu_has_sse2)
         functable.longest_match = &longest_match_slow_unaligned_sse2;
 #  endif
-#  ifdef X86_SSE42_CMP_STR
-    if (x86_cpu_has_sse42)
-        functable.longest_match_slow = &longest_match_slow_unaligned_sse4;
-#  endif
 #  if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
     if (x86_cpu_has_avx2)
         functable.longest_match_slow = &longest_match_slow_unaligned_avx2;
@@ -420,10 +412,6 @@ Z_INTERNAL uint32_t compare256_stub(const uint8_t *src0, const uint8_t *src1) {
     if (x86_cpu_has_sse2)
         functable.compare256 = &compare256_unaligned_sse2;
 #  endif
-#  ifdef X86_SSE42_CMP_STR
-    if (x86_cpu_has_sse42)
-        functable.compare256 = &compare256_unaligned_sse4;
-#  endif
 #  if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
     if (x86_cpu_has_avx2)
         functable.compare256 = &compare256_unaligned_avx2;
diff --git a/test/benchmarks/benchmark_compare256.cc b/test/benchmarks/benchmark_compare256.cc
index 01045349..5a8a98b2 100644
--- a/test/benchmarks/benchmark_compare256.cc
+++ b/test/benchmarks/benchmark_compare256.cc
@@ -75,9 +75,6 @@ BENCHMARK_COMPARE256(unaligned_64, compare256_unaligned_64, 1);
 #ifdef X86_SSE2
 BENCHMARK_COMPARE256(unaligned_sse2, compare256_unaligned_sse2, x86_cpu_has_sse2);
 #endif
-#ifdef X86_SSE42_CMP_STR
-BENCHMARK_COMPARE256(unaligned_sse4, compare256_unaligned_sse4, x86_cpu_has_sse42);
-#endif
 #if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
 BENCHMARK_COMPARE256(unaligned_avx2, compare256_unaligned_avx2, x86_cpu_has_avx2);
 #endif
diff --git a/win32/Makefile.msc b/win32/Makefile.msc
index 56b97dbc..3d8f1b2e 100644
--- a/win32/Makefile.msc
+++ b/win32/Makefile.msc
@@ -55,7 +55,7 @@ OBJS = \
 	chunkset_sse2.obj \
 	compare256.obj \
 	compare256_avx2.obj \
-	compare256_sse42.obj \
+	compare256_sse2.obj \
 	compress.obj \
 	cpu_features.obj \
 	crc32.obj \