From: Adam Stylinski Date: Sun, 23 Jan 2022 05:18:17 +0000 (-0500) Subject: Axe the SSE4 compare256 functions X-Git-Tag: 2.1.0-beta1~376 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=b3260fd0c82c8210668399d53c3277b01cb18a07;p=thirdparty%2Fzlib-ng.git Axe the SSE4 compare256 functions --- diff --git a/CMakeLists.txt b/CMakeLists.txt index 8d340087..000f3ad9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -787,13 +787,6 @@ if(WITH_OPTIM) add_definitions(-DX86_SSE42_CRC_INTRIN) endif() endif() - if(HAVE_SSE42CMPSTR_INTRIN) - add_definitions(-DX86_SSE42_CMP_STR) - set(SSE42_SRCS ${ARCHDIR}/compare256_sse42.c) - add_feature_info(SSE42_COMPARE256 1 "Support SSE4.2 optimized compare256, using \"${SSE42FLAG}\"") - list(APPEND ZLIB_ARCH_SRCS ${SSE42_SRCS}) - set_property(SOURCE ${SSE42_SRCS} PROPERTY COMPILE_FLAGS "${SSE42FLAG} ${NOLTOFLAG}") - endif() if(NOT HAVE_SSE42CRC_INLINE_ASM AND NOT HAVE_SSE42CRC_INTRIN AND NOT HAVE_SSE42CMPSTR_INTRIN) set(WITH_SSE4 OFF) endif() diff --git a/README.md b/README.md index d70ac827..3c4fd00b 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@ Features * CRC32-B implementation using PCLMULQDQ, VPCLMULQDQ, ACLE, & IBM Z * Hash table implementation using CRC32-C intrinsics on x86 and ARM * Slide hash implementations using SSE2, AVX2, Neon, VMX & VSX - * Compare256 implementations using SSE4.2 & AVX2 + * Compare256 implementations using SSE2 & AVX2 * Inflate chunk copying using SSE2, AVX, Neon & VSX * Support for hardware-accelerated deflate using IBM Z DFLTCC * Unaligned memory read/writes and large bit buffer improvements diff --git a/arch/x86/Makefile.in b/arch/x86/Makefile.in index 389fc2f3..2b90e2ad 100644 --- a/arch/x86/Makefile.in +++ b/arch/x86/Makefile.in @@ -34,7 +34,6 @@ all: \ chunkset_sse2.o chunkset_sse2.lo \ compare256_avx2.o compare256_avx2.lo \ compare256_sse2.o compare256_sse2.lo \ - compare256_sse42.o compare256_sse42.lo \ insert_string_sse42.o insert_string_sse42.lo \ crc32_fold_pclmulqdq.o crc32_fold_pclmulqdq.lo \ crc32_fold_vpclmulqdq.o crc32_fold_vpclmulqdq.lo \ @@ -71,12 +70,6 @@ compare256_sse2.o: compare256_sse2.lo: $(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse2.c -compare256_sse42.o: - $(CC) $(CFLAGS) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse42.c - -compare256_sse42.lo: - $(CC) $(SFLAGS) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse42.c - insert_string_sse42.o: $(CC) $(CFLAGS) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_sse42.c diff --git a/arch/x86/compare256_sse2.c b/arch/x86/compare256_sse2.c index 44d893d9..bd5d62cf 100644 --- a/arch/x86/compare256_sse2.c +++ b/arch/x86/compare256_sse2.c @@ -4,7 +4,6 @@ */ #include "../../zbuild.h" -#include "../../zutil.h" #include "fallback_builtins.h" diff --git a/arch/x86/compare256_sse42.c b/arch/x86/compare256_sse42.c deleted file mode 100644 index 37a847c4..00000000 --- a/arch/x86/compare256_sse42.c +++ /dev/null @@ -1,71 +0,0 @@ -/* compare256_sse42.c -- SSE4.2 version of compare256 - * - * Copyright (C) 2013 Intel Corporation. All rights reserved. - * Authors: - * Wajdi Feghali - * Jim Guilford - * Vinodh Gopal - * Erdinc Ozturk - * Jim Kukunas - * - * Portions are Copyright (C) 2016 12Sided Technology, LLC. - * Author: - * Phil Vachon - * - * For conditions of distribution and use, see copyright notice in zlib.h - */ - -#include "../../zbuild.h" - -#ifdef X86_SSE42_CMP_STR - -#include -#ifdef _MSC_VER -# include -#endif - -/* UNALIGNED_OK, SSE4.2 intrinsic comparison */ -static inline uint32_t compare256_unaligned_sse4_static(const uint8_t *src0, const uint8_t *src1) { - uint32_t len = 0; - - do { - #define cmp_mode _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_EACH | _SIDD_NEGATIVE_POLARITY - __m128i xmm_src0, xmm_src1; - uint32_t ret; - - xmm_src0 = _mm_loadu_si128((__m128i *)src0); - xmm_src1 = _mm_loadu_si128((__m128i *)src1); - ret = (uint32_t)_mm_cmpestri(xmm_src0, 16, xmm_src1, 16, cmp_mode); - if (_mm_cmpestrc(xmm_src0, 16, xmm_src1, 16, cmp_mode)) { - return len + ret; - } - src0 += 16, src1 += 16, len += 16; - - xmm_src0 = _mm_loadu_si128((__m128i *)src0); - xmm_src1 = _mm_loadu_si128((__m128i *)src1); - ret = (uint32_t)_mm_cmpestri(xmm_src0, 16, xmm_src1, 16, cmp_mode); - if (_mm_cmpestrc(xmm_src0, 16, xmm_src1, 16, cmp_mode)) { - return len + ret; - } - src0 += 16, src1 += 16, len += 16; - } while (len < 256); - - return 256; -} - -Z_INTERNAL uint32_t compare256_unaligned_sse4(const uint8_t *src0, const uint8_t *src1) { - return compare256_unaligned_sse4_static(src0, src1); -} - -#define LONGEST_MATCH longest_match_unaligned_sse4 -#define COMPARE256 compare256_unaligned_sse4_static - -#include "match_tpl.h" - -#define LONGEST_MATCH_SLOW -#define LONGEST_MATCH longest_match_slow_unaligned_sse4 -#define COMPARE256 compare256_unaligned_sse4_static - -#include "match_tpl.h" - -#endif diff --git a/configure b/configure index 3ea2fe65..328e1821 100755 --- a/configure +++ b/configure @@ -1540,14 +1540,6 @@ case "${ARCH}" in ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} insert_string_sse42.lo" fi - if test ${HAVE_SSE42CMPSTR_INTRIN} -eq 1; then - CFLAGS="${CFLAGS} -DX86_SSE42_CMP_STR" - SFLAGS="${SFLAGS} -DX86_SSE42_CMP_STR" - - ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} compare256_sse42.o" - ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} compare256_sse42.lo" - fi - check_sse2_intrinsics if test ${HAVE_SSE2_INTRIN} -eq 1; then diff --git a/cpu_features.h b/cpu_features.h index 1f254336..c0223ae1 100644 --- a/cpu_features.h +++ b/cpu_features.h @@ -120,10 +120,7 @@ extern uint32_t compare256_unaligned_32(const uint8_t *src0, const uint8_t *src1 extern uint32_t compare256_unaligned_64(const uint8_t *src0, const uint8_t *src1); #endif #if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ) -extern uint32_t compare256_unaligned_sse2(const unsigned char *src0, const unsigned char *src1); -#endif -#ifdef X86_SSE42_CMP_STR -extern uint32_t compare256_unaligned_sse4(const uint8_t *src0, const uint8_t *src1); +extern uint32_t compare256_unaligned_sse2(const uint8_t *src0, const uint8_t *src1); #endif #if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) extern uint32_t compare256_unaligned_avx2(const uint8_t *src0, const uint8_t *src1); @@ -150,9 +147,6 @@ extern uint32_t longest_match_unaligned_64(deflate_state *const s, Pos cur_match #if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ) extern uint32_t longest_match_unaligned_sse2(deflate_state *const s, Pos cur_match); #endif -#ifdef X86_SSE42_CMP_STR -extern uint32_t longest_match_unaligned_sse4(deflate_state *const s, Pos cur_match); -#endif #if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) extern uint32_t longest_match_unaligned_avx2(deflate_state *const s, Pos cur_match); #endif @@ -169,9 +163,6 @@ extern uint32_t longest_match_slow_unaligned_64(deflate_state *const s, Pos cur_ #if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ) extern uint32_t longest_match_slow_unaligned_sse2(deflate_state *const s, Pos cur_match); #endif -#ifdef X86_SSE42_CMP_STR -extern uint32_t longest_match_slow_unaligned_sse4(deflate_state *const s, Pos cur_match); -#endif #if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) extern uint32_t longest_match_slow_unaligned_avx2(deflate_state *const s, Pos cur_match); #endif diff --git a/functable.c b/functable.c index 78866a79..c84f55e4 100644 --- a/functable.c +++ b/functable.c @@ -110,10 +110,6 @@ Z_INTERNAL uint32_t longest_match_stub(deflate_state *const s, Pos cur_match) { if (x86_cpu_has_sse2) functable.longest_match = &longest_match_unaligned_sse2; # endif -# ifdef X86_SSE42_CMP_STR - if (x86_cpu_has_sse42) - functable.longest_match = &longest_match_unaligned_sse4; -# endif # if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) if (x86_cpu_has_avx2) functable.longest_match = &longest_match_unaligned_avx2; @@ -139,10 +135,6 @@ Z_INTERNAL uint32_t longest_match_slow_stub(deflate_state *const s, Pos cur_matc if (x86_cpu_has_sse2) functable.longest_match = &longest_match_slow_unaligned_sse2; # endif -# ifdef X86_SSE42_CMP_STR - if (x86_cpu_has_sse42) - functable.longest_match_slow = &longest_match_slow_unaligned_sse4; -# endif # if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) if (x86_cpu_has_avx2) functable.longest_match_slow = &longest_match_slow_unaligned_avx2; @@ -420,10 +412,6 @@ Z_INTERNAL uint32_t compare256_stub(const uint8_t *src0, const uint8_t *src1) { if (x86_cpu_has_sse2) functable.compare256 = &compare256_unaligned_sse2; # endif -# ifdef X86_SSE42_CMP_STR - if (x86_cpu_has_sse42) - functable.compare256 = &compare256_unaligned_sse4; -# endif # if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) if (x86_cpu_has_avx2) functable.compare256 = &compare256_unaligned_avx2; diff --git a/test/benchmarks/benchmark_compare256.cc b/test/benchmarks/benchmark_compare256.cc index 01045349..5a8a98b2 100644 --- a/test/benchmarks/benchmark_compare256.cc +++ b/test/benchmarks/benchmark_compare256.cc @@ -75,9 +75,6 @@ BENCHMARK_COMPARE256(unaligned_64, compare256_unaligned_64, 1); #ifdef X86_SSE2 BENCHMARK_COMPARE256(unaligned_sse2, compare256_unaligned_sse2, x86_cpu_has_sse2); #endif -#ifdef X86_SSE42_CMP_STR -BENCHMARK_COMPARE256(unaligned_sse4, compare256_unaligned_sse4, x86_cpu_has_sse42); -#endif #if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) BENCHMARK_COMPARE256(unaligned_avx2, compare256_unaligned_avx2, x86_cpu_has_avx2); #endif diff --git a/win32/Makefile.msc b/win32/Makefile.msc index 56b97dbc..3d8f1b2e 100644 --- a/win32/Makefile.msc +++ b/win32/Makefile.msc @@ -55,7 +55,7 @@ OBJS = \ chunkset_sse2.obj \ compare256.obj \ compare256_avx2.obj \ - compare256_sse42.obj \ + compare256_sse2.obj \ compress.obj \ cpu_features.obj \ crc32.obj \