add_definitions(-DX86_SSE42_CRC_INTRIN)
endif()
endif()
- if(HAVE_SSE42CMPSTR_INTRIN)
- add_definitions(-DX86_SSE42_CMP_STR)
- set(SSE42_SRCS ${ARCHDIR}/compare256_sse42.c)
- add_feature_info(SSE42_COMPARE256 1 "Support SSE4.2 optimized compare256, using \"${SSE42FLAG}\"")
- list(APPEND ZLIB_ARCH_SRCS ${SSE42_SRCS})
- set_property(SOURCE ${SSE42_SRCS} PROPERTY COMPILE_FLAGS "${SSE42FLAG} ${NOLTOFLAG}")
- endif()
if(NOT HAVE_SSE42CRC_INLINE_ASM AND NOT HAVE_SSE42CRC_INTRIN AND NOT HAVE_SSE42CMPSTR_INTRIN)
set(WITH_SSE4 OFF)
endif()
* CRC32-B implementation using PCLMULQDQ, VPCLMULQDQ, ACLE, & IBM Z
* Hash table implementation using CRC32-C intrinsics on x86 and ARM
* Slide hash implementations using SSE2, AVX2, Neon, VMX & VSX
- * Compare256 implementations using SSE4.2 & AVX2
+ * Compare256 implementations using SSE2 & AVX2
* Inflate chunk copying using SSE2, AVX, Neon & VSX
* Support for hardware-accelerated deflate using IBM Z DFLTCC
* Unaligned memory read/writes and large bit buffer improvements
chunkset_sse2.o chunkset_sse2.lo \
compare256_avx2.o compare256_avx2.lo \
compare256_sse2.o compare256_sse2.lo \
- compare256_sse42.o compare256_sse42.lo \
insert_string_sse42.o insert_string_sse42.lo \
crc32_fold_pclmulqdq.o crc32_fold_pclmulqdq.lo \
crc32_fold_vpclmulqdq.o crc32_fold_vpclmulqdq.lo \
compare256_sse2.lo:
$(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse2.c
-compare256_sse42.o:
- $(CC) $(CFLAGS) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse42.c
-
-compare256_sse42.lo:
- $(CC) $(SFLAGS) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse42.c
-
insert_string_sse42.o:
$(CC) $(CFLAGS) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_sse42.c
*/
#include "../../zbuild.h"
-#include "../../zutil.h"
#include "fallback_builtins.h"
+++ /dev/null
-/* compare256_sse42.c -- SSE4.2 version of compare256
- *
- * Copyright (C) 2013 Intel Corporation. All rights reserved.
- * Authors:
- * Wajdi Feghali <wajdi.k.feghali@intel.com>
- * Jim Guilford <james.guilford@intel.com>
- * Vinodh Gopal <vinodh.gopal@intel.com>
- * Erdinc Ozturk <erdinc.ozturk@intel.com>
- * Jim Kukunas <james.t.kukunas@linux.intel.com>
- *
- * Portions are Copyright (C) 2016 12Sided Technology, LLC.
- * Author:
- * Phil Vachon <pvachon@12sidedtech.com>
- *
- * For conditions of distribution and use, see copyright notice in zlib.h
- */
-
-#include "../../zbuild.h"
-
-#ifdef X86_SSE42_CMP_STR
-
-#include <immintrin.h>
-#ifdef _MSC_VER
-# include <nmmintrin.h>
-#endif
-
-/* UNALIGNED_OK, SSE4.2 intrinsic comparison */
-static inline uint32_t compare256_unaligned_sse4_static(const uint8_t *src0, const uint8_t *src1) {
- uint32_t len = 0;
-
- do {
- #define cmp_mode _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_EACH | _SIDD_NEGATIVE_POLARITY
- __m128i xmm_src0, xmm_src1;
- uint32_t ret;
-
- xmm_src0 = _mm_loadu_si128((__m128i *)src0);
- xmm_src1 = _mm_loadu_si128((__m128i *)src1);
- ret = (uint32_t)_mm_cmpestri(xmm_src0, 16, xmm_src1, 16, cmp_mode);
- if (_mm_cmpestrc(xmm_src0, 16, xmm_src1, 16, cmp_mode)) {
- return len + ret;
- }
- src0 += 16, src1 += 16, len += 16;
-
- xmm_src0 = _mm_loadu_si128((__m128i *)src0);
- xmm_src1 = _mm_loadu_si128((__m128i *)src1);
- ret = (uint32_t)_mm_cmpestri(xmm_src0, 16, xmm_src1, 16, cmp_mode);
- if (_mm_cmpestrc(xmm_src0, 16, xmm_src1, 16, cmp_mode)) {
- return len + ret;
- }
- src0 += 16, src1 += 16, len += 16;
- } while (len < 256);
-
- return 256;
-}
-
-Z_INTERNAL uint32_t compare256_unaligned_sse4(const uint8_t *src0, const uint8_t *src1) {
- return compare256_unaligned_sse4_static(src0, src1);
-}
-
-#define LONGEST_MATCH longest_match_unaligned_sse4
-#define COMPARE256 compare256_unaligned_sse4_static
-
-#include "match_tpl.h"
-
-#define LONGEST_MATCH_SLOW
-#define LONGEST_MATCH longest_match_slow_unaligned_sse4
-#define COMPARE256 compare256_unaligned_sse4_static
-
-#include "match_tpl.h"
-
-#endif
ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} insert_string_sse42.lo"
fi
- if test ${HAVE_SSE42CMPSTR_INTRIN} -eq 1; then
- CFLAGS="${CFLAGS} -DX86_SSE42_CMP_STR"
- SFLAGS="${SFLAGS} -DX86_SSE42_CMP_STR"
-
- ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} compare256_sse42.o"
- ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} compare256_sse42.lo"
- fi
-
check_sse2_intrinsics
if test ${HAVE_SSE2_INTRIN} -eq 1; then
extern uint32_t compare256_unaligned_64(const uint8_t *src0, const uint8_t *src1);
#endif
#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
-extern uint32_t compare256_unaligned_sse2(const unsigned char *src0, const unsigned char *src1);
-#endif
-#ifdef X86_SSE42_CMP_STR
-extern uint32_t compare256_unaligned_sse4(const uint8_t *src0, const uint8_t *src1);
+extern uint32_t compare256_unaligned_sse2(const uint8_t *src0, const uint8_t *src1);
#endif
#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
extern uint32_t compare256_unaligned_avx2(const uint8_t *src0, const uint8_t *src1);
#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
extern uint32_t longest_match_unaligned_sse2(deflate_state *const s, Pos cur_match);
#endif
-#ifdef X86_SSE42_CMP_STR
-extern uint32_t longest_match_unaligned_sse4(deflate_state *const s, Pos cur_match);
-#endif
#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
extern uint32_t longest_match_unaligned_avx2(deflate_state *const s, Pos cur_match);
#endif
#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
extern uint32_t longest_match_slow_unaligned_sse2(deflate_state *const s, Pos cur_match);
#endif
-#ifdef X86_SSE42_CMP_STR
-extern uint32_t longest_match_slow_unaligned_sse4(deflate_state *const s, Pos cur_match);
-#endif
#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
extern uint32_t longest_match_slow_unaligned_avx2(deflate_state *const s, Pos cur_match);
#endif
if (x86_cpu_has_sse2)
functable.longest_match = &longest_match_unaligned_sse2;
# endif
-# ifdef X86_SSE42_CMP_STR
- if (x86_cpu_has_sse42)
- functable.longest_match = &longest_match_unaligned_sse4;
-# endif
# if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
if (x86_cpu_has_avx2)
functable.longest_match = &longest_match_unaligned_avx2;
if (x86_cpu_has_sse2)
functable.longest_match = &longest_match_slow_unaligned_sse2;
# endif
-# ifdef X86_SSE42_CMP_STR
- if (x86_cpu_has_sse42)
- functable.longest_match_slow = &longest_match_slow_unaligned_sse4;
-# endif
# if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
if (x86_cpu_has_avx2)
functable.longest_match_slow = &longest_match_slow_unaligned_avx2;
if (x86_cpu_has_sse2)
functable.compare256 = &compare256_unaligned_sse2;
# endif
-# ifdef X86_SSE42_CMP_STR
- if (x86_cpu_has_sse42)
- functable.compare256 = &compare256_unaligned_sse4;
-# endif
# if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
if (x86_cpu_has_avx2)
functable.compare256 = &compare256_unaligned_avx2;
#ifdef X86_SSE2
BENCHMARK_COMPARE256(unaligned_sse2, compare256_unaligned_sse2, x86_cpu_has_sse2);
#endif
-#ifdef X86_SSE42_CMP_STR
-BENCHMARK_COMPARE256(unaligned_sse4, compare256_unaligned_sse4, x86_cpu_has_sse42);
-#endif
#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
BENCHMARK_COMPARE256(unaligned_avx2, compare256_unaligned_avx2, x86_cpu_has_avx2);
#endif
chunkset_sse2.obj \
compare256.obj \
compare256_avx2.obj \
- compare256_sse42.obj \
+ compare256_sse2.obj \
compress.obj \
cpu_features.obj \
crc32.obj \