From: Adam Stylinski Date: Sun, 23 Jan 2022 03:49:04 +0000 (-0500) Subject: Write an SSE2 optimized compare256 X-Git-Tag: 2.1.0-beta1~377 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=eaa00cd791b01fc9712c66d8d523fc088480e22f;p=thirdparty%2Fzlib-ng.git Write an SSE2 optimized compare256 The SSE4 variant uses the unfortunate string comparison instructions from SSE4.2 which not only don't work on as many CPUs but, are often slower than the SSE2 counterparts except in very specific circumstances. This version should be ~2x faster than unaligned_64 for larger strings and about half the performance of AVX2 comparisons on identical hardware. This version is meant to supplement pre AVX hardware. Because of this, we're performing 1 extra load + compare at the beginning. In the event that we're doing a full 256 byte comparison (completely equal strings), this will result in 2 extra SIMD comparisons if the inputs are unaligned. Given that the loads will be absorbed by L1, this isn't super likely to be a giant penalty but for something like a core-i first or second gen, where unaligned loads aren't nearly as expensive, this going to be _marginally_ slower in the worst case. This allows us to have half the loads be aligned, so that the compiler can elide the load and compare by using a register relative pcmpeqb. --- diff --git a/CMakeLists.txt b/CMakeLists.txt index 472cf2f0..8d340087 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -802,7 +802,7 @@ if(WITH_OPTIM) check_sse2_intrinsics() if(HAVE_SSE2_INTRIN) add_definitions(-DX86_SSE2 -DX86_SSE2_CHUNKSET -DX86_SSE2_SLIDEHASH) - set(SSE2_SRCS ${ARCHDIR}/chunkset_sse2.c ${ARCHDIR}/slide_hash_sse2.c) + set(SSE2_SRCS ${ARCHDIR}/chunkset_sse2.c ${ARCHDIR}/compare256_sse2.c ${ARCHDIR}/slide_hash_sse2.c) list(APPEND ZLIB_ARCH_SRCS ${SSE2_SRCS}) if(NOT ${ARCH} MATCHES "x86_64") set_property(SOURCE ${SSE2_SRCS} PROPERTY COMPILE_FLAGS "${SSE2FLAG} ${NOLTOFLAG}") diff --git a/arch/x86/Makefile.in b/arch/x86/Makefile.in index 9ef328eb..389fc2f3 100644 --- a/arch/x86/Makefile.in +++ b/arch/x86/Makefile.in @@ -33,6 +33,7 @@ all: \ chunkset_avx.o chunkset_avx.lo \ chunkset_sse2.o chunkset_sse2.lo \ compare256_avx2.o compare256_avx2.lo \ + compare256_sse2.o compare256_sse2.lo \ compare256_sse42.o compare256_sse42.lo \ insert_string_sse42.o insert_string_sse42.lo \ crc32_fold_pclmulqdq.o crc32_fold_pclmulqdq.lo \ @@ -64,6 +65,12 @@ compare256_avx2.o: compare256_avx2.lo: $(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx2.c +compare256_sse2.o: + $(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse2.c + +compare256_sse2.lo: + $(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse2.c + compare256_sse42.o: $(CC) $(CFLAGS) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse42.c diff --git a/arch/x86/compare256_sse2.c b/arch/x86/compare256_sse2.c new file mode 100644 index 00000000..44d893d9 --- /dev/null +++ b/arch/x86/compare256_sse2.c @@ -0,0 +1,97 @@ +/* compare256_sse2.c -- SSE2 version of compare256 + * Copyright Adam Stylinski + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include "../../zbuild.h" +#include "../../zutil.h" + +#include "fallback_builtins.h" + +#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ) + +#include + +static inline uint32_t compare256_unaligned_sse2_static(const uint8_t *src0, const uint8_t *src1) { + uint32_t len = 0; + int align_offset = ((uintptr_t)src0) & 15; + const uint8_t *end0 = src0 + 256; + const uint8_t *end1 = src1 + 256; + __m128i xmm_src0, xmm_src1, xmm_cmp; + + /* Do the first load unaligned, than all subsequent ones we have at least + * one aligned load. Sadly aligning both loads is probably unrealistic */ + xmm_src0 = _mm_loadu_si128((__m128i*)src0); + xmm_src1 = _mm_loadu_si128((__m128i*)src1); + xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1); + + unsigned mask = (unsigned)_mm_movemask_epi8(xmm_cmp); + + /* Compiler _may_ turn this branch into a ptest + movemask, + * since a lot of those uops are shared and fused */ + if (mask != 0xFFFF) { + uint32_t match_byte = (uint32_t)__builtin_ctz(~mask); + return len + match_byte; + } + + int align_adv = 16 - align_offset; + len += align_adv; + src0 += align_adv; + src1 += align_adv; + + /* Do a flooring division (should just be a shift right) */ + int num_iter = (256 - len) / 16; + + for (int i = 0; i < num_iter; ++i) { + xmm_src0 = _mm_load_si128((__m128i*)src0); + xmm_src1 = _mm_loadu_si128((__m128i*)src1); + xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1); + + mask = (unsigned)_mm_movemask_epi8(xmm_cmp); + + /* Compiler _may_ turn this branch into a ptest + movemask, + * since a lot of those uops are shared and fused */ + if (mask != 0xFFFF) { + uint32_t match_byte = (uint32_t)__builtin_ctz(~mask); + return len + match_byte; + } + + len += 16, src0 += 16, src1 += 16; + } + + if (align_offset) { + src0 = end0 - 16; + src1 = end1 - 16; + len = 256 - 16; + + xmm_src0 = _mm_loadu_si128((__m128i*)src0); + xmm_src1 = _mm_loadu_si128((__m128i*)src1); + xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1); + + mask = (unsigned)_mm_movemask_epi8(xmm_cmp); + + if (mask != 0xFFFF) { + uint32_t match_byte = (uint32_t)__builtin_ctz(~mask); + return len + match_byte; + } + } + + return 256; +} + +Z_INTERNAL uint32_t compare256_unaligned_sse2(const uint8_t *src0, const uint8_t *src1) { + return compare256_unaligned_sse2_static(src0, src1); +} + +#define LONGEST_MATCH longest_match_unaligned_sse2 +#define COMPARE256 compare256_unaligned_sse2_static + +#include "match_tpl.h" + +#define LONGEST_MATCH_SLOW +#define LONGEST_MATCH longest_match_slow_unaligned_sse2 +#define COMPARE256 compare256_unaligned_sse2_static + +#include "match_tpl.h" + +#endif diff --git a/configure b/configure index 126124a6..3ea2fe65 100755 --- a/configure +++ b/configure @@ -1553,8 +1553,8 @@ case "${ARCH}" in if test ${HAVE_SSE2_INTRIN} -eq 1; then CFLAGS="${CFLAGS} -DX86_SSE2 -DX86_SSE2_CHUNKSET" SFLAGS="${SFLAGS} -DX86_SSE2 -DX86_SSE2_CHUNKSET" - ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} chunkset_sse2.o slide_hash_sse2.o" - ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} chunkset_sse2.lo slide_hash_sse2.lo" + ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} chunkset_sse2.o compare256_sse2.o slide_hash_sse2.o" + ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} chunkset_sse2.lo compare256_sse2.lo slide_hash_sse2.lo" if test $forcesse2 -eq 1; then CFLAGS="${CFLAGS} -DX86_NOCHECK_SSE2" diff --git a/cpu_features.h b/cpu_features.h index 51a2f39e..1f254336 100644 --- a/cpu_features.h +++ b/cpu_features.h @@ -119,6 +119,9 @@ extern uint32_t compare256_unaligned_32(const uint8_t *src0, const uint8_t *src1 #ifdef UNALIGNED64_OK extern uint32_t compare256_unaligned_64(const uint8_t *src0, const uint8_t *src1); #endif +#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ) +extern uint32_t compare256_unaligned_sse2(const unsigned char *src0, const unsigned char *src1); +#endif #ifdef X86_SSE42_CMP_STR extern uint32_t compare256_unaligned_sse4(const uint8_t *src0, const uint8_t *src1); #endif @@ -144,6 +147,9 @@ extern uint32_t longest_match_unaligned_32(deflate_state *const s, Pos cur_match #ifdef UNALIGNED64_OK extern uint32_t longest_match_unaligned_64(deflate_state *const s, Pos cur_match); #endif +#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ) +extern uint32_t longest_match_unaligned_sse2(deflate_state *const s, Pos cur_match); +#endif #ifdef X86_SSE42_CMP_STR extern uint32_t longest_match_unaligned_sse4(deflate_state *const s, Pos cur_match); #endif @@ -160,6 +166,9 @@ extern uint32_t longest_match_slow_unaligned_32(deflate_state *const s, Pos cur_ #ifdef UNALIGNED64_OK extern uint32_t longest_match_slow_unaligned_64(deflate_state *const s, Pos cur_match); #endif +#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ) +extern uint32_t longest_match_slow_unaligned_sse2(deflate_state *const s, Pos cur_match); +#endif #ifdef X86_SSE42_CMP_STR extern uint32_t longest_match_slow_unaligned_sse4(deflate_state *const s, Pos cur_match); #endif diff --git a/functable.c b/functable.c index 19d7258e..78866a79 100644 --- a/functable.c +++ b/functable.c @@ -106,6 +106,10 @@ Z_INTERNAL uint32_t longest_match_stub(deflate_state *const s, Pos cur_match) { # else functable.longest_match = &longest_match_unaligned_16; # endif +# if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ) + if (x86_cpu_has_sse2) + functable.longest_match = &longest_match_unaligned_sse2; +# endif # ifdef X86_SSE42_CMP_STR if (x86_cpu_has_sse42) functable.longest_match = &longest_match_unaligned_sse4; @@ -131,6 +135,10 @@ Z_INTERNAL uint32_t longest_match_slow_stub(deflate_state *const s, Pos cur_matc # else functable.longest_match_slow = &longest_match_slow_unaligned_16; # endif +# if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ) + if (x86_cpu_has_sse2) + functable.longest_match = &longest_match_slow_unaligned_sse2; +# endif # ifdef X86_SSE42_CMP_STR if (x86_cpu_has_sse42) functable.longest_match_slow = &longest_match_slow_unaligned_sse4; @@ -408,6 +416,10 @@ Z_INTERNAL uint32_t compare256_stub(const uint8_t *src0, const uint8_t *src1) { # else functable.compare256 = &compare256_unaligned_16; # endif +# if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ) + if (x86_cpu_has_sse2) + functable.compare256 = &compare256_unaligned_sse2; +# endif # ifdef X86_SSE42_CMP_STR if (x86_cpu_has_sse42) functable.compare256 = &compare256_unaligned_sse4; diff --git a/test/benchmarks/benchmark_compare256.cc b/test/benchmarks/benchmark_compare256.cc index 09a81883..01045349 100644 --- a/test/benchmarks/benchmark_compare256.cc +++ b/test/benchmarks/benchmark_compare256.cc @@ -72,6 +72,9 @@ BENCHMARK_COMPARE256(unaligned_64, compare256_unaligned_64, 1); #endif #endif +#ifdef X86_SSE2 +BENCHMARK_COMPARE256(unaligned_sse2, compare256_unaligned_sse2, x86_cpu_has_sse2); +#endif #ifdef X86_SSE42_CMP_STR BENCHMARK_COMPARE256(unaligned_sse4, compare256_unaligned_sse4, x86_cpu_has_sse42); #endif