]> git.ipfire.org Git - thirdparty/zlib-ng.git/commitdiff
Write an SSE2 optimized compare256
authorAdam Stylinski <kungfujesus06@gmail.com>
Sun, 23 Jan 2022 03:49:04 +0000 (22:49 -0500)
committerHans Kristian Rosbach <hk-github@circlestorm.org>
Fri, 11 Feb 2022 08:56:19 +0000 (09:56 +0100)
The SSE4 variant uses the unfortunate string comparison instructions from
SSE4.2 which not only don't work on as many CPUs but, are often slower
than the SSE2 counterparts except in very specific circumstances.

This version should be ~2x faster than unaligned_64 for larger strings
and about half the performance of AVX2 comparisons on identical
hardware.

This version is meant to supplement pre AVX hardware. Because of this,
we're performing 1 extra load + compare at the beginning. In the event
that we're doing a full 256 byte comparison (completely equal strings),
this will result in 2 extra SIMD comparisons if the inputs are unaligned.
Given that the loads will be absorbed by L1, this isn't super likely to
be a giant penalty but for something like a core-i first or second gen,
where unaligned loads aren't nearly as expensive, this going to be
_marginally_ slower in the worst case.  This allows us to have half the
loads be aligned, so that the compiler can elide the load and compare by
using a register relative pcmpeqb.

CMakeLists.txt
arch/x86/Makefile.in
arch/x86/compare256_sse2.c [new file with mode: 0644]
configure
cpu_features.h
functable.c
test/benchmarks/benchmark_compare256.cc

index 472cf2f092d9ae2bba4e5d421144f93fe91b74ae..8d340087fbb01d37947c1f8212f55adc02bed28d 100644 (file)
@@ -802,7 +802,7 @@ if(WITH_OPTIM)
             check_sse2_intrinsics()
             if(HAVE_SSE2_INTRIN)
                 add_definitions(-DX86_SSE2 -DX86_SSE2_CHUNKSET -DX86_SSE2_SLIDEHASH)
-                set(SSE2_SRCS ${ARCHDIR}/chunkset_sse2.c ${ARCHDIR}/slide_hash_sse2.c)
+                set(SSE2_SRCS ${ARCHDIR}/chunkset_sse2.c ${ARCHDIR}/compare256_sse2.c ${ARCHDIR}/slide_hash_sse2.c)
                 list(APPEND ZLIB_ARCH_SRCS ${SSE2_SRCS})
                 if(NOT ${ARCH} MATCHES "x86_64")
                     set_property(SOURCE ${SSE2_SRCS} PROPERTY COMPILE_FLAGS "${SSE2FLAG} ${NOLTOFLAG}")
index 9ef328ebef284bde56311660b58e58d46b06a767..389fc2f328689fa73217448c59399fe264ce9f95 100644 (file)
@@ -33,6 +33,7 @@ all: \
        chunkset_avx.o chunkset_avx.lo \
        chunkset_sse2.o chunkset_sse2.lo \
        compare256_avx2.o compare256_avx2.lo \
+       compare256_sse2.o compare256_sse2.lo \
        compare256_sse42.o compare256_sse42.lo \
        insert_string_sse42.o insert_string_sse42.lo \
        crc32_fold_pclmulqdq.o crc32_fold_pclmulqdq.lo \
@@ -64,6 +65,12 @@ compare256_avx2.o:
 compare256_avx2.lo:
        $(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx2.c
 
+compare256_sse2.o:
+       $(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse2.c
+
+compare256_sse2.lo:
+       $(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse2.c
+
 compare256_sse42.o:
        $(CC) $(CFLAGS) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse42.c
 
diff --git a/arch/x86/compare256_sse2.c b/arch/x86/compare256_sse2.c
new file mode 100644 (file)
index 0000000..44d893d
--- /dev/null
@@ -0,0 +1,97 @@
+/* compare256_sse2.c -- SSE2 version of compare256
+ * Copyright Adam Stylinski <kungfujesus06@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "../../zbuild.h"
+#include "../../zutil.h"
+
+#include "fallback_builtins.h"
+
+#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
+
+#include <emmintrin.h>
+
+static inline uint32_t compare256_unaligned_sse2_static(const uint8_t *src0, const uint8_t *src1) {
+    uint32_t len = 0;
+    int align_offset = ((uintptr_t)src0) & 15;
+    const uint8_t *end0 = src0 + 256;
+    const uint8_t *end1 = src1 + 256;
+    __m128i xmm_src0, xmm_src1, xmm_cmp;
+
+    /* Do the first load unaligned, than all subsequent ones we have at least
+     * one aligned load. Sadly aligning both loads is probably unrealistic */
+    xmm_src0 = _mm_loadu_si128((__m128i*)src0);
+    xmm_src1 = _mm_loadu_si128((__m128i*)src1);
+    xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1);
+
+    unsigned mask = (unsigned)_mm_movemask_epi8(xmm_cmp);
+
+    /* Compiler _may_ turn this branch into a ptest + movemask,
+     * since a lot of those uops are shared and fused */
+    if (mask != 0xFFFF) {
+        uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
+        return len + match_byte;
+    }
+
+    int align_adv = 16 - align_offset;
+    len += align_adv;
+    src0 += align_adv;
+    src1 += align_adv;
+
+    /* Do a flooring division (should just be a shift right) */
+    int num_iter = (256 - len) / 16;
+
+    for (int i = 0; i < num_iter; ++i) {
+        xmm_src0 = _mm_load_si128((__m128i*)src0);
+        xmm_src1 = _mm_loadu_si128((__m128i*)src1);
+        xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1);
+
+        mask = (unsigned)_mm_movemask_epi8(xmm_cmp);
+
+        /* Compiler _may_ turn this branch into a ptest + movemask,
+         * since a lot of those uops are shared and fused */
+        if (mask != 0xFFFF) {
+            uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
+            return len + match_byte;
+        }
+
+        len += 16, src0 += 16, src1 += 16;
+    }
+
+    if (align_offset) {
+        src0 = end0 - 16;
+        src1 = end1 - 16;
+        len = 256 - 16;
+
+        xmm_src0 = _mm_loadu_si128((__m128i*)src0);
+        xmm_src1 = _mm_loadu_si128((__m128i*)src1);
+        xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1);
+
+        mask = (unsigned)_mm_movemask_epi8(xmm_cmp);
+
+        if (mask != 0xFFFF) {
+            uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
+            return len + match_byte;
+        }
+    }
+
+    return 256;
+}
+
+Z_INTERNAL uint32_t compare256_unaligned_sse2(const uint8_t *src0, const uint8_t *src1) {
+    return compare256_unaligned_sse2_static(src0, src1);
+}
+
+#define LONGEST_MATCH       longest_match_unaligned_sse2
+#define COMPARE256          compare256_unaligned_sse2_static
+
+#include "match_tpl.h"
+
+#define LONGEST_MATCH_SLOW
+#define LONGEST_MATCH       longest_match_slow_unaligned_sse2
+#define COMPARE256          compare256_unaligned_sse2_static
+
+#include "match_tpl.h"
+
+#endif
index 126124a6c4f892f7c7f8ae58f1b3b3b720ff4233..3ea2fe65e9908e37a0a8d98f1d3aec7b94875d3b 100755 (executable)
--- a/configure
+++ b/configure
@@ -1553,8 +1553,8 @@ case "${ARCH}" in
             if test ${HAVE_SSE2_INTRIN} -eq 1; then
                 CFLAGS="${CFLAGS} -DX86_SSE2 -DX86_SSE2_CHUNKSET"
                 SFLAGS="${SFLAGS} -DX86_SSE2 -DX86_SSE2_CHUNKSET"
-                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} chunkset_sse2.o slide_hash_sse2.o"
-                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} chunkset_sse2.lo slide_hash_sse2.lo"
+                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} chunkset_sse2.o compare256_sse2.o slide_hash_sse2.o"
+                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} chunkset_sse2.lo compare256_sse2.lo slide_hash_sse2.lo"
 
                 if test $forcesse2 -eq 1; then
                     CFLAGS="${CFLAGS} -DX86_NOCHECK_SSE2"
index 51a2f39e9225fd3e47cb1840ae12059de7b23f1d..1f25433660de20eab1b7c58e006f44c18a83f11a 100644 (file)
@@ -119,6 +119,9 @@ extern uint32_t compare256_unaligned_32(const uint8_t *src0, const uint8_t *src1
 #ifdef UNALIGNED64_OK
 extern uint32_t compare256_unaligned_64(const uint8_t *src0, const uint8_t *src1);
 #endif
+#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
+extern uint32_t compare256_unaligned_sse2(const unsigned char *src0, const unsigned char *src1);
+#endif
 #ifdef X86_SSE42_CMP_STR
 extern uint32_t compare256_unaligned_sse4(const uint8_t *src0, const uint8_t *src1);
 #endif
@@ -144,6 +147,9 @@ extern uint32_t longest_match_unaligned_32(deflate_state *const s, Pos cur_match
 #ifdef UNALIGNED64_OK
 extern uint32_t longest_match_unaligned_64(deflate_state *const s, Pos cur_match);
 #endif
+#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
+extern uint32_t longest_match_unaligned_sse2(deflate_state *const s, Pos cur_match);
+#endif
 #ifdef X86_SSE42_CMP_STR
 extern uint32_t longest_match_unaligned_sse4(deflate_state *const s, Pos cur_match);
 #endif
@@ -160,6 +166,9 @@ extern uint32_t longest_match_slow_unaligned_32(deflate_state *const s, Pos cur_
 #ifdef UNALIGNED64_OK
 extern uint32_t longest_match_slow_unaligned_64(deflate_state *const s, Pos cur_match);
 #endif
+#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
+extern uint32_t longest_match_slow_unaligned_sse2(deflate_state *const s, Pos cur_match);
+#endif
 #ifdef X86_SSE42_CMP_STR
 extern uint32_t longest_match_slow_unaligned_sse4(deflate_state *const s, Pos cur_match);
 #endif
index 19d7258e1eb72b666db139c3f780994b6ee4bbee..78866a7922e27f19147e3318f599a0ea5f664532 100644 (file)
@@ -106,6 +106,10 @@ Z_INTERNAL uint32_t longest_match_stub(deflate_state *const s, Pos cur_match) {
 #  else
     functable.longest_match = &longest_match_unaligned_16;
 #  endif
+#  if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
+    if (x86_cpu_has_sse2)
+        functable.longest_match = &longest_match_unaligned_sse2;
+#  endif
 #  ifdef X86_SSE42_CMP_STR
     if (x86_cpu_has_sse42)
         functable.longest_match = &longest_match_unaligned_sse4;
@@ -131,6 +135,10 @@ Z_INTERNAL uint32_t longest_match_slow_stub(deflate_state *const s, Pos cur_matc
 #  else
     functable.longest_match_slow = &longest_match_slow_unaligned_16;
 #  endif
+#  if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
+    if (x86_cpu_has_sse2)
+        functable.longest_match = &longest_match_slow_unaligned_sse2;
+#  endif
 #  ifdef X86_SSE42_CMP_STR
     if (x86_cpu_has_sse42)
         functable.longest_match_slow = &longest_match_slow_unaligned_sse4;
@@ -408,6 +416,10 @@ Z_INTERNAL uint32_t compare256_stub(const uint8_t *src0, const uint8_t *src1) {
 #  else
     functable.compare256 = &compare256_unaligned_16;
 #  endif
+#  if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
+    if (x86_cpu_has_sse2)
+        functable.compare256 = &compare256_unaligned_sse2;
+#  endif
 #  ifdef X86_SSE42_CMP_STR
     if (x86_cpu_has_sse42)
         functable.compare256 = &compare256_unaligned_sse4;
index 09a81883d35f3f37a4194834bf6694df7e4a68dc..010453497a66799d5a607697f14bf93c9a3872af 100644 (file)
@@ -72,6 +72,9 @@ BENCHMARK_COMPARE256(unaligned_64, compare256_unaligned_64, 1);
 #endif
 #endif
 
+#ifdef X86_SSE2
+BENCHMARK_COMPARE256(unaligned_sse2, compare256_unaligned_sse2, x86_cpu_has_sse2);
+#endif
 #ifdef X86_SSE42_CMP_STR
 BENCHMARK_COMPARE256(unaligned_sse4, compare256_unaligned_sse4, x86_cpu_has_sse42);
 #endif