From: Adam Stylinski <kungfujesus06@gmail.com>
Date: Sun, 23 Jan 2022 03:49:04 +0000 (-0500)
Subject: Write an SSE2 optimized compare256
X-Git-Tag: 2.1.0-beta1~377
X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=eaa00cd791b01fc9712c66d8d523fc088480e22f;p=thirdparty%2Fzlib-ng.git

Write an SSE2 optimized compare256

The SSE4 variant uses the unfortunate string comparison instructions from
SSE4.2 which not only don't work on as many CPUs but, are often slower
than the SSE2 counterparts except in very specific circumstances.

This version should be ~2x faster than unaligned_64 for larger strings
and about half the performance of AVX2 comparisons on identical
hardware.

This version is meant to supplement pre AVX hardware. Because of this,
we're performing 1 extra load + compare at the beginning. In the event
that we're doing a full 256 byte comparison (completely equal strings),
this will result in 2 extra SIMD comparisons if the inputs are unaligned.
Given that the loads will be absorbed by L1, this isn't super likely to
be a giant penalty but for something like a core-i first or second gen,
where unaligned loads aren't nearly as expensive, this going to be
_marginally_ slower in the worst case.  This allows us to have half the
loads be aligned, so that the compiler can elide the load and compare by
using a register relative pcmpeqb.
---

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 472cf2f0..8d340087 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -802,7 +802,7 @@ if(WITH_OPTIM)
             check_sse2_intrinsics()
             if(HAVE_SSE2_INTRIN)
                 add_definitions(-DX86_SSE2 -DX86_SSE2_CHUNKSET -DX86_SSE2_SLIDEHASH)
-                set(SSE2_SRCS ${ARCHDIR}/chunkset_sse2.c ${ARCHDIR}/slide_hash_sse2.c)
+                set(SSE2_SRCS ${ARCHDIR}/chunkset_sse2.c ${ARCHDIR}/compare256_sse2.c ${ARCHDIR}/slide_hash_sse2.c)
                 list(APPEND ZLIB_ARCH_SRCS ${SSE2_SRCS})
                 if(NOT ${ARCH} MATCHES "x86_64")
                     set_property(SOURCE ${SSE2_SRCS} PROPERTY COMPILE_FLAGS "${SSE2FLAG} ${NOLTOFLAG}")
diff --git a/arch/x86/Makefile.in b/arch/x86/Makefile.in
index 9ef328eb..389fc2f3 100644
--- a/arch/x86/Makefile.in
+++ b/arch/x86/Makefile.in
@@ -33,6 +33,7 @@ all: \
 	chunkset_avx.o chunkset_avx.lo \
 	chunkset_sse2.o chunkset_sse2.lo \
 	compare256_avx2.o compare256_avx2.lo \
+	compare256_sse2.o compare256_sse2.lo \
 	compare256_sse42.o compare256_sse42.lo \
 	insert_string_sse42.o insert_string_sse42.lo \
 	crc32_fold_pclmulqdq.o crc32_fold_pclmulqdq.lo \
@@ -64,6 +65,12 @@ compare256_avx2.o:
 compare256_avx2.lo:
 	$(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx2.c
 
+compare256_sse2.o:
+	$(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse2.c
+
+compare256_sse2.lo:
+	$(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse2.c
+
 compare256_sse42.o:
 	$(CC) $(CFLAGS) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse42.c
 
diff --git a/arch/x86/compare256_sse2.c b/arch/x86/compare256_sse2.c
new file mode 100644
index 00000000..44d893d9
--- /dev/null
+++ b/arch/x86/compare256_sse2.c
@@ -0,0 +1,97 @@
+/* compare256_sse2.c -- SSE2 version of compare256
+ * Copyright Adam Stylinski <kungfujesus06@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "../../zbuild.h"
+#include "../../zutil.h"
+
+#include "fallback_builtins.h"
+
+#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
+
+#include <emmintrin.h>
+
+static inline uint32_t compare256_unaligned_sse2_static(const uint8_t *src0, const uint8_t *src1) {
+    uint32_t len = 0;
+    int align_offset = ((uintptr_t)src0) & 15;
+    const uint8_t *end0 = src0 + 256;
+    const uint8_t *end1 = src1 + 256;
+    __m128i xmm_src0, xmm_src1, xmm_cmp;
+
+    /* Do the first load unaligned, than all subsequent ones we have at least
+     * one aligned load. Sadly aligning both loads is probably unrealistic */
+    xmm_src0 = _mm_loadu_si128((__m128i*)src0);
+    xmm_src1 = _mm_loadu_si128((__m128i*)src1);
+    xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1);
+
+    unsigned mask = (unsigned)_mm_movemask_epi8(xmm_cmp);
+
+    /* Compiler _may_ turn this branch into a ptest + movemask,
+     * since a lot of those uops are shared and fused */
+    if (mask != 0xFFFF) {
+        uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
+        return len + match_byte;
+    }
+
+    int align_adv = 16 - align_offset;
+    len += align_adv;
+    src0 += align_adv;
+    src1 += align_adv;
+
+    /* Do a flooring division (should just be a shift right) */
+    int num_iter = (256 - len) / 16;
+
+    for (int i = 0; i < num_iter; ++i) {
+        xmm_src0 = _mm_load_si128((__m128i*)src0);
+        xmm_src1 = _mm_loadu_si128((__m128i*)src1);
+        xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1);
+
+        mask = (unsigned)_mm_movemask_epi8(xmm_cmp);
+
+        /* Compiler _may_ turn this branch into a ptest + movemask,
+         * since a lot of those uops are shared and fused */
+        if (mask != 0xFFFF) {
+            uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
+            return len + match_byte;
+        }
+
+        len += 16, src0 += 16, src1 += 16;
+    }
+
+    if (align_offset) {
+        src0 = end0 - 16;
+        src1 = end1 - 16;
+        len = 256 - 16;
+
+        xmm_src0 = _mm_loadu_si128((__m128i*)src0);
+        xmm_src1 = _mm_loadu_si128((__m128i*)src1);
+        xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1);
+
+        mask = (unsigned)_mm_movemask_epi8(xmm_cmp);
+
+        if (mask != 0xFFFF) {
+            uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
+            return len + match_byte;
+        }
+    }
+
+    return 256;
+}
+
+Z_INTERNAL uint32_t compare256_unaligned_sse2(const uint8_t *src0, const uint8_t *src1) {
+    return compare256_unaligned_sse2_static(src0, src1);
+}
+
+#define LONGEST_MATCH       longest_match_unaligned_sse2
+#define COMPARE256          compare256_unaligned_sse2_static
+
+#include "match_tpl.h"
+
+#define LONGEST_MATCH_SLOW
+#define LONGEST_MATCH       longest_match_slow_unaligned_sse2
+#define COMPARE256          compare256_unaligned_sse2_static
+
+#include "match_tpl.h"
+
+#endif
diff --git a/configure b/configure
index 126124a6..3ea2fe65 100755
--- a/configure
+++ b/configure
@@ -1553,8 +1553,8 @@ case "${ARCH}" in
             if test ${HAVE_SSE2_INTRIN} -eq 1; then
                 CFLAGS="${CFLAGS} -DX86_SSE2 -DX86_SSE2_CHUNKSET"
                 SFLAGS="${SFLAGS} -DX86_SSE2 -DX86_SSE2_CHUNKSET"
-                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} chunkset_sse2.o slide_hash_sse2.o"
-                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} chunkset_sse2.lo slide_hash_sse2.lo"
+                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} chunkset_sse2.o compare256_sse2.o slide_hash_sse2.o"
+                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} chunkset_sse2.lo compare256_sse2.lo slide_hash_sse2.lo"
 
                 if test $forcesse2 -eq 1; then
                     CFLAGS="${CFLAGS} -DX86_NOCHECK_SSE2"
diff --git a/cpu_features.h b/cpu_features.h
index 51a2f39e..1f254336 100644
--- a/cpu_features.h
+++ b/cpu_features.h
@@ -119,6 +119,9 @@ extern uint32_t compare256_unaligned_32(const uint8_t *src0, const uint8_t *src1
 #ifdef UNALIGNED64_OK
 extern uint32_t compare256_unaligned_64(const uint8_t *src0, const uint8_t *src1);
 #endif
+#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
+extern uint32_t compare256_unaligned_sse2(const unsigned char *src0, const unsigned char *src1);
+#endif
 #ifdef X86_SSE42_CMP_STR
 extern uint32_t compare256_unaligned_sse4(const uint8_t *src0, const uint8_t *src1);
 #endif
@@ -144,6 +147,9 @@ extern uint32_t longest_match_unaligned_32(deflate_state *const s, Pos cur_match
 #ifdef UNALIGNED64_OK
 extern uint32_t longest_match_unaligned_64(deflate_state *const s, Pos cur_match);
 #endif
+#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
+extern uint32_t longest_match_unaligned_sse2(deflate_state *const s, Pos cur_match);
+#endif
 #ifdef X86_SSE42_CMP_STR
 extern uint32_t longest_match_unaligned_sse4(deflate_state *const s, Pos cur_match);
 #endif
@@ -160,6 +166,9 @@ extern uint32_t longest_match_slow_unaligned_32(deflate_state *const s, Pos cur_
 #ifdef UNALIGNED64_OK
 extern uint32_t longest_match_slow_unaligned_64(deflate_state *const s, Pos cur_match);
 #endif
+#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
+extern uint32_t longest_match_slow_unaligned_sse2(deflate_state *const s, Pos cur_match);
+#endif
 #ifdef X86_SSE42_CMP_STR
 extern uint32_t longest_match_slow_unaligned_sse4(deflate_state *const s, Pos cur_match);
 #endif
diff --git a/functable.c b/functable.c
index 19d7258e..78866a79 100644
--- a/functable.c
+++ b/functable.c
@@ -106,6 +106,10 @@ Z_INTERNAL uint32_t longest_match_stub(deflate_state *const s, Pos cur_match) {
 #  else
     functable.longest_match = &longest_match_unaligned_16;
 #  endif
+#  if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
+    if (x86_cpu_has_sse2)
+        functable.longest_match = &longest_match_unaligned_sse2;
+#  endif
 #  ifdef X86_SSE42_CMP_STR
     if (x86_cpu_has_sse42)
         functable.longest_match = &longest_match_unaligned_sse4;
@@ -131,6 +135,10 @@ Z_INTERNAL uint32_t longest_match_slow_stub(deflate_state *const s, Pos cur_matc
 #  else
     functable.longest_match_slow = &longest_match_slow_unaligned_16;
 #  endif
+#  if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
+    if (x86_cpu_has_sse2)
+        functable.longest_match = &longest_match_slow_unaligned_sse2;
+#  endif
 #  ifdef X86_SSE42_CMP_STR
     if (x86_cpu_has_sse42)
         functable.longest_match_slow = &longest_match_slow_unaligned_sse4;
@@ -408,6 +416,10 @@ Z_INTERNAL uint32_t compare256_stub(const uint8_t *src0, const uint8_t *src1) {
 #  else
     functable.compare256 = &compare256_unaligned_16;
 #  endif
+#  if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
+    if (x86_cpu_has_sse2)
+        functable.compare256 = &compare256_unaligned_sse2;
+#  endif
 #  ifdef X86_SSE42_CMP_STR
     if (x86_cpu_has_sse42)
         functable.compare256 = &compare256_unaligned_sse4;
diff --git a/test/benchmarks/benchmark_compare256.cc b/test/benchmarks/benchmark_compare256.cc
index 09a81883..01045349 100644
--- a/test/benchmarks/benchmark_compare256.cc
+++ b/test/benchmarks/benchmark_compare256.cc
@@ -72,6 +72,9 @@ BENCHMARK_COMPARE256(unaligned_64, compare256_unaligned_64, 1);
 #endif
 #endif
 
+#ifdef X86_SSE2
+BENCHMARK_COMPARE256(unaligned_sse2, compare256_unaligned_sse2, x86_cpu_has_sse2);
+#endif
 #ifdef X86_SSE42_CMP_STR
 BENCHMARK_COMPARE256(unaligned_sse4, compare256_unaligned_sse4, x86_cpu_has_sse42);
 #endif