]> git.ipfire.org Git - thirdparty/zlib-ng.git/commitdiff
Add AVX512 version of compare256
authorHans Kristian Rosbach <hk-git@circlestorm.org>
Thu, 10 Apr 2025 22:46:06 +0000 (00:46 +0200)
committerHans Kristian Rosbach <hk-github@circlestorm.org>
Mon, 14 Apr 2025 21:28:38 +0000 (23:28 +0200)
Improve the speed of sub-16 byte matches by first using a
128-bit intrinsic, after that use only 512-bit intrinsics.
This requires us to overlap on the last run, but this is cheaper than
processing the tail using a 256-bit and then a 128-bit run.

Change benchmark steps to avoid it hitting chunk boundaries
of one or the other function as much, this gives more fair benchmarks.

CMakeLists.txt
arch/x86/Makefile.in
arch/x86/compare256_avx512.c [new file with mode: 0644]
arch/x86/x86_functions.h
configure
functable.c
test/benchmarks/benchmark_compare256.cc
test/test_compare256.cc

index bcf0e4919464ea8b2d91775138c360fc1ed53ba6..4a453ebec7880d2360f00d633381233e7078cc0a 100644 (file)
@@ -1047,6 +1047,8 @@ if(WITH_OPTIM)
                 add_feature_info(AVX512_ADLER32 1 "Support AVX512-accelerated adler32, using \"${AVX512FLAG}\"")
                 list(APPEND AVX512_SRCS ${ARCHDIR}/chunkset_avx512.c)
                 add_feature_info(AVX512_CHUNKSET 1 "Support AVX512 optimized chunkset, using \"${AVX512FLAG}\"")
+                list(APPEND AVX512_SRCS ${ARCHDIR}/compare256_avx512.c)
+                add_feature_info(AVX512_COMPARE256 1 "Support AVX512 optimized compare256, using \"${AVX512FLAG}\"")
                 list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/adler32_avx512_p.h)
                 list(APPEND ZLIB_ARCH_SRCS ${AVX512_SRCS})
                 set_property(SOURCE ${AVX512_SRCS} PROPERTY COMPILE_FLAGS "${AVX512FLAG} ${NOLTOFLAG}")
index 7705cd0913330268270676036896e816a72577d3..3b00c3ed8f6e5ff4d73f9afec777937d091080a9 100644 (file)
@@ -36,6 +36,7 @@ all: \
        chunkset_ssse3.o chunkset_ssse3.lo \
        chorba_sse2.o chorba_sse2.lo \
        compare256_avx2.o compare256_avx2.lo \
+       compare256_avx512.o compare256_avx512.lo \
        compare256_sse2.o compare256_sse2.lo \
        crc32_pclmulqdq.o crc32_pclmulqdq.lo \
        crc32_vpclmulqdq.o crc32_vpclmulqdq.lo \
@@ -84,6 +85,12 @@ compare256_avx2.o:
 compare256_avx2.lo:
        $(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx2.c
 
+compare256_avx512.o:
+       $(CC) $(CFLAGS) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx512.c
+
+compare256_avx512.lo:
+       $(CC) $(SFLAGS) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx512.c
+
 compare256_sse2.o:
        $(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse2.c
 
diff --git a/arch/x86/compare256_avx512.c b/arch/x86/compare256_avx512.c
new file mode 100644 (file)
index 0000000..a1ebe0e
--- /dev/null
@@ -0,0 +1,97 @@
+/* compare256_avx512.c -- AVX512 version of compare256
+ * Copyright (C) 2025 Hans Kristian Rosbach
+ * Based on AVX2 implementation by Mika T. Lindqvist
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "zmemory.h"
+#include "deflate.h"
+#include "fallback_builtins.h"
+
+#if defined(X86_AVX512) && defined(HAVE_BUILTIN_CTZLL)
+
+#include <immintrin.h>
+#ifdef _MSC_VER
+#  include <nmmintrin.h>
+#endif
+
+static inline uint32_t compare256_avx512_static(const uint8_t *src0, const uint8_t *src1) {
+    __m512i zmm_src0_4, zmm_src1_4;
+    __m512i zmm_src0_3, zmm_src1_3;
+    __m512i zmm_src0_2, zmm_src1_2;
+    __m512i zmm_src0_1, zmm_src1_1;
+    __m128i xmm_src0_0, xmm_src1_0;
+    uint64_t mask_1, mask_2, mask_3, mask_4;
+    uint32_t mask_0;
+
+    // First do a 16byte round before increasing to 64bytes, this reduces the
+    // penalty for the short matches, and those are usually the most common ones.
+    // This requires us to overlap on the last round, giving a small penalty
+    // on matches of 192+ bytes (Still faster than AVX2 though).
+
+    // 16 bytes
+    xmm_src0_0 = _mm_loadu_si128((__m128i*)src0);
+    xmm_src1_0 = _mm_loadu_si128((__m128i*)src1);
+    mask_0 = (uint32_t)_mm_cmpeq_epu8_mask(xmm_src0_0, xmm_src1_0); // zero-extended to use __builtin_ctz
+    if (mask_0 != 0x0000FFFF) {
+        // There is potential for using __builtin_ctzg/__builtin_ctzs/_tzcnt_u16/__tzcnt_u16 here
+        uint32_t match_byte = (uint32_t)__builtin_ctz(~mask_0); /* Invert bits so identical = 0 */
+        return match_byte;
+    }
+
+    // 64 bytes
+    zmm_src0_1 = _mm512_loadu_si512((__m512i*)(src0 + 16));
+    zmm_src1_1 = _mm512_loadu_si512((__m512i*)(src1 + 16));
+    mask_1 = _mm512_cmpeq_epu8_mask(zmm_src0_1, zmm_src1_1);
+    if (mask_1 != 0xFFFFFFFFFFFFFFFF) {
+        uint32_t match_byte = (uint32_t)__builtin_ctzll(~mask_1);
+        return 16 + match_byte;
+    }
+
+    // 64 bytes
+    zmm_src0_2 = _mm512_loadu_si512((__m512i*)(src0 + 80));
+    zmm_src1_2 = _mm512_loadu_si512((__m512i*)(src1 + 80));
+    mask_2 = _mm512_cmpeq_epu8_mask(zmm_src0_2, zmm_src1_2);
+    if (mask_2 != 0xFFFFFFFFFFFFFFFF) {
+        uint32_t match_byte = (uint32_t)__builtin_ctzll(~mask_2);
+        return 80 + match_byte;
+    }
+
+    // 64 bytes
+    zmm_src0_3 = _mm512_loadu_si512((__m512i*)(src0 + 144));
+    zmm_src1_3 = _mm512_loadu_si512((__m512i*)(src1 + 144));
+    mask_3 = _mm512_cmpeq_epu8_mask(zmm_src0_3, zmm_src1_3);
+    if (mask_3 != 0xFFFFFFFFFFFFFFFF) {
+        uint32_t match_byte = (uint32_t)__builtin_ctzll(~mask_3);
+        return 144 + match_byte;
+    }
+
+    // 64 bytes (overlaps the previous 16 bytes for fast tail processing)
+    zmm_src0_4 = _mm512_loadu_si512((__m512i*)(src0 + 192));
+    zmm_src1_4 = _mm512_loadu_si512((__m512i*)(src1 + 192));
+    mask_4 = _mm512_cmpeq_epu8_mask(zmm_src0_4, zmm_src1_4);
+    if (mask_4 != 0xFFFFFFFFFFFFFFFF) {
+        uint32_t match_byte = (uint32_t)__builtin_ctzll(~mask_4);
+        return 192 + match_byte;
+    }
+
+    return 256;
+}
+
+Z_INTERNAL uint32_t compare256_avx512(const uint8_t *src0, const uint8_t *src1) {
+    return compare256_avx512_static(src0, src1);
+}
+
+#define LONGEST_MATCH       longest_match_avx512
+#define COMPARE256          compare256_avx512_static
+
+#include "match_tpl.h"
+
+#define LONGEST_MATCH_SLOW
+#define LONGEST_MATCH       longest_match_slow_avx512
+#define COMPARE256          compare256_avx512_static
+
+#include "match_tpl.h"
+
+#endif
index a8de8d9afcf08c8524762ff4a4634e9de4e1c806..8e1943a7d2f2557396aad6a4185d0985d5fd6658 100644 (file)
@@ -60,6 +60,11 @@ uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *s
 uint32_t chunksize_avx512(void);
 uint8_t* chunkmemset_safe_avx512(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
 void inflate_fast_avx512(PREFIX3(stream)* strm, uint32_t start);
+#  ifdef HAVE_BUILTIN_CTZLL
+    uint32_t compare256_avx512(const uint8_t *src0, const uint8_t *src1);
+    uint32_t longest_match_avx512(deflate_state *const s, Pos cur_match);
+    uint32_t longest_match_slow_avx512(deflate_state *const s, Pos cur_match);
+#  endif
 #endif
 #ifdef X86_AVX512VNNI
 uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *buf, size_t len);
@@ -169,6 +174,14 @@ uint32_t crc32_vpclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len);
 #    define native_chunksize chunksize_avx512
 #    undef native_inflate_fast
 #    define native_inflate_fast inflate_fast_avx512
+#    ifdef HAVE_BUILTIN_CTZLL
+#      undef native_compare256
+#      define native_compare256 compare256_avx512
+#      undef native_longest_match
+#      define native_longest_match longest_match_avx512
+#      undef native_longest_match_slow
+#      define native_longest_match_slow longest_match_slow_avx512
+#    endif
 // X86 - AVX512 (VNNI)
 #    if defined(X86_AVX512VNNI) && defined(__AVX512VNNI__)
 #      undef native_adler32
index 4fa8d44e756c215684efceb722f03a4128b0bd63..90f5c7086edacd3a05a28ad70e4b1619d94098c5 100755 (executable)
--- a/configure
+++ b/configure
@@ -1694,8 +1694,8 @@ case "${ARCH}" in
             if test ${HAVE_AVX512_INTRIN} -eq 1; then
                 CFLAGS="${CFLAGS} -DX86_AVX512"
                 SFLAGS="${SFLAGS} -DX86_AVX512"
-                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_avx512.o chunkset_avx512.o"
-                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_avx512.lo chunkset_avx512.lo"
+                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_avx512.o chunkset_avx512.o compare256_avx512.o"
+                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_avx512.lo chunkset_avx512.lo compare256_avx512.lo"
             fi
 
             check_mtune_cascadelake_compiler_flag
index aea7dbb3505d9d43d972adb6b05eae4107ab7a3c..a7c1bd23ccd7f7c83dacdfa441cc47c9d1fed506 100644 (file)
@@ -139,6 +139,11 @@ static void init_functable(void) {
         ft.chunkmemset_safe = &chunkmemset_safe_avx512;
         ft.chunksize = &chunksize_avx512;
         ft.inflate_fast = &inflate_fast_avx512;
+#  ifdef HAVE_BUILTIN_CTZLL
+        ft.compare256 = &compare256_avx512;
+        ft.longest_match = &longest_match_avx512;
+        ft.longest_match_slow = &longest_match_slow_avx512;
+#  endif
     }
 #endif
 #ifdef X86_AVX512VNNI
index c27bff1360b6ee8d96d5364cca91f2a662282356..8ed2d0eb3dbd4e272c3543b21e431437db14ecc7 100644 (file)
@@ -59,7 +59,7 @@ public:
         } \
         Bench(state, fptr); \
     } \
-    BENCHMARK_REGISTER_F(compare256, name)->Range(1, MAX_COMPARE_SIZE);
+    BENCHMARK_REGISTER_F(compare256, name)->Arg(1)->Arg(10)->Arg(40)->Arg(80)->Arg(100)->Arg(175)->Arg(256);
 
 #ifdef DISABLE_RUNTIME_CPU_DETECTION
 BENCHMARK_COMPARE256(native, native_compare256, 1);
@@ -80,6 +80,9 @@ BENCHMARK_COMPARE256(sse2, compare256_sse2, test_cpu_features.x86.has_sse2);
 #if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
 BENCHMARK_COMPARE256(avx2, compare256_avx2, test_cpu_features.x86.has_avx2);
 #endif
+#if defined(X86_AVX512) && defined(HAVE_BUILTIN_CTZLL)
+BENCHMARK_COMPARE256(avx512, compare256_avx512, test_cpu_features.x86.has_avx512_common);
+#endif
 #if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
 BENCHMARK_COMPARE256(neon, compare256_neon, test_cpu_features.arm.has_neon);
 #endif
index 035e63c9660a7d6b151f15ababeece2ae7da7885..f367cd0f4eaa5d107677f5ce23333fa02568f789 100644 (file)
@@ -79,6 +79,9 @@ TEST_COMPARE256(sse2, compare256_sse2, test_cpu_features.x86.has_sse2)
 #if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
 TEST_COMPARE256(avx2, compare256_avx2, test_cpu_features.x86.has_avx2)
 #endif
+#if defined(X86_AVX512) && defined(HAVE_BUILTIN_CTZLL)
+TEST_COMPARE256(avx512, compare256_avx512, test_cpu_features.x86.has_avx512_common)
+#endif
 #if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
 TEST_COMPARE256(neon, compare256_neon, test_cpu_features.arm.has_neon)
 #endif