From: Hans Kristian Rosbach Date: Thu, 10 Apr 2025 22:46:06 +0000 (+0200) Subject: Add AVX512 version of compare256 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=refs%2Fpull%2F1901%2Fhead;p=thirdparty%2Fzlib-ng.git Add AVX512 version of compare256 Improve the speed of sub-16 byte matches by first using a 128-bit intrinsic, after that use only 512-bit intrinsics. This requires us to overlap on the last run, but this is cheaper than processing the tail using a 256-bit and then a 128-bit run. Change benchmark steps to avoid it hitting chunk boundaries of one or the other function as much, this gives more fair benchmarks. --- diff --git a/CMakeLists.txt b/CMakeLists.txt index bcf0e491..4a453ebe 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1047,6 +1047,8 @@ if(WITH_OPTIM) add_feature_info(AVX512_ADLER32 1 "Support AVX512-accelerated adler32, using \"${AVX512FLAG}\"") list(APPEND AVX512_SRCS ${ARCHDIR}/chunkset_avx512.c) add_feature_info(AVX512_CHUNKSET 1 "Support AVX512 optimized chunkset, using \"${AVX512FLAG}\"") + list(APPEND AVX512_SRCS ${ARCHDIR}/compare256_avx512.c) + add_feature_info(AVX512_COMPARE256 1 "Support AVX512 optimized compare256, using \"${AVX512FLAG}\"") list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/adler32_avx512_p.h) list(APPEND ZLIB_ARCH_SRCS ${AVX512_SRCS}) set_property(SOURCE ${AVX512_SRCS} PROPERTY COMPILE_FLAGS "${AVX512FLAG} ${NOLTOFLAG}") diff --git a/arch/x86/Makefile.in b/arch/x86/Makefile.in index 7705cd09..3b00c3ed 100644 --- a/arch/x86/Makefile.in +++ b/arch/x86/Makefile.in @@ -36,6 +36,7 @@ all: \ chunkset_ssse3.o chunkset_ssse3.lo \ chorba_sse2.o chorba_sse2.lo \ compare256_avx2.o compare256_avx2.lo \ + compare256_avx512.o compare256_avx512.lo \ compare256_sse2.o compare256_sse2.lo \ crc32_pclmulqdq.o crc32_pclmulqdq.lo \ crc32_vpclmulqdq.o crc32_vpclmulqdq.lo \ @@ -84,6 +85,12 @@ compare256_avx2.o: compare256_avx2.lo: $(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx2.c +compare256_avx512.o: + $(CC) $(CFLAGS) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx512.c + +compare256_avx512.lo: + $(CC) $(SFLAGS) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx512.c + compare256_sse2.o: $(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse2.c diff --git a/arch/x86/compare256_avx512.c b/arch/x86/compare256_avx512.c new file mode 100644 index 00000000..a1ebe0e5 --- /dev/null +++ b/arch/x86/compare256_avx512.c @@ -0,0 +1,97 @@ +/* compare256_avx512.c -- AVX512 version of compare256 + * Copyright (C) 2025 Hans Kristian Rosbach + * Based on AVX2 implementation by Mika T. Lindqvist + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include "zbuild.h" +#include "zmemory.h" +#include "deflate.h" +#include "fallback_builtins.h" + +#if defined(X86_AVX512) && defined(HAVE_BUILTIN_CTZLL) + +#include +#ifdef _MSC_VER +# include +#endif + +static inline uint32_t compare256_avx512_static(const uint8_t *src0, const uint8_t *src1) { + __m512i zmm_src0_4, zmm_src1_4; + __m512i zmm_src0_3, zmm_src1_3; + __m512i zmm_src0_2, zmm_src1_2; + __m512i zmm_src0_1, zmm_src1_1; + __m128i xmm_src0_0, xmm_src1_0; + uint64_t mask_1, mask_2, mask_3, mask_4; + uint32_t mask_0; + + // First do a 16byte round before increasing to 64bytes, this reduces the + // penalty for the short matches, and those are usually the most common ones. + // This requires us to overlap on the last round, giving a small penalty + // on matches of 192+ bytes (Still faster than AVX2 though). + + // 16 bytes + xmm_src0_0 = _mm_loadu_si128((__m128i*)src0); + xmm_src1_0 = _mm_loadu_si128((__m128i*)src1); + mask_0 = (uint32_t)_mm_cmpeq_epu8_mask(xmm_src0_0, xmm_src1_0); // zero-extended to use __builtin_ctz + if (mask_0 != 0x0000FFFF) { + // There is potential for using __builtin_ctzg/__builtin_ctzs/_tzcnt_u16/__tzcnt_u16 here + uint32_t match_byte = (uint32_t)__builtin_ctz(~mask_0); /* Invert bits so identical = 0 */ + return match_byte; + } + + // 64 bytes + zmm_src0_1 = _mm512_loadu_si512((__m512i*)(src0 + 16)); + zmm_src1_1 = _mm512_loadu_si512((__m512i*)(src1 + 16)); + mask_1 = _mm512_cmpeq_epu8_mask(zmm_src0_1, zmm_src1_1); + if (mask_1 != 0xFFFFFFFFFFFFFFFF) { + uint32_t match_byte = (uint32_t)__builtin_ctzll(~mask_1); + return 16 + match_byte; + } + + // 64 bytes + zmm_src0_2 = _mm512_loadu_si512((__m512i*)(src0 + 80)); + zmm_src1_2 = _mm512_loadu_si512((__m512i*)(src1 + 80)); + mask_2 = _mm512_cmpeq_epu8_mask(zmm_src0_2, zmm_src1_2); + if (mask_2 != 0xFFFFFFFFFFFFFFFF) { + uint32_t match_byte = (uint32_t)__builtin_ctzll(~mask_2); + return 80 + match_byte; + } + + // 64 bytes + zmm_src0_3 = _mm512_loadu_si512((__m512i*)(src0 + 144)); + zmm_src1_3 = _mm512_loadu_si512((__m512i*)(src1 + 144)); + mask_3 = _mm512_cmpeq_epu8_mask(zmm_src0_3, zmm_src1_3); + if (mask_3 != 0xFFFFFFFFFFFFFFFF) { + uint32_t match_byte = (uint32_t)__builtin_ctzll(~mask_3); + return 144 + match_byte; + } + + // 64 bytes (overlaps the previous 16 bytes for fast tail processing) + zmm_src0_4 = _mm512_loadu_si512((__m512i*)(src0 + 192)); + zmm_src1_4 = _mm512_loadu_si512((__m512i*)(src1 + 192)); + mask_4 = _mm512_cmpeq_epu8_mask(zmm_src0_4, zmm_src1_4); + if (mask_4 != 0xFFFFFFFFFFFFFFFF) { + uint32_t match_byte = (uint32_t)__builtin_ctzll(~mask_4); + return 192 + match_byte; + } + + return 256; +} + +Z_INTERNAL uint32_t compare256_avx512(const uint8_t *src0, const uint8_t *src1) { + return compare256_avx512_static(src0, src1); +} + +#define LONGEST_MATCH longest_match_avx512 +#define COMPARE256 compare256_avx512_static + +#include "match_tpl.h" + +#define LONGEST_MATCH_SLOW +#define LONGEST_MATCH longest_match_slow_avx512 +#define COMPARE256 compare256_avx512_static + +#include "match_tpl.h" + +#endif diff --git a/arch/x86/x86_functions.h b/arch/x86/x86_functions.h index a8de8d9a..8e1943a7 100644 --- a/arch/x86/x86_functions.h +++ b/arch/x86/x86_functions.h @@ -60,6 +60,11 @@ uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *s uint32_t chunksize_avx512(void); uint8_t* chunkmemset_safe_avx512(uint8_t *out, uint8_t *from, unsigned len, unsigned left); void inflate_fast_avx512(PREFIX3(stream)* strm, uint32_t start); +# ifdef HAVE_BUILTIN_CTZLL + uint32_t compare256_avx512(const uint8_t *src0, const uint8_t *src1); + uint32_t longest_match_avx512(deflate_state *const s, Pos cur_match); + uint32_t longest_match_slow_avx512(deflate_state *const s, Pos cur_match); +# endif #endif #ifdef X86_AVX512VNNI uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *buf, size_t len); @@ -169,6 +174,14 @@ uint32_t crc32_vpclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len); # define native_chunksize chunksize_avx512 # undef native_inflate_fast # define native_inflate_fast inflate_fast_avx512 +# ifdef HAVE_BUILTIN_CTZLL +# undef native_compare256 +# define native_compare256 compare256_avx512 +# undef native_longest_match +# define native_longest_match longest_match_avx512 +# undef native_longest_match_slow +# define native_longest_match_slow longest_match_slow_avx512 +# endif // X86 - AVX512 (VNNI) # if defined(X86_AVX512VNNI) && defined(__AVX512VNNI__) # undef native_adler32 diff --git a/configure b/configure index 4fa8d44e..90f5c708 100755 --- a/configure +++ b/configure @@ -1694,8 +1694,8 @@ case "${ARCH}" in if test ${HAVE_AVX512_INTRIN} -eq 1; then CFLAGS="${CFLAGS} -DX86_AVX512" SFLAGS="${SFLAGS} -DX86_AVX512" - ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_avx512.o chunkset_avx512.o" - ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_avx512.lo chunkset_avx512.lo" + ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_avx512.o chunkset_avx512.o compare256_avx512.o" + ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_avx512.lo chunkset_avx512.lo compare256_avx512.lo" fi check_mtune_cascadelake_compiler_flag diff --git a/functable.c b/functable.c index aea7dbb3..a7c1bd23 100644 --- a/functable.c +++ b/functable.c @@ -139,6 +139,11 @@ static void init_functable(void) { ft.chunkmemset_safe = &chunkmemset_safe_avx512; ft.chunksize = &chunksize_avx512; ft.inflate_fast = &inflate_fast_avx512; +# ifdef HAVE_BUILTIN_CTZLL + ft.compare256 = &compare256_avx512; + ft.longest_match = &longest_match_avx512; + ft.longest_match_slow = &longest_match_slow_avx512; +# endif } #endif #ifdef X86_AVX512VNNI diff --git a/test/benchmarks/benchmark_compare256.cc b/test/benchmarks/benchmark_compare256.cc index c27bff13..8ed2d0eb 100644 --- a/test/benchmarks/benchmark_compare256.cc +++ b/test/benchmarks/benchmark_compare256.cc @@ -59,7 +59,7 @@ public: } \ Bench(state, fptr); \ } \ - BENCHMARK_REGISTER_F(compare256, name)->Range(1, MAX_COMPARE_SIZE); + BENCHMARK_REGISTER_F(compare256, name)->Arg(1)->Arg(10)->Arg(40)->Arg(80)->Arg(100)->Arg(175)->Arg(256); #ifdef DISABLE_RUNTIME_CPU_DETECTION BENCHMARK_COMPARE256(native, native_compare256, 1); @@ -80,6 +80,9 @@ BENCHMARK_COMPARE256(sse2, compare256_sse2, test_cpu_features.x86.has_sse2); #if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) BENCHMARK_COMPARE256(avx2, compare256_avx2, test_cpu_features.x86.has_avx2); #endif +#if defined(X86_AVX512) && defined(HAVE_BUILTIN_CTZLL) +BENCHMARK_COMPARE256(avx512, compare256_avx512, test_cpu_features.x86.has_avx512_common); +#endif #if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL) BENCHMARK_COMPARE256(neon, compare256_neon, test_cpu_features.arm.has_neon); #endif diff --git a/test/test_compare256.cc b/test/test_compare256.cc index 035e63c9..f367cd0f 100644 --- a/test/test_compare256.cc +++ b/test/test_compare256.cc @@ -79,6 +79,9 @@ TEST_COMPARE256(sse2, compare256_sse2, test_cpu_features.x86.has_sse2) #if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) TEST_COMPARE256(avx2, compare256_avx2, test_cpu_features.x86.has_avx2) #endif +#if defined(X86_AVX512) && defined(HAVE_BUILTIN_CTZLL) +TEST_COMPARE256(avx512, compare256_avx512, test_cpu_features.x86.has_avx512_common) +#endif #if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL) TEST_COMPARE256(neon, compare256_neon, test_cpu_features.arm.has_neon) #endif