From: Nathan Moinvaziri Date: Mon, 18 Apr 2022 01:47:07 +0000 (-0700) Subject: Implement neon version of compare256. X-Git-Tag: 2.1.0-beta1~252 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=48f346e806ff9c9fd4cb45c57191f972d5a56c35;p=thirdparty%2Fzlib-ng.git Implement neon version of compare256. Co-authored-by: Adam Stylinski --- diff --git a/CMakeLists.txt b/CMakeLists.txt index f0b08808..ed438f79 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -601,8 +601,9 @@ if(WITH_OPTIM) if(WITH_NEON) check_neon_compiler_flag() if(MFPU_NEON_AVAILABLE) - add_definitions(-DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH) - set(NEON_SRCS ${ARCHDIR}/adler32_neon.c ${ARCHDIR}/chunkset_neon.c ${ARCHDIR}/slide_hash_neon.c) + add_definitions(-DARM_NEON -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH) + set(NEON_SRCS ${ARCHDIR}/adler32_neon.c ${ARCHDIR}/chunkset_neon.c + ${ARCHDIR}/compare256_neon.c ${ARCHDIR}/slide_hash_neon.c) list(APPEND ZLIB_ARCH_SRCS ${NEON_SRCS}) set_property(SOURCE ${NEON_SRCS} PROPERTY COMPILE_FLAGS "${NEONFLAG} ${NOLTOFLAG}") if(MSVC) diff --git a/README.md b/README.md index e02e04b3..40ad1f85 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@ Features * CRC32-B implementation using PCLMULQDQ, VPCLMULQDQ, ACLE, & IBM Z * Hash table implementation using CRC32-C intrinsics on x86 and ARM * Slide hash implementations using SSE2, AVX2, Neon, VMX & VSX - * Compare256 implementations using SSE2 & AVX2 + * Compare256 implementations using SSE2, AVX2, & Neon * Inflate chunk copying using SSE2, AVX, Neon & VSX * Support for hardware-accelerated deflate using IBM Z DFLTCC * Unaligned memory read/writes and large bit buffer improvements diff --git a/arch/arm/Makefile.in b/arch/arm/Makefile.in index f47325c2..abf6193f 100644 --- a/arch/arm/Makefile.in +++ b/arch/arm/Makefile.in @@ -20,6 +20,7 @@ all: \ adler32_neon.o adler32_neon.lo \ arm_features.o arm_features.lo \ chunkset_neon.o chunkset_neon.lo \ + compare256_neon.o compare256_neon.lo \ crc32_acle.o crc32_acle.lo \ slide_hash_neon.o slide_hash_neon.lo \ insert_string_acle.o insert_string_acle.lo @@ -42,6 +43,12 @@ chunkset_neon.o: chunkset_neon.lo: $(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_neon.c +compare256_neon.o: + $(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_neon.c + +compare256_neon.lo: + $(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_neon.c + crc32_acle.o: $(CC) $(CFLAGS) $(ACLEFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_acle.c diff --git a/arch/arm/compare256_neon.c b/arch/arm/compare256_neon.c new file mode 100644 index 00000000..53a088cc --- /dev/null +++ b/arch/arm/compare256_neon.c @@ -0,0 +1,60 @@ +/* compare256_neon.c - NEON version of compare256 + * Copyright (C) 2022 Nathan Moinvaziri + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL) +#ifdef _M_ARM64 +# include +#else +# include +#endif +#include "../../zbuild.h" + +static inline uint32_t compare256_neon_static(const uint8_t *src0, const uint8_t *src1) { + uint32_t len = 0; + + do { + uint8x16_t a, b, cmp; + uint64_t lane; + + a = vld1q_u8(src0); + b = vld1q_u8(src1); + + cmp = veorq_u8(a, b); + + lane = vgetq_lane_u64(vreinterpretq_u64_u8(cmp), 0); + if (lane) { + uint32_t match_byte = (uint32_t)__builtin_ctzll(lane) / 8; + return len + match_byte; + } + len += 8; + lane = vgetq_lane_u64(vreinterpretq_u64_u8(cmp), 1); + if (lane) { + uint32_t match_byte = (uint32_t)__builtin_ctzll(lane) / 8; + return len + match_byte; + } + len += 8; + + src0 += 16, src1 += 16; + } while (len < 256); + + return 256; +} + +Z_INTERNAL uint32_t compare256_neon(const uint8_t *src0, const uint8_t *src1) { + return compare256_neon_static(src0, src1); +} + +#define LONGEST_MATCH longest_match_neon +#define COMPARE256 compare256_neon_static + +#include "match_tpl.h" + +#define LONGEST_MATCH_SLOW +#define LONGEST_MATCH longest_match_slow_neon +#define COMPARE256 compare256_neon_static + +#include "match_tpl.h" + +#endif diff --git a/configure b/configure index a357e1c8..836e0307 100755 --- a/configure +++ b/configure @@ -1659,7 +1659,10 @@ EOF fi if test $buildneon -eq 1; then - if test $MFPU_NEON_AVAILABLE -eq 1;then + CFLAGS="${CFLAGS} -DARM_NEON" + SFLAGS="${SFLAGS} -DARM_NEON" + + if test $MFPU_NEON_AVAILABLE -eq 1; then neonflag="-mfpu=neon" fi @@ -1671,8 +1674,8 @@ EOF CFLAGS="${CFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH" SFLAGS="${SFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH" - ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_neon.o chunkset_neon.o slide_hash_neon.o" - ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_neon.lo chunkset_neon.lo slide_hash_neon.lo" + ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_neon.o chunkset_neon.o compare256_neon.o slide_hash_neon.o" + ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_neon.lo chunkset_neon.lo compare256_neon.lo slide_hash_neon.lo" fi fi ;; @@ -1683,6 +1686,9 @@ EOF fi if test $buildneon -eq 1; then + CFLAGS="${CFLAGS} -DARM_NEON" + SFLAGS="${SFLAGS} -DARM_NEON" + if test $MFPU_NEON_AVAILABLE -eq 1;then neonflag="-mfpu=neon" fi @@ -1695,8 +1701,8 @@ EOF CFLAGS="${CFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH" SFLAGS="${SFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH" - ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_neon.o chunkset_neon.o slide_hash_neon.o" - ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_neon.lo chunkset_neon.lo slide_hash_neon.lo" + ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_neon.o chunkset_neon.o compare256_neon.o slide_hash_neon.o" + ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_neon.lo chunkset_neon.lo compare256_neon.lo slide_hash_neon.lo" fi fi ;; @@ -1713,6 +1719,9 @@ EOF fi if test $buildneon -eq 1; then + CFLAGS="${CFLAGS} -DARM_NEON" + SFLAGS="${SFLAGS} -DARM_NEON" + if test $MFPU_NEON_AVAILABLE -eq 1;then neonflag="-mfpu=neon" fi @@ -1725,8 +1734,8 @@ EOF CFLAGS="${CFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH" SFLAGS="${SFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH" - ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_neon.o chunkset_neon.o slide_hash_neon.o" - ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_neon.lo chunkset_neon.lo slide_hash_neon.lo" + ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_neon.o chunkset_neon.o compare256_neon.o slide_hash_neon.o" + ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_neon.lo chunkset_neon.lo compare256_neon.lo slide_hash_neon.lo" fi fi ;; @@ -1786,10 +1795,10 @@ EOF if test $native -eq 0; then ARCH="${ARCH}+simd" fi - CFLAGS="${CFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH" - SFLAGS="${SFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH" - ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_neon.o chunkset_neon.o slide_hash_neon.o" - ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_neon.lo chunkset_neon.lo slide_hash_neon.lo" + CFLAGS="${CFLAGS} -DARM_NEON -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH" + SFLAGS="${SFLAGS} -DARM_NEON -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH" + ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_neon.o chunkset_neon.o compare256_neon.o slide_hash_neon.o" + ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_neon.lo chunkset_neon.lo compare256_neon.lo slide_hash_neon.lo" fi fi diff --git a/cpu_features.h b/cpu_features.h index 4dcf8e59..504c6a93 100644 --- a/cpu_features.h +++ b/cpu_features.h @@ -127,6 +127,9 @@ extern uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1); #if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) extern uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1); #endif +#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL) +extern uint32_t compare256_neon(const uint8_t *src0, const uint8_t *src1); +#endif #ifdef DEFLATE_H_ /* insert_string */ @@ -154,6 +157,9 @@ extern uint32_t longest_match_sse2(deflate_state *const s, Pos cur_match); #if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) extern uint32_t longest_match_avx2(deflate_state *const s, Pos cur_match); #endif +#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL) +extern uint32_t longest_match_neon(deflate_state *const s, Pos cur_match); +#endif /* longest_match_slow */ extern uint32_t longest_match_slow_c(deflate_state *const s, Pos cur_match); @@ -170,6 +176,9 @@ extern uint32_t longest_match_slow_sse2(deflate_state *const s, Pos cur_match); #if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) extern uint32_t longest_match_slow_avx2(deflate_state *const s, Pos cur_match); #endif +#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL) +extern uint32_t longest_match_slow_neon(deflate_state *const s, Pos cur_match); +#endif /* quick_insert_string */ extern Pos quick_insert_string_c(deflate_state *const s, const uint32_t str); diff --git a/functable.c b/functable.c index 68aef1d3..74381e15 100644 --- a/functable.c +++ b/functable.c @@ -117,6 +117,10 @@ Z_INTERNAL uint32_t longest_match_stub(deflate_state *const s, Pos cur_match) { if (x86_cpu_has_avx2) functable.longest_match = &longest_match_avx2; #endif +#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL) + if (arm_cpu_has_neon) + functable.longest_match = &longest_match_neon; +#endif return functable.longest_match(s, cur_match); } @@ -142,6 +146,10 @@ Z_INTERNAL uint32_t longest_match_slow_stub(deflate_state *const s, Pos cur_matc if (x86_cpu_has_avx2) functable.longest_match_slow = &longest_match_slow_avx2; #endif +#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL) + if (arm_cpu_has_neon) + functable.longest_match_slow = &longest_match_slow_neon; +#endif return functable.longest_match_slow(s, cur_match); } diff --git a/test/benchmarks/benchmark_compare256.cc b/test/benchmarks/benchmark_compare256.cc index cc1ee5c1..c579d9ac 100644 --- a/test/benchmarks/benchmark_compare256.cc +++ b/test/benchmarks/benchmark_compare256.cc @@ -76,3 +76,6 @@ BENCHMARK_COMPARE256(sse2, compare256_sse2, x86_cpu_has_sse2); #if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) BENCHMARK_COMPARE256(avx2, compare256_avx2, x86_cpu_has_avx2); #endif +#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL) +BENCHMARK_COMPARE256(neon, compare256_neon, arm_cpu_has_neon); +#endif diff --git a/test/test_compare256.cc b/test/test_compare256.cc index c252cfad..61c6e19b 100644 --- a/test/test_compare256.cc +++ b/test/test_compare256.cc @@ -72,3 +72,6 @@ TEST_COMPARE256(sse2, compare256_sse2, x86_cpu_has_sse2) #if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) TEST_COMPARE256(avx2, compare256_avx2, x86_cpu_has_avx2) #endif +#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL) +TEST_COMPARE256(neon, compare256_neon, arm_cpu_has_neon) +#endif diff --git a/win32/Makefile.a64 b/win32/Makefile.a64 index 29e66cee..b0d7993d 100644 --- a/win32/Makefile.a64 +++ b/win32/Makefile.a64 @@ -93,12 +93,13 @@ OBJS = $(OBJS) gzlib.obj gzread.obj gzwrite.obj WFLAGS = $(WFLAGS) \ -DARM_ACLE_CRC_HASH \ -D__ARM_NEON__=1 \ + -DARM_NEON \ -DARM_NEON_ADLER32 \ -DARM_NEON_CHUNKSET \ -DARM_NEON_SLIDEHASH \ -DARM_NOCHECK_NEON \ # -OBJS = $(OBJS) crc32_acle.obj insert_string_acle.obj adler32_neon.obj chunkset_neon.obj slide_hash_neon.obj +OBJS = $(OBJS) crc32_acle.obj insert_string_acle.obj adler32_neon.obj chunkset_neon.obj compare256_neon.obj slide_hash_neon.obj # targets all: $(STATICLIB) $(SHAREDLIB) $(IMPLIB) \ diff --git a/win32/Makefile.arm b/win32/Makefile.arm index a43dc59b..14df718d 100644 --- a/win32/Makefile.arm +++ b/win32/Makefile.arm @@ -105,12 +105,13 @@ NEON_ARCH = /arch:VFPv3 CFLAGS = $(CFLAGS) $(NEON_ARCH) WFLAGS = $(WFLAGS) \ -D__ARM_NEON__=1 \ + -DARM_NEON \ -DARM_NEON_ADLER32 \ -DARM_NEON_CHUNKSET \ -DARM_NEON_SLIDEHASH \ -DARM_NOCHECK_NEON \ # -OBJS = $(OBJS) adler32_neon.obj chunkset_neon.obj slide_hash_neon.obj +OBJS = $(OBJS) adler32_neon.obj chunkset_neon.obj compare256_neon.obj slide_hash_neon.obj !endif # targets