From: Adam Stylinski Date: Sun, 23 Jan 2022 16:59:57 +0000 (-0500) Subject: Marginal improvement by pipelining loads on NEON X-Git-Tag: 2.1.0-beta1~395 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=9146bd472c23a501725d275f8b7edec93c072466;p=thirdparty%2Fzlib-ng.git Marginal improvement by pipelining loads on NEON The ld1{4 reg} variant saves us instructions and only adds 3 cycles of latency to load 3 more neon/asimd registers worth of data. --- diff --git a/CMakeLists.txt b/CMakeLists.txt index f15d1f9cb..ccac51cbf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -628,6 +628,10 @@ if(WITH_OPTIM) endif() add_feature_info(NEON_ADLER32 1 "Support NEON instructions in adler32, using \"${NEONFLAG}\"") add_feature_info(NEON_SLIDEHASH 1 "Support NEON instructions in slide_hash, using \"${NEONFLAG}\"") + check_neon_ld4_intrinsics() + if(NEON_HAS_LD4) + add_definitions(-DARM_NEON_HASLD4) + endif() else() set(WITH_NEON OFF) endif() diff --git a/arch/arm/slide_hash_neon.c b/arch/arm/slide_hash_neon.c index b90e47021..6ff7a0bb6 100644 --- a/arch/arm/slide_hash_neon.c +++ b/arch/arm/slide_hash_neon.c @@ -16,10 +16,12 @@ #endif #include "../../zbuild.h" #include "../../deflate.h" +#include "../../fallback_builtins.h" /* SIMD version of hash_chain rebase */ static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) { - Z_REGISTER uint16x8_t v, *p; + Z_REGISTER uint16x8_t v; + uint16x8x4_t p0, p1; Z_REGISTER size_t n; size_t size = entries*sizeof(table[0]); @@ -28,18 +30,15 @@ static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize Assert(sizeof(Pos) == 2, "Wrong Pos size"); v = vdupq_n_u16(wsize); - p = (uint16x8_t *)table; n = size / (sizeof(uint16x8_t) * 8); do { - p[0] = vqsubq_u16(p[0], v); - p[1] = vqsubq_u16(p[1], v); - p[2] = vqsubq_u16(p[2], v); - p[3] = vqsubq_u16(p[3], v); - p[4] = vqsubq_u16(p[4], v); - p[5] = vqsubq_u16(p[5], v); - p[6] = vqsubq_u16(p[6], v); - p[7] = vqsubq_u16(p[7], v); - p += 8; + p0 = vld1q_u16_x4(table); + p1 = vld1q_u16_x4(table+32); + vqsubq_u16_x4_x1(p0, p0, v); + vqsubq_u16_x4_x1(p1, p1, v); + vst1q_u16_x4(table, p0); + vst1q_u16_x4(table+32, p1); + table += 64; } while (--n); } diff --git a/cmake/detect-intrinsics.cmake b/cmake/detect-intrinsics.cmake index f7d27dc2b..c4231c26a 100644 --- a/cmake/detect-intrinsics.cmake +++ b/cmake/detect-intrinsics.cmake @@ -151,6 +151,34 @@ macro(check_neon_compiler_flag) set(CMAKE_REQUIRED_FLAGS) endmacro() +macro(check_neon_ld4_intrinsics) + if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") + if(NOT NATIVEFLAG) + if("${ARCH}" MATCHES "aarch64") + set(NEONFLAG "-march=armv8-a+simd") + else() + set(NEONFLAG "-mfpu=neon") + endif() + endif() + endif() + # Check whether compiler supports loading 4 neon vecs into a register range + set(CMAKE_REQUIRED_FLAGS "${NEONFLAG}") + check_c_source_compiles( + "#ifdef _M_ARM64 + # include + #else + # include + #endif + int main(void) { + int stack_var[16]; + int32x4x4_t v = vld1q_s32_x4(stack_var); + (void)v; + return 0; + }" + NEON_HAS_LD4) + set(CMAKE_REQUIRED_FLAGS) +endmacro() + macro(check_pclmulqdq_intrinsics) if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") if(NOT NATIVEFLAG) diff --git a/configure b/configure index 1f8c64db0..93b109b7c 100755 --- a/configure +++ b/configure @@ -1169,6 +1169,29 @@ EOF fi } +check_neon_ld4_intrinsics() { + cat > $test.c << EOF +#ifdef _M_ARM64 +# include +#else +# include +#endif +int main(void) { + int stack_var[16]; + int32x4x4_t v = vld1q_s32_x4(stack_var); + (void)v; + return 0; +} +EOF + if try $CC -c $CFLAGS -march=native $test.c; then + NEON_HAS_LD4=1 + echo "check whether compiler supports 4 wide register loads ... Yes." | tee -a configure.log + else + NEON_HAS_LD4=0 + echo "check whether compiler supports 4 wide register loads ... No." | tee -a configure.log + fi +} + check_pclmulqdq_intrinsics() { # Check whether compiler supports PCLMULQDQ intrinsics cat > $test.c << EOF @@ -1658,6 +1681,7 @@ EOF if test $without_optimizations -eq 0; then check_acle_compiler_flag check_neon_compiler_flag + check_neon_ld4_intrinsics fi case "${ARCH}" in @@ -1700,6 +1724,11 @@ EOF neonflag="-mfpu=neon" fi + if test $NEON_HAS_LD4 -eq 1; then + CFLAGS="${CFLAGS} -DARM_NEON_HASLD4" + SFLAGS="${SFLAGS} -DARM_NEON_HASLD4" + fi + CFLAGS="${CFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH" SFLAGS="${SFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH" @@ -1722,6 +1751,11 @@ EOF neonflag="-mfpu=neon" fi + if test $NEON_HAS_LD4 -eq 1; then + CFLAGS="${CFLAGS} -DARM_NEON_HASLD4" + SFLAGS="${SFLAGS} -DARM_NEON_HASLD4" + fi + CFLAGS="${CFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH" SFLAGS="${SFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH" @@ -1750,6 +1784,11 @@ EOF neonflag="-mfpu=neon" fi + if test $NEON_HAS_LD4 -eq 1; then + CFLAGS="${CFLAGS} -DARM_NEON_HASLD4" + SFLAGS="${SFLAGS} -DARM_NEON_HASLD4" + fi + CFLAGS="${CFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH" SFLAGS="${SFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH" @@ -1793,6 +1832,11 @@ EOF fi fi + if test $NEON_HAS_LD4 -eq 1; then + CFLAGS="${CFLAGS} -DARM_NEON_HASLD4" + SFLAGS="${SFLAGS} -DARM_NEON_HASLD4" + fi + if test $buildacle -eq 1; then if test $native -eq 0; then ARCH="${ARCH}+crc" diff --git a/fallback_builtins.h b/fallback_builtins.h index 5aaa2c681..ee623a66f 100644 --- a/fallback_builtins.h +++ b/fallback_builtins.h @@ -37,10 +37,10 @@ static __forceinline unsigned long long __builtin_ctzll(uint64_t value) { return trailing_zero; } #define HAVE_BUILTIN_CTZLL -#endif +#endif // Microsoft AMD64 -#endif -#endif +#endif // Microsoft AMD64/IA64/x86/ARM/ARM64 test +#endif // _MSC_VER & !clang /* Unfortunately GCC didn't support these things until version 10 */ #ifdef __AVX2__ @@ -63,4 +63,43 @@ static inline __m512i _mm512_zextsi128_si512(__m128i a) { #endif // gcc version 10 test #endif // __AVX2__ + +#ifdef ARM_NEON_SLIDEHASH + +#define vqsubq_u16_x4_x1(out, a, b) do { \ + out.val[0] = vqsubq_u16(a.val[0], b); \ + out.val[1] = vqsubq_u16(a.val[1], b); \ + out.val[2] = vqsubq_u16(a.val[2], b); \ + out.val[3] = vqsubq_u16(a.val[3], b); \ +} while (0) + +/* Have to check for hard float ABI on GCC/clang, but not + * on MSVC (we don't compile for the soft float ABI on windows) + */ +#if !defined(ARM_NEON_HASLD4) && (defined(__ARM_FP) || defined(_MSC_VER)) + +#ifdef _M_ARM64 +# include +#else +# include +#endif + +static inline uint16x8x4_t vld1q_u16_x4(uint16_t *a) { + uint16x8x4_t ret = (uint16x8x4_t) {{ + vld1q_u16(a), + vld1q_u16(a+8), + vld1q_u16(a+16), + vld1q_u16(a+24)}}; + return ret; +} + +static inline void vst1q_u16_x4(uint16_t *p, uint16x8x4_t a) { + vst1q_u16(p, a.val[0]); + vst1q_u16(p + 8, a.val[1]); + vst1q_u16(p + 16, a.val[2]); + vst1q_u16(p + 24, a.val[3]); +} +#endif // HASLD4 check and hard float +#endif // ARM_NEON_SLIDEHASH + #endif // include guard FALLBACK_BUILTINS_H diff --git a/win32/Makefile.a64 b/win32/Makefile.a64 index e65123360..1dc0815a3 100644 --- a/win32/Makefile.a64 +++ b/win32/Makefile.a64 @@ -25,12 +25,13 @@ RC = rc CP = copy /y CFLAGS = -nologo -MD -W3 -O2 -Oy- -Zi -Fd"zlib" $(LOC) WFLAGS = \ + -D_ARM64_WINAPI_PARTITION_DESKTOP_SDK_AVAILABLE=1 \ -D_CRT_SECURE_NO_DEPRECATE \ -D_CRT_NONSTDC_NO_DEPRECATE \ + -DARM_NEON_HASLD4 \ + -DARM_FEATURES \ -DUNALIGNED_OK \ -DUNALIGNED64_OK \ - -D_ARM64_WINAPI_PARTITION_DESKTOP_SDK_AVAILABLE=1 \ - -DARM_FEATURES \ # LDFLAGS = -nologo -debug -incremental:no -opt:ref -manifest ARFLAGS = -nologo diff --git a/win32/Makefile.arm b/win32/Makefile.arm index 29ce95581..7628b617f 100644 --- a/win32/Makefile.arm +++ b/win32/Makefile.arm @@ -25,11 +25,12 @@ RC = rc CP = copy /y CFLAGS = -nologo -MD -W3 -O2 -Oy- -Zi -Fd"zlib" $(LOC) WFLAGS = \ + -D_ARM_WINAPI_PARTITION_DESKTOP_SDK_AVAILABLE=1 \ -D_CRT_SECURE_NO_DEPRECATE \ -D_CRT_NONSTDC_NO_DEPRECATE \ - -DUNALIGNED_OK \ - -D_ARM_WINAPI_PARTITION_DESKTOP_SDK_AVAILABLE=1 \ -DARM_FEATURES \ + -DARM_NEON_HASLD4 \ + -DUNALIGNED_OK \ # LDFLAGS = -nologo -debug -incremental:no -opt:ref -manifest ARFLAGS = -nologo