From: Cameron Cawley Date: Wed, 12 Jul 2023 17:48:48 +0000 (+0100) Subject: Add ARMv6 version of slide_hash X-Git-Tag: 2.1.4~15 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=16fe1f885ef33553ce3bd92762eef58ab526a1ab;p=thirdparty%2Fzlib-ng.git Add ARMv6 version of slide_hash --- diff --git a/CMakeLists.txt b/CMakeLists.txt index f74c0e454..6cccb4659 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -101,6 +101,7 @@ set_property(CACHE WITH_SANITIZER PROPERTY STRINGS "Memory" "Address" "Undefined if(BASEARCH_ARM_FOUND) option(WITH_ACLE "Build with ACLE" ON) option(WITH_NEON "Build with NEON intrinsics" ON) + option(WITH_ARMV6 "Build with ARMv6 SIMD" ON) elseif(BASEARCH_PPC_FOUND) option(WITH_ALTIVEC "Build with AltiVec (VMX) optimisations for PowerPC" ON) option(WITH_POWER8 "Build with optimisations for POWER8" ON) @@ -128,6 +129,7 @@ mark_as_advanced(FORCE ZLIB_SYMBOL_PREFIX WITH_REDUCED_MEM WITH_ACLE WITH_NEON + WITH_ARMV6 WITH_DFLTCC_DEFLATE WITH_DFLTCC_INFLATE WITH_CRC32_VX @@ -292,6 +294,10 @@ endif() # # Check for standard/system includes # +check_include_file(arm_acle.h HAVE_ARM_ACLE_H) +if(HAVE_ARM_ACLE_H) + add_definitions(-DHAVE_ARM_ACLE_H) +endif() check_include_file(sys/auxv.h HAVE_SYS_AUXV_H) if(HAVE_SYS_AUXV_H) add_definitions(-DHAVE_SYS_AUXV_H) @@ -648,6 +654,23 @@ if(WITH_OPTIM) set(WITH_NEON OFF) endif() endif() + if(WITH_ARMV6) + check_armv6_compiler_flag() + if(HAVE_ARMV6_INLINE_ASM OR HAVE_ARMV6_INTRIN) + add_definitions(-DARM_SIMD) + set(ARMV6_SRCS ${ARCHDIR}/slide_hash_armv6.c) + set_property(SOURCE ${ARMV6_SRCS} PROPERTY COMPILE_FLAGS "${ARMV6FLAG} ${NOLTOFLAG}") + list(APPEND ZLIB_ARCH_SRCS ${ARMV6_SRCS}) + add_feature_info(ARMV6 1 "Support ARMv6 SIMD instructions in slide_hash, using \"${ARMV6FLAG}\"") + if(HAVE_ARMV6_INTRIN) + add_definitions(-DARM_SIMD_INTRIN) + endif() + else() + set(WITH_ARMV6 OFF) + endif() + else() + set(WITH_ARMV6 OFF) + endif() elseif(BASEARCH_PPC_FOUND) # Common arch detection code if(WITH_ALTIVEC) @@ -1215,6 +1238,7 @@ add_feature_info(WITH_INFLATE_ALLOW_INVALID_DIST WITH_INFLATE_ALLOW_INVALID_DIST if(BASEARCH_ARM_FOUND) add_feature_info(WITH_ACLE WITH_ACLE "Build with ACLE") add_feature_info(WITH_NEON WITH_NEON "Build with NEON intrinsics") + add_feature_info(WITH_ARMV6 WITH_ARMV6 "Build with ARMv6 SIMD") elseif(BASEARCH_PPC_FOUND) add_feature_info(WITH_ALTIVEC WITH_ALTIVEC "Build with AltiVec optimisations") add_feature_info(WITH_POWER8 WITH_POWER8 "Build with optimisations for POWER8") diff --git a/README.md b/README.md index c83b8487f..dfd461a01 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ Features * Adler32 implementation using SSSE3, AVX2, AVX512, AVX512-VNNI, Neon, VMX & VSX * CRC32-B implementation using PCLMULQDQ, VPCLMULQDQ, ACLE, & IBM Z * Hash table implementation using CRC32-C intrinsics on x86 and ARM - * Slide hash implementations using SSE2, AVX2, Neon, VMX & VSX + * Slide hash implementations using SSE2, AVX2, ARMv6, Neon, VMX & VSX * Compare256 implementations using SSE2, AVX2, Neon, POWER9 & RVV * Inflate chunk copying using SSE2, SSSE3, AVX, Neon & VSX * Support for hardware-accelerated deflate using IBM Z DFLTCC @@ -194,6 +194,7 @@ Advanced Build Options | WITH_VPCLMULQDQ | --without-vpclmulqdq | Build with VPCLMULQDQ intrinsics | ON | | WITH_ACLE | --without-acle | Build with ACLE intrinsics | ON | | WITH_NEON | --without-neon | Build with NEON intrinsics | ON | +| WITH_ARMV6 | --without-armv6 | Build with ARMv6 intrinsics | ON | | WITH_ALTIVEC | --without-altivec | Build with AltiVec (VMX) intrinsics | ON | | WITH_POWER8 | --without-power8 | Build with POWER8 optimisations | ON | | WITH_RVV | | Build with RVV intrinsics | ON | diff --git a/arch/arm/Makefile.in b/arch/arm/Makefile.in index 717754760..9d05b00b5 100644 --- a/arch/arm/Makefile.in +++ b/arch/arm/Makefile.in @@ -10,6 +10,7 @@ SUFFIX= ACLEFLAG= NEONFLAG= +ARMV6FLAG= NOLTOFLAG= SRCDIR=. @@ -23,6 +24,7 @@ all: \ compare256_neon.o compare256_neon.lo \ crc32_acle.o crc32_acle.lo \ slide_hash_neon.o slide_hash_neon.lo \ + slide_hash_armv6.o slide_hash_armv6.lo \ insert_string_acle.o insert_string_acle.lo adler32_neon.o: @@ -61,6 +63,12 @@ slide_hash_neon.o: slide_hash_neon.lo: $(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_neon.c +slide_hash_armv6.o: + $(CC) $(CFLAGS) $(ARMV6FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_armv6.c + +slide_hash_armv6.lo: + $(CC) $(SFLAGS) $(ARMV6FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_armv6.c + insert_string_acle.o: $(CC) $(CFLAGS) $(ACLEFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_acle.c diff --git a/arch/arm/acle_intrins.h b/arch/arm/acle_intrins.h new file mode 100644 index 000000000..e9eef12df --- /dev/null +++ b/arch/arm/acle_intrins.h @@ -0,0 +1,27 @@ +#ifndef ARM_ACLE_INTRINS_H +#define ARM_ACLE_INTRINS_H + +#include +#ifdef _MSC_VER +# include +#elif defined(HAVE_ARM_ACLE_H) +# include +#endif + +#ifdef ARM_SIMD +#ifdef _MSC_VER +typedef uint32_t uint16x2_t; + +#define __uqsub16 _arm_uqsub16 +#elif !defined(ARM_SIMD_INTRIN) +typedef uint32_t uint16x2_t; + +static inline uint16x2_t __uqsub16(uint16x2_t __a, uint16x2_t __b) { + uint16x2_t __c; + __asm__ __volatile__("uqsub16\t%0, %1, %2" : "=r" (__c) : "r"(__a), "r"(__b)); + return __c; +} +#endif +#endif + +#endif // include guard ARM_ACLE_INTRINS_H diff --git a/arch/arm/arm_features.c b/arch/arm/arm_features.c index 8ef820009..a0e070ba9 100644 --- a/arch/arm/arm_features.c +++ b/arch/arm/arm_features.c @@ -72,10 +72,28 @@ static inline int arm_has_neon() { } #endif +/* AArch64 does not have ARMv6 SIMD. */ +#if !defined(__aarch64__) && !defined(_M_ARM64) && !defined(_M_ARM64EC) +static inline int arm_has_simd() { +#if defined(__linux__) && defined(HAVE_SYS_AUXV_H) + const char *platform = (const char *)getauxval(AT_PLATFORM); + return strncmp(platform, "v6l", 3) == 0 + || strncmp(platform, "v7l", 3) == 0 + || strncmp(platform, "v8l", 3) == 0; +#elif defined(ARM_NOCHECK_SIMD) + return 1; +#else + return 0; +#endif +} +#endif + void Z_INTERNAL arm_check_features(struct arm_cpu_features *features) { #if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) + features->has_simd = 0; /* never available */ features->has_neon = 1; /* always available */ #else + features->has_simd = arm_has_simd(); features->has_neon = arm_has_neon(); #endif features->has_crc32 = arm_has_crc32(); diff --git a/arch/arm/arm_features.h b/arch/arm/arm_features.h index 6fcd8d3eb..eca078e31 100644 --- a/arch/arm/arm_features.h +++ b/arch/arm/arm_features.h @@ -6,6 +6,7 @@ #define ARM_H_ struct arm_cpu_features { + int has_simd; int has_neon; int has_crc32; }; diff --git a/arch/arm/slide_hash_armv6.c b/arch/arm/slide_hash_armv6.c new file mode 100644 index 000000000..0a2eeccf9 --- /dev/null +++ b/arch/arm/slide_hash_armv6.c @@ -0,0 +1,47 @@ +/* slide_hash_armv6.c -- Optimized hash table shifting for ARMv6 with support for SIMD instructions + * Copyright (C) 2023 Cameron Cawley + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#if defined(ARM_SIMD) +#include "acle_intrins.h" +#include "../../zbuild.h" +#include "../../deflate.h" + +/* SIMD version of hash_chain rebase */ +static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) { + Z_REGISTER uint16x2_t v; + uint16x2_t p0, p1, p2, p3; + Z_REGISTER size_t n; + + size_t size = entries*sizeof(table[0]); + Assert((size % (sizeof(uint16x2_t) * 4) == 0), "hash table size err"); + + Assert(sizeof(Pos) == 2, "Wrong Pos size"); + v = wsize | (wsize << 16); + + n = size / (sizeof(uint16x2_t) * 4); + do { + p0 = *((const uint16x2_t *)(table)); + p1 = *((const uint16x2_t *)(table+2)); + p2 = *((const uint16x2_t *)(table+4)); + p3 = *((const uint16x2_t *)(table+6)); + p0 = __uqsub16(p0, v); + p1 = __uqsub16(p1, v); + p2 = __uqsub16(p2, v); + p3 = __uqsub16(p3, v); + *((uint16x2_t *)(table)) = p0; + *((uint16x2_t *)(table+2)) = p1; + *((uint16x2_t *)(table+4)) = p2; + *((uint16x2_t *)(table+6)) = p3; + table += 8; + } while (--n); +} + +Z_INTERNAL void slide_hash_armv6(deflate_state *s) { + unsigned int wsize = s->w_size; + + slide_hash_chain(s->head, HASH_SIZE, wsize); + slide_hash_chain(s->prev, wsize, wsize); +} +#endif diff --git a/cmake/detect-intrinsics.cmake b/cmake/detect-intrinsics.cmake index a4a28b445..04830fe61 100644 --- a/cmake/detect-intrinsics.cmake +++ b/cmake/detect-intrinsics.cmake @@ -44,6 +44,46 @@ macro(check_acle_compiler_flag) endif() endmacro() +macro(check_armv6_compiler_flag) + if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") + if(NOT NATIVEFLAG) + check_c_compiler_flag("-march=armv6" HAVE_MARCH_ARMV6) + if(HAVE_MARCH_ARMV6) + set(ARMV6FLAG "-march=armv6" CACHE INTERNAL "Compiler option to enable ARMv6 support") + endif() + endif() + endif() + # Check whether compiler supports ARMv6 inline asm + set(CMAKE_REQUIRED_FLAGS "${ARMV6FLAG} ${NATIVEFLAG}") + check_c_source_compile_or_run( + "unsigned int f(unsigned int a, unsigned int b) { + unsigned int c; + __asm__ __volatile__ ( \"uqsub16 %0, %1, %2\" : \"=r\" (c) : \"r\" (a), \"r\" (b) ); + return (int)c; + } + int main(void) { return 0; }" + HAVE_ARMV6_INLINE_ASM + ) + # Check whether compiler supports ARMv6 intrinsics + check_c_source_compile_or_run( + "#if defined(_MSC_VER) + #include + #else + #include + #endif + unsigned int f(unsigned int a, unsigned int b) { + #if defined(_MSC_VER) + return _arm_uqsub16(a, b); + #else + return __uqsub16(a, b); + #endif + } + int main(void) { return 0; }" + HAVE_ARMV6_INTRIN + ) + set(CMAKE_REQUIRED_FLAGS) +endmacro() + macro(check_avx512_intrinsics) if(CMAKE_C_COMPILER_ID MATCHES "Intel") if(CMAKE_HOST_UNIX OR APPLE) diff --git a/configure b/configure index d96455e6c..29de72cec 100755 --- a/configure +++ b/configure @@ -93,6 +93,7 @@ build32=0 build64=0 buildvpclmulqdq=1 buildacle=1 +buildarmv6=1 buildaltivec=1 buildpower8=1 buildpower9=1 @@ -115,6 +116,7 @@ vpclmulflag="-mvpclmulqdq -mavx512f" xsaveflag="-mxsave" acleflag= neonflag= +armv6flag= noltoflag="-fno-lto" vgfmaflag="-march=z13" vmxflag="-maltivec" @@ -167,6 +169,7 @@ case "$1" in echo ' [--without-new-strategies] Compiles without using new additional deflate strategies' | tee -a configure.log echo ' [--without-acle] Compiles without ARM C Language Extensions' | tee -a configure.log echo ' [--without-neon] Compiles without ARM Neon SIMD instruction set' | tee -a configure.log + echo ' [--without-armv6] Compiles without ARMv6 SIMD instruction set' | tee -a configure.log echo ' [--without-altivec] Compiles without PPC AltiVec support' | tee -a configure.log echo ' [--without-power8] Compiles without Power8 instruction set' | tee -a configure.log echo ' [--with-dfltcc-deflate] Use DEFLATE CONVERSION CALL instruction for compression on IBM Z' | tee -a configure.log @@ -198,6 +201,7 @@ case "$1" in --without-vpclmulqdq) buildvpclmulqdq=0; shift ;; --without-acle) buildacle=0; shift ;; --without-neon) buildneon=0; shift ;; + --without-armv6) buildarmv6=0; shift ;; --without-altivec) buildaltivec=0 ; shift ;; --without-power8) buildpower8=0 ; shift ;; --without-power9) buildpower9=0 ; shift ;; @@ -1177,6 +1181,52 @@ EOF fi } +check_armv6_intrinsics() { + # Check whether -march=armv6 works correctly + cat > $test.c << EOF +int main() { return 0; } +EOF + if try $CC -c $CFLAGS -march=armv6 $test.c; then + armv6flag=-march=armv6 + echo "Check whether -march=armv6 works ... Yes." | tee -a configure.log + else + echo "Check whether -march=armv6 works ... No." | tee -a configure.log + fi + + # Check whether compiler supports ARMv6 inline asm + cat > $test.c << EOF +unsigned int f(unsigned int a, unsigned int b) { + unsigned int c; + __asm__ __volatile__ ( "uqsub16 %0, %1, %2" : "=r" (c) : "r" (a), "r" (b) ); + return c; +} +int main(void) { return 0; } +EOF + if try ${CC} ${CFLAGS} ${armv6flag} $test.c; then + echo "Checking for ARMv6 inline assembly ... Yes." | tee -a configure.log + HAVE_ARMV6_INLINE_ASM=1 + else + echo "Checking for ARMv6 inline assembly ... No." | tee -a configure.log + HAVE_ARMV6_INLINE_ASM=0 + fi + + # Check whether compiler supports ARMv6 intrinsics + cat > $test.c << EOF +#include +unsigned int f(unsigned int a, unsigned int b) { + return __uqsub16(a, b); +} +int main(void) { return 0; } +EOF + if try ${CC} ${CFLAGS} ${armv6flag} $test.c; then + echo "Checking for ARMv6 intrinsics ... Yes." | tee -a configure.log + HAVE_ARMV6_INTRIN=1 + else + echo "Checking for ARMv6 intrinsics ... No." | tee -a configure.log + HAVE_ARMV6_INTRIN=0 + fi +} + check_pclmulqdq_intrinsics() { # Check whether compiler supports PCLMULQDQ intrinsics cat > $test.c << EOF @@ -1592,6 +1642,18 @@ EOF ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} arm_features.o" ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} arm_features.lo" + cat > $test.c < +EOF + if try $CC -c $CFLAGS $test.c; then + echo "Checking for arm_acle.h... Yes." | tee -a configure.log + CFLAGS="${CFLAGS} -DHAVE_ARM_ACLE_H" + SFLAGS="${SFLAGS} -DHAVE_ARM_ACLE_H" + else + echo "Checking for arm_acle.h... No." | tee -a configure.log + fi + + if test $LINUX -eq 1; then if test "$ARCH" = "aarch64"; then cat > $test.c <> configure.log echo xsaveflag = $xsaveflag >> configure.log echo acleflag = $acleflag >> configure.log echo neonflag = $neonflag >> configure.log +echo armv6flag = $armv6flag >> configure.log echo ARCHDIR = ${ARCHDIR} >> configure.log echo ARCH_STATIC_OBJS = ${ARCH_STATIC_OBJS} >> configure.log echo ARCH_SHARED_OBJS = ${ARCH_SHARED_OBJS} >> configure.log @@ -2001,6 +2082,7 @@ sed < $SRCDIR/$ARCHDIR/Makefile.in " /^XSAVEFLAG *=/s#=.*#=$xsaveflag# /^ACLEFLAG *=/s#=.*#=$acleflag# /^NEONFLAG *=/s#=.*#=$neonflag# +/^ARMV6FLAG *=/s#=.*#=$armv6flag# /^NOLTOFLAG *=/s#=.*#=$noltoflag# /^VGFMAFLAG *=/s#=.*#=$vgfmaflag# /^PPCFLAGS *=/s#=.*#=$vmxflag# diff --git a/cpu_features.h b/cpu_features.h index 870f6e656..faca52ad4 100644 --- a/cpu_features.h +++ b/cpu_features.h @@ -261,7 +261,11 @@ typedef void (*slide_hash_func)(deflate_state *s); #ifdef X86_SSE2 extern void slide_hash_sse2(deflate_state *s); -#elif defined(ARM_NEON) +#endif +#if defined(ARM_SIMD) +extern void slide_hash_armv6(deflate_state *s); +#endif +#if defined(ARM_NEON) extern void slide_hash_neon(deflate_state *s); #endif #if defined(PPC_VMX) diff --git a/functable.c b/functable.c index 6e195acea..3ef840cff 100644 --- a/functable.c +++ b/functable.c @@ -142,6 +142,15 @@ static void init_functable(void) { #endif + // ARM - SIMD +#ifdef ARM_SIMD +# ifndef ARM_NOCHECK_SIMD + if (cf.arm.has_simd) +# endif + { + ft.slide_hash = &slide_hash_armv6; + } +#endif // ARM - NEON #ifdef ARM_NEON # ifndef ARM_NOCHECK_NEON diff --git a/test/benchmarks/benchmark_slidehash.cc b/test/benchmarks/benchmark_slidehash.cc index b5ab45616..e098c815e 100644 --- a/test/benchmarks/benchmark_slidehash.cc +++ b/test/benchmarks/benchmark_slidehash.cc @@ -68,6 +68,9 @@ public: BENCHMARK_SLIDEHASH(c, slide_hash_c, 1); +#ifdef ARM_SIMD +BENCHMARK_SLIDEHASH(armv6, slide_hash_armv6, test_cpu_features.arm.has_simd); +#endif #ifdef ARM_NEON BENCHMARK_SLIDEHASH(neon, slide_hash_neon, test_cpu_features.arm.has_neon); #endif diff --git a/win32/Makefile.arm b/win32/Makefile.arm index 7d3f1b58a..34dfe6bba 100644 --- a/win32/Makefile.arm +++ b/win32/Makefile.arm @@ -41,6 +41,7 @@ WITH_GZFILEOP = yes ZLIB_COMPAT = WITH_ACLE = WITH_NEON = +WITH_ARMV6 = WITH_VFPV3 = NEON_ARCH = /arch:VFPv4 SUFFIX = @@ -110,6 +111,13 @@ WFLAGS = $(WFLAGS) \ # OBJS = $(OBJS) adler32_neon.obj chunkset_neon.obj compare256_neon.obj slide_hash_neon.obj !endif +!if "$(WITH_ARMV6)" != "" +WFLAGS = $(WFLAGS) \ + -DARM_SIMD \ + -DARM_NOCHECK_SIMD \ + # +OBJS = $(OBJS) slide_hash_armv6.obj +!endif # targets all: $(STATICLIB) $(SHAREDLIB) $(IMPLIB) \