From: Matheus Castanho Date: Sun, 17 Apr 2022 00:12:53 +0000 (-0700) Subject: Implement power9 version of compare256. X-Git-Tag: 2.1.0-beta1~250 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=02d10b252cc54159f7c33823048daec4b023fb22;p=thirdparty%2Fzlib-ng.git Implement power9 version of compare256. Co-authored-by: Nathan Moinvaziri --- diff --git a/CMakeLists.txt b/CMakeLists.txt index 5b01f451..6c45fd84 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -105,6 +105,7 @@ if(BASEARCH_ARM_FOUND) elseif(BASEARCH_PPC_FOUND) option(WITH_ALTIVEC "Build with AltiVec (VMX) optimisations for PowerPC" ON) option(WITH_POWER8 "Build with optimisations for POWER8" ON) + option(WITH_POWER9 "Build with optimisations for POWER9" ON) elseif(BASEARCH_S360_FOUND) option(WITH_DFLTCC_DEFLATE "Build with DFLTCC intrinsics for compression on IBM Z" OFF) option(WITH_DFLTCC_INFLATE "Build with DFLTCC intrinsics for decompression on IBM Z" OFF) @@ -138,6 +139,7 @@ mark_as_advanced(FORCE WITH_PCLMULQDQ WITH_ALTIVEC WITH_POWER8 + WITH_POWER9 WITH_INFLATE_STRICT WITH_INFLATE_ALLOW_INVALID_DIST WITH_UNALIGNED @@ -628,7 +630,10 @@ if(WITH_OPTIM) if(WITH_POWER8) check_power8_intrinsics() endif() - if(HAVE_VMX OR HAVE_POWER8_INTRIN) + if(WITH_POWER9) + check_power9_intrinsics() + endif() + if(HAVE_VMX OR HAVE_POWER8_INTRIN OR HAVE_POWER9_INTRIN) list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/power_features.h) list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/power_features.c) endif() @@ -667,6 +672,17 @@ if(WITH_OPTIM) set(WITH_POWER8 OFF) endif() endif() + # Power9 specific options and files + if(WITH_POWER9) + if(HAVE_POWER9_INTRIN) + add_definitions(-DPOWER9) + set(POWER9_SRCS ${ARCHDIR}/compare256_power9.c) + list(APPEND ZLIB_ARCH_SRCS ${POWER9_SRCS}) + set_property(SOURCE ${POWER9_SRCS} PROPERTY COMPILE_FLAGS "${POWER9FLAG} ${NOLTOFLAG}") + else() + set(WITH_POWER9 OFF) + endif() + endif() elseif(BASEARCH_S360_FOUND) check_s390_intrinsics() if(HAVE_S390_INTRIN) @@ -1463,6 +1479,7 @@ if(BASEARCH_ARM_FOUND) elseif(BASEARCH_PPC_FOUND) add_feature_info(WITH_ALTIVEC WITH_ALTIVEC "Build with AltiVec optimisations") add_feature_info(WITH_POWER8 WITH_POWER8 "Build with optimisations for POWER8") + add_feature_info(WITH_POWER9 WITH_POWER9 "Build with optimisations for POWER9") elseif(BASEARCH_S360_FOUND) add_feature_info(WITH_DFLTCC_DEFLATE WITH_DFLTCC_DEFLATE "Build with DFLTCC intrinsics for compression on IBM Z") add_feature_info(WITH_DFLTCC_INFLATE WITH_DFLTCC_INFLATE "Build with DFLTCC intrinsics for decompression on IBM Z") diff --git a/README.md b/README.md index 40ad1f85..0dd1b1c3 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@ Features * CRC32-B implementation using PCLMULQDQ, VPCLMULQDQ, ACLE, & IBM Z * Hash table implementation using CRC32-C intrinsics on x86 and ARM * Slide hash implementations using SSE2, AVX2, Neon, VMX & VSX - * Compare256 implementations using SSE2, AVX2, & Neon + * Compare256 implementations using SSE2, AVX2, Neon, & POWER9 * Inflate chunk copying using SSE2, AVX, Neon & VSX * Support for hardware-accelerated deflate using IBM Z DFLTCC * Unaligned memory read/writes and large bit buffer improvements diff --git a/arch/power/Makefile.in b/arch/power/Makefile.in index ca0e2ba9..e9be6ddd 100644 --- a/arch/power/Makefile.in +++ b/arch/power/Makefile.in @@ -10,6 +10,7 @@ INCLUDES= SUFFIX= P8FLAGS=-mcpu=power8 +P9FLAGS=-mcpu=power9 PPCFLAGS=-maltivec NOLTOFLAG= @@ -25,6 +26,8 @@ all: power_features.o \ adler32_vmx.lo \ chunkset_power8.o \ chunkset_power8.lo \ + compare256_power9.o \ + compare256_power9.lo \ crc32_power8.o \ crc32_power8.lo \ slide_hash_power8.o \ @@ -56,6 +59,12 @@ chunkset_power8.o: chunkset_power8.lo: $(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_power8.c +compare256_power9.o: + $(CC) $(CFLAGS) $(P9FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_power9.c + +compare256_power9.lo: + $(CC) $(SFLAGS) $(P9FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_power9.c + crc32_power8.o: $(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_power8.c diff --git a/arch/power/compare256_power9.c b/arch/power/compare256_power9.c new file mode 100644 index 00000000..9b3e6170 --- /dev/null +++ b/arch/power/compare256_power9.c @@ -0,0 +1,66 @@ +/* compare256_power9.c - Power9 version of compare256 + * Copyright (C) 2019 Matheus Castanho , IBM + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef POWER9 +#include +#include "../../zbuild.h" +#include "../../zendian.h" + +/* Older versions of GCC misimplemented semantics for these bit counting builtins. + * https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=3f30f2d1dbb3228b8468b26239fe60c2974ce2ac */ +#if defined(__GNUC__) && (__GNUC__ < 12) +# define zng_vec_vctzlsbb(vc, len) __asm__ volatile("vctzlsbb %0, %1\n\t" : "=r" (len) : "v" (vc)) +# define zng_vec_vclzlsbb(vc, len) __asm__ volatile("vclzlsbb %0, %1\n\t" : "=r" (len) : "v" (vc)) +#else +# define zng_vec_vctzlsbb(vc, len) len = __builtin_vec_vctzlsbb(vc) +# define zng_vec_vclzlsbb(vc, len) len = __builtin_vec_vclzlsbb(vc) +#endif + +static inline uint32_t compare256_power9_static(const uint8_t *src0, const uint8_t *src1) { + uint32_t len = 0, cmplen; + + do { + vector unsigned char vsrc0, vsrc1, vc; + + vsrc0 = *((vector unsigned char *)src0); + vsrc1 = *((vector unsigned char *)src1); + + /* Compare 16 bytes at a time. Each byte of vc will be either + * all ones or all zeroes, depending on the result of the comparison. */ + vc = (vector unsigned char)vec_cmpne(vsrc0, vsrc1); + + /* Since the index of matching bytes will contain only zeroes + * on vc (since we used cmpne), counting the number of consecutive + * bytes where LSB == 0 is the same as counting the length of the match. */ +#if BYTE_ORDER == LITTLE_ENDIAN + zng_vec_vctzlsbb(vc, cmplen); +#else + zng_vec_vclzlsbb(vc, cmplen); +#endif + if (cmplen != 16) + return len + cmplen; + + src0 += 16, src1 += 16, len += 16; + } while (len < 256); + + return 256; +} + +Z_INTERNAL uint32_t compare256_power9(const uint8_t *src0, const uint8_t *src1) { + return compare256_power9_static(src0, src1); +} + +#define LONGEST_MATCH longest_match_power9 +#define COMPARE256 compare256_power9_static + +#include "match_tpl.h" + +#define LONGEST_MATCH_SLOW +#define LONGEST_MATCH longest_match_slow_power9 +#define COMPARE256 compare256_power9_static + +#include "match_tpl.h" + +#endif diff --git a/arch/power/power_features.c b/arch/power/power_features.c index 65599d9a..7c0350c6 100644 --- a/arch/power/power_features.c +++ b/arch/power/power_features.c @@ -12,6 +12,7 @@ Z_INTERNAL int power_cpu_has_altivec = 0; Z_INTERNAL int power_cpu_has_arch_2_07 = 0; +Z_INTERNAL int power_cpu_has_arch_3_00 = 0; void Z_INTERNAL power_check_features(void) { #ifdef PPC_FEATURES @@ -28,5 +29,7 @@ void Z_INTERNAL power_check_features(void) { if (hwcap2 & PPC_FEATURE2_ARCH_2_07) power_cpu_has_arch_2_07 = 1; + if (hwcap2 & PPC_FEATURE2_ARCH_3_00) + power_cpu_has_arch_3_00 = 1; #endif } diff --git a/arch/power/power_features.h b/arch/power/power_features.h index 077bec11..8df9f9e9 100644 --- a/arch/power/power_features.h +++ b/arch/power/power_features.h @@ -9,6 +9,7 @@ extern int power_cpu_has_altivec; extern int power_cpu_has_arch_2_07; +extern int power_cpu_has_arch_3_00; void Z_INTERNAL power_check_features(void); diff --git a/cmake/detect-intrinsics.cmake b/cmake/detect-intrinsics.cmake index 1ea4ec94..c638b3bc 100644 --- a/cmake/detect-intrinsics.cmake +++ b/cmake/detect-intrinsics.cmake @@ -316,6 +316,23 @@ macro(check_s390_intrinsics) ) endmacro() +macro(check_power9_intrinsics) + if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") + if(NOT NATIVEFLAG) + set(POWER9FLAG "-mcpu=power9") + endif() + endif() + # Check if we have what we need for POWER9 optimizations + set(CMAKE_REQUIRED_FLAGS "${POWER9FLAG} ${NATIVEFLAG}") + check_c_source_compiles( + "int main() { + return 0; + }" + HAVE_POWER9_INTRIN + ) + set(CMAKE_REQUIRED_FLAGS) +endmacro() + macro(check_sse2_intrinsics) if(CMAKE_C_COMPILER_ID MATCHES "Intel") if(CMAKE_HOST_UNIX OR APPLE) diff --git a/configure b/configure index 836e0307..ff657e69 100755 --- a/configure +++ b/configure @@ -95,6 +95,7 @@ buildvpclmulqdq=1 buildacle=1 buildaltivec=1 buildpower8=1 +buildpower9=1 buildneon=1 builddfltccdeflate=0 builddfltccinflate=0 @@ -202,6 +203,7 @@ case "$1" in --without-neon) buildneon=0; shift ;; --without-altivec) buildaltivec=0 ; shift ;; --without-power8) buildpower8=0 ; shift ;; + --without-power9) buildpower9=0 ; shift ;; --with-dfltcc-deflate) builddfltccdeflate=1; shift ;; --with-dfltcc-inflate) builddfltccinflate=1; shift ;; --without-crc32-vx) buildcrc32vx=0; shift ;; @@ -1227,7 +1229,7 @@ EOF } check_power8_intrinsics() { - # Check whether features needed by POWER optimisations are available + # Check whether features needed by POWER8 optimisations are available cat > $test.c << EOF #include int main() { return (getauxval(AT_HWCAP2) & PPC_FEATURE2_ARCH_2_07); } @@ -1241,6 +1243,20 @@ EOF fi } +check_power9_intrinsics() { + # Check whether features needed by POWER9 optimisations are available + cat > $test.c << EOF +int main() { return 0; } +EOF + if test $buildpower9 -eq 1 && try $CC -c $CFLAGS -mcpu=power9 $test.c; then + HAVE_POWER9_INTRIN=1 + echo "Check whether POWER9 instructions are available ... Yes." | tee -a configure.log + else + HAVE_POWER9_INTRIN=0 + echo "Check whether POWER9 instructions are available ... No." | tee -a configure.log + fi +} + check_sse2_intrinsics() { # Check whether compiler supports SSE2 intrinsics cat > $test.c << EOF @@ -1824,6 +1840,7 @@ EOF check_ppc_intrinsics check_power8_intrinsics + check_power9_intrinsics if test $HAVE_VMX -eq 1; then CFLAGS="${CFLAGS} -DPPC_FEATURES" @@ -1855,6 +1872,13 @@ EOF ;; esac fi + if test $HAVE_POWER9_INTRIN -eq 1; then + CFLAGS="${CFLAGS} -DPOWER9 -DPOWER_FEATURES" + SFLAGS="${SFLAGS} -DPOWER9 -DPOWER_FEATURES" + + ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} compare256_power9.o" + ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} compare256_power9.lo" + fi fi ;; s390x) diff --git a/cpu_features.h b/cpu_features.h index 504c6a93..861ae0c4 100644 --- a/cpu_features.h +++ b/cpu_features.h @@ -130,6 +130,9 @@ extern uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1); #if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL) extern uint32_t compare256_neon(const uint8_t *src0, const uint8_t *src1); #endif +#ifdef POWER9 +extern uint32_t compare256_power9(const uint8_t *src0, const uint8_t *src1); +#endif #ifdef DEFLATE_H_ /* insert_string */ @@ -160,6 +163,9 @@ extern uint32_t longest_match_avx2(deflate_state *const s, Pos cur_match); #if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL) extern uint32_t longest_match_neon(deflate_state *const s, Pos cur_match); #endif +#ifdef POWER9 +extern uint32_t longest_match_power9(deflate_state *const s, Pos cur_match); +#endif /* longest_match_slow */ extern uint32_t longest_match_slow_c(deflate_state *const s, Pos cur_match); @@ -179,6 +185,9 @@ extern uint32_t longest_match_slow_avx2(deflate_state *const s, Pos cur_match); #if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL) extern uint32_t longest_match_slow_neon(deflate_state *const s, Pos cur_match); #endif +#ifdef POWER9 +extern uint32_t longest_match_slow_power9(deflate_state *const s, Pos cur_match); +#endif /* quick_insert_string */ extern Pos quick_insert_string_c(deflate_state *const s, const uint32_t str); diff --git a/functable.c b/functable.c index 74381e15..64992bc7 100644 --- a/functable.c +++ b/functable.c @@ -121,6 +121,10 @@ Z_INTERNAL uint32_t longest_match_stub(deflate_state *const s, Pos cur_match) { if (arm_cpu_has_neon) functable.longest_match = &longest_match_neon; #endif +#ifdef POWER9 + if (power_cpu_has_arch_3_00) + functable.longest_match = &longest_match_power9; +#endif return functable.longest_match(s, cur_match); } @@ -150,6 +154,10 @@ Z_INTERNAL uint32_t longest_match_slow_stub(deflate_state *const s, Pos cur_matc if (arm_cpu_has_neon) functable.longest_match_slow = &longest_match_slow_neon; #endif +#ifdef POWER9 + if (power_cpu_has_arch_3_00) + functable.longest_match_slow = &longest_match_slow_power9; +#endif return functable.longest_match_slow(s, cur_match); } @@ -410,6 +418,10 @@ Z_INTERNAL uint32_t compare256_stub(const uint8_t *src0, const uint8_t *src1) { if (x86_cpu_has_avx2) functable.compare256 = &compare256_avx2; #endif +#ifdef POWER9 + if (power_cpu_has_arch_3_00) + functable.compare256 = &compare256_power9; +#endif return functable.compare256(src0, src1); } diff --git a/test/benchmarks/benchmark_compare256.cc b/test/benchmarks/benchmark_compare256.cc index c579d9ac..54459dad 100644 --- a/test/benchmarks/benchmark_compare256.cc +++ b/test/benchmarks/benchmark_compare256.cc @@ -79,3 +79,6 @@ BENCHMARK_COMPARE256(avx2, compare256_avx2, x86_cpu_has_avx2); #if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL) BENCHMARK_COMPARE256(neon, compare256_neon, arm_cpu_has_neon); #endif +#ifdef POWER9 +BENCHMARK_COMPARE256(power9, compare256_power9, power_cpu_has_arch_3_00); +#endif diff --git a/test/test_compare256.cc b/test/test_compare256.cc index 61c6e19b..7c4dab98 100644 --- a/test/test_compare256.cc +++ b/test/test_compare256.cc @@ -75,3 +75,6 @@ TEST_COMPARE256(avx2, compare256_avx2, x86_cpu_has_avx2) #if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL) TEST_COMPARE256(neon, compare256_neon, arm_cpu_has_neon) #endif +#ifdef POWER9 +TEST_COMPARE256(power9, compare256_power9, power_cpu_has_arch_3_00) +#endif