]> git.ipfire.org Git - thirdparty/zlib-ng.git/commitdiff
Implement power9 version of compare256.
authorMatheus Castanho <msc@linux.ibm.com>
Sun, 17 Apr 2022 00:12:53 +0000 (17:12 -0700)
committerHans Kristian Rosbach <hk-github@circlestorm.org>
Sat, 7 May 2022 12:06:42 +0000 (14:06 +0200)
Co-authored-by: Nathan Moinvaziri <nathan@nathanm.com>
12 files changed:
CMakeLists.txt
README.md
arch/power/Makefile.in
arch/power/compare256_power9.c [new file with mode: 0644]
arch/power/power_features.c
arch/power/power_features.h
cmake/detect-intrinsics.cmake
configure
cpu_features.h
functable.c
test/benchmarks/benchmark_compare256.cc
test/test_compare256.cc

index 5b01f451de7955b4e08954f2807a0210a5a23e35..6c45fd84463ca33d9f7b162a212cd37ba05a1a68 100644 (file)
@@ -105,6 +105,7 @@ if(BASEARCH_ARM_FOUND)
 elseif(BASEARCH_PPC_FOUND)
     option(WITH_ALTIVEC "Build with AltiVec (VMX) optimisations for PowerPC" ON)
     option(WITH_POWER8 "Build with optimisations for POWER8" ON)
+    option(WITH_POWER9 "Build with optimisations for POWER9" ON)
 elseif(BASEARCH_S360_FOUND)
     option(WITH_DFLTCC_DEFLATE "Build with DFLTCC intrinsics for compression on IBM Z" OFF)
     option(WITH_DFLTCC_INFLATE "Build with DFLTCC intrinsics for decompression on IBM Z" OFF)
@@ -138,6 +139,7 @@ mark_as_advanced(FORCE
     WITH_PCLMULQDQ
     WITH_ALTIVEC
     WITH_POWER8
+    WITH_POWER9
     WITH_INFLATE_STRICT
     WITH_INFLATE_ALLOW_INVALID_DIST
     WITH_UNALIGNED
@@ -628,7 +630,10 @@ if(WITH_OPTIM)
         if(WITH_POWER8)
             check_power8_intrinsics()
         endif()
-        if(HAVE_VMX OR HAVE_POWER8_INTRIN)
+        if(WITH_POWER9)
+            check_power9_intrinsics()
+        endif()
+        if(HAVE_VMX OR HAVE_POWER8_INTRIN OR HAVE_POWER9_INTRIN)
             list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/power_features.h)
             list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/power_features.c)
         endif()
@@ -667,6 +672,17 @@ if(WITH_OPTIM)
                 set(WITH_POWER8 OFF)
             endif()
         endif()
+        # Power9 specific options and files
+        if(WITH_POWER9)
+            if(HAVE_POWER9_INTRIN)
+                add_definitions(-DPOWER9)
+                set(POWER9_SRCS ${ARCHDIR}/compare256_power9.c)
+                list(APPEND ZLIB_ARCH_SRCS ${POWER9_SRCS})
+                set_property(SOURCE ${POWER9_SRCS} PROPERTY COMPILE_FLAGS "${POWER9FLAG} ${NOLTOFLAG}")
+            else()
+                set(WITH_POWER9 OFF)
+            endif()
+        endif()
     elseif(BASEARCH_S360_FOUND)
         check_s390_intrinsics()
         if(HAVE_S390_INTRIN)
@@ -1463,6 +1479,7 @@ if(BASEARCH_ARM_FOUND)
 elseif(BASEARCH_PPC_FOUND)
     add_feature_info(WITH_ALTIVEC WITH_ALTIVEC "Build with AltiVec optimisations")
     add_feature_info(WITH_POWER8 WITH_POWER8 "Build with optimisations for POWER8")
+    add_feature_info(WITH_POWER9 WITH_POWER9 "Build with optimisations for POWER9")
 elseif(BASEARCH_S360_FOUND)
     add_feature_info(WITH_DFLTCC_DEFLATE WITH_DFLTCC_DEFLATE "Build with DFLTCC intrinsics for compression on IBM Z")
     add_feature_info(WITH_DFLTCC_INFLATE WITH_DFLTCC_INFLATE "Build with DFLTCC intrinsics for decompression on IBM Z")
index 40ad1f858df45f26812fae5982cae64934d1379a..0dd1b1c37e383ee86db0e7b975576303e16fe7a2 100644 (file)
--- a/README.md
+++ b/README.md
@@ -25,7 +25,7 @@ Features
   * CRC32-B implementation using PCLMULQDQ, VPCLMULQDQ, ACLE, & IBM Z
   * Hash table implementation using CRC32-C intrinsics on x86 and ARM
   * Slide hash implementations using SSE2, AVX2, Neon, VMX & VSX
-  * Compare256 implementations using SSE2, AVX2, & Neon
+  * Compare256 implementations using SSE2, AVX2, Neon, & POWER9
   * Inflate chunk copying using SSE2, AVX, Neon & VSX
   * Support for hardware-accelerated deflate using IBM Z DFLTCC
 * Unaligned memory read/writes and large bit buffer improvements
index ca0e2ba9ca43af145c645b922b814196f2a4e011..e9be6dddba16721e9202321648301c59c82dd14a 100644 (file)
@@ -10,6 +10,7 @@ INCLUDES=
 SUFFIX=
 
 P8FLAGS=-mcpu=power8
+P9FLAGS=-mcpu=power9
 PPCFLAGS=-maltivec
 NOLTOFLAG=
 
@@ -25,6 +26,8 @@ all: power_features.o \
      adler32_vmx.lo \
      chunkset_power8.o \
      chunkset_power8.lo \
+     compare256_power9.o \
+     compare256_power9.lo \
      crc32_power8.o \
      crc32_power8.lo \
      slide_hash_power8.o \
@@ -56,6 +59,12 @@ chunkset_power8.o:
 chunkset_power8.lo:
        $(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_power8.c
 
+compare256_power9.o:
+       $(CC) $(CFLAGS) $(P9FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_power9.c
+
+compare256_power9.lo:
+       $(CC) $(SFLAGS) $(P9FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_power9.c
+
 crc32_power8.o:
        $(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_power8.c
 
diff --git a/arch/power/compare256_power9.c b/arch/power/compare256_power9.c
new file mode 100644 (file)
index 0000000..9b3e617
--- /dev/null
@@ -0,0 +1,66 @@
+/* compare256_power9.c - Power9 version of compare256
+ * Copyright (C) 2019 Matheus Castanho <msc@linux.ibm.com>, IBM
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef POWER9
+#include <altivec.h>
+#include "../../zbuild.h"
+#include "../../zendian.h"
+
+/* Older versions of GCC misimplemented semantics for these bit counting builtins.
+ * https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=3f30f2d1dbb3228b8468b26239fe60c2974ce2ac */
+#if defined(__GNUC__) && (__GNUC__ < 12)
+#  define zng_vec_vctzlsbb(vc, len) __asm__ volatile("vctzlsbb %0, %1\n\t" : "=r" (len) : "v" (vc))
+#  define zng_vec_vclzlsbb(vc, len) __asm__ volatile("vclzlsbb %0, %1\n\t" : "=r" (len) : "v" (vc))
+#else
+#  define zng_vec_vctzlsbb(vc, len) len = __builtin_vec_vctzlsbb(vc)
+#  define zng_vec_vclzlsbb(vc, len) len = __builtin_vec_vclzlsbb(vc)
+#endif
+
+static inline uint32_t compare256_power9_static(const uint8_t *src0, const uint8_t *src1) {
+    uint32_t len = 0, cmplen;
+
+    do {
+        vector unsigned char vsrc0, vsrc1, vc;
+
+        vsrc0 = *((vector unsigned char *)src0);
+        vsrc1 = *((vector unsigned char *)src1);
+
+        /* Compare 16 bytes at a time. Each byte of vc will be either
+         * all ones or all zeroes, depending on the result of the comparison. */
+        vc = (vector unsigned char)vec_cmpne(vsrc0, vsrc1);
+
+        /* Since the index of matching bytes will contain only zeroes
+         * on vc (since we used cmpne), counting the number of consecutive
+         * bytes where LSB == 0 is the same as counting the length of the match. */
+#if BYTE_ORDER == LITTLE_ENDIAN
+        zng_vec_vctzlsbb(vc, cmplen);
+#else
+        zng_vec_vclzlsbb(vc, cmplen);
+#endif
+        if (cmplen != 16)
+            return len + cmplen;
+
+        src0 += 16, src1 += 16, len += 16;
+    } while (len < 256);
+
+   return 256;
+}
+
+Z_INTERNAL uint32_t compare256_power9(const uint8_t *src0, const uint8_t *src1) {
+    return compare256_power9_static(src0, src1);
+}
+
+#define LONGEST_MATCH       longest_match_power9
+#define COMPARE256          compare256_power9_static
+
+#include "match_tpl.h"
+
+#define LONGEST_MATCH_SLOW
+#define LONGEST_MATCH       longest_match_slow_power9
+#define COMPARE256          compare256_power9_static
+
+#include "match_tpl.h"
+
+#endif
index 65599d9a79d91cb897e9ff315c07a78ca59d3322..7c0350c66e65f6dbed217b9863cf56d349632ffc 100644 (file)
@@ -12,6 +12,7 @@
 
 Z_INTERNAL int power_cpu_has_altivec = 0;
 Z_INTERNAL int power_cpu_has_arch_2_07 = 0;
+Z_INTERNAL int power_cpu_has_arch_3_00 = 0;
 
 void Z_INTERNAL power_check_features(void) {
 #ifdef PPC_FEATURES
@@ -28,5 +29,7 @@ void Z_INTERNAL power_check_features(void) {
 
     if (hwcap2 & PPC_FEATURE2_ARCH_2_07)
         power_cpu_has_arch_2_07 = 1;
+    if (hwcap2 & PPC_FEATURE2_ARCH_3_00)
+        power_cpu_has_arch_3_00 = 1;
 #endif
 }
index 077bec11588708fb7c831acab06c9cfe0127266a..8df9f9e958266f55894cd579dc67cbb492cfe7e1 100644 (file)
@@ -9,6 +9,7 @@
 
 extern int power_cpu_has_altivec;
 extern int power_cpu_has_arch_2_07;
+extern int power_cpu_has_arch_3_00;
 
 void Z_INTERNAL power_check_features(void);
 
index 1ea4ec9418434aebd8b5e1824148637d3fc4eaac..c638b3bc263415d2d0747e5c1baf10466802b24d 100644 (file)
@@ -316,6 +316,23 @@ macro(check_s390_intrinsics)
     )
 endmacro()
 
+macro(check_power9_intrinsics)
+    if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+        if(NOT NATIVEFLAG)
+            set(POWER9FLAG "-mcpu=power9")
+        endif()
+    endif()
+    # Check if we have what we need for POWER9 optimizations
+    set(CMAKE_REQUIRED_FLAGS "${POWER9FLAG} ${NATIVEFLAG}")
+    check_c_source_compiles(
+        "int main() {
+            return 0;
+        }"
+        HAVE_POWER9_INTRIN
+    )
+    set(CMAKE_REQUIRED_FLAGS)
+endmacro()
+
 macro(check_sse2_intrinsics)
     if(CMAKE_C_COMPILER_ID MATCHES "Intel")
         if(CMAKE_HOST_UNIX OR APPLE)
index 836e03072b378c8b6b4c07569b4a5f0985a788f9..ff657e698f5eb0498628d10dad855d8d4b90dab0 100755 (executable)
--- a/configure
+++ b/configure
@@ -95,6 +95,7 @@ buildvpclmulqdq=1
 buildacle=1
 buildaltivec=1
 buildpower8=1
+buildpower9=1
 buildneon=1
 builddfltccdeflate=0
 builddfltccinflate=0
@@ -202,6 +203,7 @@ case "$1" in
     --without-neon) buildneon=0; shift ;;
     --without-altivec) buildaltivec=0 ; shift ;;
     --without-power8) buildpower8=0 ; shift ;;
+    --without-power9) buildpower9=0 ; shift ;;
     --with-dfltcc-deflate) builddfltccdeflate=1; shift ;;
     --with-dfltcc-inflate) builddfltccinflate=1; shift ;;
     --without-crc32-vx) buildcrc32vx=0; shift ;;
@@ -1227,7 +1229,7 @@ EOF
 }
 
 check_power8_intrinsics() {
-    # Check whether features needed by POWER optimisations are available
+    # Check whether features needed by POWER8 optimisations are available
     cat > $test.c << EOF
 #include <sys/auxv.h>
 int main() { return (getauxval(AT_HWCAP2) & PPC_FEATURE2_ARCH_2_07); }
@@ -1241,6 +1243,20 @@ EOF
     fi
 }
 
+check_power9_intrinsics() {
+    # Check whether features needed by POWER9 optimisations are available
+    cat > $test.c << EOF
+int main() { return 0; }
+EOF
+    if test $buildpower9 -eq 1 && try $CC -c $CFLAGS -mcpu=power9 $test.c; then
+        HAVE_POWER9_INTRIN=1
+        echo "Check whether POWER9 instructions are available ... Yes." | tee -a configure.log
+    else
+        HAVE_POWER9_INTRIN=0
+        echo "Check whether POWER9 instructions are available ... No." | tee -a configure.log
+    fi
+}
+
 check_sse2_intrinsics() {
     # Check whether compiler supports SSE2 intrinsics
     cat > $test.c << EOF
@@ -1824,6 +1840,7 @@ EOF
 
             check_ppc_intrinsics
             check_power8_intrinsics
+            check_power9_intrinsics
 
             if test $HAVE_VMX -eq 1; then
                 CFLAGS="${CFLAGS} -DPPC_FEATURES"
@@ -1855,6 +1872,13 @@ EOF
                         ;;
                 esac
             fi
+            if test $HAVE_POWER9_INTRIN -eq 1; then
+                CFLAGS="${CFLAGS} -DPOWER9 -DPOWER_FEATURES"
+                SFLAGS="${SFLAGS} -DPOWER9 -DPOWER_FEATURES"
+
+                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} compare256_power9.o"
+                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} compare256_power9.lo"
+            fi
         fi
     ;;
     s390x)
index 504c6a93e7ede0c9b7ae7472566f3a3cfb61bbe1..861ae0c4d80644085cea4df9d93c21c4de3029f0 100644 (file)
@@ -130,6 +130,9 @@ extern uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1);
 #if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
 extern uint32_t compare256_neon(const uint8_t *src0, const uint8_t *src1);
 #endif
+#ifdef POWER9
+extern uint32_t compare256_power9(const uint8_t *src0, const uint8_t *src1);
+#endif
 
 #ifdef DEFLATE_H_
 /* insert_string */
@@ -160,6 +163,9 @@ extern uint32_t longest_match_avx2(deflate_state *const s, Pos cur_match);
 #if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
 extern uint32_t longest_match_neon(deflate_state *const s, Pos cur_match);
 #endif
+#ifdef POWER9
+extern uint32_t longest_match_power9(deflate_state *const s, Pos cur_match);
+#endif
 
 /* longest_match_slow */
 extern uint32_t longest_match_slow_c(deflate_state *const s, Pos cur_match);
@@ -179,6 +185,9 @@ extern uint32_t longest_match_slow_avx2(deflate_state *const s, Pos cur_match);
 #if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
 extern uint32_t longest_match_slow_neon(deflate_state *const s, Pos cur_match);
 #endif
+#ifdef POWER9
+extern uint32_t longest_match_slow_power9(deflate_state *const s, Pos cur_match);
+#endif
 
 /* quick_insert_string */
 extern Pos quick_insert_string_c(deflate_state *const s, const uint32_t str);
index 74381e1589015eda11bf44424d6f3cd39d57e42f..64992bc7be9a2ea82e7ada6efab977b8c2cfc3c2 100644 (file)
@@ -121,6 +121,10 @@ Z_INTERNAL uint32_t longest_match_stub(deflate_state *const s, Pos cur_match) {
     if (arm_cpu_has_neon)
         functable.longest_match = &longest_match_neon;
 #endif
+#ifdef POWER9
+    if (power_cpu_has_arch_3_00)
+        functable.longest_match = &longest_match_power9;
+#endif
 
     return functable.longest_match(s, cur_match);
 }
@@ -150,6 +154,10 @@ Z_INTERNAL uint32_t longest_match_slow_stub(deflate_state *const s, Pos cur_matc
     if (arm_cpu_has_neon)
         functable.longest_match_slow = &longest_match_slow_neon;
 #endif
+#ifdef POWER9
+    if (power_cpu_has_arch_3_00)
+        functable.longest_match_slow = &longest_match_slow_power9;
+#endif
 
     return functable.longest_match_slow(s, cur_match);
 }
@@ -410,6 +418,10 @@ Z_INTERNAL uint32_t compare256_stub(const uint8_t *src0, const uint8_t *src1) {
     if (x86_cpu_has_avx2)
         functable.compare256 = &compare256_avx2;
 #endif
+#ifdef POWER9
+    if (power_cpu_has_arch_3_00)
+        functable.compare256 = &compare256_power9;
+#endif
 
     return functable.compare256(src0, src1);
 }
index c579d9ac81a3814f4c3ebb5b6b151381b0f35cd8..54459dad06b05351606a0df5429208daff543c3e 100644 (file)
@@ -79,3 +79,6 @@ BENCHMARK_COMPARE256(avx2, compare256_avx2, x86_cpu_has_avx2);
 #if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
 BENCHMARK_COMPARE256(neon, compare256_neon, arm_cpu_has_neon);
 #endif
+#ifdef POWER9
+BENCHMARK_COMPARE256(power9, compare256_power9, power_cpu_has_arch_3_00);
+#endif
index 61c6e19bcc41ed6601d4fc4b96103e490e2d2fd8..7c4dab98991864632727b5b7668e50dfb1c1efe4 100644 (file)
@@ -75,3 +75,6 @@ TEST_COMPARE256(avx2, compare256_avx2, x86_cpu_has_avx2)
 #if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
 TEST_COMPARE256(neon, compare256_neon, arm_cpu_has_neon)
 #endif
+#ifdef POWER9
+TEST_COMPARE256(power9, compare256_power9, power_cpu_has_arch_3_00)
+#endif