elseif(BASEARCH_PPC_FOUND)
option(WITH_ALTIVEC "Build with AltiVec (VMX) optimisations for PowerPC" ON)
option(WITH_POWER8 "Build with optimisations for POWER8" ON)
+ option(WITH_POWER9 "Build with optimisations for POWER9" ON)
elseif(BASEARCH_S360_FOUND)
option(WITH_DFLTCC_DEFLATE "Build with DFLTCC intrinsics for compression on IBM Z" OFF)
option(WITH_DFLTCC_INFLATE "Build with DFLTCC intrinsics for decompression on IBM Z" OFF)
WITH_PCLMULQDQ
WITH_ALTIVEC
WITH_POWER8
+ WITH_POWER9
WITH_INFLATE_STRICT
WITH_INFLATE_ALLOW_INVALID_DIST
WITH_UNALIGNED
if(WITH_POWER8)
check_power8_intrinsics()
endif()
- if(HAVE_VMX OR HAVE_POWER8_INTRIN)
+ if(WITH_POWER9)
+ check_power9_intrinsics()
+ endif()
+ if(HAVE_VMX OR HAVE_POWER8_INTRIN OR HAVE_POWER9_INTRIN)
list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/power_features.h)
list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/power_features.c)
endif()
set(WITH_POWER8 OFF)
endif()
endif()
+ # Power9 specific options and files
+ if(WITH_POWER9)
+ if(HAVE_POWER9_INTRIN)
+ add_definitions(-DPOWER9)
+ set(POWER9_SRCS ${ARCHDIR}/compare256_power9.c)
+ list(APPEND ZLIB_ARCH_SRCS ${POWER9_SRCS})
+ set_property(SOURCE ${POWER9_SRCS} PROPERTY COMPILE_FLAGS "${POWER9FLAG} ${NOLTOFLAG}")
+ else()
+ set(WITH_POWER9 OFF)
+ endif()
+ endif()
elseif(BASEARCH_S360_FOUND)
check_s390_intrinsics()
if(HAVE_S390_INTRIN)
elseif(BASEARCH_PPC_FOUND)
add_feature_info(WITH_ALTIVEC WITH_ALTIVEC "Build with AltiVec optimisations")
add_feature_info(WITH_POWER8 WITH_POWER8 "Build with optimisations for POWER8")
+ add_feature_info(WITH_POWER9 WITH_POWER9 "Build with optimisations for POWER9")
elseif(BASEARCH_S360_FOUND)
add_feature_info(WITH_DFLTCC_DEFLATE WITH_DFLTCC_DEFLATE "Build with DFLTCC intrinsics for compression on IBM Z")
add_feature_info(WITH_DFLTCC_INFLATE WITH_DFLTCC_INFLATE "Build with DFLTCC intrinsics for decompression on IBM Z")
* CRC32-B implementation using PCLMULQDQ, VPCLMULQDQ, ACLE, & IBM Z
* Hash table implementation using CRC32-C intrinsics on x86 and ARM
* Slide hash implementations using SSE2, AVX2, Neon, VMX & VSX
- * Compare256 implementations using SSE2, AVX2, & Neon
+ * Compare256 implementations using SSE2, AVX2, Neon, & POWER9
* Inflate chunk copying using SSE2, AVX, Neon & VSX
* Support for hardware-accelerated deflate using IBM Z DFLTCC
* Unaligned memory read/writes and large bit buffer improvements
SUFFIX=
P8FLAGS=-mcpu=power8
+P9FLAGS=-mcpu=power9
PPCFLAGS=-maltivec
NOLTOFLAG=
adler32_vmx.lo \
chunkset_power8.o \
chunkset_power8.lo \
+ compare256_power9.o \
+ compare256_power9.lo \
crc32_power8.o \
crc32_power8.lo \
slide_hash_power8.o \
chunkset_power8.lo:
$(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_power8.c
+compare256_power9.o:
+ $(CC) $(CFLAGS) $(P9FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_power9.c
+
+compare256_power9.lo:
+ $(CC) $(SFLAGS) $(P9FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_power9.c
+
crc32_power8.o:
$(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_power8.c
--- /dev/null
+/* compare256_power9.c - Power9 version of compare256
+ * Copyright (C) 2019 Matheus Castanho <msc@linux.ibm.com>, IBM
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef POWER9
+#include <altivec.h>
+#include "../../zbuild.h"
+#include "../../zendian.h"
+
+/* Older versions of GCC misimplemented semantics for these bit counting builtins.
+ * https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=3f30f2d1dbb3228b8468b26239fe60c2974ce2ac */
+#if defined(__GNUC__) && (__GNUC__ < 12)
+# define zng_vec_vctzlsbb(vc, len) __asm__ volatile("vctzlsbb %0, %1\n\t" : "=r" (len) : "v" (vc))
+# define zng_vec_vclzlsbb(vc, len) __asm__ volatile("vclzlsbb %0, %1\n\t" : "=r" (len) : "v" (vc))
+#else
+# define zng_vec_vctzlsbb(vc, len) len = __builtin_vec_vctzlsbb(vc)
+# define zng_vec_vclzlsbb(vc, len) len = __builtin_vec_vclzlsbb(vc)
+#endif
+
+static inline uint32_t compare256_power9_static(const uint8_t *src0, const uint8_t *src1) {
+ uint32_t len = 0, cmplen;
+
+ do {
+ vector unsigned char vsrc0, vsrc1, vc;
+
+ vsrc0 = *((vector unsigned char *)src0);
+ vsrc1 = *((vector unsigned char *)src1);
+
+ /* Compare 16 bytes at a time. Each byte of vc will be either
+ * all ones or all zeroes, depending on the result of the comparison. */
+ vc = (vector unsigned char)vec_cmpne(vsrc0, vsrc1);
+
+ /* Since the index of matching bytes will contain only zeroes
+ * on vc (since we used cmpne), counting the number of consecutive
+ * bytes where LSB == 0 is the same as counting the length of the match. */
+#if BYTE_ORDER == LITTLE_ENDIAN
+ zng_vec_vctzlsbb(vc, cmplen);
+#else
+ zng_vec_vclzlsbb(vc, cmplen);
+#endif
+ if (cmplen != 16)
+ return len + cmplen;
+
+ src0 += 16, src1 += 16, len += 16;
+ } while (len < 256);
+
+ return 256;
+}
+
+Z_INTERNAL uint32_t compare256_power9(const uint8_t *src0, const uint8_t *src1) {
+ return compare256_power9_static(src0, src1);
+}
+
+#define LONGEST_MATCH longest_match_power9
+#define COMPARE256 compare256_power9_static
+
+#include "match_tpl.h"
+
+#define LONGEST_MATCH_SLOW
+#define LONGEST_MATCH longest_match_slow_power9
+#define COMPARE256 compare256_power9_static
+
+#include "match_tpl.h"
+
+#endif
Z_INTERNAL int power_cpu_has_altivec = 0;
Z_INTERNAL int power_cpu_has_arch_2_07 = 0;
+Z_INTERNAL int power_cpu_has_arch_3_00 = 0;
void Z_INTERNAL power_check_features(void) {
#ifdef PPC_FEATURES
if (hwcap2 & PPC_FEATURE2_ARCH_2_07)
power_cpu_has_arch_2_07 = 1;
+ if (hwcap2 & PPC_FEATURE2_ARCH_3_00)
+ power_cpu_has_arch_3_00 = 1;
#endif
}
extern int power_cpu_has_altivec;
extern int power_cpu_has_arch_2_07;
+extern int power_cpu_has_arch_3_00;
void Z_INTERNAL power_check_features(void);
)
endmacro()
+macro(check_power9_intrinsics)
+ if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+ if(NOT NATIVEFLAG)
+ set(POWER9FLAG "-mcpu=power9")
+ endif()
+ endif()
+ # Check if we have what we need for POWER9 optimizations
+ set(CMAKE_REQUIRED_FLAGS "${POWER9FLAG} ${NATIVEFLAG}")
+ check_c_source_compiles(
+ "int main() {
+ return 0;
+ }"
+ HAVE_POWER9_INTRIN
+ )
+ set(CMAKE_REQUIRED_FLAGS)
+endmacro()
+
macro(check_sse2_intrinsics)
if(CMAKE_C_COMPILER_ID MATCHES "Intel")
if(CMAKE_HOST_UNIX OR APPLE)
buildacle=1
buildaltivec=1
buildpower8=1
+buildpower9=1
buildneon=1
builddfltccdeflate=0
builddfltccinflate=0
--without-neon) buildneon=0; shift ;;
--without-altivec) buildaltivec=0 ; shift ;;
--without-power8) buildpower8=0 ; shift ;;
+ --without-power9) buildpower9=0 ; shift ;;
--with-dfltcc-deflate) builddfltccdeflate=1; shift ;;
--with-dfltcc-inflate) builddfltccinflate=1; shift ;;
--without-crc32-vx) buildcrc32vx=0; shift ;;
}
check_power8_intrinsics() {
- # Check whether features needed by POWER optimisations are available
+ # Check whether features needed by POWER8 optimisations are available
cat > $test.c << EOF
#include <sys/auxv.h>
int main() { return (getauxval(AT_HWCAP2) & PPC_FEATURE2_ARCH_2_07); }
fi
}
+check_power9_intrinsics() {
+ # Check whether features needed by POWER9 optimisations are available
+ cat > $test.c << EOF
+int main() { return 0; }
+EOF
+ if test $buildpower9 -eq 1 && try $CC -c $CFLAGS -mcpu=power9 $test.c; then
+ HAVE_POWER9_INTRIN=1
+ echo "Check whether POWER9 instructions are available ... Yes." | tee -a configure.log
+ else
+ HAVE_POWER9_INTRIN=0
+ echo "Check whether POWER9 instructions are available ... No." | tee -a configure.log
+ fi
+}
+
check_sse2_intrinsics() {
# Check whether compiler supports SSE2 intrinsics
cat > $test.c << EOF
check_ppc_intrinsics
check_power8_intrinsics
+ check_power9_intrinsics
if test $HAVE_VMX -eq 1; then
CFLAGS="${CFLAGS} -DPPC_FEATURES"
;;
esac
fi
+ if test $HAVE_POWER9_INTRIN -eq 1; then
+ CFLAGS="${CFLAGS} -DPOWER9 -DPOWER_FEATURES"
+ SFLAGS="${SFLAGS} -DPOWER9 -DPOWER_FEATURES"
+
+ ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} compare256_power9.o"
+ ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} compare256_power9.lo"
+ fi
fi
;;
s390x)
#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
extern uint32_t compare256_neon(const uint8_t *src0, const uint8_t *src1);
#endif
+#ifdef POWER9
+extern uint32_t compare256_power9(const uint8_t *src0, const uint8_t *src1);
+#endif
#ifdef DEFLATE_H_
/* insert_string */
#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
extern uint32_t longest_match_neon(deflate_state *const s, Pos cur_match);
#endif
+#ifdef POWER9
+extern uint32_t longest_match_power9(deflate_state *const s, Pos cur_match);
+#endif
/* longest_match_slow */
extern uint32_t longest_match_slow_c(deflate_state *const s, Pos cur_match);
#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
extern uint32_t longest_match_slow_neon(deflate_state *const s, Pos cur_match);
#endif
+#ifdef POWER9
+extern uint32_t longest_match_slow_power9(deflate_state *const s, Pos cur_match);
+#endif
/* quick_insert_string */
extern Pos quick_insert_string_c(deflate_state *const s, const uint32_t str);
if (arm_cpu_has_neon)
functable.longest_match = &longest_match_neon;
#endif
+#ifdef POWER9
+ if (power_cpu_has_arch_3_00)
+ functable.longest_match = &longest_match_power9;
+#endif
return functable.longest_match(s, cur_match);
}
if (arm_cpu_has_neon)
functable.longest_match_slow = &longest_match_slow_neon;
#endif
+#ifdef POWER9
+ if (power_cpu_has_arch_3_00)
+ functable.longest_match_slow = &longest_match_slow_power9;
+#endif
return functable.longest_match_slow(s, cur_match);
}
if (x86_cpu_has_avx2)
functable.compare256 = &compare256_avx2;
#endif
+#ifdef POWER9
+ if (power_cpu_has_arch_3_00)
+ functable.compare256 = &compare256_power9;
+#endif
return functable.compare256(src0, src1);
}
#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
BENCHMARK_COMPARE256(neon, compare256_neon, arm_cpu_has_neon);
#endif
+#ifdef POWER9
+BENCHMARK_COMPARE256(power9, compare256_power9, power_cpu_has_arch_3_00);
+#endif
#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
TEST_COMPARE256(neon, compare256_neon, arm_cpu_has_neon)
#endif
+#ifdef POWER9
+TEST_COMPARE256(power9, compare256_power9, power_cpu_has_arch_3_00)
+#endif