option(WITH_ACLE "Build with ACLE" ON)
option(WITH_NEON "Build with NEON intrinsics" ON)
elseif(BASEARCH_PPC_FOUND)
+ option(WITH_ALTIVEC "Build with AltiVec (VMX) optimisations for PowerPC" ON)
option(WITH_POWER8 "Build with optimisations for POWER8" ON)
elseif(BASEARCH_S360_FOUND)
option(WITH_DFLTCC_DEFLATE "Build with DFLTCC intrinsics for compression on IBM Z" OFF)
WITH_AVX2 WITH_SSE2
WITH_SSSE3 WITH_SSE4
WITH_PCLMULQDQ
+ WITH_ALTIVEC
WITH_POWER8
WITH_INFLATE_STRICT
WITH_INFLATE_ALLOW_INVALID_DIST
endif()
endif()
elseif(BASEARCH_PPC_FOUND)
+ # Common arch detection code
+ if(WITH_ALTIVEC)
+ check_ppc_intrinsics()
+ endif()
if(WITH_POWER8)
check_power8_intrinsics()
+ endif()
+ if(HAVE_VMX OR HAVE_POWER8_INTRIN)
+ list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/power.h)
+ list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/power.c)
+ endif()
+ # VMX specific options and files
+ if(WITH_ALTIVEC)
+ if(HAVE_VMX)
+ add_definitions(-DPPC_FEATURES)
+ if(HAVE_ALTIVEC)
+ add_definitions(-DPPC_VMX_ADLER32)
+ add_definitions(-DPPC_VMX_SLIDEHASH)
+ set(PPC_SRCS ${ARCHDIR}/adler32_vmx.c ${ARCHDIR}/slide_hash_vmx.c)
+ list(APPEND ZLIB_ARCH_SRCS ${PPC_SRCS})
+ add_feature_info(ALTIVEC 1 "Support the AltiVec instruction set, using \"-maltivec\"")
+ set_property(SOURCE ${PPC_SRCS} PROPERTY COMPILE_FLAGS "${PPCFLAGS}")
+ else()
+ set(WITH_ALTIVEC OFF)
+ endif()
+ endif()
+ endif()
+ # Power8 specific options and files
+ if(WITH_POWER8)
if(HAVE_POWER8_INTRIN)
add_definitions(-DPOWER8)
add_definitions(-DPOWER_FEATURES)
add_definitions(-DPOWER8_VSX_ADLER32)
add_definitions(-DPOWER8_VSX_CHUNKSET)
add_definitions(-DPOWER8_VSX_SLIDEHASH)
- list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/power.h)
- list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/power.c)
set(POWER8_SRCS ${ARCHDIR}/adler32_power8.c ${ARCHDIR}/chunkset_power8.c ${ARCHDIR}/slide_hash_power8.c)
list(APPEND ZLIB_ARCH_SRCS ${POWER8_SRCS})
set_property(SOURCE ${POWER8_SRCS} PROPERTY COMPILE_FLAGS "${POWER8FLAG} ${NOLTOFLAG}")
add_feature_info(WITH_ACLE WITH_ACLE "Build with ACLE")
add_feature_info(WITH_NEON WITH_NEON "Build with NEON intrinsics")
elseif(BASEARCH_PPC_FOUND)
+ add_feature_info(WITH_ALTIVEC WITH_ALTIVEC "Build with AltiVec optimisations")
add_feature_info(WITH_POWER8 WITH_POWER8 "Build with optimisations for POWER8")
elseif(BASEARCH_S360_FOUND)
add_feature_info(WITH_DFLTCC_DEFLATE WITH_DFLTCC_DEFLATE "Build with DFLTCC intrinsics for compression on IBM Z")
# Makefile for POWER-specific files
# Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
+# Copyright (C) 2021 Mika T. Lindqvist <postmaster@raasu.org>
# For conditions of distribution and use, see copyright notice in zlib.h
CC=
SUFFIX=
P8FLAGS=-mcpu=power8
+PPCFLAGS=-maltivec
NOLTOFLAG=
SRCDIR=.
power.lo \
adler32_power8.o \
adler32_power8.lo \
+ adler32_vmx.o \
+ adler32_vmx.lo \
chunkset_power8.o \
chunkset_power8.lo \
slide_hash_power8.o \
- slide_hash_power8.lo
+ slide_hash_power8.lo \
+ slide_hash_vmx.o \
+ slide_hash_vmx.lo
power.o:
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/power.c
adler32_power8.lo:
$(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_power8.c
+adler32_vmx.o:
+ $(CC) $(CFLAGS) $(PPCFLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_vmx.c
+
+adler32_vmx.lo:
+ $(CC) $(SFLAGS) $(PPCFLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_vmx.c
+
chunkset_power8.o:
$(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_power8.c
slide_hash_power8.lo:
$(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_power8.c
+slide_hash_vmx.o:
+ $(CC) $(CFLAGS) ${PPCFLAGS} $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_vmx.c
+
+slide_hash_vmx.lo:
+ $(CC) $(SFLAGS) ${PPCFLAGS} $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_vmx.c
+
mostlyclean: clean
clean:
rm -f *.o *.lo *~
--- /dev/null
+/* adler32_vmx.c -- compute the Adler-32 checksum of a data stream
+ * Copyright (C) 1995-2011 Mark Adler
+ * Copyright (C) 2017-2021 Mika T. Lindqvist <postmaster@raasu.org>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef PPC_VMX_ADLER32
+#include <altivec.h>
+#include "zutil.h"
+#include "adler32_p.h"
+
+#define vmx_zero() (vec_splat_u32(0))
+
+vector unsigned short vec_hadduh(vector unsigned char a) {
+ vector unsigned char vmx_one = vec_splat_u8(1);
+ return vec_add(vec_mulo(a, vmx_one), vec_mule(a, vmx_one));
+}
+
+vector unsigned int vec_hadduw(vector unsigned short a) {
+ vector unsigned short vmx_one = vec_splat_u16(1);
+ return vec_add(vec_mulo(a, vmx_one), vec_mule(a, vmx_one));
+}
+
+static inline void vmx_handle_head_or_tail(uint32_t *pair, const unsigned char *buf, size_t len) {
+ unsigned int i;
+ for (i = 0; i < len; ++i) {
+ pair[0] += buf[i];
+ pair[1] += pair[0];
+ }
+}
+
+static void vmx_accum32(uint32_t *s, const unsigned char *buf, size_t len) {
+ static const uint8_t tc0[16] ALIGNED_(16) = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1};
+
+ vector unsigned char t0 = vec_ld(0, tc0);
+ vector unsigned int adacc, s2acc;
+ adacc = vec_insert(s[0], vmx_zero(), 0);
+ s2acc = vec_insert(s[1], vmx_zero(), 0);
+
+ while (len > 0) {
+ vector unsigned char d0 = vec_ld(0, buf);
+ vector unsigned short sum2;
+ sum2 = vec_add(vec_mulo(t0, d0), vec_mule(t0, d0));
+ s2acc = vec_add(s2acc, vec_sl(adacc, vec_splat_u32(4)));
+ s2acc = vec_add(s2acc, vec_hadduw(sum2));
+ adacc = vec_add(adacc, vec_hadduw(vec_hadduh(d0)));
+ buf += 16;
+ len--;
+ }
+
+ s[0] = vec_extract(adacc, 0) + vec_extract(adacc, 1) + vec_extract(adacc, 2) + vec_extract(adacc, 3); /* Horizontal add */
+ s[1] = vec_extract(s2acc, 0) + vec_extract(s2acc, 1) + vec_extract(s2acc, 2) + vec_extract(s2acc, 3); /* Horizontal add */
+}
+
+uint32_t adler32_vmx(uint32_t adler, const unsigned char *buf, size_t len) {
+ uint32_t sum2;
+ uint32_t pair[2];
+ int n = NMAX;
+ unsigned int done = 0, i;
+
+ /* Split Adler-32 into component sums, it can be supplied by
+ * the caller sites (e.g. in a PNG file).
+ */
+ sum2 = (adler >> 16) & 0xffff;
+ adler &= 0xffff;
+ pair[0] = adler;
+ pair[1] = sum2;
+
+ /* in case user likes doing a byte at a time, keep it fast */
+ if (UNLIKELY(len == 1))
+ return adler32_len_1(adler, buf, sum2);
+
+ /* initial Adler-32 value (deferred check for len == 1 speed) */
+ if (UNLIKELY(buf == NULL))
+ return 1L;
+
+ /* in case short lengths are provided, keep it somewhat fast */
+ if (UNLIKELY(len < 16))
+ return adler32_len_16(adler, buf, len, sum2);
+
+ // Align buffer
+ unsigned int al = 0;
+ if ((uintptr_t)buf & 0xf) {
+ al = 16-((uintptr_t)buf & 0xf);
+ if (al > len) {
+ al=len;
+ }
+ vmx_handle_head_or_tail(pair, buf, al);
+ pair[0] %= BASE;
+ pair[1] %= BASE;
+
+ done += al;
+ }
+ for (i = al; i < len; i += n) {
+ if ((i + n) > len)
+ n = (int)(len - i);
+
+ if (n < 16)
+ break;
+
+ vmx_accum32(pair, buf + i, n / 16);
+ pair[0] %= BASE;
+ pair[1] %= BASE;
+
+ done += (n / 16) * 16;
+ }
+
+ /* Handle the tail elements. */
+ if (done < len) {
+ vmx_handle_head_or_tail(pair, (buf + done), len - done);
+ pair[0] %= BASE;
+ pair[1] %= BASE;
+ }
+
+ /* D = B * 65536 + A, see: https://en.wikipedia.org/wiki/Adler-32. */
+ return (pair[1] << 16) | pair[0];
+}
+#endif
/* POWER feature check
* Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
+ * Copyright (C) 2021 Mika T. Lindqvist <postmaster@raasu.org>
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include <sys/auxv.h>
#include "../../zutil.h"
+#include "power.h"
-Z_INTERNAL int power_cpu_has_arch_2_07;
+Z_INTERNAL int power_cpu_has_altivec = 0;
+Z_INTERNAL int power_cpu_has_arch_2_07 = 0;
void Z_INTERNAL power_check_features(void) {
+#ifdef PPC_FEATURES
+ unsigned long hwcap;
+ hwcap = getauxval(AT_HWCAP);
+
+ if (hwcap & PPC_FEATURE_HAS_ALTIVEC)
+ power_cpu_has_altivec = 1;
+#endif
+
+#ifdef POWER_FEATURES
unsigned long hwcap2;
hwcap2 = getauxval(AT_HWCAP2);
-#ifdef POWER8
if (hwcap2 & PPC_FEATURE2_ARCH_2_07)
- power_cpu_has_arch_2_07 = 1;
+ power_cpu_has_arch_2_07 = 1;
#endif
}
/* power.h -- check for POWER CPU features
* Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
+ * Copyright (C) 2021 Mika T. Lindqvist <postmaster@raasu.org>
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef POWER_H_
#define POWER_H_
+extern int power_cpu_has_altivec;
extern int power_cpu_has_arch_2_07;
void Z_INTERNAL power_check_features(void);
--- /dev/null
+/* Optimized slide_hash for PowerPC processors with VMX instructions
+ * Copyright (C) 2017-2021 Mika T. Lindqvist <postmaster@raasu.org>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#ifdef PPC_VMX_SLIDEHASH
+
+#include <altivec.h>
+#include "zbuild.h"
+#include "deflate.h"
+
+static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {
+ const vector unsigned short vmx_wsize = vec_splats(wsize);
+ Pos *p = table;
+
+ do {
+ vector unsigned short value, result;
+
+ value = vec_ld(0, p);
+ result = vec_subs(value, vmx_wsize);
+ vec_st(result, 0, p);
+
+ p += 8;
+ entries -= 8;
+ } while (entries > 0);
+}
+
+void Z_INTERNAL slide_hash_vmx(deflate_state *s) {
+ uint16_t wsize = s->w_size;
+
+ slide_hash_chain(s->head, HASH_SIZE, wsize);
+ slide_hash_chain(s->prev, wsize, wsize);
+}
+
+#endif /* PPC_VMX_SLIDEHASH */
endif()
endmacro()
+macro(check_ppc_intrinsics)
+ # Check if compiler supports AltiVec
+ set(CMAKE_REQUIRED_FLAGS "-maltivec")
+ check_c_source_compiles(
+ "#include <altivec.h>
+ int main(void)
+ {
+ vector int a = vec_splats(0);
+ vector int b = vec_splats(0);
+ a = vec_add(a, b);
+ return 0;
+ }"
+ HAVE_ALTIVEC
+ )
+ set(CMAKE_REQUIRED_FLAGS)
+
+ if(HAVE_ALTIVEC)
+ set(PPCFLAGS "-maltivec")
+ endif()
+
+ set(CMAKE_REQUIRED_FLAGS "-maltivec -mno-vsx")
+ check_c_source_compiles(
+ "#include <altivec.h>
+ int main(void)
+ {
+ vector int a = vec_splats(0);
+ vector int b = vec_splats(0);
+ a = vec_add(a, b);
+ return 0;
+ }"
+ HAVE_NOVSX
+ )
+ set(CMAKE_REQUIRED_FLAGS)
+
+ if(HAVE_NOVSX)
+ set(PPCFLAGS "${PPCFLAGS} -mno-vsx")
+ endif()
+
+ # Check if we have what we need for AltiVec optimizations
+ set(CMAKE_REQUIRED_FLAGS "${PPCFLAGS}")
+ check_c_source_compiles(
+ "#include <sys/auxv.h>
+ int main() {
+ return (getauxval(AT_HWCAP) & PPC_FEATURE_HAS_ALTIVEC);
+ }"
+ HAVE_VMX
+ )
+ set(CMAKE_REQUIRED_FLAGS)
+endmacro()
+
macro(check_power8_intrinsics)
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
if(NOT NATIVEFLAG)
build32=0
build64=0
buildacle=1
+buildaltivec=1
buildneon=1
builddfltccdeflate=0
builddfltccinflate=0
neonflag=
noltoflag="-fno-lto"
vgfmaflag="-march=z13"
+vmxflag="-maltivec"
without_optimizations=0
without_new_strategies=0
reducedmem=0
echo ' [--without-new-strategies] Compiles without using new additional deflate strategies' | tee -a configure.log
echo ' [--without-acle] Compiles without ARM C Language Extensions' | tee -a configure.log
echo ' [--without-neon] Compiles without ARM Neon SIMD instruction set' | tee -a configure.log
+ echo ' [--without-altivec] Compiles without PPC AltiVec support' | tee -a configure.log
echo ' [--with-dfltcc-deflate] Use DEFLATE CONVERSION CALL instruction for compression on IBM Z' | tee -a configure.log
echo ' [--with-dfltcc-inflate] Use DEFLATE CONVERSION CALL instruction for decompression on IBM Z' | tee -a configure.log
echo ' [--without-crc32-vx] Build without vectorized CRC32 on IBM Z' | tee -a configure.log
-6* | --64) build64=1; shift ;;
--without-acle) buildacle=0; shift ;;
--without-neon) buildneon=0; shift ;;
+ --without-altivec) buildaltivec=0 ; shift ;;
--with-dfltcc-deflate) builddfltccdeflate=1; shift ;;
--with-dfltcc-inflate) builddfltccinflate=1; shift ;;
--without-crc32-vx) buildcrc32vx=0; shift ;;
fi
}
+check_ppc_intrinsics() {
+ cat > $test.c << EOF
+#include <altivec.h>
+int main(void)
+{
+ vector int a = vec_splats(0);
+ vector int b = vec_splats(0);
+ a = vec_add(a, b);
+ return 0;
+}
+EOF
+ if test $buildaltivec -eq 1 && try ${CC} ${CFLAGS} -maltivec $test.c; then
+ echo "Checking for AltiVec intrinsics ... Yes." | tee -a configure.log
+ HAVE_ALTIVEC_INTRIN=1
+ else
+ echo "Checking for AltiVec intrinsics ... No." | tee -a configure.log
+ HAVE_ALTIVEC_INTRIN=0
+ fi
+ if test $buildaltivec -eq 1 && try ${CC} ${CFLAGS} -maltivec -mno-vsx $test.c; then
+ echo "Checking if -mno-vsx is supported ... Yes." | tee -a configure.log
+ vmxflag="$vmxflag -mno-vsx"
+ else
+ echo "Checking if -mno-vsx is supported ... No." | tee -a configure.log
+ fi
+ cat > $test.c << EOF
+#include <sys/auxv.h>
+int main() { return (getauxval(AT_HWCAP) & PPC_FEATURE_HAS_ALTIVEC); }
+EOF
+ if try $CC -c $CFLAGS -maltivec $test.c; then
+ HAVE_VMX=1
+ echo "Check whether VMX instructions are available ... Yes." | tee -a configure.log
+ else
+ HAVE_VMX=0
+ echo "Check whether VMX instructions are available ... No." | tee -a configure.log
+ fi
+}
+
check_power8_intrinsics() {
# Check whether features needed by POWER optimisations are available
cat > $test.c << EOF
if test $without_optimizations -eq 0; then
+ check_ppc_intrinsics
check_power8_intrinsics
+ if test $HAVE_VMX -eq 1; then
+ CFLAGS="${CFLAGS} -DPPC_FEATURES"
+ SFLAGS="${SFLAGS} -DPPC_FEATURES"
+ fi
+ if test $HAVE_VMX -eq 1 -o $HAVE_POWER8_INTRIN -eq 1; then
+ ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} power.o"
+ ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} power.lo"
+ fi
+ if test $HAVE_VMX -eq 1 -a $HAVE_ALTIVEC_INTRIN -eq 1; then
+ CFLAGS="${CFLAGS} -DPPC_VMX_ADLER32 -DPPC_VMX_SLIDEHASH"
+ SFLAGS="${SFLAGS} -DPPC_VMX_ADLER32 -DPPC_VMX_SLIDEHASH"
+
+ ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_vmx.o slide_hash_vmx.o"
+ ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_vmx.lo slide_hash_vmx.lo"
+ fi
if test $HAVE_POWER8_INTRIN -eq 1; then
CFLAGS="${CFLAGS} -DPOWER8 -DPOWER_FEATURES -DPOWER8_VSX_ADLER32 -DPOWER8_VSX_CHUNKSET -DPOWER8_VSX_SLIDEHASH"
SFLAGS="${SFLAGS} -DPOWER8 -DPOWER_FEATURES -DPOWER8_VSX_ADLER32 -DPOWER8_VSX_CHUNKSET -DPOWER8_VSX_SLIDEHASH"
- ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} power.o adler32_power8.o chunkset_power8.o slide_hash_power8.o"
- ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} power.lo adler32_power8.lo chunkset_power8.lo slide_hash_power8.lo"
+ ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_power8.o chunkset_power8.o slide_hash_power8.o"
+ ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_power8.lo chunkset_power8.lo slide_hash_power8.lo"
fi
fi
;;
/^NEONFLAG *=/s#=.*#=$neonflag#
/^NOLTOFLAG *=/s#=.*#=$noltoflag#
/^VGFMAFLAG *=/s#=.*#=$vgfmaflag#
+/^PPCFLAGS *=/s#=.*#=$vmxflag#
" > $ARCHDIR/Makefile
# Append header files dependences.
void slide_hash_sse2(deflate_state *s);
#elif defined(ARM_NEON_SLIDEHASH)
void slide_hash_neon(deflate_state *s);
-#elif defined(POWER8_VSX_SLIDEHASH)
+#endif
+#if defined(PPC_VMX_SLIDEHASH)
+void slide_hash_vmx(deflate_state *s);
+#endif
+#if defined(POWER8_VSX_SLIDEHASH)
void slide_hash_power8(deflate_state *s);
#endif
#ifdef X86_AVX2
#ifdef ARM_NEON_ADLER32
extern uint32_t adler32_neon(uint32_t adler, const unsigned char *buf, size_t len);
#endif
+#ifdef PPC_VMX_ADLER32
+extern uint32_t adler32_vmx(uint32_t adler, const unsigned char *buf, size_t len);
+#endif
#ifdef X86_SSSE3_ADLER32
extern uint32_t adler32_ssse3(uint32_t adler, const unsigned char *buf, size_t len);
#endif
x86_check_features();
#elif defined(ARM_FEATURES)
arm_check_features();
-#elif defined(POWER_FEATURES)
+#elif defined(PPC_FEATURES) || defined(POWER_FEATURES)
power_check_features();
#elif defined(S390_FEATURES)
s390_check_features();
if (x86_cpu_has_avx2)
functable.slide_hash = &slide_hash_avx2;
#endif
+#ifdef PPC_VMX_SLIDEHASH
+ if (power_cpu_has_altivec)
+ functable.slide_hash = &slide_hash_vmx;
+#endif
#ifdef POWER8_VSX_SLIDEHASH
if (power_cpu_has_arch_2_07)
functable.slide_hash = &slide_hash_power8;
if (x86_cpu_has_avx2)
functable.adler32 = &adler32_avx2;
#endif
+#ifdef PPC_VMX_ADLER32
+ if (power_cpu_has_altivec)
+ functable.adler32 = &adler32_vmx;
+#endif
#ifdef POWER8_VSX_ADLER32
if (power_cpu_has_arch_2_07)
functable.adler32 = &adler32_power8;
# include "arch/x86/x86.h"
#elif defined(ARM_FEATURES)
# include "arch/arm/arm.h"
-#elif defined(POWER_FEATURES)
+#elif defined(PPC_FEATURES) || defined(POWER_FEATURES)
# include "arch/power/power.h"
#elif defined(S390_FEATURES)
# include "arch/s390/s390.h"