From: Mika Lindqvist Date: Sat, 19 Jun 2021 05:58:09 +0000 (+0300) Subject: [Power8] Add chunk*_power8. X-Git-Tag: 2.1.0-beta1~541 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=564d473c6d86a001c4dd37c0cec84894d5ab47ae;p=thirdparty%2Fzlib-ng.git [Power8] Add chunk*_power8. --- diff --git a/CMakeLists.txt b/CMakeLists.txt index 3ea40b502..98dbec9db 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -606,10 +606,11 @@ if(WITH_OPTIM) add_definitions(-DPOWER8) add_definitions(-DPOWER_FEATURES) add_definitions(-DPOWER8_VSX_ADLER32) + add_definitions(-DPOWER8_VSX_CHUNKSET) add_definitions(-DPOWER8_VSX_SLIDEHASH) list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/power.h) list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/power.c) - set(POWER8_SRCS ${ARCHDIR}/adler32_power8.c ${ARCHDIR}/slide_power8.c) + set(POWER8_SRCS ${ARCHDIR}/adler32_power8.c ${ARCHDIR}/chunkset_power8.c ${ARCHDIR}/slide_power8.c) list(APPEND ZLIB_ARCH_SRCS ${POWER8_SRCS}) set_property(SOURCE ${POWER8_SRCS} PROPERTY COMPILE_FLAGS "${POWER8FLAG} ${NOLTOFLAG}") else() diff --git a/README.md b/README.md index 22b1e4164..6efbda5dc 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ Features * Hash table implementation using CRC32-C intrinsics on x86 and ARM * Slide hash implementations using SSE2, AVX2, Neon & VSX * Compare256/258 implementations using SSE4.2 & AVX2 - * Inflate chunk copying using SSE2, AVX2 & Neon + * Inflate chunk copying using SSE2, AVX2, Neon & VSX * Support for hardware-accelerated deflate using IBM Z DFLTCC * Unaligned memory read/writes and large bit buffer improvements * Includes improvements from Cloudflare and Intel forks diff --git a/arch/power/Makefile.in b/arch/power/Makefile.in index e7a2473a3..f58c49e81 100644 --- a/arch/power/Makefile.in +++ b/arch/power/Makefile.in @@ -19,6 +19,8 @@ all: power.o \ power.lo \ adler32_power8.o \ adler32_power8.lo \ + chunkset_power8.o \ + chunkset_power8.lo \ slide_power8.o \ slide_power8.lo @@ -34,6 +36,12 @@ adler32_power8.o: adler32_power8.lo: $(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_power8.c +chunkset_power8.o: + $(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_power8.c + +chunkset_power8.lo: + $(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_power8.c + slide_power8.o: $(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_power8.c diff --git a/arch/power/chunkset_power8.c b/arch/power/chunkset_power8.c new file mode 100644 index 000000000..a76f66334 --- /dev/null +++ b/arch/power/chunkset_power8.c @@ -0,0 +1,58 @@ +/* chunkset_power8.c -- VSX inline functions to copy small data chunks. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef POWER8_VSX_CHUNKSET +#include +#include "zbuild.h" +#include "zutil.h" + +typedef vector unsigned char chunk_t; + +#define CHUNK_SIZE 16 + +#define HAVE_CHUNKMEMSET_1 +#define HAVE_CHUNKMEMSET_2 +#define HAVE_CHUNKMEMSET_4 +#define HAVE_CHUNKMEMSET_8 + +static inline void chunkmemset_1(uint8_t *from, chunk_t *chunk) { + *chunk = vec_splats(*from); +} + +static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) { + uint16_t tmp; + memcpy(&tmp, from, 2); + *chunk = (vector unsigned char)vec_splats(tmp); +} + +static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) { + uint32_t tmp; + memcpy(&tmp, from, 4); + *chunk = (vector unsigned char)vec_splats(tmp); +} + +static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) { + uint64_t tmp; + memcpy(&tmp, from, 8); + *chunk = (vector unsigned char)vec_splats(tmp); +} + +#define CHUNKSIZE chunksize_power8 +#define CHUNKCOPY chunkcopy_power8 +#define CHUNKCOPY_SAFE chunkcopy_safe_power8 +#define CHUNKUNROLL chunkunroll_power8 +#define CHUNKMEMSET chunkmemset_power8 +#define CHUNKMEMSET_SAFE chunkmemset_safe_power8 + +static inline void loadchunk(uint8_t const *s, chunk_t *chunk) { + *chunk = vec_xl(0, s); +} + +static inline void storechunk(uint8_t *out, chunk_t *chunk) { + vec_xst(*chunk, 0, out); +} + +#include "chunkset_tpl.h" + +#endif diff --git a/configure b/configure index 911269254..e4738dca1 100755 --- a/configure +++ b/configure @@ -1525,11 +1525,11 @@ EOF check_power8_intrinsics if test $HAVE_POWER8_INTRIN -eq 1; then - CFLAGS="${CFLAGS} -DPOWER8 -DPOWER_FEATURES -DPOWER8_VSX_ADLER32 -DPOWER8_VSX_SLIDEHASH" - SFLAGS="${SFLAGS} -DPOWER8 -DPOWER_FEATURES -DPOWER8_VSX_ADLER32 -DPOWER8_VSX_SLIDEHASH" + CFLAGS="${CFLAGS} -DPOWER8 -DPOWER_FEATURES -DPOWER8_VSX_ADLER32 -DPOWER8_VSX_CHUNKSET -DPOWER8_VSX_SLIDEHASH" + SFLAGS="${SFLAGS} -DPOWER8 -DPOWER_FEATURES -DPOWER8_VSX_ADLER32 -DPOWER8_VSX_CHUNKSET -DPOWER8_VSX_SLIDEHASH" - ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} power.o adler32_power8.o slide_power8.o" - ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} power.lo adler32_power8.lo slide_power8.lo" + ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} power.o adler32_power8.o chunkset_power8.o slide_power8.o" + ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} power.lo adler32_power8.lo chunkset_power8.lo slide_power8.lo" fi fi ;; diff --git a/functable.c b/functable.c index 5ed930c10..12feedfb3 100644 --- a/functable.c +++ b/functable.c @@ -96,6 +96,14 @@ extern uint8_t* chunkunroll_neon(uint8_t *out, unsigned *dist, unsigned *len); extern uint8_t* chunkmemset_neon(uint8_t *out, unsigned dist, unsigned len); extern uint8_t* chunkmemset_safe_neon(uint8_t *out, unsigned dist, unsigned len, unsigned left); #endif +#ifdef POWER8_VSX_CHUNKSET +extern uint32_t chunksize_power8(void); +extern uint8_t* chunkcopy_power8(uint8_t *out, uint8_t const *from, unsigned len); +extern uint8_t* chunkcopy_safe_power8(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe); +extern uint8_t* chunkunroll_power8(uint8_t *out, unsigned *dist, unsigned *len); +extern uint8_t* chunkmemset_power8(uint8_t *out, unsigned dist, unsigned len); +extern uint8_t* chunkmemset_safe_power8(uint8_t *out, unsigned dist, unsigned len, unsigned left); +#endif /* CRC32 */ Z_INTERNAL uint32_t crc32_generic(uint32_t, const unsigned char *, uint64_t); @@ -298,6 +306,10 @@ Z_INTERNAL uint32_t chunksize_stub(void) { if (arm_cpu_has_neon) functable.chunksize = &chunksize_neon; #endif +#ifdef POWER8_VSX_CHUNKSET + if (power_cpu_has_arch_2_07) + functable.chunksize = &chunksize_power8; +#endif return functable.chunksize(); } @@ -320,6 +332,10 @@ Z_INTERNAL uint8_t* chunkcopy_stub(uint8_t *out, uint8_t const *from, unsigned l if (arm_cpu_has_neon) functable.chunkcopy = &chunkcopy_neon; #endif +#ifdef POWER8_VSX_CHUNKSET + if (power_cpu_has_arch_2_07) + functable.chunkcopy = &chunkcopy_power8; +#endif return functable.chunkcopy(out, from, len); } @@ -342,6 +358,10 @@ Z_INTERNAL uint8_t* chunkcopy_safe_stub(uint8_t *out, uint8_t const *from, unsig if (arm_cpu_has_neon) functable.chunkcopy_safe = &chunkcopy_safe_neon; #endif +#ifdef POWER8_VSX_CHUNKSET + if (power_cpu_has_arch_2_07) + functable.chunkcopy_safe = &chunkcopy_safe_power8; +#endif return functable.chunkcopy_safe(out, from, len, safe); } @@ -364,6 +384,10 @@ Z_INTERNAL uint8_t* chunkunroll_stub(uint8_t *out, unsigned *dist, unsigned *len if (arm_cpu_has_neon) functable.chunkunroll = &chunkunroll_neon; #endif +#ifdef POWER8_VSX_CHUNKSET + if (power_cpu_has_arch_2_07) + functable.chunkunroll = &chunkunroll_power8; +#endif return functable.chunkunroll(out, dist, len); } @@ -386,6 +410,11 @@ Z_INTERNAL uint8_t* chunkmemset_stub(uint8_t *out, unsigned dist, unsigned len) if (arm_cpu_has_neon) functable.chunkmemset = &chunkmemset_neon; #endif +#ifdef POWER8_VSX_CHUNKSET + if (power_cpu_has_arch_2_07) + functable.chunkmemset = &chunkmemset_power8; +#endif + return functable.chunkmemset(out, dist, len); } @@ -408,6 +437,10 @@ Z_INTERNAL uint8_t* chunkmemset_safe_stub(uint8_t *out, unsigned dist, unsigned if (arm_cpu_has_neon) functable.chunkmemset_safe = &chunkmemset_safe_neon; #endif +#ifdef POWER8_VSX_CHUNKSET + if (power_cpu_has_arch_2_07) + functable.chunkmemset_safe = &chunkmemset_safe_power8; +#endif return functable.chunkmemset_safe(out, dist, len, left); }