From: Matheus Castanho Date: Wed, 27 May 2020 13:06:09 +0000 (-0300) Subject: Add optimized slide_hash for POWER processors X-Git-Tag: 1.9.9-b1~246 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=3fbfd99cf665e23e879c3d9c2d9bd8e0a9cae87a;p=thirdparty%2Fzlib-ng.git Add optimized slide_hash for POWER processors This commit introduces a new slide_hash function that uses VSX vector instructions to slide 8 hash elements at a time, instead of just one as the standard code does. --- diff --git a/CMakeLists.txt b/CMakeLists.txt index 08422632..cdb0ab87 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -644,7 +644,8 @@ if(WITH_OPTIM) if(WITH_POWER8 AND HAVE_POWER8) add_definitions(-DPOWER_FEATURES) add_definitions(-DPOWER8) - set(ZLIB_POWER8_SRCS ) + set(ZLIB_POWER8_SRCS + ${ARCHDIR}/slide_hash_power8.c) set_source_files_properties( ${ZLIB_POWER8_SRCS} PROPERTIES COMPILE_FLAGS ${POWER8FLAG}) diff --git a/README.md b/README.md index b9e3bc1d..f206618e 100644 --- a/README.md +++ b/README.md @@ -62,7 +62,7 @@ Features * Intel CRC32-B implementation using PCLMULQDQ * Intel CRC32-C intrinics for hash tables * ARM CRC32-B implementation using ACLE - * Slide hash implementations using AVX2, SSE2, & ARM Neon + * Slide hash implementations using AVX2, SSE2, ARM Neon, & VSX * Inflate fast using SSE2, ARM Neon * Deflate hooks for IBM Z DFLTCC * Code sanitizers, fuzzing, and coverage diff --git a/arch/power/Makefile.in b/arch/power/Makefile.in index a438fa5a..6deb690a 100644 --- a/arch/power/Makefile.in +++ b/arch/power/Makefile.in @@ -15,7 +15,9 @@ TOPDIR=$(SRCTOP) P8FLAGS=-mcpu=power8 all: power.o \ - power.lo + power.lo \ + slide_hash_power8.o \ + slide_hash_power8.lo power.o: $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/power.c @@ -23,6 +25,12 @@ power.o: power.lo: $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/power.c +slide_hash_power8.o: + $(CC) $(CFLAGS) $(P8FLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_power8.c + +slide_hash_power8.lo: + $(CC) $(SFLAGS) $(P8FLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_power8.c + mostlyclean: clean clean: rm -f *.o *.lo *~ diff --git a/arch/power/slide_hash_power8.c b/arch/power/slide_hash_power8.c new file mode 100644 index 00000000..c277c15d --- /dev/null +++ b/arch/power/slide_hash_power8.c @@ -0,0 +1,55 @@ +/* Optimized slide_hash for POWER processors + * Copyright (C) 2019-2020 Matheus Castanho , IBM + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include +#include "zbuild.h" +#include "deflate.h" + +static inline void slide_hash_power8_loop(deflate_state *s, unsigned n_elems, Pos *table_end) { + vector unsigned short vw, vm, *vp; + unsigned chunks; + + /* Each vector register (chunk) corresponds to 128 bits == 8 Posf, + * so instead of processing each of the n_elems in the hash table + * individually, we can do it in chunks of 8 with vector instructions. + * + * This function is only called from slide_hash_power8(), and both calls + * pass n_elems as a power of 2 higher than 2^7, as defined by + * deflateInit2_(), so n_elems will always be a multiple of 8. */ + chunks = n_elems >> 3; + Assert(n_elems % 8 == 0, "Weird hash table size!"); + + /* This type casting is safe since s->w_size is always <= 64KB + * as defined by deflateInit2_() and Posf == unsigned short */ + vw[0] = (Pos) s->w_size; + vw = vec_splat(vw,0); + + vp = (vector unsigned short *) table_end; + + do { + /* Processing 8 elements at a time */ + vp--; + vm = *vp; + + /* This is equivalent to: m >= w_size ? m - w_size : 0 + * Since we are using a saturated unsigned subtraction, any + * values that are > w_size will be set to 0, while the others + * will be subtracted by w_size. */ + *vp = vec_subs(vm,vw); + } while (--chunks); +} + +void ZLIB_INTERNAL slide_hash_power8(deflate_state *s) { + unsigned int n; + Pos *p; + + n = s->hash_size; + p = &s->head[n]; + slide_hash_power8_loop(s,n,p); + + n = s->w_size; + p = &s->prev[n]; + slide_hash_power8_loop(s,n,p); +} diff --git a/configure b/configure index 5004c540..4928c346 100755 --- a/configure +++ b/configure @@ -1372,8 +1372,8 @@ case "${ARCH}" in if test $without_optimizations -eq 0; then if test $HAVE_POWER8 -eq 1; then - ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} power.o" - ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} power.lo" + ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} power.o slide_hash_power8.o" + ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} power.lo slide_hash_power8.lo" POWERFLAGS="-DPOWER_FEATURES -DPOWER8" fi fi diff --git a/functable.c b/functable.c index aad87660..e25da331 100644 --- a/functable.c +++ b/functable.c @@ -35,6 +35,8 @@ extern Pos quick_insert_string_acle(deflate_state *const s, const Pos str); void slide_hash_sse2(deflate_state *s); #elif defined(ARM_NEON_SLIDEHASH) void slide_hash_neon(deflate_state *s); +#elif defined(POWER8) +void slide_hash_power8(deflate_state *s); #endif #ifdef X86_AVX2 void slide_hash_avx2(deflate_state *s); @@ -174,6 +176,10 @@ ZLIB_INTERNAL void slide_hash_stub(deflate_state *s) { if (x86_cpu_has_avx2) functable.slide_hash = &slide_hash_avx2; #endif +#ifdef POWER8 + if (power_cpu_has_arch_2_07) + functable.slide_hash = &slide_hash_power8; +#endif functable.slide_hash(s); }