From: Matheus Castanho <msc@linux.ibm.com>
Date: Wed, 27 May 2020 13:06:09 +0000 (-0300)
Subject: Add optimized slide_hash for POWER processors
X-Git-Tag: 1.9.9-b1~246
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=3fbfd99cf665e23e879c3d9c2d9bd8e0a9cae87a;p=thirdparty%2Fzlib-ng.git

Add optimized slide_hash for POWER processors

This commit introduces a new slide_hash function that
uses VSX vector instructions to slide 8 hash elements at a time,
instead of just one as the standard code does.
---

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 08422632..cdb0ab87 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -644,7 +644,8 @@ if(WITH_OPTIM)
         if(WITH_POWER8 AND HAVE_POWER8)
             add_definitions(-DPOWER_FEATURES)
             add_definitions(-DPOWER8)
-            set(ZLIB_POWER8_SRCS )
+            set(ZLIB_POWER8_SRCS
+                ${ARCHDIR}/slide_hash_power8.c)
             set_source_files_properties(
                 ${ZLIB_POWER8_SRCS}
                 PROPERTIES COMPILE_FLAGS ${POWER8FLAG})
diff --git a/README.md b/README.md
index b9e3bc1d..f206618e 100644
--- a/README.md
+++ b/README.md
@@ -62,7 +62,7 @@ Features
   * Intel CRC32-B implementation using PCLMULQDQ
   * Intel CRC32-C intrinics for hash tables
   * ARM CRC32-B implementation using ACLE
-  * Slide hash implementations using AVX2, SSE2, & ARM Neon
+  * Slide hash implementations using AVX2, SSE2, ARM Neon, & VSX
   * Inflate fast using SSE2, ARM Neon
   * Deflate hooks for IBM Z DFLTCC
 * Code sanitizers, fuzzing, and coverage
diff --git a/arch/power/Makefile.in b/arch/power/Makefile.in
index a438fa5a..6deb690a 100644
--- a/arch/power/Makefile.in
+++ b/arch/power/Makefile.in
@@ -15,7 +15,9 @@ TOPDIR=$(SRCTOP)
 P8FLAGS=-mcpu=power8
 
 all: power.o \
-     power.lo
+     power.lo \
+     slide_hash_power8.o \
+     slide_hash_power8.lo
 
 power.o:
 	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/power.c
@@ -23,6 +25,12 @@ power.o:
 power.lo:
 	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/power.c
 
+slide_hash_power8.o:
+	$(CC) $(CFLAGS) $(P8FLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_power8.c
+
+slide_hash_power8.lo:
+	$(CC) $(SFLAGS) $(P8FLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_power8.c
+
 mostlyclean: clean
 clean:
 	rm -f *.o *.lo *~
diff --git a/arch/power/slide_hash_power8.c b/arch/power/slide_hash_power8.c
new file mode 100644
index 00000000..c277c15d
--- /dev/null
+++ b/arch/power/slide_hash_power8.c
@@ -0,0 +1,55 @@
+/* Optimized slide_hash for POWER processors
+ * Copyright (C) 2019-2020 Matheus Castanho <msc@linux.ibm.com>, IBM
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include <altivec.h>
+#include "zbuild.h"
+#include "deflate.h"
+
+static inline void slide_hash_power8_loop(deflate_state *s, unsigned n_elems, Pos *table_end) {
+    vector unsigned short vw, vm, *vp;
+    unsigned chunks;
+
+    /* Each vector register (chunk) corresponds to 128 bits == 8 Posf,
+     * so instead of processing each of the n_elems in the hash table
+     * individually, we can do it in chunks of 8 with vector instructions.
+     *
+     * This function is only called from slide_hash_power8(), and both calls
+     * pass n_elems as a power of 2 higher than 2^7, as defined by
+     * deflateInit2_(), so n_elems will always be a multiple of 8. */
+    chunks = n_elems >> 3;
+    Assert(n_elems % 8 == 0, "Weird hash table size!");
+
+    /* This type casting is safe since s->w_size is always <= 64KB
+     * as defined by deflateInit2_() and Posf == unsigned short */
+    vw[0] = (Pos) s->w_size;
+    vw = vec_splat(vw,0);
+
+    vp = (vector unsigned short *) table_end;
+
+    do {
+        /* Processing 8 elements at a time */
+        vp--;
+        vm = *vp;
+
+        /* This is equivalent to: m >= w_size ? m - w_size : 0
+         * Since we are using a saturated unsigned subtraction, any
+         * values that are > w_size will be set to 0, while the others
+         * will be subtracted by w_size. */
+        *vp = vec_subs(vm,vw);
+    } while (--chunks);
+}
+
+void ZLIB_INTERNAL slide_hash_power8(deflate_state *s) {
+    unsigned int n;
+    Pos *p;
+
+    n = s->hash_size;
+    p = &s->head[n];
+    slide_hash_power8_loop(s,n,p);
+
+    n = s->w_size;
+    p = &s->prev[n];
+    slide_hash_power8_loop(s,n,p);
+}
diff --git a/configure b/configure
index 5004c540..4928c346 100755
--- a/configure
+++ b/configure
@@ -1372,8 +1372,8 @@ case "${ARCH}" in
 
         if test $without_optimizations -eq 0; then
             if test $HAVE_POWER8 -eq 1; then
-                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} power.o"
-                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} power.lo"
+                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} power.o slide_hash_power8.o"
+                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} power.lo slide_hash_power8.lo"
                 POWERFLAGS="-DPOWER_FEATURES -DPOWER8"
             fi
         fi
diff --git a/functable.c b/functable.c
index aad87660..e25da331 100644
--- a/functable.c
+++ b/functable.c
@@ -35,6 +35,8 @@ extern Pos quick_insert_string_acle(deflate_state *const s, const Pos str);
 void slide_hash_sse2(deflate_state *s);
 #elif defined(ARM_NEON_SLIDEHASH)
 void slide_hash_neon(deflate_state *s);
+#elif defined(POWER8)
+void slide_hash_power8(deflate_state *s);
 #endif
 #ifdef X86_AVX2
 void slide_hash_avx2(deflate_state *s);
@@ -174,6 +176,10 @@ ZLIB_INTERNAL void slide_hash_stub(deflate_state *s) {
     if (x86_cpu_has_avx2)
         functable.slide_hash = &slide_hash_avx2;
 #endif
+#ifdef POWER8
+    if (power_cpu_has_arch_2_07)
+        functable.slide_hash = &slide_hash_power8;
+#endif
 
     functable.slide_hash(s);
 }