Add optimized slide_hash for POWER processors

author Matheus Castanho <msc@linux.ibm.com>

Wed, 27 May 2020 13:06:09 +0000 (10:06 -0300)

committer Hans Kristian Rosbach <hk-github@circlestorm.org>

Mon, 8 Jun 2020 12:47:17 +0000 (14:47 +0200)
author Matheus Castanho <msc@linux.ibm.com>
Wed, 27 May 2020 13:06:09 +0000 (10:06 -0300)
committer Hans Kristian Rosbach <hk-github@circlestorm.org>
Mon, 8 Jun 2020 12:47:17 +0000 (14:47 +0200)
diff --git a/CMakeLists.txt b/CMakeLists.txt

index 0842263285143e254fae41b3374654a3f4ce7f12..cdb0ab87ed8c043e501b3ab21131ae580d4141e8 100644 (file)
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -644,7 +644,8 @@ if(WITH_OPTIM)
          if(WITH_POWER8 AND HAVE_POWER8)
              add_definitions(-DPOWER_FEATURES)
              add_definitions(-DPOWER8)
-            set(ZLIB_POWER8_SRCS )
+            set(ZLIB_POWER8_SRCS
+                ${ARCHDIR}/slide_hash_power8.c)
              set_source_files_properties(
                  ${ZLIB_POWER8_SRCS}
                  PROPERTIES COMPILE_FLAGS ${POWER8FLAG})
diff --git a/README.md b/README.md

index b9e3bc1dcef236b140ddbf6f668bda3f76fcac95..f206618e63ccacb433caa6a0bcf1a7dffe7524ea 100644 (file)
--- a/README.md
+++ b/README.md
@@ -62,7 +62,7 @@ Features
    * Intel CRC32-B implementation using PCLMULQDQ
    * Intel CRC32-C intrinics for hash tables
    * ARM CRC32-B implementation using ACLE
-  * Slide hash implementations using AVX2, SSE2, & ARM Neon
+  * Slide hash implementations using AVX2, SSE2, ARM Neon, & VSX
    * Inflate fast using SSE2, ARM Neon
    * Deflate hooks for IBM Z DFLTCC
  * Code sanitizers, fuzzing, and coverage
diff --git a/arch/power/Makefile.in b/arch/power/Makefile.in

index a438fa5a2c3e99bea03404049e954ce0b7043f46..6deb690a973f0a4ec9e7860748193b3cc4c8a841 100644 (file)
--- a/arch/power/Makefile.in
+++ b/arch/power/Makefile.in
@@ -15,7 +15,9 @@ TOPDIR=$(SRCTOP)
  P8FLAGS=-mcpu=power8
  
  all: power.o \
-     power.lo
+     power.lo \
+     slide_hash_power8.o \
+     slide_hash_power8.lo
  
  power.o:
         $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/power.c
@@ -23,6 +25,12 @@ power.o:
  power.lo:
         $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/power.c
  
+slide_hash_power8.o:
+       $(CC) $(CFLAGS) $(P8FLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_power8.c
+
+slide_hash_power8.lo:
+       $(CC) $(SFLAGS) $(P8FLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_power8.c
+
  mostlyclean: clean
  clean:
         rm -f *.o *.lo *~
diff --git a/arch/power/slide_hash_power8.c b/arch/power/slide_hash_power8.c

new file mode 100644 (file)

index 0000000..c277c15
--- /dev/null
+++ b/arch/power/slide_hash_power8.c
@@ -0,0 +1,55 @@
+/* Optimized slide_hash for POWER processors
+ * Copyright (C) 2019-2020 Matheus Castanho <msc@linux.ibm.com>, IBM
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include <altivec.h>
+#include "zbuild.h"
+#include "deflate.h"
+
+static inline void slide_hash_power8_loop(deflate_state *s, unsigned n_elems, Pos *table_end) {
+    vector unsigned short vw, vm, *vp;
+    unsigned chunks;
+
+    /* Each vector register (chunk) corresponds to 128 bits == 8 Posf,
+     * so instead of processing each of the n_elems in the hash table
+     * individually, we can do it in chunks of 8 with vector instructions.
+     *
+     * This function is only called from slide_hash_power8(), and both calls
+     * pass n_elems as a power of 2 higher than 2^7, as defined by
+     * deflateInit2_(), so n_elems will always be a multiple of 8. */
+    chunks = n_elems >> 3;
+    Assert(n_elems % 8 == 0, "Weird hash table size!");
+
+    /* This type casting is safe since s->w_size is always <= 64KB
+     * as defined by deflateInit2_() and Posf == unsigned short */
+    vw[0] = (Pos) s->w_size;
+    vw = vec_splat(vw,0);
+
+    vp = (vector unsigned short *) table_end;
+
+    do {
+        /* Processing 8 elements at a time */
+        vp--;
+        vm = *vp;
+
+        /* This is equivalent to: m >= w_size ? m - w_size : 0
+         * Since we are using a saturated unsigned subtraction, any
+         * values that are > w_size will be set to 0, while the others
+         * will be subtracted by w_size. */
+        *vp = vec_subs(vm,vw);
+    } while (--chunks);
+}
+
+void ZLIB_INTERNAL slide_hash_power8(deflate_state *s) {
+    unsigned int n;
+    Pos *p;
+
+    n = s->hash_size;
+    p = &s->head[n];
+    slide_hash_power8_loop(s,n,p);
+
+    n = s->w_size;
+    p = &s->prev[n];
+    slide_hash_power8_loop(s,n,p);
+}
diff --git a/configure b/configure

index 5004c540a8c9aa797e3060a54d36c13355f01123..4928c3468645c61e8cc2aff71ed47c6f772ff8f9 100755 (executable)
--- a/configure
+++ b/configure
@@ -1372,8 +1372,8 @@ case "${ARCH}" in
  
          if test $without_optimizations -eq 0; then
              if test $HAVE_POWER8 -eq 1; then
-                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} power.o"
-                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} power.lo"
+                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} power.o slide_hash_power8.o"
+                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} power.lo slide_hash_power8.lo"
                  POWERFLAGS="-DPOWER_FEATURES -DPOWER8"
              fi
          fi
diff --git a/functable.c b/functable.c

index aad876605fc5f4db4e87da0bdcf9f2978db2225c..e25da33161354ec8c41d42f63e40896bfb190524 100644 (file)
--- a/functable.c
+++ b/functable.c
@@ -35,6 +35,8 @@ extern Pos quick_insert_string_acle(deflate_state *const s, const Pos str);
  void slide_hash_sse2(deflate_state *s);
  #elif defined(ARM_NEON_SLIDEHASH)
  void slide_hash_neon(deflate_state *s);
+#elif defined(POWER8)
+void slide_hash_power8(deflate_state *s);
  #endif
  #ifdef X86_AVX2
  void slide_hash_avx2(deflate_state *s);
@@ -174,6 +176,10 @@ ZLIB_INTERNAL void slide_hash_stub(deflate_state *s) {
      if (x86_cpu_has_avx2)
          functable.slide_hash = &slide_hash_avx2;
  #endif
+#ifdef POWER8
+    if (power_cpu_has_arch_2_07)
+        functable.slide_hash = &slide_hash_power8;
+#endif
  
      functable.slide_hash(s);
  }
author	Matheus Castanho <msc@linux.ibm.com>
	Wed, 27 May 2020 13:06:09 +0000 (10:06 -0300)
committer	Hans Kristian Rosbach <hk-github@circlestorm.org>
	Mon, 8 Jun 2020 12:47:17 +0000 (14:47 +0200)
CMakeLists.txt		patch \| blob \| blame \| history
README.md		patch \| blob \| blame \| history
arch/power/Makefile.in		patch \| blob \| blame \| history
arch/power/slide_hash_power8.c	[new file with mode: 0644]	patch \| blob
configure		patch \| blob \| blame \| history
functable.c		patch \| blob \| blame \| history