From: Mika Lindqvist <postmaster@raasu.org>
Date: Sat, 19 Jun 2021 05:58:09 +0000 (+0300)
Subject: [Power8] Add chunk*_power8.
X-Git-Tag: 2.1.0-beta1~541
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=564d473c6d86a001c4dd37c0cec84894d5ab47ae;p=thirdparty%2Fzlib-ng.git

[Power8] Add chunk*_power8.
---

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3ea40b502..98dbec9db 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -606,10 +606,11 @@ if(WITH_OPTIM)
                 add_definitions(-DPOWER8)
                 add_definitions(-DPOWER_FEATURES)
                 add_definitions(-DPOWER8_VSX_ADLER32)
+                add_definitions(-DPOWER8_VSX_CHUNKSET)
                 add_definitions(-DPOWER8_VSX_SLIDEHASH)
                 list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/power.h)
                 list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/power.c)
-                set(POWER8_SRCS ${ARCHDIR}/adler32_power8.c ${ARCHDIR}/slide_power8.c)
+                set(POWER8_SRCS ${ARCHDIR}/adler32_power8.c ${ARCHDIR}/chunkset_power8.c ${ARCHDIR}/slide_power8.c)
                 list(APPEND ZLIB_ARCH_SRCS ${POWER8_SRCS})
                 set_property(SOURCE ${POWER8_SRCS} PROPERTY COMPILE_FLAGS "${POWER8FLAG} ${NOLTOFLAG}")
             else()
diff --git a/README.md b/README.md
index 22b1e4164..6efbda5dc 100644
--- a/README.md
+++ b/README.md
@@ -26,7 +26,7 @@ Features
   * Hash table implementation using CRC32-C intrinsics on x86 and ARM
   * Slide hash implementations using SSE2, AVX2, Neon & VSX
   * Compare256/258 implementations using SSE4.2 & AVX2
-  * Inflate chunk copying using SSE2, AVX2 & Neon
+  * Inflate chunk copying using SSE2, AVX2, Neon & VSX
   * Support for hardware-accelerated deflate using IBM Z DFLTCC
 * Unaligned memory read/writes and large bit buffer improvements
 * Includes improvements from Cloudflare and Intel forks
diff --git a/arch/power/Makefile.in b/arch/power/Makefile.in
index e7a2473a3..f58c49e81 100644
--- a/arch/power/Makefile.in
+++ b/arch/power/Makefile.in
@@ -19,6 +19,8 @@ all: power.o \
      power.lo \
      adler32_power8.o \
      adler32_power8.lo \
+     chunkset_power8.o \
+     chunkset_power8.lo \
      slide_power8.o \
      slide_power8.lo
 
@@ -34,6 +36,12 @@ adler32_power8.o:
 adler32_power8.lo:
 	$(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_power8.c
 
+chunkset_power8.o:
+	$(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_power8.c
+
+chunkset_power8.lo:
+	$(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_power8.c
+
 slide_power8.o:
 	$(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_power8.c
 
diff --git a/arch/power/chunkset_power8.c b/arch/power/chunkset_power8.c
new file mode 100644
index 000000000..a76f66334
--- /dev/null
+++ b/arch/power/chunkset_power8.c
@@ -0,0 +1,58 @@
+/* chunkset_power8.c -- VSX inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef POWER8_VSX_CHUNKSET
+#include <altivec.h>
+#include "zbuild.h"
+#include "zutil.h"
+
+typedef vector unsigned char chunk_t;
+
+#define CHUNK_SIZE 16
+
+#define HAVE_CHUNKMEMSET_1
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+
+static inline void chunkmemset_1(uint8_t *from, chunk_t *chunk) {
+    *chunk = vec_splats(*from);
+}
+
+static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
+    uint16_t tmp;
+    memcpy(&tmp, from, 2);
+    *chunk = (vector unsigned char)vec_splats(tmp);
+}
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+    uint32_t tmp;
+    memcpy(&tmp, from, 4);
+    *chunk = (vector unsigned char)vec_splats(tmp);
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+    uint64_t tmp;
+    memcpy(&tmp, from, 8);
+    *chunk = (vector unsigned char)vec_splats(tmp);
+}
+
+#define CHUNKSIZE        chunksize_power8
+#define CHUNKCOPY        chunkcopy_power8
+#define CHUNKCOPY_SAFE   chunkcopy_safe_power8
+#define CHUNKUNROLL      chunkunroll_power8
+#define CHUNKMEMSET      chunkmemset_power8
+#define CHUNKMEMSET_SAFE chunkmemset_safe_power8
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+    *chunk = vec_xl(0, s);
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+    vec_xst(*chunk, 0, out);
+}
+
+#include "chunkset_tpl.h"
+
+#endif
diff --git a/configure b/configure
index 911269254..e4738dca1 100755
--- a/configure
+++ b/configure
@@ -1525,11 +1525,11 @@ EOF
             check_power8_intrinsics
 
             if test $HAVE_POWER8_INTRIN -eq 1; then
-                CFLAGS="${CFLAGS} -DPOWER8 -DPOWER_FEATURES -DPOWER8_VSX_ADLER32 -DPOWER8_VSX_SLIDEHASH"
-                SFLAGS="${SFLAGS} -DPOWER8 -DPOWER_FEATURES -DPOWER8_VSX_ADLER32 -DPOWER8_VSX_SLIDEHASH"
+                CFLAGS="${CFLAGS} -DPOWER8 -DPOWER_FEATURES -DPOWER8_VSX_ADLER32 -DPOWER8_VSX_CHUNKSET -DPOWER8_VSX_SLIDEHASH"
+                SFLAGS="${SFLAGS} -DPOWER8 -DPOWER_FEATURES -DPOWER8_VSX_ADLER32 -DPOWER8_VSX_CHUNKSET -DPOWER8_VSX_SLIDEHASH"
 
-                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} power.o adler32_power8.o slide_power8.o"
-                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} power.lo adler32_power8.lo slide_power8.lo"
+                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} power.o adler32_power8.o chunkset_power8.o slide_power8.o"
+                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} power.lo adler32_power8.lo chunkset_power8.lo slide_power8.lo"
             fi
         fi
     ;;
diff --git a/functable.c b/functable.c
index 5ed930c10..12feedfb3 100644
--- a/functable.c
+++ b/functable.c
@@ -96,6 +96,14 @@ extern uint8_t* chunkunroll_neon(uint8_t *out, unsigned *dist, unsigned *len);
 extern uint8_t* chunkmemset_neon(uint8_t *out, unsigned dist, unsigned len);
 extern uint8_t* chunkmemset_safe_neon(uint8_t *out, unsigned dist, unsigned len, unsigned left);
 #endif
+#ifdef POWER8_VSX_CHUNKSET
+extern uint32_t chunksize_power8(void);
+extern uint8_t* chunkcopy_power8(uint8_t *out, uint8_t const *from, unsigned len);
+extern uint8_t* chunkcopy_safe_power8(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe);
+extern uint8_t* chunkunroll_power8(uint8_t *out, unsigned *dist, unsigned *len);
+extern uint8_t* chunkmemset_power8(uint8_t *out, unsigned dist, unsigned len);
+extern uint8_t* chunkmemset_safe_power8(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+#endif
 
 /* CRC32 */
 Z_INTERNAL uint32_t crc32_generic(uint32_t, const unsigned char *, uint64_t);
@@ -298,6 +306,10 @@ Z_INTERNAL uint32_t chunksize_stub(void) {
     if (arm_cpu_has_neon)
         functable.chunksize = &chunksize_neon;
 #endif
+#ifdef POWER8_VSX_CHUNKSET
+    if (power_cpu_has_arch_2_07)
+        functable.chunksize = &chunksize_power8;
+#endif
 
     return functable.chunksize();
 }
@@ -320,6 +332,10 @@ Z_INTERNAL uint8_t* chunkcopy_stub(uint8_t *out, uint8_t const *from, unsigned l
     if (arm_cpu_has_neon)
         functable.chunkcopy = &chunkcopy_neon;
 #endif
+#ifdef POWER8_VSX_CHUNKSET
+    if (power_cpu_has_arch_2_07)
+        functable.chunkcopy = &chunkcopy_power8;
+#endif
 
     return functable.chunkcopy(out, from, len);
 }
@@ -342,6 +358,10 @@ Z_INTERNAL uint8_t* chunkcopy_safe_stub(uint8_t *out, uint8_t const *from, unsig
     if (arm_cpu_has_neon)
         functable.chunkcopy_safe = &chunkcopy_safe_neon;
 #endif
+#ifdef POWER8_VSX_CHUNKSET
+    if (power_cpu_has_arch_2_07)
+        functable.chunkcopy_safe = &chunkcopy_safe_power8;
+#endif
 
     return functable.chunkcopy_safe(out, from, len, safe);
 }
@@ -364,6 +384,10 @@ Z_INTERNAL uint8_t* chunkunroll_stub(uint8_t *out, unsigned *dist, unsigned *len
     if (arm_cpu_has_neon)
         functable.chunkunroll = &chunkunroll_neon;
 #endif
+#ifdef POWER8_VSX_CHUNKSET
+    if (power_cpu_has_arch_2_07)
+        functable.chunkunroll = &chunkunroll_power8;
+#endif
 
     return functable.chunkunroll(out, dist, len);
 }
@@ -386,6 +410,11 @@ Z_INTERNAL uint8_t* chunkmemset_stub(uint8_t *out, unsigned dist, unsigned len)
     if (arm_cpu_has_neon)
         functable.chunkmemset = &chunkmemset_neon;
 #endif
+#ifdef POWER8_VSX_CHUNKSET
+    if (power_cpu_has_arch_2_07)
+        functable.chunkmemset = &chunkmemset_power8;
+#endif
+
 
     return functable.chunkmemset(out, dist, len);
 }
@@ -408,6 +437,10 @@ Z_INTERNAL uint8_t* chunkmemset_safe_stub(uint8_t *out, unsigned dist, unsigned
     if (arm_cpu_has_neon)
         functable.chunkmemset_safe = &chunkmemset_safe_neon;
 #endif
+#ifdef POWER8_VSX_CHUNKSET
+    if (power_cpu_has_arch_2_07)
+        functable.chunkmemset_safe = &chunkmemset_safe_power8;
+#endif
 
     return functable.chunkmemset_safe(out, dist, len, left);
 }