]> git.ipfire.org Git - thirdparty/zlib-ng.git/commitdiff
[Power8] Add chunk*_power8.
authorMika Lindqvist <postmaster@raasu.org>
Sat, 19 Jun 2021 05:58:09 +0000 (08:58 +0300)
committerHans Kristian Rosbach <hk-github@circlestorm.org>
Fri, 25 Jun 2021 18:38:14 +0000 (20:38 +0200)
CMakeLists.txt
README.md
arch/power/Makefile.in
arch/power/chunkset_power8.c [new file with mode: 0644]
configure
functable.c

index 3ea40b502b4ea8ee967f5dabd8747a8e4329b50a..98dbec9db4e95e42cd823c29e7e7bcbe146135a9 100644 (file)
@@ -606,10 +606,11 @@ if(WITH_OPTIM)
                 add_definitions(-DPOWER8)
                 add_definitions(-DPOWER_FEATURES)
                 add_definitions(-DPOWER8_VSX_ADLER32)
+                add_definitions(-DPOWER8_VSX_CHUNKSET)
                 add_definitions(-DPOWER8_VSX_SLIDEHASH)
                 list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/power.h)
                 list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/power.c)
-                set(POWER8_SRCS ${ARCHDIR}/adler32_power8.c ${ARCHDIR}/slide_power8.c)
+                set(POWER8_SRCS ${ARCHDIR}/adler32_power8.c ${ARCHDIR}/chunkset_power8.c ${ARCHDIR}/slide_power8.c)
                 list(APPEND ZLIB_ARCH_SRCS ${POWER8_SRCS})
                 set_property(SOURCE ${POWER8_SRCS} PROPERTY COMPILE_FLAGS "${POWER8FLAG} ${NOLTOFLAG}")
             else()
index 22b1e41647bd287937f28cd792f03fd5be1b20bf..6efbda5dc613be5fde8a7759d90d51dda395dd3f 100644 (file)
--- a/README.md
+++ b/README.md
@@ -26,7 +26,7 @@ Features
   * Hash table implementation using CRC32-C intrinsics on x86 and ARM
   * Slide hash implementations using SSE2, AVX2, Neon & VSX
   * Compare256/258 implementations using SSE4.2 & AVX2
-  * Inflate chunk copying using SSE2, AVX2 & Neon
+  * Inflate chunk copying using SSE2, AVX2, Neon & VSX
   * Support for hardware-accelerated deflate using IBM Z DFLTCC
 * Unaligned memory read/writes and large bit buffer improvements
 * Includes improvements from Cloudflare and Intel forks
index e7a2473a3975e5f9fc242e66c1292936cddc7301..f58c49e81e7ffffff7b2932b13319d311f737261 100644 (file)
@@ -19,6 +19,8 @@ all: power.o \
      power.lo \
      adler32_power8.o \
      adler32_power8.lo \
+     chunkset_power8.o \
+     chunkset_power8.lo \
      slide_power8.o \
      slide_power8.lo
 
@@ -34,6 +36,12 @@ adler32_power8.o:
 adler32_power8.lo:
        $(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_power8.c
 
+chunkset_power8.o:
+       $(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_power8.c
+
+chunkset_power8.lo:
+       $(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_power8.c
+
 slide_power8.o:
        $(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_power8.c
 
diff --git a/arch/power/chunkset_power8.c b/arch/power/chunkset_power8.c
new file mode 100644 (file)
index 0000000..a76f663
--- /dev/null
@@ -0,0 +1,58 @@
+/* chunkset_power8.c -- VSX inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef POWER8_VSX_CHUNKSET
+#include <altivec.h>
+#include "zbuild.h"
+#include "zutil.h"
+
+typedef vector unsigned char chunk_t;
+
+#define CHUNK_SIZE 16
+
+#define HAVE_CHUNKMEMSET_1
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+
+static inline void chunkmemset_1(uint8_t *from, chunk_t *chunk) {
+    *chunk = vec_splats(*from);
+}
+
+static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
+    uint16_t tmp;
+    memcpy(&tmp, from, 2);
+    *chunk = (vector unsigned char)vec_splats(tmp);
+}
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+    uint32_t tmp;
+    memcpy(&tmp, from, 4);
+    *chunk = (vector unsigned char)vec_splats(tmp);
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+    uint64_t tmp;
+    memcpy(&tmp, from, 8);
+    *chunk = (vector unsigned char)vec_splats(tmp);
+}
+
+#define CHUNKSIZE        chunksize_power8
+#define CHUNKCOPY        chunkcopy_power8
+#define CHUNKCOPY_SAFE   chunkcopy_safe_power8
+#define CHUNKUNROLL      chunkunroll_power8
+#define CHUNKMEMSET      chunkmemset_power8
+#define CHUNKMEMSET_SAFE chunkmemset_safe_power8
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+    *chunk = vec_xl(0, s);
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+    vec_xst(*chunk, 0, out);
+}
+
+#include "chunkset_tpl.h"
+
+#endif
index 911269254d4f3a6c90cfecf01cd7b567c9ffdfcc..e4738dca16af6aac0bc867afd979e18ebae62b80 100755 (executable)
--- a/configure
+++ b/configure
@@ -1525,11 +1525,11 @@ EOF
             check_power8_intrinsics
 
             if test $HAVE_POWER8_INTRIN -eq 1; then
-                CFLAGS="${CFLAGS} -DPOWER8 -DPOWER_FEATURES -DPOWER8_VSX_ADLER32 -DPOWER8_VSX_SLIDEHASH"
-                SFLAGS="${SFLAGS} -DPOWER8 -DPOWER_FEATURES -DPOWER8_VSX_ADLER32 -DPOWER8_VSX_SLIDEHASH"
+                CFLAGS="${CFLAGS} -DPOWER8 -DPOWER_FEATURES -DPOWER8_VSX_ADLER32 -DPOWER8_VSX_CHUNKSET -DPOWER8_VSX_SLIDEHASH"
+                SFLAGS="${SFLAGS} -DPOWER8 -DPOWER_FEATURES -DPOWER8_VSX_ADLER32 -DPOWER8_VSX_CHUNKSET -DPOWER8_VSX_SLIDEHASH"
 
-                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} power.o adler32_power8.o slide_power8.o"
-                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} power.lo adler32_power8.lo slide_power8.lo"
+                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} power.o adler32_power8.o chunkset_power8.o slide_power8.o"
+                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} power.lo adler32_power8.lo chunkset_power8.lo slide_power8.lo"
             fi
         fi
     ;;
index 5ed930c1083c40b1c86685e3d85e4d7a30f614c6..12feedfb31eaf55c2984ee800b127fdd13fc8830 100644 (file)
@@ -96,6 +96,14 @@ extern uint8_t* chunkunroll_neon(uint8_t *out, unsigned *dist, unsigned *len);
 extern uint8_t* chunkmemset_neon(uint8_t *out, unsigned dist, unsigned len);
 extern uint8_t* chunkmemset_safe_neon(uint8_t *out, unsigned dist, unsigned len, unsigned left);
 #endif
+#ifdef POWER8_VSX_CHUNKSET
+extern uint32_t chunksize_power8(void);
+extern uint8_t* chunkcopy_power8(uint8_t *out, uint8_t const *from, unsigned len);
+extern uint8_t* chunkcopy_safe_power8(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe);
+extern uint8_t* chunkunroll_power8(uint8_t *out, unsigned *dist, unsigned *len);
+extern uint8_t* chunkmemset_power8(uint8_t *out, unsigned dist, unsigned len);
+extern uint8_t* chunkmemset_safe_power8(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+#endif
 
 /* CRC32 */
 Z_INTERNAL uint32_t crc32_generic(uint32_t, const unsigned char *, uint64_t);
@@ -298,6 +306,10 @@ Z_INTERNAL uint32_t chunksize_stub(void) {
     if (arm_cpu_has_neon)
         functable.chunksize = &chunksize_neon;
 #endif
+#ifdef POWER8_VSX_CHUNKSET
+    if (power_cpu_has_arch_2_07)
+        functable.chunksize = &chunksize_power8;
+#endif
 
     return functable.chunksize();
 }
@@ -320,6 +332,10 @@ Z_INTERNAL uint8_t* chunkcopy_stub(uint8_t *out, uint8_t const *from, unsigned l
     if (arm_cpu_has_neon)
         functable.chunkcopy = &chunkcopy_neon;
 #endif
+#ifdef POWER8_VSX_CHUNKSET
+    if (power_cpu_has_arch_2_07)
+        functable.chunkcopy = &chunkcopy_power8;
+#endif
 
     return functable.chunkcopy(out, from, len);
 }
@@ -342,6 +358,10 @@ Z_INTERNAL uint8_t* chunkcopy_safe_stub(uint8_t *out, uint8_t const *from, unsig
     if (arm_cpu_has_neon)
         functable.chunkcopy_safe = &chunkcopy_safe_neon;
 #endif
+#ifdef POWER8_VSX_CHUNKSET
+    if (power_cpu_has_arch_2_07)
+        functable.chunkcopy_safe = &chunkcopy_safe_power8;
+#endif
 
     return functable.chunkcopy_safe(out, from, len, safe);
 }
@@ -364,6 +384,10 @@ Z_INTERNAL uint8_t* chunkunroll_stub(uint8_t *out, unsigned *dist, unsigned *len
     if (arm_cpu_has_neon)
         functable.chunkunroll = &chunkunroll_neon;
 #endif
+#ifdef POWER8_VSX_CHUNKSET
+    if (power_cpu_has_arch_2_07)
+        functable.chunkunroll = &chunkunroll_power8;
+#endif
 
     return functable.chunkunroll(out, dist, len);
 }
@@ -386,6 +410,11 @@ Z_INTERNAL uint8_t* chunkmemset_stub(uint8_t *out, unsigned dist, unsigned len)
     if (arm_cpu_has_neon)
         functable.chunkmemset = &chunkmemset_neon;
 #endif
+#ifdef POWER8_VSX_CHUNKSET
+    if (power_cpu_has_arch_2_07)
+        functable.chunkmemset = &chunkmemset_power8;
+#endif
+
 
     return functable.chunkmemset(out, dist, len);
 }
@@ -408,6 +437,10 @@ Z_INTERNAL uint8_t* chunkmemset_safe_stub(uint8_t *out, unsigned dist, unsigned
     if (arm_cpu_has_neon)
         functable.chunkmemset_safe = &chunkmemset_safe_neon;
 #endif
+#ifdef POWER8_VSX_CHUNKSET
+    if (power_cpu_has_arch_2_07)
+        functable.chunkmemset_safe = &chunkmemset_safe_power8;
+#endif
 
     return functable.chunkmemset_safe(out, dist, len, left);
 }