add_definitions(-DPOWER8)
add_definitions(-DPOWER_FEATURES)
add_definitions(-DPOWER8_VSX_ADLER32)
+ add_definitions(-DPOWER8_VSX_CHUNKSET)
add_definitions(-DPOWER8_VSX_SLIDEHASH)
list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/power.h)
list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/power.c)
- set(POWER8_SRCS ${ARCHDIR}/adler32_power8.c ${ARCHDIR}/slide_power8.c)
+ set(POWER8_SRCS ${ARCHDIR}/adler32_power8.c ${ARCHDIR}/chunkset_power8.c ${ARCHDIR}/slide_power8.c)
list(APPEND ZLIB_ARCH_SRCS ${POWER8_SRCS})
set_property(SOURCE ${POWER8_SRCS} PROPERTY COMPILE_FLAGS "${POWER8FLAG} ${NOLTOFLAG}")
else()
* Hash table implementation using CRC32-C intrinsics on x86 and ARM
* Slide hash implementations using SSE2, AVX2, Neon & VSX
* Compare256/258 implementations using SSE4.2 & AVX2
- * Inflate chunk copying using SSE2, AVX2 & Neon
+ * Inflate chunk copying using SSE2, AVX2, Neon & VSX
* Support for hardware-accelerated deflate using IBM Z DFLTCC
* Unaligned memory read/writes and large bit buffer improvements
* Includes improvements from Cloudflare and Intel forks
power.lo \
adler32_power8.o \
adler32_power8.lo \
+ chunkset_power8.o \
+ chunkset_power8.lo \
slide_power8.o \
slide_power8.lo
adler32_power8.lo:
$(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_power8.c
+chunkset_power8.o:
+ $(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_power8.c
+
+chunkset_power8.lo:
+ $(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_power8.c
+
slide_power8.o:
$(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_power8.c
--- /dev/null
+/* chunkset_power8.c -- VSX inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef POWER8_VSX_CHUNKSET
+#include <altivec.h>
+#include "zbuild.h"
+#include "zutil.h"
+
+typedef vector unsigned char chunk_t;
+
+#define CHUNK_SIZE 16
+
+#define HAVE_CHUNKMEMSET_1
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+
+static inline void chunkmemset_1(uint8_t *from, chunk_t *chunk) {
+ *chunk = vec_splats(*from);
+}
+
+static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
+ uint16_t tmp;
+ memcpy(&tmp, from, 2);
+ *chunk = (vector unsigned char)vec_splats(tmp);
+}
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+ uint32_t tmp;
+ memcpy(&tmp, from, 4);
+ *chunk = (vector unsigned char)vec_splats(tmp);
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+ uint64_t tmp;
+ memcpy(&tmp, from, 8);
+ *chunk = (vector unsigned char)vec_splats(tmp);
+}
+
+#define CHUNKSIZE chunksize_power8
+#define CHUNKCOPY chunkcopy_power8
+#define CHUNKCOPY_SAFE chunkcopy_safe_power8
+#define CHUNKUNROLL chunkunroll_power8
+#define CHUNKMEMSET chunkmemset_power8
+#define CHUNKMEMSET_SAFE chunkmemset_safe_power8
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+ *chunk = vec_xl(0, s);
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+ vec_xst(*chunk, 0, out);
+}
+
+#include "chunkset_tpl.h"
+
+#endif
check_power8_intrinsics
if test $HAVE_POWER8_INTRIN -eq 1; then
- CFLAGS="${CFLAGS} -DPOWER8 -DPOWER_FEATURES -DPOWER8_VSX_ADLER32 -DPOWER8_VSX_SLIDEHASH"
- SFLAGS="${SFLAGS} -DPOWER8 -DPOWER_FEATURES -DPOWER8_VSX_ADLER32 -DPOWER8_VSX_SLIDEHASH"
+ CFLAGS="${CFLAGS} -DPOWER8 -DPOWER_FEATURES -DPOWER8_VSX_ADLER32 -DPOWER8_VSX_CHUNKSET -DPOWER8_VSX_SLIDEHASH"
+ SFLAGS="${SFLAGS} -DPOWER8 -DPOWER_FEATURES -DPOWER8_VSX_ADLER32 -DPOWER8_VSX_CHUNKSET -DPOWER8_VSX_SLIDEHASH"
- ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} power.o adler32_power8.o slide_power8.o"
- ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} power.lo adler32_power8.lo slide_power8.lo"
+ ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} power.o adler32_power8.o chunkset_power8.o slide_power8.o"
+ ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} power.lo adler32_power8.lo chunkset_power8.lo slide_power8.lo"
fi
fi
;;
extern uint8_t* chunkmemset_neon(uint8_t *out, unsigned dist, unsigned len);
extern uint8_t* chunkmemset_safe_neon(uint8_t *out, unsigned dist, unsigned len, unsigned left);
#endif
+#ifdef POWER8_VSX_CHUNKSET
+extern uint32_t chunksize_power8(void);
+extern uint8_t* chunkcopy_power8(uint8_t *out, uint8_t const *from, unsigned len);
+extern uint8_t* chunkcopy_safe_power8(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe);
+extern uint8_t* chunkunroll_power8(uint8_t *out, unsigned *dist, unsigned *len);
+extern uint8_t* chunkmemset_power8(uint8_t *out, unsigned dist, unsigned len);
+extern uint8_t* chunkmemset_safe_power8(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+#endif
/* CRC32 */
Z_INTERNAL uint32_t crc32_generic(uint32_t, const unsigned char *, uint64_t);
if (arm_cpu_has_neon)
functable.chunksize = &chunksize_neon;
#endif
+#ifdef POWER8_VSX_CHUNKSET
+ if (power_cpu_has_arch_2_07)
+ functable.chunksize = &chunksize_power8;
+#endif
return functable.chunksize();
}
if (arm_cpu_has_neon)
functable.chunkcopy = &chunkcopy_neon;
#endif
+#ifdef POWER8_VSX_CHUNKSET
+ if (power_cpu_has_arch_2_07)
+ functable.chunkcopy = &chunkcopy_power8;
+#endif
return functable.chunkcopy(out, from, len);
}
if (arm_cpu_has_neon)
functable.chunkcopy_safe = &chunkcopy_safe_neon;
#endif
+#ifdef POWER8_VSX_CHUNKSET
+ if (power_cpu_has_arch_2_07)
+ functable.chunkcopy_safe = &chunkcopy_safe_power8;
+#endif
return functable.chunkcopy_safe(out, from, len, safe);
}
if (arm_cpu_has_neon)
functable.chunkunroll = &chunkunroll_neon;
#endif
+#ifdef POWER8_VSX_CHUNKSET
+ if (power_cpu_has_arch_2_07)
+ functable.chunkunroll = &chunkunroll_power8;
+#endif
return functable.chunkunroll(out, dist, len);
}
if (arm_cpu_has_neon)
functable.chunkmemset = &chunkmemset_neon;
#endif
+#ifdef POWER8_VSX_CHUNKSET
+ if (power_cpu_has_arch_2_07)
+ functable.chunkmemset = &chunkmemset_power8;
+#endif
+
return functable.chunkmemset(out, dist, len);
}
if (arm_cpu_has_neon)
functable.chunkmemset_safe = &chunkmemset_safe_neon;
#endif
+#ifdef POWER8_VSX_CHUNKSET
+ if (power_cpu_has_arch_2_07)
+ functable.chunkmemset_safe = &chunkmemset_safe_power8;
+#endif
return functable.chunkmemset_safe(out, dist, len, left);
}