if(WITH_POWER8 AND HAVE_POWER8)
add_definitions(-DPOWER_FEATURES)
add_definitions(-DPOWER8)
- set(ZLIB_POWER8_SRCS )
+ set(ZLIB_POWER8_SRCS
+ ${ARCHDIR}/slide_hash_power8.c)
set_source_files_properties(
${ZLIB_POWER8_SRCS}
PROPERTIES COMPILE_FLAGS ${POWER8FLAG})
* Intel CRC32-B implementation using PCLMULQDQ
* Intel CRC32-C intrinics for hash tables
* ARM CRC32-B implementation using ACLE
- * Slide hash implementations using AVX2, SSE2, & ARM Neon
+ * Slide hash implementations using AVX2, SSE2, ARM Neon, & VSX
* Inflate fast using SSE2, ARM Neon
* Deflate hooks for IBM Z DFLTCC
* Code sanitizers, fuzzing, and coverage
P8FLAGS=-mcpu=power8
all: power.o \
- power.lo
+ power.lo \
+ slide_hash_power8.o \
+ slide_hash_power8.lo
power.o:
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/power.c
power.lo:
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/power.c
+slide_hash_power8.o:
+ $(CC) $(CFLAGS) $(P8FLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_power8.c
+
+slide_hash_power8.lo:
+ $(CC) $(SFLAGS) $(P8FLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_power8.c
+
mostlyclean: clean
clean:
rm -f *.o *.lo *~
--- /dev/null
+/* Optimized slide_hash for POWER processors
+ * Copyright (C) 2019-2020 Matheus Castanho <msc@linux.ibm.com>, IBM
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include <altivec.h>
+#include "zbuild.h"
+#include "deflate.h"
+
+static inline void slide_hash_power8_loop(deflate_state *s, unsigned n_elems, Pos *table_end) {
+ vector unsigned short vw, vm, *vp;
+ unsigned chunks;
+
+ /* Each vector register (chunk) corresponds to 128 bits == 8 Posf,
+ * so instead of processing each of the n_elems in the hash table
+ * individually, we can do it in chunks of 8 with vector instructions.
+ *
+ * This function is only called from slide_hash_power8(), and both calls
+ * pass n_elems as a power of 2 higher than 2^7, as defined by
+ * deflateInit2_(), so n_elems will always be a multiple of 8. */
+ chunks = n_elems >> 3;
+ Assert(n_elems % 8 == 0, "Weird hash table size!");
+
+ /* This type casting is safe since s->w_size is always <= 64KB
+ * as defined by deflateInit2_() and Posf == unsigned short */
+ vw[0] = (Pos) s->w_size;
+ vw = vec_splat(vw,0);
+
+ vp = (vector unsigned short *) table_end;
+
+ do {
+ /* Processing 8 elements at a time */
+ vp--;
+ vm = *vp;
+
+ /* This is equivalent to: m >= w_size ? m - w_size : 0
+ * Since we are using a saturated unsigned subtraction, any
+ * values that are > w_size will be set to 0, while the others
+ * will be subtracted by w_size. */
+ *vp = vec_subs(vm,vw);
+ } while (--chunks);
+}
+
+void ZLIB_INTERNAL slide_hash_power8(deflate_state *s) {
+ unsigned int n;
+ Pos *p;
+
+ n = s->hash_size;
+ p = &s->head[n];
+ slide_hash_power8_loop(s,n,p);
+
+ n = s->w_size;
+ p = &s->prev[n];
+ slide_hash_power8_loop(s,n,p);
+}
if test $without_optimizations -eq 0; then
if test $HAVE_POWER8 -eq 1; then
- ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} power.o"
- ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} power.lo"
+ ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} power.o slide_hash_power8.o"
+ ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} power.lo slide_hash_power8.lo"
POWERFLAGS="-DPOWER_FEATURES -DPOWER8"
fi
fi
void slide_hash_sse2(deflate_state *s);
#elif defined(ARM_NEON_SLIDEHASH)
void slide_hash_neon(deflate_state *s);
+#elif defined(POWER8)
+void slide_hash_power8(deflate_state *s);
#endif
#ifdef X86_AVX2
void slide_hash_avx2(deflate_state *s);
if (x86_cpu_has_avx2)
functable.slide_hash = &slide_hash_avx2;
#endif
+#ifdef POWER8
+ if (power_cpu_has_arch_2_07)
+ functable.slide_hash = &slide_hash_power8;
+#endif
functable.slide_hash(s);
}