From: Rogerio Alves Date: Mon, 9 Dec 2019 17:40:53 +0000 (-0300) Subject: Adler32 vector optimization for Power. X-Git-Tag: 1.9.9-b1~191 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=a13b6039b327fb127685a99191d0053a62346e03;p=thirdparty%2Fzlib-ng.git Adler32 vector optimization for Power. This commit implements a Power (POWER8+) vector optimization for Adler32 checksum using VSX (vector) instructions. The VSX adler32 checksum is up to 10x fast than the adler32 baseline code. Author: Rogerio Alves --- diff --git a/CMakeLists.txt b/CMakeLists.txt index 16a94364..ee9bb64b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -692,9 +692,11 @@ if(WITH_OPTIM) endif() elseif(BASEARCH_PPC_FOUND) if(WITH_POWER8 AND HAVE_POWER8) - add_definitions(-DPOWER_FEATURES) add_definitions(-DPOWER8) + add_definitions(-DPOWER_FEATURES) + add_definitions(-DPOWER8_VSX_ADLER32) set(ZLIB_POWER8_SRCS + ${ARCHDIR}/adler32_power8.c ${ARCHDIR}/slide_hash_power8.c) set_source_files_properties( ${ZLIB_POWER8_SRCS} diff --git a/README.md b/README.md index 1c07a0dd..1873b373 100644 --- a/README.md +++ b/README.md @@ -58,7 +58,7 @@ Features * Modernized native API based on zlib API for ease of porting * Intel deflate medium and quick algorithms * Support for CPU intrinsics when available - * Adler32 implementation using SSSE3, AVX2, & Neon + * Adler32 implementation using SSSE3, AVX2, Neon, & VSX * Intel CRC32-B implementation using PCLMULQDQ * Intel CRC32-C intrinics for hash tables * ARM CRC32-B implementation using ACLE diff --git a/arch/power/Makefile.in b/arch/power/Makefile.in index 6deb690a..25ebc9d1 100644 --- a/arch/power/Makefile.in +++ b/arch/power/Makefile.in @@ -16,6 +16,8 @@ P8FLAGS=-mcpu=power8 all: power.o \ power.lo \ + adler32_power8.o \ + adler32_power8.lo \ slide_hash_power8.o \ slide_hash_power8.lo @@ -25,6 +27,12 @@ power.o: power.lo: $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/power.c +adler32_power8.o: + $(CC) $(CFLAGS) $(P8FLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_power8.c + +adler32_power8.lo: + $(CC) $(SFLAGS) $(P8FLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_power8.c + slide_hash_power8.o: $(CC) $(CFLAGS) $(P8FLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_power8.c diff --git a/arch/power/adler32_power8.c b/arch/power/adler32_power8.c new file mode 100644 index 00000000..3f74f0d1 --- /dev/null +++ b/arch/power/adler32_power8.c @@ -0,0 +1,165 @@ +/* Adler32 for POWER8 using VSX instructions. + * Copyright (C) 2020 IBM Corporation + * Author: Rogerio Alves + * For conditions of distribution and use, see copyright notice in zlib.h + * + * Calculate adler32 checksum for 16 bytes at once using POWER8+ VSX (vector) + * instructions. + * + * If adler32 do 1 byte at time on the first iteration s1 is s1_0 (_n means + * iteration n) is the initial value of adler - at start _0 is 1 unless + * adler initial value is different than 1. So s1_1 = s1_0 + c[0] after + * the first calculation. For the iteration s1_2 = s1_1 + c[1] and so on. + * Hence, for iteration N, s1_N = s1_(N-1) + c[N] is the value of s1 on + * after iteration N. + * + * Therefore, for s2 and iteration N, s2_N = s2_0 + N*s1_N + N*c[0] + + * N-1*c[1] + ... + c[N] + * + * In a more general way: + * + * s1_N = s1_0 + sum(i=1 to N)c[i] + * s2_N = s2_0 + N*s1 + sum (i=1 to N)(N-i+1)*c[i] + * + * Where s1_N, s2_N are the values for s1, s2 after N iterations. So if we + * can process N-bit at time we can do this at once. + * + * Since VSX can support 16-bit vector instructions, we can process + * 16-bit at time using N = 16 we have: + * + * s1 = s1_16 = s1_(16-1) + c[16] = s1_0 + sum(i=1 to 16)c[i] + * s2 = s2_16 = s2_0 + 16*s1 + sum(i=1 to 16)(16-i+1)*c[i] + * + * After the first iteration we calculate the adler32 checksum for 16 bytes. + * + * For more background about adler32 please check the RFC: + * https://www.ietf.org/rfc/rfc1950.txt + */ + +#ifdef POWER8_VSX_ADLER32 + +#include +#include "zbuild.h" +#include "zutil.h" +#include "adler32_p.h" + +#define DO1(s1,s2,buf,i) {(s1) += buf[(i)]; (s2) += (s1);} +#define DO2(s1,s2,buf,i) {DO1(s1,s2,buf,i); DO1(s1,s2,buf,i+1);} +#define DO4(s1,s2,buf,i) {DO2(s1,s2,buf,i); DO2(s1,s2,buf,i+2);} +#define DO8(s1,s2,buf,i) {DO4(s1,s2,buf,i); DO4(s1,s2,buf,i+4);} +#define DO16(s1,s2,buf) {DO8(s1,s2,buf,0); DO8(s1,s2,buf,8);} + +/* Vector across sum unsigned int (saturate). */ +inline vector unsigned int vec_sumsu(vector unsigned int __a, vector unsigned int __b) { + __b = vec_sld(__a, __a, 8); + __b = vec_add(__b, __a); + __a = vec_sld(__b, __b, 4); + __a = vec_add(__a, __b); + + return __a; +} + +uint32_t adler32_power8(uint32_t adler, const unsigned char* buf, size_t len) { + uint32_t s1 = adler & 0xffff; + uint32_t s2 = (adler >> 16) & 0xffff; + + /* in case user likes doing a byte at a time, keep it fast */ + if (UNLIKELY(len == 1)) + return adler32_len_1(s1, buf, s2); + + /* If buffer is empty or len=0 we need to return adler initial value. */ + if (UNLIKELY(buf == NULL)) + return 1; + + /* This is faster than VSX code for len < 64. */ + if (len < 64) { + while (len >= 16) { + len -= 16; + DO16(s1,s2,buf); + buf += 16; + } + } else { + /* Use POWER VSX instructions for len >= 64. */ + const vector unsigned int v_zeros = { 0 }; + const vector unsigned char v_mul = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7, + 6, 5, 4, 3, 2, 1}; + const vector unsigned char vsh = vec_splat_u8(4); + const vector unsigned int vmask = {0xffffffff, 0x0, 0x0, 0x0}; + vector unsigned int vs1 = { 0 }; + vector unsigned int vs2 = { 0 }; + vector unsigned int vs1_save = { 0 }; + vector unsigned int vsum1, vsum2; + vector unsigned char vbuf; + int n; + + vs1[0] = s1; + vs2[0] = s2; + + /* Do length bigger than NMAX in blocks of NMAX size. */ + while (len >= NMAX) { + len -= NMAX; + n = NMAX / 16; + do { + vbuf = vec_xl(0, (unsigned char *) buf); + vsum1 = vec_sum4s(vbuf, v_zeros); /* sum(i=1 to 16) buf[i]. */ + /* sum(i=1 to 16) buf[i]*(16-i+1). */ + vsum2 = vec_msum(vbuf, v_mul, v_zeros); + /* Save vs1. */ + vs1_save = vec_add(vs1_save, vs1); + /* Accumulate the sums. */ + vs1 = vec_add(vsum1, vs1); + vs2 = vec_add(vsum2, vs2); + + buf += 16; + } while (--n); + /* Once each block of NMAX size. */ + vs1 = vec_sumsu(vs1, vsum1); + vs1_save = vec_sll(vs1_save, vsh); /* 16*vs1_save. */ + vs2 = vec_add(vs1_save, vs2); + vs2 = vec_sumsu(vs2, vsum2); + + /* vs1[0] = (s1_i + sum(i=1 to 16)buf[i]) mod 65521. */ + vs1[0] = vs1[0] % BASE; + /* vs2[0] = s2_i + 16*s1_save + + sum(i=1 to 16)(16-i+1)*buf[i] mod 65521. */ + vs2[0] = vs2[0] % BASE; + + vs1 = vec_and(vs1, vmask); + vs2 = vec_and(vs2, vmask); + vs1_save = v_zeros; + } + + /* len is less than NMAX one modulo is needed. */ + if (len >= 16) { + while (len >= 16) { + len -= 16; + + vbuf = vec_xl(0, (unsigned char *) buf); + + vsum1 = vec_sum4s(vbuf, v_zeros); /* sum(i=1 to 16) buf[i]. */ + /* sum(i=1 to 16) buf[i]*(16-i+1). */ + vsum2 = vec_msum(vbuf, v_mul, v_zeros); + /* Save vs1. */ + vs1_save = vec_add(vs1_save, vs1); + /* Accumulate the sums. */ + vs1 = vec_add(vsum1, vs1); + vs2 = vec_add(vsum2, vs2); + + buf += 16; + } + /* Since the size will be always less than NMAX we do this once. */ + vs1 = vec_sumsu(vs1, vsum1); + vs1_save = vec_sll(vs1_save, vsh); /* 16*vs1_save. */ + vs2 = vec_add(vs1_save, vs2); + vs2 = vec_sumsu(vs2, vsum2); + } + /* Copy result back to s1, s2 (mod 65521). */ + s1 = vs1[0] % BASE; + s2 = vs2[0] % BASE; + } + + /* Process tail (len < 16).and return */ + return adler32_len_16(s1, buf, len, s2); +} + +#endif /* POWER8_VSX_ADLER32 */ diff --git a/configure b/configure index ec1f157b..25dd20af 100755 --- a/configure +++ b/configure @@ -1392,9 +1392,9 @@ case "${ARCH}" in if test $without_optimizations -eq 0; then if test $HAVE_POWER8 -eq 1; then - ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} power.o slide_hash_power8.o" - ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} power.lo slide_hash_power8.lo" - POWERFLAGS="-DPOWER_FEATURES -DPOWER8" + ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_power8.o power.o slide_hash_power8.o" + ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_power8.lo power.lo slide_hash_power8.lo" + POWERFLAGS="-DPOWER8 -DPOWER_FEATURES -DPOWER8_VSX_ADLER32" fi fi diff --git a/functable.c b/functable.c index 81249656..6679a9d0 100644 --- a/functable.c +++ b/functable.c @@ -53,6 +53,9 @@ extern uint32_t adler32_ssse3(uint32_t adler, const unsigned char *buf, size_t l #ifdef X86_AVX2_ADLER32 extern uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_t len); #endif +#ifdef POWER8_VSX_ADLER32 +extern uint32_t adler32_power8(uint32_t adler, const unsigned char* buf, size_t len); +#endif /* CRC32 */ ZLIB_INTERNAL uint32_t crc32_generic(uint32_t, const unsigned char *, uint64_t); @@ -213,6 +216,10 @@ ZLIB_INTERNAL uint32_t adler32_stub(uint32_t adler, const unsigned char *buf, si if (x86_cpu_has_avx2) functable.adler32 = &adler32_avx2; #endif +#ifdef POWER8_VSX_ADLER32 + if (power_cpu_has_arch_2_07) + functable.adler32 = &adler32_power8; +#endif return functable.adler32(adler, buf, len); }