From: Adam Stylinski Date: Fri, 28 Jan 2022 15:00:07 +0000 (-0500) Subject: More than double adler32 performance with altivec X-Git-Tag: 2.1.0-beta1~396 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=403a9aee6751c9eb7ada6090c530f1ba8991da97;p=thirdparty%2Fzlib-ng.git More than double adler32 performance with altivec Bits of low hanging and high hanging fruit in this round of optimization. Altivec has a sum characters into 4 lanes of integers instructions (intrinsic vec_sum4s) that seems basically made for this algorithm. Additionally, there's a similar multiply-accumulate routine that takes two character vectors for input and outputs a vector of 4 ints for their respective adjacent sums. This alone was a good amount of the performance gains. Additionally, the shifting by 4 was still done in the loop when it was easy to roll outside of the loop and do only once. This removed some latency for a dependent operand to be ready. We also unrolled the loop with independent sums, though, this only seems to help for much larger input sizes. Additionally, we reduced feeding the two 16 bit halves of the sum simply by packing them into an aligned allocation in the stack next to each other. Then, when loaded, we permute and shift the values to two separate vector registers from the same input registers. The separation of these scalars probably could have been done in vector registers through some tricks but we need them in scalar GPRs anyhow every time they leave the loop so it was naturally better to keep those separate before hitting the vectorized code. For the horizontal addition, the code was modified to use a sequence of shifts and adds to produce a vector sum in the first lane. Then, the much cheaper vec_ste was used to store the value into a general purpose register rather than vec_extract. Lastly, instead of doing the relatively expensive modulus in GPRs after we perform the scalar operations to align all of the loads in the loop, we can instead reduce "n" here for the first round to be n minus the alignment offset. --- diff --git a/arch/power/adler32_vmx.c b/arch/power/adler32_vmx.c index 5d7dc43e2..491d8d5df 100644 --- a/arch/power/adler32_vmx.c +++ b/arch/power/adler32_vmx.c @@ -1,6 +1,7 @@ /* adler32_vmx.c -- compute the Adler-32 checksum of a data stream * Copyright (C) 1995-2011 Mark Adler * Copyright (C) 2017-2021 Mika T. Lindqvist + * Copyright (C) 2021 Adam Stylinski * For conditions of distribution and use, see copyright notice in zlib.h */ @@ -11,16 +12,6 @@ #define vmx_zero() (vec_splat_u32(0)) -vector unsigned short vec_hadduh(vector unsigned char a) { - vector unsigned char vmx_one = vec_splat_u8(1); - return vec_add(vec_mulo(a, vmx_one), vec_mule(a, vmx_one)); -} - -vector unsigned int vec_hadduw(vector unsigned short a) { - vector unsigned short vmx_one = vec_splat_u16(1); - return vec_add(vec_mulo(a, vmx_one), vec_mule(a, vmx_one)); -} - static inline void vmx_handle_head_or_tail(uint32_t *pair, const unsigned char *buf, size_t len) { unsigned int i; for (i = 0; i < len; ++i) { @@ -30,31 +21,64 @@ static inline void vmx_handle_head_or_tail(uint32_t *pair, const unsigned char * } static void vmx_accum32(uint32_t *s, const unsigned char *buf, size_t len) { - static const uint8_t tc0[16] ALIGNED_(16) = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1}; - - vector unsigned char t0 = vec_ld(0, tc0); + const vector unsigned char t0 = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1}; + /* As silly and inefficient as it seems, creating 1 permutation vector to permute + * a 2 element vector from a single load + a subsequent shift is just barely faster + * than doing 2 indexed insertions into zero initialized vectors from unaligned memory. */ + const vector unsigned char s0_perm = {0, 1, 2, 3, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; + const vector unsigned char shift_vec = vec_sl(vec_splat_u8(8), vec_splat_u8(2)); vector unsigned int adacc, s2acc; - adacc = vec_insert(s[0], vmx_zero(), 0); - s2acc = vec_insert(s[1], vmx_zero(), 0); + vector unsigned int pair_vec = vec_ld(0, s); + adacc = vec_perm(pair_vec, pair_vec, s0_perm); + s2acc = vec_slo(pair_vec, shift_vec); + + vector unsigned int s3acc = vmx_zero(); + vector unsigned int s2acc_0 = s3acc; + vector unsigned int adacc_0 = adacc; + + int num_iter = len / 2; + int rem = len & 1; + + for (int i = 0; i < num_iter; ++i) { + vector unsigned char d0 = vec_ld(0, buf); + vector unsigned char d1 = vec_ld(16, buf); + + adacc = vec_sum4s(d0, adacc); + s3acc = vec_add(s3acc, adacc_0); + s2acc = vec_msum(t0, d0, s2acc); - while (len > 0) { + s3acc = vec_add(s3acc, adacc); + adacc = vec_sum4s(d1, adacc); + s2acc_0 = vec_msum(t0, d1, s2acc_0); + adacc_0 = adacc; + + buf += 32; + } + + if (rem) { vector unsigned char d0 = vec_ld(0, buf); - vector unsigned short sum2; - sum2 = vec_add(vec_mulo(t0, d0), vec_mule(t0, d0)); - s2acc = vec_add(s2acc, vec_sl(adacc, vec_splat_u32(4))); - s2acc = vec_add(s2acc, vec_hadduw(sum2)); - adacc = vec_add(adacc, vec_hadduw(vec_hadduh(d0))); - buf += 16; - len--; + s3acc = vec_add(s3acc, adacc); + s2acc = vec_msum(t0, d0, s2acc); + adacc = vec_sum4s(d0, adacc); } - s[0] = vec_extract(adacc, 0) + vec_extract(adacc, 1) + vec_extract(adacc, 2) + vec_extract(adacc, 3); /* Horizontal add */ - s[1] = vec_extract(s2acc, 0) + vec_extract(s2acc, 1) + vec_extract(s2acc, 2) + vec_extract(s2acc, 3); /* Horizontal add */ + s2acc = vec_add(s2acc, s2acc_0); + s3acc = vec_sl(s3acc, vec_splat_u32(4)); + s2acc = vec_add(s2acc, s3acc); + + adacc = vec_add(adacc, vec_sld(adacc, adacc, 8)); + s2acc = vec_add(s2acc, vec_sld(s2acc, s2acc, 8)); + adacc = vec_add(adacc, vec_sld(adacc, adacc, 4)); + s2acc = vec_add(s2acc, vec_sld(s2acc, s2acc, 4)); + + vec_ste(adacc, 0, s); + vec_ste(s2acc, 0, s+1); } uint32_t adler32_vmx(uint32_t adler, const unsigned char *buf, size_t len) { uint32_t sum2; - uint32_t pair[2]; + uint32_t pair[16] ALIGNED_(16); + memset(&pair[2], 0, 14); int n = NMAX; unsigned int done = 0, i; @@ -86,14 +110,15 @@ uint32_t adler32_vmx(uint32_t adler, const unsigned char *buf, size_t len) { al=len; } vmx_handle_head_or_tail(pair, buf, al); - pair[0] %= BASE; - pair[1] %= BASE; done += al; + /* Rather than rebasing, we can reduce the max sums for the + * first round only */ + n -= al; } for (i = al; i < len; i += n) { - if ((i + n) > len) - n = (int)(len - i); + int remaining = (int)(len-i); + n = MIN(remaining, (i == al) ? n : NMAX); if (n < 16) break;