From: Adam Stylinski <kungfujesus06@gmail.com>
Date: Fri, 28 Jan 2022 15:00:07 +0000 (-0500)
Subject: More than double adler32 performance with altivec
X-Git-Tag: 2.1.0-beta1~396
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=403a9aee6751c9eb7ada6090c530f1ba8991da97;p=thirdparty%2Fzlib-ng.git

More than double adler32 performance with altivec

Bits of low hanging and high hanging fruit in this round of
optimization.  Altivec has a sum characters into 4 lanes of integers
instructions (intrinsic vec_sum4s) that seems basically made for this
algorithm.  Additionally, there's a similar multiply-accumulate routine
that takes two character vectors for input and outputs a vector of 4
ints for their respective adjacent sums.  This alone was a good amount
of the performance gains.

Additionally, the shifting by 4 was still done in the loop when it was
easy to roll outside of the loop and do only once.  This removed some
latency for a dependent operand to be ready.  We also unrolled the loop
with independent sums, though, this only seems to help for much larger
input sizes.

Additionally, we reduced feeding the two 16 bit halves of the sum simply
by packing them into an aligned allocation in the stack next to each
other.  Then, when loaded, we permute and shift the values to two
separate vector registers from the same input registers.  The separation
of these scalars probably could have been done in vector registers
through some tricks but we need them in scalar GPRs anyhow every time
they leave the loop so it was naturally better to keep those separate
before hitting the vectorized code.

For the horizontal addition, the code was modified to use a sequence of
shifts and adds to produce a vector sum in the first lane.  Then, the
much cheaper vec_ste was used to store the value into a general purpose
register rather than vec_extract.

Lastly, instead of doing the relatively expensive modulus in GPRs after
we perform the scalar operations to align all of the loads in the loop,
we can instead reduce "n" here for the first round to be n minus the
alignment offset.
---

diff --git a/arch/power/adler32_vmx.c b/arch/power/adler32_vmx.c
index 5d7dc43e2..491d8d5df 100644
--- a/arch/power/adler32_vmx.c
+++ b/arch/power/adler32_vmx.c
@@ -1,6 +1,7 @@
 /* adler32_vmx.c -- compute the Adler-32 checksum of a data stream
  * Copyright (C) 1995-2011 Mark Adler
  * Copyright (C) 2017-2021 Mika T. Lindqvist <postmaster@raasu.org>
+ * Copyright (C) 2021 Adam Stylinski <kungfujesus06@gmail.com>
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
@@ -11,16 +12,6 @@
 
 #define vmx_zero()  (vec_splat_u32(0))
 
-vector unsigned short vec_hadduh(vector unsigned char a) {
-    vector unsigned char vmx_one = vec_splat_u8(1);
-    return vec_add(vec_mulo(a, vmx_one), vec_mule(a, vmx_one));
-}
-
-vector unsigned int vec_hadduw(vector unsigned short a) {
-    vector unsigned short vmx_one = vec_splat_u16(1);
-    return vec_add(vec_mulo(a, vmx_one), vec_mule(a, vmx_one));
-}
-
 static inline void vmx_handle_head_or_tail(uint32_t *pair, const unsigned char *buf, size_t len) {
     unsigned int i;
     for (i = 0; i < len; ++i) {
@@ -30,31 +21,64 @@ static inline void vmx_handle_head_or_tail(uint32_t *pair, const unsigned char *
 }
 
 static void vmx_accum32(uint32_t *s, const unsigned char *buf, size_t len) {
-    static const uint8_t tc0[16] ALIGNED_(16) = {16, 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1};
-
-    vector unsigned char t0 = vec_ld(0, tc0);
+    const vector unsigned char t0 = {16, 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1};
+    /* As silly and inefficient as it seems, creating 1 permutation vector to permute
+     * a 2 element vector from a single load + a subsequent shift is just barely faster
+     * than doing 2 indexed insertions into zero initialized vectors from unaligned memory. */
+    const vector unsigned char s0_perm = {0, 1, 2, 3, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
+    const vector unsigned char shift_vec = vec_sl(vec_splat_u8(8), vec_splat_u8(2));
     vector unsigned int  adacc, s2acc;
-    adacc = vec_insert(s[0], vmx_zero(), 0);
-    s2acc = vec_insert(s[1], vmx_zero(), 0);
+    vector unsigned int pair_vec = vec_ld(0, s);
+    adacc = vec_perm(pair_vec, pair_vec, s0_perm);
+    s2acc = vec_slo(pair_vec, shift_vec);
+    
+    vector unsigned int s3acc = vmx_zero();
+    vector unsigned int s2acc_0 = s3acc;
+    vector unsigned int adacc_0 = adacc;
+
+    int num_iter = len / 2;
+    int rem = len & 1;
+
+    for (int i = 0; i < num_iter; ++i) {
+        vector unsigned char d0 = vec_ld(0, buf);
+        vector unsigned char d1 = vec_ld(16, buf);
+
+        adacc = vec_sum4s(d0, adacc);
+        s3acc = vec_add(s3acc, adacc_0);
+        s2acc = vec_msum(t0, d0, s2acc);
 
-    while (len > 0) {
+        s3acc = vec_add(s3acc, adacc);
+        adacc = vec_sum4s(d1, adacc);
+        s2acc_0 = vec_msum(t0, d1, s2acc_0);
+        adacc_0 = adacc;
+
+        buf += 32;
+    }
+
+    if (rem) {
         vector unsigned char d0 = vec_ld(0, buf);
-        vector unsigned short sum2;
-        sum2  = vec_add(vec_mulo(t0, d0), vec_mule(t0, d0));
-        s2acc = vec_add(s2acc, vec_sl(adacc, vec_splat_u32(4)));
-        s2acc = vec_add(s2acc, vec_hadduw(sum2));
-        adacc = vec_add(adacc, vec_hadduw(vec_hadduh(d0)));
-        buf += 16;
-        len--;
+        s3acc = vec_add(s3acc, adacc);
+        s2acc = vec_msum(t0, d0, s2acc);
+        adacc = vec_sum4s(d0, adacc);
     }
 
-    s[0] = vec_extract(adacc, 0) + vec_extract(adacc, 1) + vec_extract(adacc, 2) + vec_extract(adacc, 3); /* Horizontal add */
-    s[1] = vec_extract(s2acc, 0) + vec_extract(s2acc, 1) + vec_extract(s2acc, 2) + vec_extract(s2acc, 3); /* Horizontal add */
+    s2acc = vec_add(s2acc, s2acc_0);
+    s3acc = vec_sl(s3acc, vec_splat_u32(4));
+    s2acc = vec_add(s2acc, s3acc);
+
+    adacc = vec_add(adacc, vec_sld(adacc, adacc, 8));
+    s2acc = vec_add(s2acc, vec_sld(s2acc, s2acc, 8));
+    adacc = vec_add(adacc, vec_sld(adacc, adacc, 4));
+    s2acc = vec_add(s2acc, vec_sld(s2acc, s2acc, 4));
+
+    vec_ste(adacc, 0, s);
+    vec_ste(s2acc, 0, s+1);
 }
 
 uint32_t adler32_vmx(uint32_t adler, const unsigned char *buf, size_t len) {
     uint32_t sum2;
-    uint32_t pair[2];
+    uint32_t pair[16] ALIGNED_(16);
+    memset(&pair[2], 0, 14);
     int n = NMAX;
     unsigned int done = 0, i;
 
@@ -86,14 +110,15 @@ uint32_t adler32_vmx(uint32_t adler, const unsigned char *buf, size_t len) {
             al=len;
         }
         vmx_handle_head_or_tail(pair, buf, al);
-        pair[0] %= BASE;
-        pair[1] %= BASE;
 
         done += al;
+        /* Rather than rebasing, we can reduce the max sums for the
+         * first round only */
+        n -= al;
     }
     for (i = al; i < len; i += n) {
-        if ((i + n) > len)
-            n = (int)(len - i);
+        int remaining = (int)(len-i);
+        n = MIN(remaining, (i == al) ? n : NMAX);
 
         if (n < 16)
             break;