From: Adam Stylinski <kungfujesus06@gmail.com>
Date: Sat, 12 Feb 2022 15:26:50 +0000 (-0500)
Subject: Improved adler32 NEON performance by 30-47%
X-Git-Tag: 2.1.0-beta1~363
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=43dbfd6709fb3a8028430ea30f3da88fbeb3ced9;p=thirdparty%2Fzlib-ng.git

Improved adler32 NEON performance by 30-47%

We unlocked some ILP by allowing for independent sums in the loop and
reducing these sums outside of the loop. Additionally, the multiplication
by 32 (now 64) is moved outside of this loop. Similar to the chromium
implementation, this code does straight 8 bit -> 16 bit additions and defers
the fused multiply accumulate outside of the loop.  However, by unrolling by
another factor of 2, the code is measurably faster. The code does fused multiply
accmulates back to as many scratch registers we have room for in order to maximize
ILP for the 16 integer FMAs that need to occur.  The compiler seems to order them
such that the destination register is the same register as the previous instruction,
so perhaps it's not actually able to overlap or maybe the -A73's pipeline is reordering
these instructions, anyway.

On the Odroid-N2, the Cortex-A73 cores are ~30-44% faster on the adler32 benchmark,
and the Cortex-A53 cores are anywhere from 34-47% faster.
---

diff --git a/arch/arm/adler32_neon.c b/arch/arm/adler32_neon.c
index 8f493444..dfbd54de 100644
--- a/arch/arm/adler32_neon.c
+++ b/arch/arm/adler32_neon.c
@@ -1,7 +1,8 @@
 /* Copyright (C) 1995-2011, 2016 Mark Adler
  * Copyright (C) 2017 ARM Holdings Inc.
- * Author: Adenilson Cavalcanti <adenilson.cavalcanti@arm.com>
- *
+ * Authors: 
+ *   Adenilson Cavalcanti <adenilson.cavalcanti@arm.com>
+ *   Adam Stylinski <kungfujesus06@gmail.com>
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 #ifdef ARM_NEON_ADLER32
@@ -12,50 +13,124 @@
 #endif
 #include "../../zbuild.h"
 #include "../../adler32_p.h"
+#include "../../fallback_builtins.h"
 
 static void NEON_accum32(uint32_t *s, const unsigned char *buf, size_t len) {
-    static const uint8_t taps[32] = {
+    static const uint16_t ALIGNED_(16) taps[64] = {
+        64, 63, 62, 61, 60, 59, 58, 57,
+        56, 55, 54, 53, 52, 51, 50, 49,
+        48, 47, 46, 45, 44, 43, 42, 41,
+        40, 39, 38, 37, 36, 35, 34, 33,
         32, 31, 30, 29, 28, 27, 26, 25,
         24, 23, 22, 21, 20, 19, 18, 17,
-        16, 15, 14, 13, 12, 11, 10, 9,
+        16, 15, 14, 13, 12, 11, 10, 9, 
         8, 7, 6, 5, 4, 3, 2, 1 };
 
-    uint32x2_t adacc2, s2acc2, as;
-    uint8x16_t t0 = vld1q_u8(taps), t1 = vld1q_u8(taps + 16);
+    uint32x4_t adacc = vdupq_n_u32(0);
+    uint32x4_t s2acc = vdupq_n_u32(0);
+    uint32x4_t s2acc_0 = vdupq_n_u32(0);
+    uint32x4_t s2acc_1 = vdupq_n_u32(0);
+    uint32x4_t s2acc_2 = vdupq_n_u32(0);
 
-    uint32x4_t adacc = vdupq_n_u32(0), s2acc = vdupq_n_u32(0);
     adacc = vsetq_lane_u32(s[0], adacc, 0);
     s2acc = vsetq_lane_u32(s[1], s2acc, 0);
 
-    while (len >= 2) {
-        uint8x16_t d0 = vld1q_u8(buf), d1 = vld1q_u8(buf + 16);
-        uint16x8_t adler, sum2;
-        s2acc = vaddq_u32(s2acc, vshlq_n_u32(adacc, 5));
-        adler = vpaddlq_u8(       d0);
-        adler = vpadalq_u8(adler, d1);
-        sum2 = vmull_u8(      vget_low_u8(t0), vget_low_u8(d0));
-        sum2 = vmlal_u8(sum2, vget_high_u8(t0), vget_high_u8(d0));
-        sum2 = vmlal_u8(sum2, vget_low_u8(t1), vget_low_u8(d1));
-        sum2 = vmlal_u8(sum2, vget_high_u8(t1), vget_high_u8(d1));
-        adacc = vpadalq_u16(adacc, adler);
-        s2acc = vpadalq_u16(s2acc, sum2);
-        len -= 2;
-        buf += 32;
+    uint32x4_t s3acc = vdupq_n_u32(0);
+    uint32x4_t adacc_prev = adacc;
+
+    uint16x8_t s2_0, s2_1, s2_2, s2_3;
+    s2_0 = s2_1 = s2_2 = s2_3 = vdupq_n_u16(0);
+
+    uint16x8_t s2_4, s2_5, s2_6, s2_7;
+    s2_4 = s2_5 = s2_6 = s2_7 = vdupq_n_u16(0);
+
+    int num_iter = len >> 2;
+    int rem = len & 3;
+
+    for (int i = 0; i < num_iter; ++i) {
+        uint8x16x4_t d0_d3 = vld1q_u8_x4(buf);
+
+        /* Unfortunately it doesn't look like there's a direct sum 8 bit to 32
+         * bit instruction, we'll have to make due summing to 16 bits first */
+        uint16x8x2_t hsum, hsum_fold;
+        hsum.val[0] = vpaddlq_u8(d0_d3.val[0]);
+        hsum.val[1] = vpaddlq_u8(d0_d3.val[1]);
+
+        hsum_fold.val[0] = vpadalq_u8(hsum.val[0], d0_d3.val[2]);
+        hsum_fold.val[1] = vpadalq_u8(hsum.val[1], d0_d3.val[3]);
+
+        adacc = vpadalq_u16(adacc, hsum_fold.val[0]); 
+        s3acc = vaddq_u32(s3acc, adacc_prev);
+        adacc = vpadalq_u16(adacc, hsum_fold.val[1]); 
+        
+        /* If we do straight widening additions to the 16 bit values, we don't incur
+         * the usual penalties of a pairwise add. We can defer the multiplications
+         * until the very end. These will not overflow because we are incurring at
+         * most 408 loop iterations (NMAX / 64), and a given lane is only going to be
+         * summed into once. This means for the maximum input size, the largest value
+         * we will see is 255 * 102 = 26010, safely under uint16 max */
+        s2_0 = vaddw_u8(s2_0, vget_low_u8(d0_d3.val[0])); 
+        s2_1 = vaddw_high_u8(s2_1, d0_d3.val[0]);
+        s2_2 = vaddw_u8(s2_2, vget_low_u8(d0_d3.val[1]));
+        s2_3 = vaddw_high_u8(s2_3, d0_d3.val[1]);
+        s2_4 = vaddw_u8(s2_4, vget_low_u8(d0_d3.val[2])); 
+        s2_5 = vaddw_high_u8(s2_5, d0_d3.val[2]);
+        s2_6 = vaddw_u8(s2_6, vget_low_u8(d0_d3.val[3]));
+        s2_7 = vaddw_high_u8(s2_7, d0_d3.val[3]);
+
+        adacc_prev = adacc;
+        buf += 64;
     }
 
-    while (len > 0) {
-        uint8x16_t d0 = vld1q_u8(buf);
-        uint16x8_t adler, sum2;
-        s2acc = vaddq_u32(s2acc, vshlq_n_u32(adacc, 4));
-        adler = vpaddlq_u8(d0);
-        sum2 = vmull_u8(      vget_low_u8(t1), vget_low_u8(d0));
-        sum2 = vmlal_u8(sum2, vget_high_u8(t1), vget_high_u8(d0));
-        adacc = vpadalq_u16(adacc, adler);
-        s2acc = vpadalq_u16(s2acc, sum2);
-        buf += 16;
-        len--;
+    s3acc = vshlq_n_u32(s3acc, 6);
+
+    if (rem) {
+        uint32x4_t s3acc_0 = vdupq_n_u32(0);
+        while (rem--) {
+            uint8x16_t d0 = vld1q_u8(buf);
+            uint16x8_t adler;
+            adler = vpaddlq_u8(d0);
+            s2_6 = vaddw_u8(s2_6, vget_low_u8(d0));
+            s2_7 = vaddw_high_u8(s2_7, d0);
+            adacc = vpadalq_u16(adacc, adler);
+            s3acc_0 = vaddq_u32(s3acc_0, adacc_prev);
+            adacc_prev = adacc;
+            buf += 16;
+        }
+
+        s3acc_0 = vshlq_n_u32(s3acc_0, 4);
+        s3acc = vaddq_u32(s3acc_0, s3acc);
     }
 
+    uint16x8x4_t t0_t3 = vld1q_u16_x4(taps);
+    uint16x8x4_t t4_t7 = vld1q_u16_x4(taps + 32);
+
+    s2acc = vmlal_high_u16(s2acc, t0_t3.val[0], s2_0);
+    s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t0_t3.val[0]), vget_low_u16(s2_0));
+    s2acc_1 = vmlal_high_u16(s2acc_1, t0_t3.val[1], s2_1);
+    s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t0_t3.val[1]), vget_low_u16(s2_1));
+
+    s2acc = vmlal_high_u16(s2acc, t0_t3.val[2], s2_2);
+    s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t0_t3.val[2]), vget_low_u16(s2_2));
+    s2acc_1 = vmlal_high_u16(s2acc_1, t0_t3.val[3], s2_3);
+    s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t0_t3.val[3]), vget_low_u16(s2_3));
+
+    s2acc = vmlal_high_u16(s2acc, t4_t7.val[0], s2_4);
+    s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t4_t7.val[0]), vget_low_u16(s2_4));
+    s2acc_1 = vmlal_high_u16(s2acc_1, t4_t7.val[1], s2_5);
+    s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t4_t7.val[1]), vget_low_u16(s2_5));
+
+    s2acc = vmlal_high_u16(s2acc, t4_t7.val[2], s2_6);
+    s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t4_t7.val[2]), vget_low_u16(s2_6));
+    s2acc_1 = vmlal_high_u16(s2acc_1, t4_t7.val[3], s2_7);
+    s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t4_t7.val[3]), vget_low_u16(s2_7));
+
+    s2acc = vaddq_u32(s2acc_0, s2acc);
+    s2acc_2 = vaddq_u32(s2acc_1, s2acc_2);
+    s2acc = vaddq_u32(s2acc, s2acc_2);
+
+    uint32x2_t adacc2, s2acc2, as;
+    s2acc = vaddq_u32(s2acc, s3acc);
     adacc2 = vpadd_u32(vget_low_u32(adacc), vget_high_u32(adacc));
     s2acc2 = vpadd_u32(vget_low_u32(s2acc), vget_high_u32(s2acc));
     as = vpadd_u32(adacc2, s2acc2);
@@ -91,7 +166,6 @@ uint32_t adler32_neon(uint32_t adler, const unsigned char *buf, size_t len) {
     uint32_t pair[2];
     int n = NMAX;
     unsigned int done = 0;
-    unsigned int i;
 
     /* Split Adler-32 into component sums, it can be supplied by
      * the caller sites (e.g. in a PNG file).
@@ -99,18 +173,37 @@ uint32_t adler32_neon(uint32_t adler, const unsigned char *buf, size_t len) {
     pair[0] = adler;
     pair[1] = sum2;
 
-    for (i = 0; i < len; i += n) {
-        if ((i + n) > len)
-            n = (int)(len - i);
+    /* If memory is not SIMD aligned, do scalar sums to an aligned
+     * offset, provided that doing so doesn't completely eliminate
+     * SIMD operation. Aligned loads are still faster on ARM, even
+     * though there's no explicit aligned load instruction */
+    unsigned int align_offset = ((uintptr_t)buf & 15);
+    unsigned int align_adj = (align_offset) ? 16 - align_offset : 0;
+
+    if (align_offset && len >= (16 + align_adj)) {
+        NEON_handle_tail(pair, buf, align_adj);
+        n -= align_adj;
+        done += align_adj;
+
+    } else {
+        /* If here, we failed the len criteria test, it wouldn't be
+         * worthwhile to do scalar aligning sums */
+        align_adj = 0;
+    }
+
+    while (done < len) {
+        int remaining = (int)(len - done);
+        n = MIN(remaining, (done == align_adj) ? n : NMAX);
 
         if (n < 16)
             break;
 
-        NEON_accum32(pair, buf + i, n / 16);
+        NEON_accum32(pair, buf + done, n >> 4);
         pair[0] %= BASE;
         pair[1] %= BASE;
 
-        done += (n / 16) * 16;
+        int actual_nsums = (n >> 4) << 4;
+        done += actual_nsums;
     }
 
     /* Handle the tail elements. */
@@ -123,4 +216,5 @@ uint32_t adler32_neon(uint32_t adler, const unsigned char *buf, size_t len) {
     /* D = B * 65536 + A, see: https://en.wikipedia.org/wiki/Adler-32. */
     return (pair[1] << 16) | pair[0];
 }
+
 #endif
diff --git a/fallback_builtins.h b/fallback_builtins.h
index 3554b6cc..8abec2fa 100644
--- a/fallback_builtins.h
+++ b/fallback_builtins.h
@@ -64,6 +64,14 @@ static inline __m512i _mm512_zextsi128_si512(__m128i a) {
 
 #endif // __AVX2__
 
+#if defined(ARM_NEON_ADLER32) && !defined(__aarch64__)
+/* Compatibility shim for the _high family of functions */
+#define vmull_high_u8(a, b) vmull_u8(vget_high_u8(a), vget_high_u8(b))
+#define vmlal_high_u8(a, b, c) vmlal_u8(a, vget_high_u8(b), vget_high_u8(c))
+#define vmlal_high_u16(a, b, c) vmlal_u16(a, vget_high_u16(b), vget_high_u16(c))
+#define vaddw_high_u8(a, b) vaddw_u8(a, vget_high_u8(b))
+#endif
+
 #ifdef ARM_NEON_SLIDEHASH
 
 #define vqsubq_u16_x4_x1(out, a, b) do { \
@@ -93,6 +101,15 @@ static inline uint16x8x4_t vld1q_u16_x4(uint16_t *a) {
     return ret;
 }
 
+static inline uint8x16x4_t vld1q_u8_x4(uint8_t *a) {
+    uint8x16x4_t ret = (uint8x16x4_t) {{
+                          vld1q_u8(a),
+                          vld1q_u8(a+16),
+                          vld1q_u8(a+32),
+                          vld1q_u8(a+48)}};
+    return ret;
+}
+
 static inline void vst1q_u16_x4(uint16_t *p, uint16x8x4_t a) {
     vst1q_u16(p, a.val[0]);
     vst1q_u16(p + 8, a.val[1]);