Improved adler32 NEON performance by 30-47%

author Adam Stylinski <kungfujesus06@gmail.com>

Sat, 12 Feb 2022 15:26:50 +0000 (10:26 -0500)

committer Hans Kristian Rosbach <hk-github@circlestorm.org>

Thu, 24 Feb 2022 15:00:51 +0000 (16:00 +0100)
author Adam Stylinski <kungfujesus06@gmail.com>
Sat, 12 Feb 2022 15:26:50 +0000 (10:26 -0500)
committer Hans Kristian Rosbach <hk-github@circlestorm.org>
Thu, 24 Feb 2022 15:00:51 +0000 (16:00 +0100)
diff --git a/arch/arm/adler32_neon.c b/arch/arm/adler32_neon.c

index 8f4934440439f69f76ad40f05187085d7df71524..dfbd54def9375db39c5bfe5c5905be81ae13e50a 100644 (file)
--- a/arch/arm/adler32_neon.c
+++ b/arch/arm/adler32_neon.c
@@ -1,7 +1,8 @@
  /* Copyright (C) 1995-2011, 2016 Mark Adler
   * Copyright (C) 2017 ARM Holdings Inc.
- * Author: Adenilson Cavalcanti <adenilson.cavalcanti@arm.com>
- *
+ * Authors: 
+ *   Adenilson Cavalcanti <adenilson.cavalcanti@arm.com>
+ *   Adam Stylinski <kungfujesus06@gmail.com>
   * For conditions of distribution and use, see copyright notice in zlib.h
   */
  #ifdef ARM_NEON_ADLER32
@@ -12,50 +13,124 @@
  #endif
  #include "../../zbuild.h"
  #include "../../adler32_p.h"
+#include "../../fallback_builtins.h"
  
  static void NEON_accum32(uint32_t *s, const unsigned char *buf, size_t len) {
-    static const uint8_t taps[32] = {
+    static const uint16_t ALIGNED_(16) taps[64] = {
+        64, 63, 62, 61, 60, 59, 58, 57,
+        56, 55, 54, 53, 52, 51, 50, 49,
+        48, 47, 46, 45, 44, 43, 42, 41,
+        40, 39, 38, 37, 36, 35, 34, 33,
          32, 31, 30, 29, 28, 27, 26, 25,
          24, 23, 22, 21, 20, 19, 18, 17,
-        16, 15, 14, 13, 12, 11, 10, 9,
+        16, 15, 14, 13, 12, 11, 10, 9, 
          8, 7, 6, 5, 4, 3, 2, 1 };
  
-    uint32x2_t adacc2, s2acc2, as;
-    uint8x16_t t0 = vld1q_u8(taps), t1 = vld1q_u8(taps + 16);
+    uint32x4_t adacc = vdupq_n_u32(0);
+    uint32x4_t s2acc = vdupq_n_u32(0);
+    uint32x4_t s2acc_0 = vdupq_n_u32(0);
+    uint32x4_t s2acc_1 = vdupq_n_u32(0);
+    uint32x4_t s2acc_2 = vdupq_n_u32(0);
  
-    uint32x4_t adacc = vdupq_n_u32(0), s2acc = vdupq_n_u32(0);
      adacc = vsetq_lane_u32(s[0], adacc, 0);
      s2acc = vsetq_lane_u32(s[1], s2acc, 0);
  
-    while (len >= 2) {
-        uint8x16_t d0 = vld1q_u8(buf), d1 = vld1q_u8(buf + 16);
-        uint16x8_t adler, sum2;
-        s2acc = vaddq_u32(s2acc, vshlq_n_u32(adacc, 5));
-        adler = vpaddlq_u8(       d0);
-        adler = vpadalq_u8(adler, d1);
-        sum2 = vmull_u8(      vget_low_u8(t0), vget_low_u8(d0));
-        sum2 = vmlal_u8(sum2, vget_high_u8(t0), vget_high_u8(d0));
-        sum2 = vmlal_u8(sum2, vget_low_u8(t1), vget_low_u8(d1));
-        sum2 = vmlal_u8(sum2, vget_high_u8(t1), vget_high_u8(d1));
-        adacc = vpadalq_u16(adacc, adler);
-        s2acc = vpadalq_u16(s2acc, sum2);
-        len -= 2;
-        buf += 32;
+    uint32x4_t s3acc = vdupq_n_u32(0);
+    uint32x4_t adacc_prev = adacc;
+
+    uint16x8_t s2_0, s2_1, s2_2, s2_3;
+    s2_0 = s2_1 = s2_2 = s2_3 = vdupq_n_u16(0);
+
+    uint16x8_t s2_4, s2_5, s2_6, s2_7;
+    s2_4 = s2_5 = s2_6 = s2_7 = vdupq_n_u16(0);
+
+    int num_iter = len >> 2;
+    int rem = len & 3;
+
+    for (int i = 0; i < num_iter; ++i) {
+        uint8x16x4_t d0_d3 = vld1q_u8_x4(buf);
+
+        /* Unfortunately it doesn't look like there's a direct sum 8 bit to 32
+         * bit instruction, we'll have to make due summing to 16 bits first */
+        uint16x8x2_t hsum, hsum_fold;
+        hsum.val[0] = vpaddlq_u8(d0_d3.val[0]);
+        hsum.val[1] = vpaddlq_u8(d0_d3.val[1]);
+
+        hsum_fold.val[0] = vpadalq_u8(hsum.val[0], d0_d3.val[2]);
+        hsum_fold.val[1] = vpadalq_u8(hsum.val[1], d0_d3.val[3]);
+
+        adacc = vpadalq_u16(adacc, hsum_fold.val[0]); 
+        s3acc = vaddq_u32(s3acc, adacc_prev);
+        adacc = vpadalq_u16(adacc, hsum_fold.val[1]); 
+        
+        /* If we do straight widening additions to the 16 bit values, we don't incur
+         * the usual penalties of a pairwise add. We can defer the multiplications
+         * until the very end. These will not overflow because we are incurring at
+         * most 408 loop iterations (NMAX / 64), and a given lane is only going to be
+         * summed into once. This means for the maximum input size, the largest value
+         * we will see is 255 * 102 = 26010, safely under uint16 max */
+        s2_0 = vaddw_u8(s2_0, vget_low_u8(d0_d3.val[0])); 
+        s2_1 = vaddw_high_u8(s2_1, d0_d3.val[0]);
+        s2_2 = vaddw_u8(s2_2, vget_low_u8(d0_d3.val[1]));
+        s2_3 = vaddw_high_u8(s2_3, d0_d3.val[1]);
+        s2_4 = vaddw_u8(s2_4, vget_low_u8(d0_d3.val[2])); 
+        s2_5 = vaddw_high_u8(s2_5, d0_d3.val[2]);
+        s2_6 = vaddw_u8(s2_6, vget_low_u8(d0_d3.val[3]));
+        s2_7 = vaddw_high_u8(s2_7, d0_d3.val[3]);
+
+        adacc_prev = adacc;
+        buf += 64;
      }
  
-    while (len > 0) {
-        uint8x16_t d0 = vld1q_u8(buf);
-        uint16x8_t adler, sum2;
-        s2acc = vaddq_u32(s2acc, vshlq_n_u32(adacc, 4));
-        adler = vpaddlq_u8(d0);
-        sum2 = vmull_u8(      vget_low_u8(t1), vget_low_u8(d0));
-        sum2 = vmlal_u8(sum2, vget_high_u8(t1), vget_high_u8(d0));
-        adacc = vpadalq_u16(adacc, adler);
-        s2acc = vpadalq_u16(s2acc, sum2);
-        buf += 16;
-        len--;
+    s3acc = vshlq_n_u32(s3acc, 6);
+
+    if (rem) {
+        uint32x4_t s3acc_0 = vdupq_n_u32(0);
+        while (rem--) {
+            uint8x16_t d0 = vld1q_u8(buf);
+            uint16x8_t adler;
+            adler = vpaddlq_u8(d0);
+            s2_6 = vaddw_u8(s2_6, vget_low_u8(d0));
+            s2_7 = vaddw_high_u8(s2_7, d0);
+            adacc = vpadalq_u16(adacc, adler);
+            s3acc_0 = vaddq_u32(s3acc_0, adacc_prev);
+            adacc_prev = adacc;
+            buf += 16;
+        }
+
+        s3acc_0 = vshlq_n_u32(s3acc_0, 4);
+        s3acc = vaddq_u32(s3acc_0, s3acc);
      }
  
+    uint16x8x4_t t0_t3 = vld1q_u16_x4(taps);
+    uint16x8x4_t t4_t7 = vld1q_u16_x4(taps + 32);
+
+    s2acc = vmlal_high_u16(s2acc, t0_t3.val[0], s2_0);
+    s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t0_t3.val[0]), vget_low_u16(s2_0));
+    s2acc_1 = vmlal_high_u16(s2acc_1, t0_t3.val[1], s2_1);
+    s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t0_t3.val[1]), vget_low_u16(s2_1));
+
+    s2acc = vmlal_high_u16(s2acc, t0_t3.val[2], s2_2);
+    s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t0_t3.val[2]), vget_low_u16(s2_2));
+    s2acc_1 = vmlal_high_u16(s2acc_1, t0_t3.val[3], s2_3);
+    s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t0_t3.val[3]), vget_low_u16(s2_3));
+
+    s2acc = vmlal_high_u16(s2acc, t4_t7.val[0], s2_4);
+    s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t4_t7.val[0]), vget_low_u16(s2_4));
+    s2acc_1 = vmlal_high_u16(s2acc_1, t4_t7.val[1], s2_5);
+    s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t4_t7.val[1]), vget_low_u16(s2_5));
+
+    s2acc = vmlal_high_u16(s2acc, t4_t7.val[2], s2_6);
+    s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t4_t7.val[2]), vget_low_u16(s2_6));
+    s2acc_1 = vmlal_high_u16(s2acc_1, t4_t7.val[3], s2_7);
+    s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t4_t7.val[3]), vget_low_u16(s2_7));
+
+    s2acc = vaddq_u32(s2acc_0, s2acc);
+    s2acc_2 = vaddq_u32(s2acc_1, s2acc_2);
+    s2acc = vaddq_u32(s2acc, s2acc_2);
+
+    uint32x2_t adacc2, s2acc2, as;
+    s2acc = vaddq_u32(s2acc, s3acc);
      adacc2 = vpadd_u32(vget_low_u32(adacc), vget_high_u32(adacc));
      s2acc2 = vpadd_u32(vget_low_u32(s2acc), vget_high_u32(s2acc));
      as = vpadd_u32(adacc2, s2acc2);
@@ -91,7 +166,6 @@ uint32_t adler32_neon(uint32_t adler, const unsigned char *buf, size_t len) {
      uint32_t pair[2];
      int n = NMAX;
      unsigned int done = 0;
-    unsigned int i;
  
      /* Split Adler-32 into component sums, it can be supplied by
       * the caller sites (e.g. in a PNG file).
@@ -99,18 +173,37 @@ uint32_t adler32_neon(uint32_t adler, const unsigned char *buf, size_t len) {
      pair[0] = adler;
      pair[1] = sum2;
  
-    for (i = 0; i < len; i += n) {
-        if ((i + n) > len)
-            n = (int)(len - i);
+    /* If memory is not SIMD aligned, do scalar sums to an aligned
+     * offset, provided that doing so doesn't completely eliminate
+     * SIMD operation. Aligned loads are still faster on ARM, even
+     * though there's no explicit aligned load instruction */
+    unsigned int align_offset = ((uintptr_t)buf & 15);
+    unsigned int align_adj = (align_offset) ? 16 - align_offset : 0;
+
+    if (align_offset && len >= (16 + align_adj)) {
+        NEON_handle_tail(pair, buf, align_adj);
+        n -= align_adj;
+        done += align_adj;
+
+    } else {
+        /* If here, we failed the len criteria test, it wouldn't be
+         * worthwhile to do scalar aligning sums */
+        align_adj = 0;
+    }
+
+    while (done < len) {
+        int remaining = (int)(len - done);
+        n = MIN(remaining, (done == align_adj) ? n : NMAX);
  
          if (n < 16)
              break;
  
-        NEON_accum32(pair, buf + i, n / 16);
+        NEON_accum32(pair, buf + done, n >> 4);
          pair[0] %= BASE;
          pair[1] %= BASE;
  
-        done += (n / 16) * 16;
+        int actual_nsums = (n >> 4) << 4;
+        done += actual_nsums;
      }
  
      /* Handle the tail elements. */
@@ -123,4 +216,5 @@ uint32_t adler32_neon(uint32_t adler, const unsigned char *buf, size_t len) {
      /* D = B * 65536 + A, see: https://en.wikipedia.org/wiki/Adler-32. */
      return (pair[1] << 16) | pair[0];
  }
+
  #endif
diff --git a/fallback_builtins.h b/fallback_builtins.h

index 3554b6cc3d70284551096cbea3325ab8b2a09c27..8abec2fad72cd74d6e34c9f25cb936ade02064ae 100644 (file)
--- a/fallback_builtins.h
+++ b/fallback_builtins.h
@@ -64,6 +64,14 @@ static inline __m512i _mm512_zextsi128_si512(__m128i a) {
  
  #endif // __AVX2__
  
+#if defined(ARM_NEON_ADLER32) && !defined(__aarch64__)
+/* Compatibility shim for the _high family of functions */
+#define vmull_high_u8(a, b) vmull_u8(vget_high_u8(a), vget_high_u8(b))
+#define vmlal_high_u8(a, b, c) vmlal_u8(a, vget_high_u8(b), vget_high_u8(c))
+#define vmlal_high_u16(a, b, c) vmlal_u16(a, vget_high_u16(b), vget_high_u16(c))
+#define vaddw_high_u8(a, b) vaddw_u8(a, vget_high_u8(b))
+#endif
+
  #ifdef ARM_NEON_SLIDEHASH
  
  #define vqsubq_u16_x4_x1(out, a, b) do { \
@@ -93,6 +101,15 @@ static inline uint16x8x4_t vld1q_u16_x4(uint16_t *a) {
      return ret;
  }
  
+static inline uint8x16x4_t vld1q_u8_x4(uint8_t *a) {
+    uint8x16x4_t ret = (uint8x16x4_t) {{
+                          vld1q_u8(a),
+                          vld1q_u8(a+16),
+                          vld1q_u8(a+32),
+                          vld1q_u8(a+48)}};
+    return ret;
+}
+
  static inline void vst1q_u16_x4(uint16_t *p, uint16x8x4_t a) {
      vst1q_u16(p, a.val[0]);
      vst1q_u16(p + 8, a.val[1]);
author	Adam Stylinski <kungfujesus06@gmail.com>
	Sat, 12 Feb 2022 15:26:50 +0000 (10:26 -0500)
committer	Hans Kristian Rosbach <hk-github@circlestorm.org>
	Thu, 24 Feb 2022 15:00:51 +0000 (16:00 +0100)
arch/arm/adler32_neon.c		patch \| blob \| blame \| history
fallback_builtins.h		patch \| blob \| blame \| history