From: Nathan Moin Vaziri <nathan@nathanm.com>
Date: Tue, 31 Mar 2026 20:12:33 +0000 (-0700)
Subject: Use vaddvq_u32 for adler32 NEON horizontal reduction
X-Git-Url: http://git.ipfire.org/index.cgi?a=commitdiff_plain;p=thirdparty%2Fzlib-ng.git

Use vaddvq_u32 for adler32 NEON horizontal reduction

Replace interleaved pairwise reduction with vaddvq_u32 to break the
dependency chain between s1 and s2 modulo computations. The original
code merged both accumulators through a shared addp, serializing the
subsequent umull/lsr/msub chains. Independent reductions allow them
to execute in parallel.

On AArch64 this maps to the ADDV instruction. A compatibility shim
in neon_intrins.h emulates this on 32-bit ARM using vadd and vpadd.
---

diff --git a/arch/arm/adler32_neon.c b/arch/arm/adler32_neon.c
index 95d5b6610..43698735a 100644
--- a/arch/arm/adler32_neon.c
+++ b/arch/arm/adler32_neon.c
@@ -199,13 +199,9 @@ Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, co
         s2acc_2 = vaddq_u32(s2acc_1, s2acc_2);
         s2acc = vaddq_u32(s2acc, s2acc_2);
 
-        uint32x2_t adacc2, s2acc2, as;
         s2acc = vaddq_u32(s2acc, s3acc);
-        adacc2 = vpadd_u32(vget_low_u32(adacc), vget_high_u32(adacc));
-        s2acc2 = vpadd_u32(vget_low_u32(s2acc), vget_high_u32(s2acc));
-        as = vpadd_u32(adacc2, s2acc2);
-        pair[0] = vget_lane_u32(as, 0);
-        pair[1] = vget_lane_u32(as, 1);
+        pair[0] = vaddvq_u32(adacc);
+        pair[1] = vaddvq_u32(s2acc);
 
         pair[0] %= BASE;
         pair[1] %= BASE;
diff --git a/arch/arm/neon_intrins.h b/arch/arm/neon_intrins.h
index 97072ba55..6138db75f 100644
--- a/arch/arm/neon_intrins.h
+++ b/arch/arm/neon_intrins.h
@@ -14,6 +14,14 @@
 #define vmlal_high_u8(a, b, c) vmlal_u8(a, vget_high_u8(b), vget_high_u8(c))
 #define vmlal_high_u16(a, b, c) vmlal_u16(a, vget_high_u16(b), vget_high_u16(c))
 #define vaddw_high_u8(a, b) vaddw_u8(a, vget_high_u8(b))
+
+/* Emulate vaddvq_u32 as a horizontal add without widening */
+static inline uint32_t vaddvq_u32(uint32x4_t a) {
+    /* Add high and low halves {t0=a0+a2, t1=a1+a3} */
+    uint32x2_t t = vadd_u32(vget_low_u32(a), vget_high_u32(a));
+    /* Pairwise add to scalar (t0+t1) */
+    return vget_lane_u32(vpadd_u32(t, t), 0);
+}
 #endif
 
 #ifdef ARM_NEON