From: Nathan Moin Vaziri Date: Tue, 31 Mar 2026 20:12:33 +0000 (-0700) Subject: Use vaddvq_u32 for adler32 NEON horizontal reduction X-Git-Url: http://git.ipfire.org/index.cgi?a=commitdiff_plain;p=thirdparty%2Fzlib-ng.git Use vaddvq_u32 for adler32 NEON horizontal reduction Replace interleaved pairwise reduction with vaddvq_u32 to break the dependency chain between s1 and s2 modulo computations. The original code merged both accumulators through a shared addp, serializing the subsequent umull/lsr/msub chains. Independent reductions allow them to execute in parallel. On AArch64 this maps to the ADDV instruction. A compatibility shim in neon_intrins.h emulates this on 32-bit ARM using vadd and vpadd. --- diff --git a/arch/arm/adler32_neon.c b/arch/arm/adler32_neon.c index 95d5b6610..43698735a 100644 --- a/arch/arm/adler32_neon.c +++ b/arch/arm/adler32_neon.c @@ -199,13 +199,9 @@ Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, co s2acc_2 = vaddq_u32(s2acc_1, s2acc_2); s2acc = vaddq_u32(s2acc, s2acc_2); - uint32x2_t adacc2, s2acc2, as; s2acc = vaddq_u32(s2acc, s3acc); - adacc2 = vpadd_u32(vget_low_u32(adacc), vget_high_u32(adacc)); - s2acc2 = vpadd_u32(vget_low_u32(s2acc), vget_high_u32(s2acc)); - as = vpadd_u32(adacc2, s2acc2); - pair[0] = vget_lane_u32(as, 0); - pair[1] = vget_lane_u32(as, 1); + pair[0] = vaddvq_u32(adacc); + pair[1] = vaddvq_u32(s2acc); pair[0] %= BASE; pair[1] %= BASE; diff --git a/arch/arm/neon_intrins.h b/arch/arm/neon_intrins.h index 97072ba55..6138db75f 100644 --- a/arch/arm/neon_intrins.h +++ b/arch/arm/neon_intrins.h @@ -14,6 +14,14 @@ #define vmlal_high_u8(a, b, c) vmlal_u8(a, vget_high_u8(b), vget_high_u8(c)) #define vmlal_high_u16(a, b, c) vmlal_u16(a, vget_high_u16(b), vget_high_u16(c)) #define vaddw_high_u8(a, b) vaddw_u8(a, vget_high_u8(b)) + +/* Emulate vaddvq_u32 as a horizontal add without widening */ +static inline uint32_t vaddvq_u32(uint32x4_t a) { + /* Add high and low halves {t0=a0+a2, t1=a1+a3} */ + uint32x2_t t = vadd_u32(vget_low_u32(a), vget_high_u32(a)); + /* Pairwise add to scalar (t0+t1) */ + return vget_lane_u32(vpadd_u32(t, t), 0); +} #endif #ifdef ARM_NEON