Replace interleaved pairwise reduction with vaddvq_u32 to break the
dependency chain between s1 and s2 modulo computations. The original
code merged both accumulators through a shared addp, serializing the
subsequent umull/lsr/msub chains. Independent reductions allow them
to execute in parallel.
On AArch64 this maps to the ADDV instruction. A compatibility shim
in neon_intrins.h emulates this on 32-bit ARM using vadd and vpadd.
s2acc_2 = vaddq_u32(s2acc_1, s2acc_2);
s2acc = vaddq_u32(s2acc, s2acc_2);
s2acc_2 = vaddq_u32(s2acc_1, s2acc_2);
s2acc = vaddq_u32(s2acc, s2acc_2);
- uint32x2_t adacc2, s2acc2, as;
s2acc = vaddq_u32(s2acc, s3acc);
s2acc = vaddq_u32(s2acc, s3acc);
- adacc2 = vpadd_u32(vget_low_u32(adacc), vget_high_u32(adacc));
- s2acc2 = vpadd_u32(vget_low_u32(s2acc), vget_high_u32(s2acc));
- as = vpadd_u32(adacc2, s2acc2);
- pair[0] = vget_lane_u32(as, 0);
- pair[1] = vget_lane_u32(as, 1);
+ pair[0] = vaddvq_u32(adacc);
+ pair[1] = vaddvq_u32(s2acc);
pair[0] %= BASE;
pair[1] %= BASE;
pair[0] %= BASE;
pair[1] %= BASE;
#define vmlal_high_u8(a, b, c) vmlal_u8(a, vget_high_u8(b), vget_high_u8(c))
#define vmlal_high_u16(a, b, c) vmlal_u16(a, vget_high_u16(b), vget_high_u16(c))
#define vaddw_high_u8(a, b) vaddw_u8(a, vget_high_u8(b))
#define vmlal_high_u8(a, b, c) vmlal_u8(a, vget_high_u8(b), vget_high_u8(c))
#define vmlal_high_u16(a, b, c) vmlal_u16(a, vget_high_u16(b), vget_high_u16(c))
#define vaddw_high_u8(a, b) vaddw_u8(a, vget_high_u8(b))
+
+/* Emulate vaddvq_u32 as a horizontal add without widening */
+static inline uint32_t vaddvq_u32(uint32x4_t a) {
+ /* Add high and low halves {t0=a0+a2, t1=a1+a3} */
+ uint32x2_t t = vadd_u32(vget_low_u32(a), vget_high_u32(a));
+ /* Pairwise add to scalar (t0+t1) */
+ return vget_lane_u32(vpadd_u32(t, t), 0);
+}