From e45af510bc816e860c8e2e1d4a652b4fe15c4b34 Mon Sep 17 00:00:00 2001 From: Joe Ramsay Date: Thu, 6 Nov 2025 18:29:33 +0000 Subject: [PATCH] AArch64: Fix instability in AdvSIMD sinh Previously presence of special-cases in one lane could affect the results in other lanes due to unconditional scalar fallback. The old WANT_SIMD_EXCEPT option (which has never been enabled in libmvec) has been removed from AOR, making it easier to spot and fix this. No measured change in performance. This patch applies cleanly as far back as 2.41, however there are conflicts with 2.40 where sinh was first introduced. Reviewed-by: Wilco Dijkstra --- sysdeps/aarch64/fpu/sinh_advsimd.c | 33 ++++++++---------------------- 1 file changed, 9 insertions(+), 24 deletions(-) diff --git a/sysdeps/aarch64/fpu/sinh_advsimd.c b/sysdeps/aarch64/fpu/sinh_advsimd.c index 0d6a4856f8..b6b60262c6 100644 --- a/sysdeps/aarch64/fpu/sinh_advsimd.c +++ b/sysdeps/aarch64/fpu/sinh_advsimd.c @@ -24,36 +24,26 @@ static const struct data { struct v_expm1_data d; uint64x2_t halff; -#if WANT_SIMD_EXCEPT - uint64x2_t tiny_bound, thresh; -#else float64x2_t large_bound; -#endif } data = { .d = V_EXPM1_DATA, .halff = V2 (0x3fe0000000000000), -#if WANT_SIMD_EXCEPT - /* 2^-26, below which sinh(x) rounds to x. */ - .tiny_bound = V2 (0x3e50000000000000), - /* asuint(large_bound) - asuint(tiny_bound). */ - .thresh = V2 (0x0230000000000000), -#else /* 2^9. expm1 helper overflows for large input. */ .large_bound = V2 (0x1p+9), -#endif }; static float64x2_t NOINLINE VPCS_ATTR -special_case (float64x2_t x) +special_case (float64x2_t x, float64x2_t t, float64x2_t halfsign, + uint64x2_t special) { - return v_call_f64 (sinh, x, x, v_u64 (-1)); + return v_call_f64 (sinh, x, vmulq_f64 (t, halfsign), special); } /* Approximation for vector double-precision sinh(x) using expm1. sinh(x) = (exp(x) - exp(-x)) / 2. The greatest observed error is 2.52 ULP: - _ZGVnN2v_sinh(-0x1.a098a2177a2b9p-2) got -0x1.ac2f05bb66fccp-2 - want -0x1.ac2f05bb66fc9p-2. */ + _ZGVnN2v_sinh(0x1.9f6ff2ab6fb19p-2) got 0x1.aaed83a3153ccp-2 + want 0x1.aaed83a3153c9p-2. */ float64x2_t VPCS_ATTR V_NAME_D1 (sinh) (float64x2_t x) { const struct data *d = ptr_barrier (&data); @@ -63,21 +53,16 @@ float64x2_t VPCS_ATTR V_NAME_D1 (sinh) (float64x2_t x) float64x2_t halfsign = vreinterpretq_f64_u64 ( vbslq_u64 (v_u64 (0x8000000000000000), ix, d->halff)); -#if WANT_SIMD_EXCEPT - uint64x2_t special = vcgeq_u64 ( - vsubq_u64 (vreinterpretq_u64_f64 (ax), d->tiny_bound), d->thresh); -#else uint64x2_t special = vcageq_f64 (x, d->large_bound); -#endif - - /* Fall back to scalar variant for all lanes if any of them are special. */ - if (__glibc_unlikely (v_any_u64 (special))) - return special_case (x); /* Up to the point that expm1 overflows, we can use it to calculate sinh using a slight rearrangement of the definition of sinh. This allows us to retain acceptable accuracy for very small inputs. */ float64x2_t t = expm1_inline (ax, &d->d); t = vaddq_f64 (t, vdivq_f64 (t, vaddq_f64 (t, v_f64 (1.0)))); + + if (__glibc_unlikely (v_any_u64 (special))) + return special_case (x, t, halfsign, special); + return vmulq_f64 (t, halfsign); } -- 2.47.3