From 6c22823da57aa5218f717f569c04c9573c0448c5 Mon Sep 17 00:00:00 2001 From: Joe Ramsay Date: Thu, 6 Nov 2025 18:26:54 +0000 Subject: [PATCH] AArch64: Fix instability in AdvSIMD tan Previously presence of special-cases in one lane could affect the results in other lanes due to unconditional scalar fallback. The old WANT_SIMD_EXCEPT option (which has never been enabled in libmvec) has been removed from AOR, making it easier to spot and fix this. 4% improvement in throughput with GCC 14 on Neoverse V1. This bug is present as far back as 2.39 (where tan was first introduced). Reviewed-by: Wilco Dijkstra --- sysdeps/aarch64/fpu/tan_advsimd.c | 31 +++++++++---------------------- 1 file changed, 9 insertions(+), 22 deletions(-) diff --git a/sysdeps/aarch64/fpu/tan_advsimd.c b/sysdeps/aarch64/fpu/tan_advsimd.c index 825c9754b3..d391a003d8 100644 --- a/sysdeps/aarch64/fpu/tan_advsimd.c +++ b/sysdeps/aarch64/fpu/tan_advsimd.c @@ -25,9 +25,7 @@ static const struct data float64x2_t poly[9]; double half_pi[2]; float64x2_t two_over_pi, shift; -#if !WANT_SIMD_EXCEPT float64x2_t range_val; -#endif } data = { /* Coefficients generated using FPMinimax. */ .poly = { V2 (0x1.5555555555556p-2), V2 (0x1.1111111110a63p-3), @@ -38,20 +36,17 @@ static const struct data .half_pi = { 0x1.921fb54442d18p0, 0x1.1a62633145c07p-54 }, .two_over_pi = V2 (0x1.45f306dc9c883p-1), .shift = V2 (0x1.8p52), -#if !WANT_SIMD_EXCEPT .range_val = V2 (0x1p23), -#endif }; #define RangeVal 0x4160000000000000 /* asuint64(0x1p23). */ #define TinyBound 0x3e50000000000000 /* asuint64(2^-26). */ -#define Thresh 0x310000000000000 /* RangeVal - TinyBound. */ /* Special cases (fall back to scalar calls). */ static float64x2_t VPCS_ATTR NOINLINE -special_case (float64x2_t x) +special_case (float64x2_t x, float64x2_t n, float64x2_t d, uint64x2_t special) { - return v_call_f64 (tan, x, x, v_u64 (-1)); + return v_call_f64 (tan, x, vdivq_f64 (n, d), special); } /* Vector approximation for double-precision tan. @@ -65,14 +60,6 @@ float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x) very large inputs. Fall back to scalar routine for all lanes if any are too large, or Inf/NaN. If fenv exceptions are expected, also fall back for tiny input to avoid underflow. */ -#if WANT_SIMD_EXCEPT - uint64x2_t iax = vreinterpretq_u64_f64 (vabsq_f64 (x)); - /* iax - tiny_bound > range_val - tiny_bound. */ - uint64x2_t special - = vcgtq_u64 (vsubq_u64 (iax, v_u64 (TinyBound)), v_u64 (Thresh)); - if (__glibc_unlikely (v_any_u64 (special))) - return special_case (x); -#endif /* q = nearest integer to 2 * x / pi. */ float64x2_t q @@ -81,9 +68,8 @@ float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x) /* Use q to reduce x to r in [-pi/4, pi/4], by: r = x - q * pi/2, in extended precision. */ - float64x2_t r = x; float64x2_t half_pi = vld1q_f64 (dat->half_pi); - r = vfmsq_laneq_f64 (r, q, half_pi, 0); + float64x2_t r = vfmsq_laneq_f64 (x, q, half_pi, 0); r = vfmsq_laneq_f64 (r, q, half_pi, 1); /* Further reduce r to [-pi/8, pi/8], to be reconstructed using double angle formula. */ @@ -114,12 +100,13 @@ float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x) uint64x2_t no_recip = vtstq_u64 (vreinterpretq_u64_s64 (qi), v_u64 (1)); -#if !WANT_SIMD_EXCEPT uint64x2_t special = vcageq_f64 (x, dat->range_val); + float64x2_t swap = vbslq_f64 (no_recip, n, vnegq_f64 (d)); + d = vbslq_f64 (no_recip, d, n); + n = swap; + if (__glibc_unlikely (v_any_u64 (special))) - return special_case (x); -#endif + return special_case (x, n, d, special); - return vdivq_f64 (vbslq_f64 (no_recip, n, vnegq_f64 (d)), - vbslq_f64 (no_recip, d, n)); + return vdivq_f64 (n, d); } -- 2.47.3