From 6c22823da57aa5218f717f569c04c9573c0448c5 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Thu, 6 Nov 2025 18:26:54 +0000
Subject: [PATCH] AArch64: Fix instability in AdvSIMD tan

Previously presence of special-cases in one lane could affect the
results in other lanes due to unconditional scalar fallback. The old
WANT_SIMD_EXCEPT option (which has never been enabled in libmvec) has
been removed from AOR, making it easier to spot and fix this. 4%
improvement in throughput with GCC 14 on Neoverse V1. This bug is
present as far back as 2.39 (where tan was first introduced).

Reviewed-by: Wilco Dijkstra  <Wilco.Dijkstra@arm.com>
---
 sysdeps/aarch64/fpu/tan_advsimd.c | 31 +++++++++----------------------
 1 file changed, 9 insertions(+), 22 deletions(-)

diff --git a/sysdeps/aarch64/fpu/tan_advsimd.c b/sysdeps/aarch64/fpu/tan_advsimd.c
index 825c9754b3..d391a003d8 100644
--- a/sysdeps/aarch64/fpu/tan_advsimd.c
+++ b/sysdeps/aarch64/fpu/tan_advsimd.c
@@ -25,9 +25,7 @@ static const struct data
   float64x2_t poly[9];
   double half_pi[2];
   float64x2_t two_over_pi, shift;
-#if !WANT_SIMD_EXCEPT
   float64x2_t range_val;
-#endif
 } data = {
   /* Coefficients generated using FPMinimax.  */
   .poly = { V2 (0x1.5555555555556p-2), V2 (0x1.1111111110a63p-3),
@@ -38,20 +36,17 @@ static const struct data
   .half_pi = { 0x1.921fb54442d18p0, 0x1.1a62633145c07p-54 },
   .two_over_pi = V2 (0x1.45f306dc9c883p-1),
   .shift = V2 (0x1.8p52),
-#if !WANT_SIMD_EXCEPT
   .range_val = V2 (0x1p23),
-#endif
 };
 
 #define RangeVal 0x4160000000000000  /* asuint64(0x1p23).  */
 #define TinyBound 0x3e50000000000000 /* asuint64(2^-26).  */
-#define Thresh 0x310000000000000     /* RangeVal - TinyBound.  */
 
 /* Special cases (fall back to scalar calls).  */
 static float64x2_t VPCS_ATTR NOINLINE
-special_case (float64x2_t x)
+special_case (float64x2_t x, float64x2_t n, float64x2_t d, uint64x2_t special)
 {
-  return v_call_f64 (tan, x, x, v_u64 (-1));
+  return v_call_f64 (tan, x, vdivq_f64 (n, d), special);
 }
 
 /* Vector approximation for double-precision tan.
@@ -65,14 +60,6 @@ float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x)
      very large inputs. Fall back to scalar routine for all lanes if any are
      too large, or Inf/NaN. If fenv exceptions are expected, also fall back for
      tiny input to avoid underflow.  */
-#if WANT_SIMD_EXCEPT
-  uint64x2_t iax = vreinterpretq_u64_f64 (vabsq_f64 (x));
-  /* iax - tiny_bound > range_val - tiny_bound.  */
-  uint64x2_t special
-      = vcgtq_u64 (vsubq_u64 (iax, v_u64 (TinyBound)), v_u64 (Thresh));
-  if (__glibc_unlikely (v_any_u64 (special)))
-    return special_case (x);
-#endif
 
   /* q = nearest integer to 2 * x / pi.  */
   float64x2_t q
@@ -81,9 +68,8 @@ float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x)
 
   /* Use q to reduce x to r in [-pi/4, pi/4], by:
      r = x - q * pi/2, in extended precision.  */
-  float64x2_t r = x;
   float64x2_t half_pi = vld1q_f64 (dat->half_pi);
-  r = vfmsq_laneq_f64 (r, q, half_pi, 0);
+  float64x2_t r = vfmsq_laneq_f64 (x, q, half_pi, 0);
   r = vfmsq_laneq_f64 (r, q, half_pi, 1);
   /* Further reduce r to [-pi/8, pi/8], to be reconstructed using double angle
      formula.  */
@@ -114,12 +100,13 @@ float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x)
 
   uint64x2_t no_recip = vtstq_u64 (vreinterpretq_u64_s64 (qi), v_u64 (1));
 
-#if !WANT_SIMD_EXCEPT
   uint64x2_t special = vcageq_f64 (x, dat->range_val);
+  float64x2_t swap = vbslq_f64 (no_recip, n, vnegq_f64 (d));
+  d = vbslq_f64 (no_recip, d, n);
+  n = swap;
+
   if (__glibc_unlikely (v_any_u64 (special)))
-    return special_case (x);
-#endif
+    return special_case (x, n, d, special);
 
-  return vdivq_f64 (vbslq_f64 (no_recip, n, vnegq_f64 (d)),
-		    vbslq_f64 (no_recip, d, n));
+  return vdivq_f64 (n, d);
 }
-- 
2.47.3