From e45af510bc816e860c8e2e1d4a652b4fe15c4b34 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Thu, 6 Nov 2025 18:29:33 +0000
Subject: [PATCH] AArch64: Fix instability in AdvSIMD sinh

Previously presence of special-cases in one lane could affect the
results in other lanes due to unconditional scalar fallback. The old
WANT_SIMD_EXCEPT option (which has never been enabled in libmvec) has
been removed from AOR, making it easier to spot and fix
this. No measured change in performance. This patch applies cleanly as
far back as 2.41, however there are conflicts with 2.40 where sinh was
first introduced.

Reviewed-by: Wilco Dijkstra  <Wilco.Dijkstra@arm.com>
---
 sysdeps/aarch64/fpu/sinh_advsimd.c | 33 ++++++++----------------------
 1 file changed, 9 insertions(+), 24 deletions(-)

diff --git a/sysdeps/aarch64/fpu/sinh_advsimd.c b/sysdeps/aarch64/fpu/sinh_advsimd.c
index 0d6a4856f8..b6b60262c6 100644
--- a/sysdeps/aarch64/fpu/sinh_advsimd.c
+++ b/sysdeps/aarch64/fpu/sinh_advsimd.c
@@ -24,36 +24,26 @@ static const struct data
 {
   struct v_expm1_data d;
   uint64x2_t halff;
-#if WANT_SIMD_EXCEPT
-  uint64x2_t tiny_bound, thresh;
-#else
   float64x2_t large_bound;
-#endif
 } data = {
   .d = V_EXPM1_DATA,
   .halff = V2 (0x3fe0000000000000),
-#if WANT_SIMD_EXCEPT
-  /* 2^-26, below which sinh(x) rounds to x.  */
-  .tiny_bound = V2 (0x3e50000000000000),
-  /* asuint(large_bound) - asuint(tiny_bound).  */
-  .thresh = V2 (0x0230000000000000),
-#else
   /* 2^9. expm1 helper overflows for large input.  */
   .large_bound = V2 (0x1p+9),
-#endif
 };
 
 static float64x2_t NOINLINE VPCS_ATTR
-special_case (float64x2_t x)
+special_case (float64x2_t x, float64x2_t t, float64x2_t halfsign,
+	      uint64x2_t special)
 {
-  return v_call_f64 (sinh, x, x, v_u64 (-1));
+  return v_call_f64 (sinh, x, vmulq_f64 (t, halfsign), special);
 }
 
 /* Approximation for vector double-precision sinh(x) using expm1.
    sinh(x) = (exp(x) - exp(-x)) / 2.
    The greatest observed error is 2.52 ULP:
-   _ZGVnN2v_sinh(-0x1.a098a2177a2b9p-2) got -0x1.ac2f05bb66fccp-2
-				       want -0x1.ac2f05bb66fc9p-2.  */
+   _ZGVnN2v_sinh(0x1.9f6ff2ab6fb19p-2) got 0x1.aaed83a3153ccp-2
+				      want 0x1.aaed83a3153c9p-2.  */
 float64x2_t VPCS_ATTR V_NAME_D1 (sinh) (float64x2_t x)
 {
   const struct data *d = ptr_barrier (&data);
@@ -63,21 +53,16 @@ float64x2_t VPCS_ATTR V_NAME_D1 (sinh) (float64x2_t x)
   float64x2_t halfsign = vreinterpretq_f64_u64 (
       vbslq_u64 (v_u64 (0x8000000000000000), ix, d->halff));
 
-#if WANT_SIMD_EXCEPT
-  uint64x2_t special = vcgeq_u64 (
-      vsubq_u64 (vreinterpretq_u64_f64 (ax), d->tiny_bound), d->thresh);
-#else
   uint64x2_t special = vcageq_f64 (x, d->large_bound);
-#endif
-
-  /* Fall back to scalar variant for all lanes if any of them are special.  */
-  if (__glibc_unlikely (v_any_u64 (special)))
-    return special_case (x);
 
   /* Up to the point that expm1 overflows, we can use it to calculate sinh
      using a slight rearrangement of the definition of sinh. This allows us to
      retain acceptable accuracy for very small inputs.  */
   float64x2_t t = expm1_inline (ax, &d->d);
   t = vaddq_f64 (t, vdivq_f64 (t, vaddq_f64 (t, v_f64 (1.0))));
+
+  if (__glibc_unlikely (v_any_u64 (special)))
+    return special_case (x, t, halfsign, special);
+
   return vmulq_f64 (t, halfsign);
 }
-- 
2.47.3