aarch64: Optimise AdvSIMD asinhf

author James Chesterman <James.Chesterman@arm.com>

Fri, 28 Nov 2025 11:18:52 +0000 (11:18 +0000)

committer Adhemerval Zanella <adhemerval.zanella@linaro.org>

Thu, 4 Dec 2025 13:54:49 +0000 (10:54 -0300)
author James Chesterman <James.Chesterman@arm.com>
Fri, 28 Nov 2025 11:18:52 +0000 (11:18 +0000)
committer Adhemerval Zanella <adhemerval.zanella@linaro.org>
Thu, 4 Dec 2025 13:54:49 +0000 (10:54 -0300)
diff --git a/sysdeps/aarch64/fpu/asinhf_advsimd.c b/sysdeps/aarch64/fpu/asinhf_advsimd.c

index 90b2d78b5effdb64ae801e12d4ef3019b30b306b..e953c720770d9d66a31da4215cf0cf91ca583975 100644 (file)
--- a/sysdeps/aarch64/fpu/asinhf_advsimd.c
+++ b/sysdeps/aarch64/fpu/asinhf_advsimd.c
@@ -24,46 +24,80 @@ const static struct data
  {
    struct v_log1pf_data log1pf_consts;
    float32x4_t one;
-  uint32x4_t big_bound;
+  uint32x4_t square_lim;
+  float32x4_t pinf, nan;
  } data = {
    .one = V4 (1),
    .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE,
-  .big_bound = V4 (0x5f800000), /* asuint(0x1p64).  */
+  .square_lim = V4 (0x5f800000), /* asuint(sqrt(FLT_MAX)).  */
+  .pinf = V4 (INFINITY),
+  .nan = V4 (NAN),
  };
  
-static float32x4_t NOINLINE VPCS_ATTR
-special_case (float32x4_t x, uint32x4_t sign, float32x4_t y,
-             uint32x4_t special, const struct data *d)
+static inline float32x4_t VPCS_ATTR
+inline_asinhf (float32x4_t ax, uint32x4_t sign, const struct data *d)
  {
-  return v_call_f32 (
-      asinhf, x,
-      vreinterpretq_f32_u32 (veorq_u32 (
-         sign, vreinterpretq_u32_f32 (log1pf_inline (y, &d->log1pf_consts)))),
-      special);
+  /* Consider the identity asinh(x) = log(x + sqrt(x^2 + 1)).
+    Then, for x>0, asinh(x) = log1p(x + x^2 / (1 + sqrt(x^2 + 1))).  */
+  float32x4_t t
+      = vaddq_f32 (v_f32 (1.0f), vsqrtq_f32 (vfmaq_f32 (d->one, ax, ax)));
+  float32x4_t y = vaddq_f32 (ax, vdivq_f32 (vmulq_f32 (ax, ax), t));
+
+  return vreinterpretq_f32_u32 (veorq_u32 (
+      sign, vreinterpretq_u32_f32 (log1pf_inline (y, &d->log1pf_consts))));
+}
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t ax, uint32x4_t sign, uint32x4_t special,
+             const struct data *d)
+{
+  /* To avoid overflow in x^2 (so the x < sqrt(FLT_MAX) constraint), we
+    reduce the input of asinh to a narrower interval by relying on the
+    identity: 2asinh(t) = +-acosh(2t^2 + 1)
+    If we set t=sqrt((x-1)/2), then
+    2asinh(sqrt((x-1)/2)) = acosh(x).
+    Found that, for a high input x, asinh(x) very closely approximates
+    acosh(x), so implemented it with this function instead.  */
+  float32x4_t r = vsubq_f32 (ax, d->one);
+  r = vmulq_f32 (r, v_f32 (0.5f));
+  r = vbslq_f32 (special, vsqrtq_f32 (r), ax);
+
+  float32x4_t y = inline_asinhf (r, sign, d);
+
+  y = vbslq_f32 (special, vmulq_f32 (y, v_f32 (2.0f)), y);
+
+  /* Check whether x is inf or nan.  */
+  uint32x4_t ret_inf = vceqq_f32 (ax, d->pinf);
+  uint32x4_t ret_nan = vmvnq_u32 (vcleq_f32 (ax, d->pinf));
+  y = vbslq_f32 (ret_inf, d->pinf, y);
+  y = vbslq_f32 (ret_nan, d->nan, y);
+  /* Put sign back in for minf, as it doesn't happen in log1pf_inline call.  */
+  y = vbslq_f32 (
+      ret_inf,
+      vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), sign)), y);
+  return y;
  }
  
  /* Single-precision implementation of vector asinh(x), using vector log1p.
     Worst-case error is 2.59 ULP:
     _ZGVnN4v_asinhf(0x1.d86124p-3) got 0x1.d449bep-3
                                  want 0x1.d449c4p-3.  */
-VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (asinh) (float32x4_t x)
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (asinh) (float32x4_t x)
  {
-  const struct data *dat = ptr_barrier (&data);
+  const struct data *d = ptr_barrier (&data);
    float32x4_t ax = vabsq_f32 (x);
    uint32x4_t iax = vreinterpretq_u32_f32 (ax);
-  uint32x4_t special = vcgeq_u32 (iax, dat->big_bound);
    uint32x4_t sign = veorq_u32 (vreinterpretq_u32_f32 (x), iax);
  
-  /* asinh(x) = log(x + sqrt(x * x + 1)).
-     For positive x, asinh(x) = log1p(x + x * x / (1 + sqrt(x * x + 1))).  */
-  float32x4_t d
-      = vaddq_f32 (v_f32 (1), vsqrtq_f32 (vfmaq_f32 (dat->one, ax, ax)));
-  float32x4_t y = vaddq_f32 (ax, vdivq_f32 (vmulq_f32 (ax, ax), d));
+  /* Inputs greater than or equal to square_lim will cause the output to
+    overflow. This is because there is a square operation in the log1pf_inline
+    call. Also captures inf and nan. Does not capture negative numbers as we
+    separate the sign bit from the rest of the input.  */
+  uint32x4_t special = vcgeq_u32 (iax, d->square_lim);
  
    if (__glibc_unlikely (v_any_u32 (special)))
-    return special_case (x, sign, y, special, dat);
-  return vreinterpretq_f32_u32 (veorq_u32 (
-      sign, vreinterpretq_u32_f32 (log1pf_inline (y, &dat->log1pf_consts))));
+    return special_case (ax, sign, special, d);
+  return inline_asinhf (ax, sign, d);
  }
  libmvec_hidden_def (V_NAME_F1 (asinh))
  HALF_WIDTH_ALIAS_F1 (asinh)
author	James Chesterman <James.Chesterman@arm.com>
	Fri, 28 Nov 2025 11:18:52 +0000 (11:18 +0000)
committer	Adhemerval Zanella <adhemerval.zanella@linaro.org>
	Thu, 4 Dec 2025 13:54:49 +0000 (10:54 -0300)