AArch64: Single and Double precision hyperbolics, SVE and AdvSIMD optimisations

author Richard.Wild@arm.com <Richard.Wild@arm.com>

Thu, 26 Feb 2026 10:22:35 +0000 (10:22 +0000)

committer Adhemerval Zanella <adhemerval.zanella@linaro.org>

Thu, 26 Feb 2026 16:43:30 +0000 (13:43 -0300)
author Richard.Wild@arm.com <Richard.Wild@arm.com>
Thu, 26 Feb 2026 10:22:35 +0000 (10:22 +0000)
committer Adhemerval Zanella <adhemerval.zanella@linaro.org>
Thu, 26 Feb 2026 16:43:30 +0000 (13:43 -0300)
diff --git a/sysdeps/aarch64/fpu/cosh_advsimd.c b/sysdeps/aarch64/fpu/cosh_advsimd.c

index 9894305360b8b4bd5a0cab88f7f44e62a97e880b..12e6698f9521cc4b03a75903106f4756230a95a0 100644 (file)
--- a/sysdeps/aarch64/fpu/cosh_advsimd.c
+++ b/sysdeps/aarch64/fpu/cosh_advsimd.c
@@ -21,32 +21,28 @@
  
  static const struct data
  {
-  float64x2_t poly[3];
-  float64x2_t inv_ln2;
-  double ln2[2];
-  float64x2_t shift, thres;
-  uint64x2_t index_mask, special_bound;
+  double c2, inv_ln2;
+  float64x2_t c0, c1;
+  float64x2_t shift, inf_bound, cosh_9, special_bound;
+  uint64x2_t index_mask;
+  double ln2_hi_lo[2];
  } data = {
-  .poly = { V2 (0x1.fffffffffffd4p-2), V2 (0x1.5555571d6b68cp-3),
-           V2 (0x1.5555576a59599p-5), },
-
-  .inv_ln2 = V2 (0x1.71547652b82fep8), /* N/ln2.  */
+  .c0 = V2 (0x1.fffffffffffd4p-2),
+  .c1 = V2 (0x1.5555571d6b68cp-3),
+  .c2 = 0x1.5555576a59599p-5,
+  .inv_ln2 = 0x1.71547652b82fep8, /* N/ln2.  */
    /* -ln2/N.  */
-  .ln2 = {-0x1.62e42fefa39efp-9, -0x1.abc9e3b39803f3p-64},
+  .ln2_hi_lo = { -0x1.62e42fefa39efp-9, -0x1.abc9e3b39803f3p-64 },
    .shift = V2 (0x1.8p+52),
-  .thres = V2 (704.0),
-
    .index_mask = V2 (0xff),
-  /* 0x1.6p9, above which exp overflows.  */
-  .special_bound = V2 (0x4086000000000000),
+  /* ln(2^1023). expm1 helper overflows for large input.  */
+  .special_bound = V2 (0x1.6232e147ae148p+9), /* 708.40.  */
+  /* Bound past which function returns inf.  */
+  .inf_bound = V2 (0x1.634p+9), /* 710.5.  */
+  /* Cosh(9) slightly shifted for accuracy.  */
+  .cosh_9 = V2 (0x1.fa7157c470f82p+11),
  };
  
-static float64x2_t NOINLINE VPCS_ATTR
-special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
-{
-  return v_call_f64 (cosh, x, y, special);
-}
-
  /* Helper for approximating exp(x). Copied from v_exp_tail, with no
     special-case handling or tail.  */
  static inline float64x2_t
@@ -54,58 +50,87 @@ exp_inline (float64x2_t x)
  {
    const struct data *d = ptr_barrier (&data);
  
+  float64x2_t c2_inv_ln2 = vld1q_f64 (&d->c2);
    /* n = round(x/(ln2/N)).  */
-  float64x2_t z = vfmaq_f64 (d->shift, x, d->inv_ln2);
+  float64x2_t z = vfmaq_laneq_f64 (d->shift, x, c2_inv_ln2, 1);
    uint64x2_t u = vreinterpretq_u64_f64 (z);
    float64x2_t n = vsubq_f64 (z, d->shift);
  
    /* r = x - n*ln2/N.  */
-  float64x2_t ln2 = vld1q_f64 (d->ln2);
-  float64x2_t r = vfmaq_laneq_f64 (x, n, ln2, 0);
-  r = vfmaq_laneq_f64 (r, n, ln2, 1);
+  float64x2_t ln2_hi_lo = vld1q_f64 (d->ln2_hi_lo);
+  float64x2_t r = vfmaq_laneq_f64 (x, n, ln2_hi_lo, 0);
+  r = vfmaq_laneq_f64 (r, n, ln2_hi_lo, 1);
  
    uint64x2_t e = vshlq_n_u64 (u, 52 - V_EXP_TAIL_TABLE_BITS);
    uint64x2_t i = vandq_u64 (u, d->index_mask);
  
    /* y = tail + exp(r) - 1 ~= r + C1 r^2 + C2 r^3 + C3 r^4.  */
-  float64x2_t y = vfmaq_f64 (d->poly[1], d->poly[2], r);
-  y = vfmaq_f64 (d->poly[0], y, r);
-  y = vmulq_f64 (vfmaq_f64 (v_f64 (1), y, r), r);
+  float64x2_t poly = vfmaq_laneq_f64 (d->c1, r, c2_inv_ln2, 0);
+  poly = vfmaq_f64 (d->c0, poly, r);
+  poly = vmulq_f64 (vfmaq_f64 (v_f64 (1), poly, r), r);
  
    /* s = 2^(n/N).  */
    u = v_lookup_u64 (__v_exp_tail_data, i);
-  float64x2_t s = vreinterpretq_f64_u64 (vaddq_u64 (u, e));
+  float64x2_t scale = vreinterpretq_f64_u64 (vaddq_u64 (u, e));
+
+  return vfmaq_f64 (scale, poly, scale);
+}
+
+/* Uses the compound angle formula to adjust x back into an approximable range:
+   cosh (A + B) = cosh(A)cosh(B) + sinh(A)sinh(B)
+   By choosing sufficiently large values whereby after rounding cosh == sinh,
+   this can be simplified into: cosh (A + B) = cosh(A) * e^B.  */
+static float64x2_t NOINLINE VPCS_ATTR
+special_case (float64x2_t x, float64x2_t t, uint64x2_t special)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  /* Complete fast path computation.  */
+  float64x2_t half_t = vmulq_n_f64 (t, 0.5);
+  float64x2_t half_over_t = vdivq_f64 (v_f64 (0.5), t);
+  float64x2_t y = vaddq_f64 (half_t, half_over_t);
+
+  /* Absolute x so we can subtract 9.0.  */
+  float64x2_t ax = vabsq_f64 (x);
+  /* Subtract 9.0 from x as a reduction to prevent early overflow.  */
+  float64x2_t sx = vsubq_f64 (ax, v_f64 (9.0));
+  float64x2_t s = exp_inline (sx);
+
+  /* Multiply the result by cosh(9) slightly shifted for accuracy.  */
+  float64x2_t r = vmulq_f64 (s, d->cosh_9);
+
+  /* Check for overflowing lanes and return inf.  */
+  uint64x2_t cmp = vcagtq_f64 (ax, d->inf_bound);
+
+  /* Set overflowing lines to inf and set none over flowing to result.  */
+  r = vbslq_f64 (cmp, v_f64 (INFINITY), r);
  
-  return vfmaq_f64 (s, y, s);
+  /* Return r for special lanes and y for none special lanes.  */
+  return vbslq_f64 (special, r, y);
  }
  
  /* Approximation for vector double-precision cosh(x) using exp_inline.
     cosh(x) = (exp(x) + exp(-x)) / 2.
-   The greatest observed error is in the scalar fall-back region, so is the
-   same as the scalar routine, 1.93 ULP:
-   _ZGVnN2v_cosh (0x1.628af341989dap+9) got 0x1.fdf28623ef921p+1021
-                                      want 0x1.fdf28623ef923p+1021.
-
-   The greatest observed error in the non-special region is 1.54 ULP:
-   _ZGVnN2v_cosh (0x1.8e205b6ecacf7p+2) got 0x1.f711dcb0c77afp+7
-                                      want 0x1.f711dcb0c77b1p+7.  */
+   The greatest observed error in the non-special region is 2.12 + 0.5 ULP:
+   _ZGVnN2v_cosh (-0x1.6241387f982f3p+1) got 0x1.ff784e05bad75p+2
+                                       want 0x1.ff784e05bad72p+2.  */
  float64x2_t VPCS_ATTR V_NAME_D1 (cosh) (float64x2_t x)
  {
    const struct data *d = ptr_barrier (&data);
  
-  float64x2_t ax = vabsq_f64 (x);
-  uint64x2_t special
-      = vcgtq_u64 (vreinterpretq_u64_f64 (ax), d->special_bound);
-
    /* Up to the point that exp overflows, we can use it to calculate cosh by
       exp(|x|) / 2 + 1 / (2 * exp(|x|)).  */
-  float64x2_t t = exp_inline (ax);
-  float64x2_t half_t = vmulq_n_f64 (t, 0.5);
-  float64x2_t half_over_t = vdivq_f64 (v_f64 (0.5), t);
+  float64x2_t t = exp_inline (x);
  
-  /* Fall back to scalar for any special cases.  */
+  /* Check for special cases.  */
+  uint64x2_t special = vcagtq_f64 (x, d->special_bound);
+  /* Fall back to vectorised special case for any lanes which would cause
+     exp to overflow.  */
    if (__glibc_unlikely (v_any_u64 (special)))
-    return special_case (x, vaddq_f64 (half_t, half_over_t), special);
+    return special_case (x, t, special);
  
+  /* Complete fast path if no special lanes.  */
+  float64x2_t half_t = vmulq_n_f64 (t, 0.5);
+  float64x2_t half_over_t = vdivq_f64 (v_f64 (0.5), t);
    return vaddq_f64 (half_t, half_over_t);
  }
diff --git a/sysdeps/aarch64/fpu/cosh_sve.c b/sysdeps/aarch64/fpu/cosh_sve.c

index 2b7d066e9fac765e2d1770d26e0e059853b492ae..39754ac01ade91458412ae58ef3e0f10106c8b02 100644 (file)
--- a/sysdeps/aarch64/fpu/cosh_sve.c
+++ b/sysdeps/aarch64/fpu/cosh_sve.c
@@ -23,8 +23,9 @@ static const struct data
  {
    double c0, c2;
    double c1, c3;
+  double special_bound;
    float64_t inv_ln2, ln2_hi, ln2_lo, shift;
-  uint64_t special_bound;
+  float64_t exp_9;
  } data = {
    /* Generated using Remez, in [-log(2)/128, log(2)/128].  */
    .c0 = 0x1.fffffffffdbcdp-2,
@@ -36,9 +37,9 @@ static const struct data
    /* 1/ln2.  */
    .inv_ln2 = 0x1.71547652b82fep+0,
    .shift = 0x1.800000000ff80p+46, /* 1.5*2^46+1022.  */
-
-  /* asuint(ln(2^(1024 - 1/128))), the value above which exp overflows.  */
-  .special_bound = 0x40862e37e7d8ba72,
+  /* (ln(2^(1021 + 1/128))), above which exp overflows.  */
+  .special_bound = 0x1.61dab63dc7dc1p+9, /* ~ 707.71.  */
+  .exp_9 = 0x1.fa7157c470f82p+12,       /* exp(9) ~ 8103.08.  */
  };
  
  /* Helper for approximating exp(x)/2.
@@ -71,63 +72,55 @@ exp_over_two_inline (const svbool_t pg, svfloat64_t x, const struct data *d)
    return svmla_x (pg, scale, scale, p);
  }
  
-/* Vectorised special case to handle values past where exp_inline overflows.
-   Halves the input value and uses the identity exp(x) = exp(x/2)^2 to double
-   the valid range of inputs, and returns inf for anything past that.  */
+/* Uses the compound angle formula to adjust x back into an approximable range:
+   cosh (A + B) = cosh(A)cosh(B) + sinh(A)sinh(B)
+   By choosing sufficiently large values whereby after rounding cosh == sinh,
+   this can be simplified into: cosh (A + B) = cosh(A) * e^B.  */
  static svfloat64_t NOINLINE
-special_case (svbool_t pg, svbool_t special, svfloat64_t ax, svfloat64_t t,
+special_case (svfloat64_t x, svbool_t pg, svbool_t special, svfloat64_t t,
               const struct data *d)
  {
    /* Finish fast path to compute values for non-special cases.  */
    svfloat64_t inv_twoexp = svdivr_x (pg, t, 0.25);
    svfloat64_t y = svadd_x (pg, t, inv_twoexp);
  
-  /* Halves input value, and then check if any cases
-     are still going to overflow.  */
-  ax = svmul_x (special, ax, 0.5);
-  svbool_t is_safe
-      = svcmplt (special, svreinterpret_u64 (ax), d->special_bound);
+  /* Absolute x so we can subtract 9.0 without worrying about signing.  */
+  svfloat64_t ax = svabs_x (svptrue_b64 (), x);
+  /* The input `x` is reduced by an offset of 9.0 to allow for accurate
+     approximation on the interval x > SpecialBound ~ 710.47.  */
+  ax = svsub_x (svptrue_b64 (), ax, 9.0);
  
-  /* Computes exp(x/2), and sets any overflowing lanes to inf.  */
-  svfloat64_t half_exp = exp_over_two_inline (special, ax, d);
-  half_exp = svsel (is_safe, half_exp, sv_f64 (INFINITY));
+  svfloat64_t half_exp = exp_over_two_inline (svptrue_b64 (), ax, d);
  
-  /* Construct special case cosh(x) = (exp(x/2)^2)/2.  */
-  svfloat64_t exp = svmul_x (svptrue_b64 (), half_exp, 2);
-  svfloat64_t special_y = svmul_x (special, exp, half_exp);
+  /* Multiply the result by exp(9) for special lanes only.  */
+  svfloat64_t cosh_sum = svmul_x (svptrue_b64 (), half_exp, d->exp_9);
  
-  /* Select correct return values for special and non-special cases.  */
-  special_y = svsel (special, special_y, y);
+  /* Check for overflowing special lanes and return inf for these lanes.  */
+  svbool_t is_inf = svcmpgt (special, ax, d->special_bound);
+  /* Return inf for overflowing lanes.  */
+  svfloat64_t special_y = svsel (is_inf, sv_f64 (INFINITY), cosh_sum);
  
-  /* Ensure an input of nan is correctly propagated.  */
-  svbool_t is_nan
-      = svcmpgt (special, svreinterpret_u64 (ax), sv_u64 (0x7ff0000000000000));
-  return svsel (is_nan, ax, svsel (special, special_y, y));
+  return svsel (special, special_y, y);
  }
  
  /* Approximation for SVE double-precision cosh(x) using exp_inline.
     cosh(x) = (exp(x) + exp(-x)) / 2.
-   The greatest observed error in special case region is 2.66 + 0.5 ULP:
-   _ZGVsMxv_cosh (0x1.633b532ffbc1ap+9) got 0x1.f9b2d3d22399ep+1023
-                                      want 0x1.f9b2d3d22399bp+1023
-
-  The greatest observed error in the non-special region is 1.01 + 0.5 ULP:
-  _ZGVsMxv_cosh (0x1.998ecbb3c1f81p+1) got 0x1.890b225657f84p+3
-                                     want 0x1.890b225657f82p+3.  */
+   The greatest observed error is 2.10 + 0.5 ULP:
+   _ZGVsMxv_cosh (-0x1.2acb2978bd15ep+4) got 0x1.ebbd8806ea342p+25
+                                       want 0x1.ebbd8806ea33fp+25.  */
  svfloat64_t SV_NAME_D1 (cosh) (svfloat64_t x, const svbool_t pg)
  {
    const struct data *d = ptr_barrier (&data);
  
-  svfloat64_t ax = svabs_x (pg, x);
-  svbool_t special = svcmpgt (pg, svreinterpret_u64 (ax), d->special_bound);
-
    /* Up to the point that exp overflows, we can use it to calculate cosh by
       (exp(|x|)/2 + 1) / (2 * exp(|x|)).  */
-  svfloat64_t half_exp = exp_over_two_inline (pg, ax, d);
+  svfloat64_t half_exp = exp_over_two_inline (pg, x, d);
  
-  /* Falls back to entirely standalone vectorized special case.  */
+  /* Fall back to vectorised special case for any lanes which would cause
+     exp to overflow.  */
+  svbool_t special = svacge (pg, x, d->special_bound);
    if (__glibc_unlikely (svptest_any (pg, special)))
-    return special_case (pg, special, ax, half_exp, d);
+    return special_case (x, pg, special, half_exp, d);
  
    svfloat64_t inv_twoexp = svdivr_x (pg, half_exp, 0.25);
    return svadd_x (pg, half_exp, inv_twoexp);
diff --git a/sysdeps/aarch64/fpu/coshf_advsimd.c b/sysdeps/aarch64/fpu/coshf_advsimd.c

index 1ba3888c17c31963e0773227b145ae3b2ebb733a..950ee247d8a2edbedcfc34fdcf47fa293c0429ea 100644 (file)
--- a/sysdeps/aarch64/fpu/coshf_advsimd.c
+++ b/sysdeps/aarch64/fpu/coshf_advsimd.c
@@ -23,18 +23,49 @@
  static const struct data
  {
    struct v_expf_data expf_consts;
-  float32x4_t bound;
+  float32x4_t special_bound, inf_bound, cosh_9, nine;
  } data = {
    .expf_consts = V_EXPF_DATA,
-  /* 0x1.5a92d8p+6: expf overflows above this, so have to use special case.  */
-  .bound = V4 (0x1.5a92d8p+6),
+  /* 86.64: expf overflows above this, so have to use special case.  */
+  .special_bound = V4 (0x1.5a92d8p+6),
+  /* Value above which inf is returned.  */
+  .inf_bound = V4 (0x1.65a9fap+6), /* ~ 89.42.  */
+  .cosh_9 = V4 (0x1.fa715845p+11), /* cosh(9).  */
+  .nine = V4 (0x1.2p+3),          /* 9.0.  */
  };
  
+/* Uses the compound angle formula to adjust x back into an approximable range:
+   cosh (A + B) = cosh(A)cosh(B) + sinh(A)sinh(B)
+   By choosing sufficiently large values whereby after rounding cosh == sinh,
+   this can be simplified into: cosh (A + B) = cosh(A) * e^B.  */
  static float32x4_t NOINLINE VPCS_ATTR
-special_case (float32x4_t x, float32x4_t half_t, float32x4_t half_over_t,
-             uint32x4_t special)
+special_case (float32x4_t x, float32x4_t t, uint32x4_t special)
  {
-  return v_call_f32 (coshf, x, vaddq_f32 (half_t, half_over_t), special);
+  const struct data *d = ptr_barrier (&data);
+
+  /* Complete fast path computation.  */
+  /* Calculate cosh by exp(x) / 2 + exp(-x) / 2.  */
+  float32x4_t half_t = vmulq_n_f32 (t, 0.5);
+  float32x4_t half_over_t = vdivq_f32 (v_f32 (0.5), t);
+  float32x4_t y = vaddq_f32 (half_t, half_over_t);
+
+  /* Absolute x so we can subtract 9.0 without worrying about signing.  */
+  float32x4_t ax = vabsq_f32 (x);
+  /* Subtract 9.0 from x as a reduction to prevent early overflow.  */
+  float32x4_t sx = vsubq_f32 (ax, d->nine);
+  float32x4_t s = v_expf_inline (sx, &d->expf_consts);
+
+  /* Multiply the result by cosh(9) slightly shifted for accuracy.  */
+  float32x4_t r = vmulq_f32 (s, d->cosh_9);
+
+  /* Check for overflowing lanes and return inf.  */
+  uint32x4_t cmp = vcagtq_f32 (ax, d->inf_bound);
+
+  /* Set overflowing lines to inf and set none over flowing to result.  */
+  r = vbslq_f32 (cmp, v_f32 (INFINITY), r);
+
+  /* Return r for special lanes and y for none special lanes.  */
+  return vbslq_f32 (special, r, y);
  }
  
  /* Single-precision vector cosh, using vector expf.
@@ -45,16 +76,19 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (cosh) (float32x4_t x)
  {
    const struct data *d = ptr_barrier (&data);
  
-  uint32x4_t special = vcageq_f32 (x, d->bound);
    float32x4_t t = v_expf_inline (x, &d->expf_consts);
  
+  /* Check for special cases.  */
+  uint32x4_t special = vcageq_f32 (x, d->special_bound);
+  /* Fall back to vectorised special case for any lanes which would cause
+     expm1 to overflow.  */
+  if (__glibc_unlikely (v_any_u32 (special)))
+    return special_case (x, t, special);
+
+  /* Complete fast path if no special lanes.  */
    /* Calculate cosh by exp(x) / 2 + exp(-x) / 2.  */
    float32x4_t half_t = vmulq_n_f32 (t, 0.5);
    float32x4_t half_over_t = vdivq_f32 (v_f32 (0.5), t);
-
-  if (__glibc_unlikely (v_any_u32 (special)))
-    return special_case (x, half_t, half_over_t, special);
-
    return vaddq_f32 (half_t, half_over_t);
  }
  libmvec_hidden_def (V_NAME_F1 (cosh))
diff --git a/sysdeps/aarch64/fpu/coshf_sve.c b/sysdeps/aarch64/fpu/coshf_sve.c

index cde242659df6a9d997c556e55495edb10ea92619..a37456e5b72ec93994fb553900c2eef989d97a2a 100644 (file)
--- a/sysdeps/aarch64/fpu/coshf_sve.c
+++ b/sysdeps/aarch64/fpu/coshf_sve.c
@@ -20,34 +20,60 @@
  #include "sv_math.h"
  #include "sv_expf_inline.h"
  
+/* For x < SpecialBound, the result of exp is subnormal and not handled
+   correctly by FEXPA.  */
+#define SpecialBound 0x1.5d5e2ap+6f /* ~ 87.34.  */
+
  static const struct data
  {
    struct sv_expf_data expf_consts;
-  float special_bound;
+  float32_t special_bound, cosh_9;
  } data = {
    .expf_consts = SV_EXPF_DATA,
-  /* 0x1.5a92d8p+6: expf overflows above this, so have to use special case.  */
-  .special_bound = 0x1.5a92d8p+6,
+  .special_bound = SpecialBound,
+  .cosh_9 = 0x1.fa715845p+11, /* cosh(9).  */
  };
  
-static svfloat32_t NOINLINE
-special_case (svfloat32_t x, svfloat32_t half_e, svfloat32_t half_over_e,
-             svbool_t pg)
+/* Uses the compound angle formula to adjust x back into an approximable range:
+   cosh (A + B) = cosh(A)cosh(B) + sinh(A)sinh(B)
+   By choosing sufficiently large values whereby after rounding cosh == sinh,
+   this can be simplified into: cosh (A + B) = cosh(A) * e^B.  */
+static inline svfloat32_t
+special_case (svfloat32_t x, svbool_t special, svfloat32_t half_e,
+             svfloat32_t half_over_e, const struct data *d)
  {
-  return sv_call_f32 (coshf, x, svadd_x (svptrue_b32 (), half_e, half_over_e),
-                     pg);
+  /* Finish fastpass to compute values for non-special cases.  */
+  svfloat32_t y = svadd_x (svptrue_b32 (), half_e, half_over_e);
+
+  /* Make special values positive.  */
+  svfloat32_t ax = svabs_x (svptrue_b32 (), x);
+
+  /* The input `x` is reduced by an offset of 9.0 to allow for accurate
+     approximation on the interval `x > SpecialBound ~ 87.34`.  */
+  ax = svsub_x (svptrue_b32 (), ax, 9.0);
+  svfloat32_t r = expf_inline (ax, svptrue_b32 (), &d->expf_consts);
+
+  /* Multiply the result e by cosh(9) = exp(9)/2 for special lanes only.  */
+  svfloat32_t coshf_sum = svmul_x (svptrue_b32 (), r, d->cosh_9);
+
+  /* Check for overflow in exponential for special case lanes.  */
+  svbool_t is_inf = svcmpge (special, ax, d->special_bound);
+
+  /* Set overflowing lines to inf and set none over flowing to result.  */
+  svfloat32_t special_y = svsel (is_inf, sv_f32 (INFINITY), coshf_sum);
+
+  /* Return special_y for special lanes and y for none special lanes.  */
+  return svsel (special, special_y, y);
  }
  
  /* Single-precision vector cosh, using vector expf.
-   Maximum error is 2.56 +0.5 ULP:
+   Maximum error is 2.55 +0.5 ULP:
     _ZGVsMxv_coshf(-0x1.5b40f4p+1) got 0x1.e47748p+2
                                  want 0x1.e4774ep+2.  */
  svfloat32_t SV_NAME_F1 (cosh) (svfloat32_t x, svbool_t pg)
  {
    const struct data *d = ptr_barrier (&data);
  
-  svbool_t special = svacge (pg, x, d->special_bound);
-
    /* Calculate cosh by exp(x) / 2 + exp(-x) / 2.
       Note that x is passed to exp here, rather than |x|. This is to avoid using
       destructive unary ABS for better register usage. However it means the
@@ -57,8 +83,12 @@ svfloat32_t SV_NAME_F1 (cosh) (svfloat32_t x, svbool_t pg)
    svfloat32_t half_e = svmul_x (svptrue_b32 (), e, 0.5);
    svfloat32_t half_over_e = svdivr_x (pg, e, 0.5);
  
+  /* Check for special cases and fall back to vectorised special case for any
+  lanes which would cause expf to overflow.  */
+  svbool_t special = svacgt (pg, x, d->special_bound);
    if (__glibc_unlikely (svptest_any (pg, special)))
-    return special_case (x, half_e, half_over_e, special);
+    return special_case (x, special, half_e, half_over_e, d);
  
+  /* Complete fast path if no special lanes.  */
    return svadd_x (svptrue_b32 (), half_e, half_over_e);
  }
diff --git a/sysdeps/aarch64/fpu/sinh_advsimd.c b/sysdeps/aarch64/fpu/sinh_advsimd.c

index 67001a804abfbf90a643f9f4bfdcae14300ac3fb..2d8827c83d314c7cb36846fb4e34bf0f7b2793f1 100644 (file)
--- a/sysdeps/aarch64/fpu/sinh_advsimd.c
+++ b/sysdeps/aarch64/fpu/sinh_advsimd.c
@@ -24,19 +24,57 @@ static const struct data
  {
    struct v_expm1_data d;
    uint64x2_t halff;
-  float64x2_t large_bound;
+  float64x2_t special_bound;
+  float64x2_t inf_bound, cosh_9;
  } data = {
    .d = V_EXPM1_DATA,
    .halff = V2 (0x3fe0000000000000),
-  /* 2^9. expm1 helper overflows for large input.  */
-  .large_bound = V2 (0x1p+9),
+  /* ln(2^1023). expm1 helper overflows for large input.  */
+  .special_bound = V2 (0x1.628b76e3a7b61p+9), /* 709.09.  */
+  /* Bound past which function returns inf.  */
+  .inf_bound = V2 (0x1.634p+9), /* 710.5.  */
+  /* Cosh(9) slightly shifted for accuracy.  */
+  .cosh_9 = V2 (0x1.fa7157c470f82p+11),
  };
  
+/* Uses the compound angle formula to adjust x back into an approximable range:
+   sinh (A + B) = cosh(A)cosh(B) + sinh(A)sinh(B)
+   By choosing sufficiently large values whereby after rounding sinh == cosh,
+   this can be simplified into: sinh (A + B) = sinh(A) * e^B.  */
  static float64x2_t NOINLINE VPCS_ATTR
  special_case (float64x2_t x, float64x2_t t, float64x2_t halfsign,
               uint64x2_t special)
  {
-  return v_call_f64 (sinh, x, vmulq_f64 (t, halfsign), special);
+  const struct data *d = ptr_barrier (&data);
+
+  /* Complete fast path.  */
+  t = vaddq_f64 (t, vdivq_f64 (t, vaddq_f64 (t, v_f64 (1.0))));
+  float64x2_t y = vmulq_f64 (t, halfsign);
+
+  float64x2_t ax = vabsq_f64 (x);
+
+  /* Preserve sign for later use.  */
+  uint64x2_t sign
+      = veorq_u64 (vreinterpretq_u64_f64 (x), vreinterpretq_u64_f64 (ax));
+
+  /* Subtract 9.0 from x as a reduction to prevent early overflow.  */
+  float64x2_t sx = vsubq_f64 (ax, v_f64 (9.0));
+  float64x2_t s = expm1_inline (sx, &d->d);
+
+  /* Multiply the result by cosh(9) slightly shifted for accuracy.  */
+  float64x2_t r = vmulq_f64 (s, d->cosh_9);
+
+  /* Check for overflowing lanes and set to inf.  */
+  uint64x2_t cmp = vcagtq_f64 (ax, d->inf_bound);
+
+  /* Set overflowing lines to inf and set none over flowing to result.  */
+  r = vbslq_f64 (cmp, v_f64 (INFINITY), r);
+
+  /* Change sign back to original and return.  */
+  r = vreinterpretq_f64_u64 (vorrq_u64 (sign, vreinterpretq_u64_f64 (r)));
+
+  /* Return r for special lanes and y for none special lanes.  */
+  return vbslq_f64 (special, r, y);
  }
  
  /* Approximation for vector double-precision sinh(x) using expm1.
@@ -53,16 +91,19 @@ float64x2_t VPCS_ATTR V_NAME_D1 (sinh) (float64x2_t x)
    float64x2_t halfsign = vreinterpretq_f64_u64 (
        vbslq_u64 (v_u64 (0x8000000000000000), ix, d->halff));
  
-  uint64x2_t special = vcageq_f64 (x, d->large_bound);
-
    /* Up to the point that expm1 overflows, we can use it to calculate sinh
       using a slight rearrangement of the definition of sinh. This allows us to
       retain acceptable accuracy for very small inputs.  */
    float64x2_t t = expm1_inline (ax, &d->d);
-  t = vaddq_f64 (t, vdivq_f64 (t, vaddq_f64 (t, v_f64 (1.0))));
  
+  /* Check for special cases.  */
+  uint64x2_t special = vcageq_f64 (x, d->special_bound);
+  /* Fall back to vectorised special case for any lanes which would cause
+     expm1 to overflow.  */
    if (__glibc_unlikely (v_any_u64 (special)))
      return special_case (x, t, halfsign, special);
  
+  /* Complete fast path if no special lanes.  */
+  t = vaddq_f64 (t, vdivq_f64 (t, vaddq_f64 (t, v_f64 (1.0))));
    return vmulq_f64 (t, halfsign);
  }
diff --git a/sysdeps/aarch64/fpu/sinh_sve.c b/sysdeps/aarch64/fpu/sinh_sve.c

index e12c49c68d05fa3c5041712a068561f58f014f22..2b8d04833c2572d97114c42be36538f472cd4937 100644 (file)
--- a/sysdeps/aarch64/fpu/sinh_sve.c
+++ b/sysdeps/aarch64/fpu/sinh_sve.c
@@ -21,13 +21,14 @@
  
  static const struct data
  {
+  uint64_t expm1_data[20];
    uint64_t halff;
    double c2, c4;
    double inv_ln2;
    double ln2_hi, ln2_lo;
    double c0, c1, c3;
-  double shift, special_bound, bound;
-  uint64_t expm1_data[20];
+  double shift, small_bound;
+  double special_bound, cosh_9;
  } data = {
    /* Table lookup of 2^(i/64) - 1, for values of i from 0..19.  */
    .expm1_data = {
@@ -50,7 +51,9 @@ static const struct data
    .shift = 0x1.800000000ffc0p+46, /* 1.5*2^46+1023.  */
    .halff = 0x3fe0000000000000,
    .special_bound = 0x1.62e37e7d8ba72p+9,       /* ln(2^(1024 - 1/128)).  */
-  .bound = 0x1.a56ef8ec924ccp-3 /* 19*ln2/64.  */
+  .small_bound = 0x1.a56ef8ec924ccp-3, /* 19*ln2/64.  */
+  /* cosh(9) 4051.541963787692 slightly shifted for accuracy.  */
+  .cosh_9 = 0x1.fa7157c470f82p+11,
  };
  
  /* A specialised FEXPA expm1 that is only valid for positive inputs and
@@ -89,7 +92,7 @@ expm1_inline (svbool_t pg, svfloat64_t x)
  
       This can be circumvented by using a small lookup for scale-1
       when our input is below a certain bound, otherwise we can use FEXPA.  */
-  svbool_t is_small = svaclt (pg, x, d->bound);
+  svbool_t is_small = svaclt (pg, x, d->small_bound);
  
    /* Index via the input of FEXPA, but we only care about the lower 5 bits.  */
    svuint64_t base_idx = svand_x (pg, u, 0x1f);
@@ -107,32 +110,37 @@ expm1_inline (svbool_t pg, svfloat64_t x)
    return svmla_x (pg, scalem1, scale, p);
  }
  
-/* Vectorised special case to handle values past where exp_inline overflows.
-   Halves the input value and uses the identity exp(x) = exp(x/2)^2 to double
-   the valid range of inputs, and returns inf for anything past that.  */
+/* Uses the compound angle formula to adjust x back into an approximable range:
+   sinh (A + B) = cosh(A)cosh(B) + sinh(A)sinh(B)
+   By choosing sufficiently large values whereby after rounding sinh == cosh,
+   this can be simplified into: sinh (A + B) = sinh(A) * e^B.  */
  static svfloat64_t NOINLINE
-special_case (svbool_t pg, svbool_t special, svfloat64_t ax,
-             svfloat64_t halfsign, const struct data *d)
+special_case (svuint64_t sign, svbool_t pg, svbool_t special, svfloat64_t ax,
+             svfloat64_t halfsign)
  {
-  /* Halves input value, and then check if any cases
-     are still going to overflow.  */
-  ax = svmul_x (special, ax, 0.5);
-  svbool_t is_safe = svaclt (special, ax, d->special_bound);
+  const struct data *d = ptr_barrier (&data);
  
-  svfloat64_t t = expm1_inline (pg, ax);
+  /* The input `x` is reduced by an offset of 9.0 to allow for accurate
+     approximation on the interval x > SpecialBound ~ 709.78.  */
+  ax = svsub_m (special, ax, 9.0);
  
+  svfloat64_t t = expm1_inline (pg, ax);
    /* Finish fastpass to compute values for non-special cases.  */
    svfloat64_t y = svadd_x (pg, t, svdiv_x (pg, t, svadd_x (pg, t, 1.0)));
    y = svmul_x (pg, y, halfsign);
  
-  /* Computes special lane, and set remaining overflow lanes to inf.  */
-  svfloat64_t half_special_y = svmul_x (svptrue_b64 (), t, halfsign);
-  svfloat64_t special_y = svmul_x (svptrue_b64 (), half_special_y, t);
+  /* Multiply the result by cosh(9) with a slight tweek for accuracy for
+     special lanes only.  */
+  svfloat64_t cosh_sum = svmul_x (svptrue_b64 (), t, d->cosh_9);
+
+  /* Check for overflowing special lanes.  */
+  svbool_t is_inf = svcmpgt (special, ax, d->special_bound);
+  /* Return inf for overflowing lanes.  */
+  svfloat64_t special_y = svsel (is_inf, sv_f64 (INFINITY), cosh_sum);
  
-  svuint64_t signed_inf
-      = svorr_x (svptrue_b64 (), svreinterpret_u64 (halfsign),
-                sv_u64 (0x7ff0000000000000));
-  special_y = svsel (is_safe, special_y, svreinterpret_f64 (signed_inf));
+  /* Change sign back to original and return.  */
+  special_y = svreinterpret_f64 (
+      svorr_x (svptrue_b64 (), sign, svreinterpret_u64 (special_y)));
  
    /* Join resulting vectors together and return.  */
    return svsel (special, special_y, y);
@@ -140,13 +148,9 @@ special_case (svbool_t pg, svbool_t special, svfloat64_t ax,
  
  /* Approximation for SVE double-precision sinh(x) using FEXPA expm1.
     Uses sinh(x) = e^2x - 1 / 2e^x, rewritten for accuracy.
-   The greatest observed error in the non-special region is 2.63 + 0.5 ULP:
+   The greatest observed error is 2.62 + 0.5 ULP:
     _ZGVsMxv_sinh (0x1.b5e0e13ba88aep-2) got 0x1.c3587faf97b0cp-2
-                                      want 0x1.c3587faf97b09p-2
-
-   The greatest observed error in the special region is 2.65 + 0.5 ULP:
-   _ZGVsMxv_sinh (0x1.633ce847dab1ap+9) got 0x1.fffd30eea0066p+1023
-                                      want 0x1.fffd30eea0063p+1023.  */
+                                      want 0x1.c3587faf97b09p-2.  */
  svfloat64_t SV_NAME_D1 (sinh) (svfloat64_t x, svbool_t pg)
  {
    const struct data *d = ptr_barrier (&data);
@@ -157,9 +161,10 @@ svfloat64_t SV_NAME_D1 (sinh) (svfloat64_t x, svbool_t pg)
        = sveor_x (pg, svreinterpret_u64 (x), svreinterpret_u64 (ax));
    svfloat64_t halfsign = svreinterpret_f64 (svorr_x (pg, sign, d->halff));
  
-  /* Fall back to scalar variant for all lanes if any are special.  */
+  /* Fall back to vectorised special case for any lanes which would cause
+     expm1f to overflow.  */
    if (__glibc_unlikely (svptest_any (pg, special)))
-    return special_case (pg, special, ax, halfsign, d);
+    return special_case (sign, pg, special, ax, halfsign);
  
    /* Up to the point that expm1 overflows, we can use it to calculate sinh
       using a slight rearrangement of the definition of sinh. This allows us to
diff --git a/sysdeps/aarch64/fpu/sinhf_advsimd.c b/sysdeps/aarch64/fpu/sinhf_advsimd.c

index b11214c7a53e2c6d5158108c456b7f339b84ac57..27b419b140a28d97bd79e7602e79579f01ac5bfc 100644 (file)
--- a/sysdeps/aarch64/fpu/sinhf_advsimd.c
+++ b/sysdeps/aarch64/fpu/sinhf_advsimd.c
@@ -23,18 +23,55 @@
  static const struct data
  {
    struct v_expm1f_data expm1f_consts;
-  float32x4_t oflow_bound;
+  float32x4_t special_bound, inf_bound, cosh_9, nine;
  } data = {
    .expm1f_consts = V_EXPM1F_DATA,
-  /* 0x1.61814ep+6, above which expm1f helper overflows.  */
-  .oflow_bound = V4 (0x1.61814ep+6),
+  /* 88.38, above which expm1f helper overflows.  */
+  .special_bound = V4 (0x1.61814ap+6),
+  /* Value above which inf is returned.  */
+  .inf_bound = V4 (0x1.65a9fap+6), /* ~ 89.42.  */
+  .cosh_9 = V4 (0x1.fa715845p+11), /* cosh(9).  */
+  .nine = V4 (0x1.2p+3),          /* 9.0.  */
  };
  
+/* Uses the compound angle formula to adjust x back into an approximable range:
+   sinh (A + B) = cosh(A)cosh(B) + sinh(A)sinh(B)
+   By choosing sufficiently large values whereby after rounding sinh == cosh,
+   this can be simplified into: sinh (A + B) = sinh(A) * e^B.  */
  static float32x4_t NOINLINE VPCS_ATTR
  special_case (float32x4_t x, float32x4_t t, float32x4_t halfsign,
               uint32x4_t special)
  {
-  return v_call_f32 (sinhf, x, vmulq_f32 (t, halfsign), special);
+  const struct data *d = ptr_barrier (&data);
+
+  /* Complete fast path.  */
+  t = vaddq_f32 (t, vdivq_f32 (t, vaddq_f32 (t, v_f32 (1.0))));
+  float32x4_t y = vmulq_f32 (t, halfsign);
+
+  float32x4_t ax = vabsq_f32 (x);
+  uint32x4_t iax = vreinterpretq_u32_f32 (ax);
+
+  /* Preserve sign for later use.  */
+  uint32x4_t sign = veorq_u32 (vreinterpretq_u32_f32 (x), iax);
+
+  /* Subtract 9.0 from x as a reduction to prevent early overflow.  */
+  float32x4_t sx = vsubq_f32 (ax, d->nine);
+  float32x4_t s = expm1f_inline (sx, &d->expm1f_consts);
+
+  /* Multiply the result by cosh(9) slightly shifted for accuracy.  */
+  float32x4_t r = vmulq_f32 (s, d->cosh_9);
+
+  /* Check for overflowing lanes and return inf.  */
+  uint32x4_t cmp = vcagtq_f32 (ax, d->inf_bound);
+
+  /* Set overflowing lines to inf and set none over flowing to result.  */
+  r = vbslq_f32 (cmp, v_f32 (INFINITY), r);
+
+  /* Change sign back to original and return.  */
+  r = vreinterpretq_f32_u32 (vorrq_u32 (sign, vreinterpretq_u32_f32 (r)));
+
+  /* Return r for special lanes and y for none special lanes.  */
+  return vbslq_f32 (special, r, y);
  }
  
  /* Approximation for vector single-precision sinh(x) using expm1.
@@ -51,19 +88,21 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sinh) (float32x4_t x)
    float32x4_t halfsign = vreinterpretq_f32_u32 (
        vbslq_u32 (v_u32 (0x80000000), ix, vreinterpretq_u32_f32 (v_f32 (0.5))));
  
-  uint32x4_t special = vcageq_f32 (x, d->oflow_bound);
-
    /* Up to the point that expm1f overflows, we can use it to calculate sinhf
-       using a slight rearrangement of the definition of asinh. This allows us
+     using a slight rearrangement of the definition of asinh. This allows us
       to retain acceptable accuracy for very small inputs.  */
    float32x4_t t = expm1f_inline (ax, &d->expm1f_consts);
-  t = vaddq_f32 (t, vdivq_f32 (t, vaddq_f32 (t, v_f32 (1.0))));
  
-  /* Fall back to the scalar variant for any lanes that should trigger an
-     exception.  */
+  /* Check for special cases.  */
+  uint32x4_t special = vcageq_f32 (x, d->special_bound);
+
+  /* Fall back to vectorised special case for any lanes which would cause
+     expm1 to overflow.  */
    if (__glibc_unlikely (v_any_u32 (special)))
      return special_case (x, t, halfsign, special);
  
+  /* Complete fast path if no special lanes.  */
+  t = vaddq_f32 (t, vdivq_f32 (t, vaddq_f32 (t, v_f32 (1.0))));
    return vmulq_f32 (t, halfsign);
  }
  libmvec_hidden_def (V_NAME_F1 (sinh))
diff --git a/sysdeps/aarch64/fpu/sinhf_sve.c b/sysdeps/aarch64/fpu/sinhf_sve.c

index 2f4e63ba739fbaf2188c9c8b9be7938fa39df36a..e52712438aee1fe5794b62f8a9175eebab92895d 100644 (file)
--- a/sysdeps/aarch64/fpu/sinhf_sve.c
+++ b/sysdeps/aarch64/fpu/sinhf_sve.c
@@ -18,50 +18,91 @@
     <https://www.gnu.org/licenses/>.  */
  
  #include "sv_expm1f_inline.h"
+#include "sv_expf_inline.h"
  #include "sv_math.h"
  
  static const struct data
  {
    struct sv_expm1f_data expm1f_consts;
-  uint32_t halff, large_bound;
+  struct sv_expf_data expf_consts;
+  float32_t special_bound, cosh_9;
+  uint32_t halff;
  } data = {
    .expm1f_consts = SV_EXPM1F_DATA,
+  .expf_consts = SV_EXPF_DATA,
    .halff = 0x3f000000,
-  /* 0x1.61814ep+6, above which expm1f helper overflows.  */
-  .large_bound = 0x42b0c0a7,
+  /* ~ 88.37 above which expm1f helper overflows.  */
+  .special_bound = 0x1.61814ap+6,
+  .cosh_9 = 0x1.fa715845p+11, /* cosh(9) ~ 4051.54.  */
  };
  
-static svfloat32_t NOINLINE
-special_case (svfloat32_t x, svfloat32_t y, svbool_t pg)
+/* Uses the compound angle formula to adjust x back into an approximable range:
+   sinh (A + B) = cosh(A)cosh(B) + sinh(A)sinh(B)
+   By choosing sufficiently large values whereby after rounding sinh == cosh,
+   this can be simplified into: sinh (A + B) = sinh(A) * e^B.  */
+static inline svfloat32_t
+special_case (const svbool_t pg, svbool_t special, svfloat32_t ax,
+             svfloat32_t x, svfloat32_t t, const struct data *d)
  {
-  return sv_call_f32 (sinhf, x, y, pg);
+  /* Preserve the sign bit to return final calcualtion to correct sign.  */
+  svuint32_t sign
+      = sveor_x (pg, svreinterpret_u32 (x), svreinterpret_u32 (ax));
+
+  /* Finish fastpass to compute values for non-special cases.  */
+  svfloat32_t halfsign = svreinterpret_f32 (svorr_x (pg, sign, d->halff));
+  t = svadd_x (pg, t, svdiv_x (pg, t, svadd_x (pg, t, 1.0)));
+  svfloat32_t y = svmul_x (svptrue_b32 (), t, halfsign);
+
+  /* The input `x` is reduced by an offset of 9.0 to allow for accurate
+     approximation on the interval x > SpecialBound ~ 88.37.  */
+  ax = svsub_x (svptrue_b32 (), ax, 9.0);
+  svfloat32_t e = expf_inline (ax, svptrue_b32 (), &d->expf_consts);
+
+  /* Multiply the result e by cosh(9) = exp(9)/2 for special lanes only.  */
+  svfloat32_t sinhf_sum = svmul_x (svptrue_b32 (), e, d->cosh_9);
+
+  /* Check for overflow in exponential for special case lanes.  */
+  svbool_t is_inf = svcmpge (special, ax, d->special_bound);
+
+  /* Set overflowing lines to inf and set none over flowing to result.  */
+  svfloat32_t special_y = svsel (is_inf, sv_f32 (INFINITY), sinhf_sum);
+
+  /* Change sign back to original and return.  */
+  special_y = svreinterpret_f32 (
+      svorr_x (svptrue_b32 (), sign, svreinterpret_u32 (special_y)));
+
+  /* Return special_y for special lanes and y for none special lanes.  */
+  return svsel (special, special_y, y);
  }
  
  /* Approximation for SVE single-precision sinh(x) using expm1.
     sinh(x) = (exp(x) - exp(-x)) / 2.
-   The maximum error is 2.26 ULP:
-   _ZGVsMxv_sinhf (0x1.e34a9ep-4) got 0x1.e469ep-4
-                                want 0x1.e469e4p-4.  */
+   Maximum error is 2.76 +0.5 ULP:
+   _ZGVsMxv_sinhf (0x1.6587e8p+6) got 0x1.ef3f98p+127
+                                want 0x1.ef3f92p+127.  */
  svfloat32_t SV_NAME_F1 (sinh) (svfloat32_t x, const svbool_t pg)
  {
    const struct data *d = ptr_barrier (&data);
-  svfloat32_t ax = svabs_x (pg, x);
-  svuint32_t sign
-      = sveor_x (pg, svreinterpret_u32 (x), svreinterpret_u32 (ax));
-  svfloat32_t halfsign = svreinterpret_f32 (svorr_x (pg, sign, d->halff));
  
-  svbool_t special = svcmpge (pg, svreinterpret_u32 (ax), d->large_bound);
+  /* Use absolute number for calculations for accuracy.  */
+  svfloat32_t ax = svabs_x (pg, x);
  
    /* Up to the point that expm1f overflows, we can use it to calculate sinhf
-   using a slight rearrangement of the definition of asinh. This allows us to
-   retain acceptable accuracy for very small inputs.  */
+     using a slight rearrangement of the definition of asinh. This allows us to
+     retain acceptable accuracy for very small inputs.  */
    svfloat32_t t = expm1f_inline (ax, pg, &d->expm1f_consts);
-  t = svadd_x (pg, t, svdiv_x (pg, t, svadd_x (pg, t, 1.0)));
  
-  /* Fall back to the scalar variant for any lanes which would cause
-     expm1f to overflow.  */
+  /* Check for special cases and fall back to vectorised special case for any
+     lanes which would cause expm1f to overflow.  */
+  svbool_t special = svacge (pg, x, d->special_bound);
    if (__glibc_unlikely (svptest_any (pg, special)))
-    return special_case (x, svmul_x (pg, t, halfsign), special);
+    return special_case (pg, special, ax, x, t, d);
  
+  /* Preserve the sign bit to return final calcualtion to correct sign.  */
+  svuint32_t sign
+      = sveor_x (pg, svreinterpret_u32 (x), svreinterpret_u32 (ax));
+  svfloat32_t halfsign = svreinterpret_f32 (svorr_x (pg, sign, d->halff));
+  /* Complete fast path if no special lanes.  */
+  t = svadd_x (pg, t, svdiv_x (pg, t, svadd_x (pg, t, 1.0)));
    return svmul_x (svptrue_b32 (), t, halfsign);
  }
diff --git a/sysdeps/aarch64/fpu/tanh_advsimd.c b/sysdeps/aarch64/fpu/tanh_advsimd.c

index 954844a8f0ae927f86247122bd84ef50aa780bb2..864d6747a253c0564b8b1e0a0b6367c2fc58d2ad 100644 (file)
--- a/sysdeps/aarch64/fpu/tanh_advsimd.c
+++ b/sysdeps/aarch64/fpu/tanh_advsimd.c
@@ -18,24 +18,106 @@
     <https://www.gnu.org/licenses/>.  */
  
  #include "v_math.h"
-#include "v_expm1_inline.h"
  
  static const struct data
  {
-  struct v_expm1_data d;
-  uint64x2_t thresh, tiny_bound;
+  float64x2_t c2, c4, c6, c8;
+  float64x2_t two_over_ln2;
+  int64x2_t exponent_bias;
+  double c1, c3, c5, c7, c9, c10;
+  double ln2_hi_lo[2];
+  float64x2_t special_bound;
  } data = {
-  .d = V_EXPM1_DATA,
-  .tiny_bound = V2 (0x3e40000000000000), /* asuint64 (0x1p-27).  */
-  /* asuint64(0x1.241bf835f9d5fp+4) - asuint64(tiny_bound).  */
-  .thresh = V2 (0x01f241bf835f9d5f),
+  .c1 = 0x1.5555555555559p-3,
+  .c2 = V2 (0x1.555555555554bp-5),
+  .c3 = 0x1.111111110f663p-7,
+  .c4 = V2 (0x1.6c16c16c1b5f3p-10),
+  .c5 = 0x1.a01a01affa35dp-13,
+  .c6 = V2 (0x1.a01a018b4ecbbp-16),
+  .c7 = 0x1.71ddf82db5bb4p-19,
+  .c8 = V2 (0x1.27e517fc0d54bp-22),
+  .c9 = 0x1.af5eedae67435p-26,
+  .c10 = 0x1.1f143d060a28ap-29,
+  .ln2_hi_lo = { 0x1.62e42fefa39efp-1, 0x1.abc9e3b39803fp-56 },
+  .two_over_ln2 = V2 (0x1.71547652b82fep1),
+  .exponent_bias = V2 (0x3ff0000000000000),
+  /* Bound past which function returns signed 1 as the result.  */
+  .special_bound = V2 (0x1.2cccccccccccdp+4), /* 18.80.  */
  };
  
+/* e^2x - 1 inline helper.  */
+static inline float64x2_t
+e2xm1_inline (float64x2_t x, const struct data *d)
+{
+  float64x2_t ln2_hi_lo = vld1q_f64 (&d->ln2_hi_lo[0]);
+
+  /* Reduce argument to smaller range:
+     Let i = round(x / ln2)
+     and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
+     exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
+     where 2^i is exact because i is an integer.  */
+  float64x2_t n = vrndaq_f64 (vmulq_f64 (x, d->two_over_ln2));
+  int64x2_t i = vcvtq_s64_f64 (n);
+  float64x2_t f = vaddq_f64 (x, x);
+  f = vfmsq_laneq_f64 (f, n, ln2_hi_lo, 0);
+  f = vfmsq_laneq_f64 (f, n, ln2_hi_lo, 1);
+
+  /* Approximate expm1(f) using polynomial.
+     Taylor expansion for expm1(x) has the form:
+        x + ax^2 + bx^3 + cx^4 ....
+     So we calculate the polynomial P(f) = a + bf + cf^2 + ...
+     and assemble the approximation expm1(f) ~= f + f^2 * P(f).  */
+  float64x2_t f2 = vmulq_f64 (f, f);
+  float64x2_t f4 = vmulq_f64 (f2, f2);
+  float64x2_t lane_consts_13 = vld1q_f64 (&d->c1);
+  float64x2_t lane_consts_57 = vld1q_f64 (&d->c5);
+  float64x2_t lane_consts_910 = vld1q_f64 (&d->c9);
+  float64x2_t p01 = vfmaq_laneq_f64 (v_f64 (0.5), f, lane_consts_13, 0);
+  float64x2_t p23 = vfmaq_laneq_f64 (d->c2, f, lane_consts_13, 1);
+  float64x2_t p45 = vfmaq_laneq_f64 (d->c4, f, lane_consts_57, 0);
+  float64x2_t p67 = vfmaq_laneq_f64 (d->c6, f, lane_consts_57, 1);
+  float64x2_t p03 = vfmaq_f64 (p01, f2, p23);
+  float64x2_t p47 = vfmaq_f64 (p45, f2, p67);
+  float64x2_t p89 = vfmaq_laneq_f64 (d->c8, f, lane_consts_910, 0);
+  float64x2_t p = vfmaq_laneq_f64 (p89, f2, lane_consts_910, 1);
+  p = vfmaq_f64 (p47, f4, p);
+  p = vfmaq_f64 (p03, f4, p);
+
+  p = vfmaq_f64 (f, f2, p);
+
+  /* Assemble the result.
+     expm1(x) ~= 2^i * (p + 1) - 1
+     Let t = 2^i.  */
+  int64x2_t u = vaddq_s64 (vshlq_n_s64 (i, 52), d->exponent_bias);
+  float64x2_t t = vreinterpretq_f64_s64 (u);
+
+  /* expm1(x) ~= p * t + (t - 1).  */
+  return vfmaq_f64 (vsubq_f64 (t, v_f64 (1.0)), p, t);
+}
+
  static float64x2_t NOINLINE VPCS_ATTR
-special_case (float64x2_t x, float64x2_t q, float64x2_t qp2,
-             uint64x2_t special)
+special_case (float64x2_t x, float64x2_t q, uint64x2_t special)
  {
-  return v_call_f64 (tanh, x, vdivq_f64 (q, qp2), special);
+  const struct data *d = ptr_barrier (&data);
+  /* Complete fast path.  */
+  float64x2_t y = vdivq_f64 (q, (vaddq_f64 (q, v_f64 (2.0))));
+
+  uint64x2_t ix = vreinterpretq_u64_f64 (x);
+
+  /* expm1 exponent bias is +1.0f.  */
+  uint64x2_t one_bits = vreinterpretq_u64_s64 (d->exponent_bias);
+
+  /* Mask selecting only the sign bit in each lane.  */
+  uint64x2_t sign_mask = vdupq_n_u64 (0x8000000000000000ULL);
+
+  /* Produce signed 1 for return of special cases:
+    sign bit taken from ix
+    all other bits taken from +1.0f (one_bits).  */
+  uint64x2_t special_bits = vbslq_u64 (sign_mask, ix, one_bits);
+  float64x2_t special_y = vreinterpretq_f64_u64 (special_bits);
+
+  /* Select between special case or regular case and return value.  */
+  return vbslq_f64 (special, special_y, y);
  }
  
  /* Vector approximation for double-precision tanh(x), using a simplified
@@ -46,17 +128,18 @@ float64x2_t VPCS_ATTR V_NAME_D1 (tanh) (float64x2_t x)
  {
    const struct data *d = ptr_barrier (&data);
  
-  uint64x2_t ia = vreinterpretq_u64_f64 (vabsq_f64 (x));
-
-  /* Trigger special-cases for tiny, boring and infinity/NaN.  */
-  uint64x2_t special = vcgtq_u64 (vsubq_u64 (ia, d->tiny_bound), d->thresh);
-
    /* tanh(x) = (e^2x - 1) / (e^2x + 1).  */
-  float64x2_t twox = vaddq_f64 (x, x);
-  float64x2_t q = expm1_inline (twox, &d->d);
-  float64x2_t qp2 = vaddq_f64 (q, v_f64 (2.0));
+  float64x2_t q = e2xm1_inline (x, d);
  
+  /* Check for special cases.  */
+  uint64x2_t special = vcagtq_f64 (x, d->special_bound);
+  /* For sufficiently high inputs, the result of tanh(|x|) is 1 when correctly
+     rounded, at this point we can return 1 directly, with sign correction.
+     This will also act as a guard against our approximation overflowing.
+     Kept as a special case to avoid slow down in fast path.  */
    if (__glibc_unlikely (v_any_u64 (special)))
-    return special_case (x, q, qp2, special);
-  return vdivq_f64 (q, qp2);
+    return special_case (x, q, special);
+
+  /* Complete fast path if no special lanes.  */
+  return vdivq_f64 (q, (vaddq_f64 (q, v_f64 (2.0))));
  }
diff --git a/sysdeps/aarch64/fpu/tanhf_advsimd.c b/sysdeps/aarch64/fpu/tanhf_advsimd.c

index 03e8621d26e6cf050a385c7c22c043e748aedb9a..7de476fbd30f09a1337482aaee930f030ca1073d 100644 (file)
--- a/sysdeps/aarch64/fpu/tanhf_advsimd.c
+++ b/sysdeps/aarch64/fpu/tanhf_advsimd.c
@@ -17,57 +17,109 @@
     License along with the GNU C Library; if not, see
     <https://www.gnu.org/licenses/>.  */
  
-#include "v_expm1f_inline.h"
+#include "v_math.h"
  
  static const struct data
  {
-  struct v_expm1f_data expm1f_consts;
-  uint32x4_t boring_bound, large_bound, onef;
+  float32x4_t special_bound, two;
+  float32x4_t c0, c2;
+  int32x4_t exponent_bias;
+  float c1, c3, two_over_ln2, c4;
+  float ln2_hi, ln2_lo;
  } data = {
-  .expm1f_consts = V_EXPM1F_DATA,
-  /* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for  negative).  */
-  .boring_bound = V4 (0x41102cb3),
-  .large_bound = V4 (0x7f800000),
+  /* 9.01, above which tanhf rounds to 1 (or -1 for  negative).  */
+  .special_bound = V4 (0x1.205966p+3),
+  .two = V4 (0x1.0p+1), /* 2.0.  */
+  /* Coefficients generated using fpminimax with degree=5 in [-log(2)/2,
+    log(2)/2]. Exponent bias is asuint(1.0f).  */
+  .c0 = V4 (0x1.fffffep-2),
+  .c1 = 0x1.5554aep-3,
+  .c2 = V4 (0x1.555736p-5),
+  .c3 = 0x1.12287cp-7,
+  .c4 = 0x1.6b55a2p-10,
+  .exponent_bias = V4 (0x3f800000),
+  .two_over_ln2 = 0x1.715476p+1f,
+  .ln2_hi = 0x1.62e4p-1f,
+  .ln2_lo = 0x1.7f7d1cp-20f,
  };
  
+/* e^2x - 1 inline helper.  */
+static inline float32x4_t
+e2xm1f_inline (float32x4_t x, const struct data *d)
+{
+  float32x2_t ln2 = vld1_f32 (&d->ln2_hi);
+  float32x4_t lane_consts = vld1q_f32 (&d->c1);
+
+  /* Reduce argument: f in [-ln2/2, ln2/2], i is exact.  */
+  float32x4_t j = vrndaq_f32 (vmulq_laneq_f32 (x, lane_consts, 2));
+  int32x4_t i = vcvtq_s32_f32 (j);
+  float32x4_t f = vaddq_f32 (x, x);
+  f = vfmsq_lane_f32 (f, j, ln2, 0);
+  f = vfmsq_lane_f32 (f, j, ln2, 1);
+
+  /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f).  */
+  float32x4_t f2 = vmulq_f32 (f, f);
+  float32x4_t f4 = vmulq_f32 (f2, f2);
+  float32x4_t p01 = vfmaq_laneq_f32 (d->c0, f, lane_consts, 0);
+  float32x4_t p23 = vfmaq_laneq_f32 (d->c2, f, lane_consts, 1);
+  float32x4_t poly = vfmaq_f32 (p01, f2, p23);
+  poly = vfmaq_laneq_f32 (poly, f4, lane_consts, 3);
+  poly = vfmaq_f32 (f, f2, poly);
+
+  /* scale = 2^i.  */
+  int32x4_t u = vaddq_s32 (vshlq_n_s32 (i, 23), d->exponent_bias);
+  float32x4_t scale = vreinterpretq_f32_s32 (u);
+  /* expm1(x) ~= poly * scale + (scale - 1).  */
+  return vfmaq_f32 (vsubq_f32 (scale, v_f32 (1.0f)), poly, scale);
+}
+
  static float32x4_t NOINLINE VPCS_ATTR
-special_case (float32x4_t x, uint32x4_t is_boring, float32x4_t boring,
-             float32x4_t q, uint32x4_t special)
+special_case (float32x4_t x, float32x4_t q, uint32x4_t special)
  {
-  return v_call_f32 (
-      tanhf, x,
-      vbslq_f32 (is_boring, boring, vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0)))),
-      special);
+  const struct data *d = ptr_barrier (&data);
+
+  /* Complete fast path.  */
+  float32x4_t y = vdivq_f32 (q, vaddq_f32 (q, d->two));
+
+  uint32x4_t ix = vreinterpretq_u32_f32 (x);
+
+  /* expm1 exponent bias is +1.0f.  */
+  uint32x4_t one_bits = vreinterpretq_u32_s32 (d->exponent_bias);
+
+  /* Mask selecting only the sign bit in each lane.  */
+  uint32x4_t sign_mask = vdupq_n_u32 (0x80000000u);
+
+  /* Produce signed 1 for return of special cases:
+    sign bit taken from ix
+    all other bits taken from +1.0f (one_bits).  */
+  uint32x4_t special_bits = vbslq_u32 (sign_mask, ix, one_bits);
+  float32x4_t special_y = vreinterpretq_f32_u32 (special_bits);
+
+  /* Select between special case or regular case and return value.  */
+  return vbslq_f32 (special, special_y, y);
  }
  
  /* Approximation for single-precision vector tanh(x), using a simplified
-   version of expm1f. The maximum error is 2.58 ULP:
+   version of expm1f. The maximum error is 2.08 + 0.5 ULP:
     _ZGVnN4v_tanhf (0x1.fa5eep-5) got 0x1.f9ba02p-5
                                 want 0x1.f9ba08p-5.  */
  float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tanh) (float32x4_t x)
  {
    const struct data *d = ptr_barrier (&data);
  
-  uint32x4_t ix = vreinterpretq_u32_f32 (x);
-  float32x4_t ax = vabsq_f32 (x);
-  uint32x4_t iax = vreinterpretq_u32_f32 (ax);
-  uint32x4_t sign = veorq_u32 (ix, iax);
-  uint32x4_t is_boring = vcgtq_u32 (iax, d->boring_bound);
-  /* expm1 exponent bias is 1.0f reinterpreted to int.  */
-  float32x4_t boring = vreinterpretq_f32_u32 (vorrq_u32 (
-      sign, vreinterpretq_u32_s32 (d->expm1f_consts.exponent_bias)));
-
-  uint32x4_t special = vcgtq_u32 (iax, d->large_bound);
-
    /* tanh(x) = (e^2x - 1) / (e^2x + 1).  */
-  float32x4_t q = expm1f_inline (vmulq_n_f32 (x, 2), &d->expm1f_consts);
+  float32x4_t q = e2xm1f_inline (x, d);
+
+  /* Check for special cases.  */
+  uint32x4_t special = vcagtq_f32 (x, d->special_bound);
  
+  /* Fall back to vectorised special case for any lanes which would cause
+     expm1 to overflow.  */
    if (__glibc_unlikely (v_any_u32 (special)))
-    return special_case (vreinterpretq_f32_u32 (ix), is_boring, boring, q,
-                        special);
+    return special_case (x, q, special);
  
-  float32x4_t y = vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0)));
-  return vbslq_f32 (is_boring, boring, y);
+  /* Complete fast path if no special lanes.  */
+  return vdivq_f32 (q, vaddq_f32 (q, d->two));
  }
  libmvec_hidden_def (V_NAME_F1 (tanh))
  HALF_WIDTH_ALIAS_F1 (tanh)
diff --git a/sysdeps/aarch64/fpu/tanhf_sve.c b/sysdeps/aarch64/fpu/tanhf_sve.c

index 8444078acb98b9c2d6ef7ce07ff656ff8178f503..6fd2879c277bc486943fe6106a7c3f7f26809ad5 100644 (file)
--- a/sysdeps/aarch64/fpu/tanhf_sve.c
+++ b/sysdeps/aarch64/fpu/tanhf_sve.c
@@ -17,53 +17,107 @@
     License along with the GNU C Library; if not, see
     <https://www.gnu.org/licenses/>.  */
  
-#include "sv_expm1f_inline.h"
+#include "sv_math.h"
  
  /* Largest value of x for which tanhf(x) rounds to 1 (or -1 for negative).  */
-#define BoringBound 0x1.205966p+3f
+#define SpecialBound 0x1.205966p+3f /* ~9.01.  */
  
  static const struct data
  {
-  struct sv_expm1f_data expm1f_consts;
-  uint32_t onef, special_bound;
-  float boring_bound;
+  /* These 4 are grouped together so they can be loaded as one quadword, then
+   used with _lane forms of svmla/svmls.  */
+  float32_t c2, c4, ln2_hi, ln2_lo;
+  float c0, two_over_ln2, c1, c3, special_bound;
  } data = {
-  .expm1f_consts = SV_EXPM1F_DATA,
-  .onef = 0x3f800000,
-  .special_bound = 0x7f800000,
-  .boring_bound = BoringBound,
+  .special_bound = SpecialBound,
+  /* Coefficients generated using fpminimax.  */
+  .c0 = 0x1.fffffep-2,
+  .c1 = 0x1.5554aep-3,
+  /* 2/ln2.  */
+  .two_over_ln2 = 0x1.715476p+1f,
+  .c2 = 0x1.555736p-5,
+  .c3 = 0x1.12287cp-7,
+  .c4 = 0x1.6b55a2p-10,
+  .ln2_lo = 0x1.7f7d1cp-20f,
+  .ln2_hi = 0x1.62e4p-1f,
  };
  
+/* An expm1 inspired helper function that returns an accurate
+   estimate for e^2x - 1.  */
+static inline svfloat32_t
+e2xm1f_inline (svfloat32_t x, svbool_t pg, const struct data *d)
+{
+  /* This vector is reliant on layout of data - it contains constants
+   that can be used with _lane forms of svmla/svmls. Values are:
+   [ coeff_2, coeff_4, ln2_hi, ln2_lo ].  */
+  svfloat32_t lane_constants = svld1rq (svptrue_b32 (), &d->c2);
+
+  /* Reduce argument to smaller range:
+     Let i = round(x / (2 * ln2))
+     and f = (x + x) - i * ln2, then f is in [-ln2/2, ln2/2].
+     exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
+     where 2^i is exact because i is an integer.  */
+  svfloat32_t j = svmul_x (svptrue_b32 (), x, d->two_over_ln2);
+  j = svrinta_x (pg, j);
+  svfloat32_t f = svadd_x (pg, x, x);
+  f = svmls_lane (f, j, lane_constants, 2);
+  f = svmls_lane (f, j, lane_constants, 3);
+
+  /* Approximate expm1(f) using polynomial.
+     Taylor expansion for expm1(x) has the form:
+        x + ax^2 + bx^3 + cx^4 ....
+     So we calculate the polynomial P(f) = a + bf + cf^2 + ...
+     and assemble the approximation expm1(f) ~= f + f^2 * P(f).  */
+  svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), f, lane_constants, 0);
+  svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), f, lane_constants, 1);
+  svfloat32_t f2 = svmul_x (svptrue_b32 (), f, f);
+  svfloat32_t p = svmla_x (pg, p12, f2, p34);
+  p = svmla_x (pg, sv_f32 (d->c0), f, p);
+  p = svmla_x (pg, f, f2, p);
+
+  /* Assemble the result.
+     expm1(x) ~= 2^i * (p + 1) - 1
+     Let t = 2^i.  */
+  svfloat32_t t = svscale_x (pg, sv_f32 (1.0f), svcvt_s32_x (pg, j));
+  return svmla_x (pg, svsub_x (pg, t, 1.0f), p, t);
+}
+
  static svfloat32_t NOINLINE
-special_case (svfloat32_t x, svbool_t pg, svbool_t is_boring,
-             svfloat32_t boring, svfloat32_t q, svbool_t special)
+special_case (svfloat32_t x, svbool_t pg, svbool_t special, svfloat32_t q)
  {
-  svfloat32_t y
-      = svsel_f32 (is_boring, boring, svdiv_x (pg, q, svadd_x (pg, q, 2.0)));
-  return sv_call_f32 (tanhf, x, y, special);
+  /* Finish fastpass to compute values for non-special cases.  */
+  svfloat32_t y = svdiv_x (pg, q, svadd_x (pg, q, 2.0));
+
+  /* Make special values positive for best accuracy.  */
+  svfloat32_t ax = svabs_x (svptrue_b32 (), x);
+  svuint32_t iax = svreinterpret_u32 (ax);
+
+  /* Preserve the sign bit to return final calcualtion to correct sign.  */
+  svuint32_t sign = sveor_x (svptrue_b32 (), svreinterpret_u32 (x), iax);
+
+  /* Set overflowing lanes to signed 1.  */
+  svfloat32_t special_y = svreinterpret_f32 (
+      svorr_x (svptrue_b32 (), sign, sv_u32 (0x3f800000)));
+
+  /* Return special_y for special lanes and y for none special lanes.  */
+  return svsel_f32 (special, special_y, y);
  }
  
  /* Approximation for single-precision SVE tanh(x), using a simplified
-   version of expm1f. The maximum error is 2.57 ULP:
+   version of expm1f.
+   Maximum error is 2.06 +0.5 ULP:
     _ZGVsMxv_tanhf (0x1.fc1832p-5) got 0x1.fb71a4p-5
                                  want 0x1.fb71aap-5.  */
  svfloat32_t SV_NAME_F1 (tanh) (svfloat32_t x, const svbool_t pg)
  {
    const struct data *d = ptr_barrier (&data);
  
-  svfloat32_t ax = svabs_x (pg, x);
-  svuint32_t iax = svreinterpret_u32 (ax);
-  svuint32_t sign = sveor_x (pg, svreinterpret_u32 (x), iax);
-  svfloat32_t boring = svreinterpret_f32 (svorr_x (pg, sign, d->onef));
-  svbool_t special = svcmpgt (pg, iax, d->special_bound);
-  svbool_t is_boring = svacgt (pg, x, d->boring_bound);
-
    /* tanh(x) = (e^2x - 1) / (e^2x + 1).  */
-  svfloat32_t q = expm1f_inline (svmul_x (svptrue_b32 (), x, 2.0), pg,
-                                &d->expm1f_consts);
+  svfloat32_t q = e2xm1f_inline (x, pg, d);
  
+  svbool_t special = svacgt (pg, x, d->special_bound);
    if (__glibc_unlikely (svptest_any (pg, special)))
-    return special_case (x, pg, is_boring, boring, q, special);
-  svfloat32_t y = svdiv_x (pg, q, svadd_x (pg, q, 2.0));
-  return svsel_f32 (is_boring, boring, y);
+    return special_case (x, pg, special, q);
+
+  return svdiv_x (pg, q, svadd_x (pg, q, 2.0));
  }
author	Richard.Wild@arm.com <Richard.Wild@arm.com>
	Thu, 26 Feb 2026 10:22:35 +0000 (10:22 +0000)
committer	Adhemerval Zanella <adhemerval.zanella@linaro.org>
	Thu, 26 Feb 2026 16:43:30 +0000 (13:43 -0300)
sysdeps/aarch64/fpu/cosh_advsimd.c		patch \| blob \| blame \| history
sysdeps/aarch64/fpu/cosh_sve.c		patch \| blob \| blame \| history
sysdeps/aarch64/fpu/coshf_advsimd.c		patch \| blob \| blame \| history
sysdeps/aarch64/fpu/coshf_sve.c		patch \| blob \| blame \| history
sysdeps/aarch64/fpu/sinh_advsimd.c		patch \| blob \| blame \| history
sysdeps/aarch64/fpu/sinh_sve.c		patch \| blob \| blame \| history
sysdeps/aarch64/fpu/sinhf_advsimd.c		patch \| blob \| blame \| history
sysdeps/aarch64/fpu/sinhf_sve.c		patch \| blob \| blame \| history
sysdeps/aarch64/fpu/tanh_advsimd.c		patch \| blob \| blame \| history
sysdeps/aarch64/fpu/tanhf_advsimd.c		patch \| blob \| blame \| history
sysdeps/aarch64/fpu/tanhf_sve.c		patch \| blob \| blame \| history