AArch64: Remove WANT_SIMD_EXCEPT from aarch64 AdvSIMD math routines

author Dylan Fleming <Dylan.Fleming@arm.com>

Tue, 18 Nov 2025 15:34:41 +0000 (15:34 +0000)

committer Wilco Dijkstra <wilco.dijkstra@arm.com>

Tue, 18 Nov 2025 15:51:15 +0000 (15:51 +0000)
author Dylan Fleming <Dylan.Fleming@arm.com>
Tue, 18 Nov 2025 15:34:41 +0000 (15:34 +0000)
committer Wilco Dijkstra <wilco.dijkstra@arm.com>
Tue, 18 Nov 2025 15:51:15 +0000 (15:51 +0000)
diff --git a/sysdeps/aarch64/fpu/acos_advsimd.c b/sysdeps/aarch64/fpu/acos_advsimd.c

index 453f78031431ae91fb58ff358730fbd8f7003cc8..7ccde9c81c14cb4cac118ac6f830cc2a14be6314 100644 (file)
--- a/sysdeps/aarch64/fpu/acos_advsimd.c
+++ b/sysdeps/aarch64/fpu/acos_advsimd.c
@@ -38,26 +38,9 @@ static const struct data
    .abs_mask = V2 (0x7fffffffffffffff),
  };
  
-#define AllMask v_u64 (0xffffffffffffffff)
-#define Oneu 0x3ff0000000000000
-#define Small 0x3e50000000000000 /* 2^-53.  */
-
-#if WANT_SIMD_EXCEPT
-static float64x2_t VPCS_ATTR NOINLINE
-special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
-{
-  return v_call_f64 (acos, x, y, special);
-}
-#endif
-
  /* Double-precision implementation of vector acos(x).
  
-   For |x| < Small, approximate acos(x) by pi/2 - x. Small = 2^-53 for correct
-   rounding.
-   If WANT_SIMD_EXCEPT = 0, Small = 0 and we proceed with the following
-   approximation.
-
-   For |x| in [Small, 0.5], use an order 11 polynomial P such that the final
+   For |x| in [0, 0.5], use an order 11 polynomial P such that the final
     approximation of asin is an odd polynomial:
  
       acos(x) ~ pi/2 - (x + x^3 P(x^2)).
@@ -78,16 +61,6 @@ float64x2_t VPCS_ATTR V_NAME_D1 (acos) (float64x2_t x)
    const struct data *d = ptr_barrier (&data);
  
    float64x2_t ax = vabsq_f64 (x);
-
-#if WANT_SIMD_EXCEPT
-  /* A single comparison for One, Small and QNaN.  */
-  uint64x2_t special
-      = vcgtq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (ax), v_u64 (Small)),
-                  v_u64 (Oneu - Small));
-  if (__glibc_unlikely (v_any_u64 (special)))
-    return special_case (x, x, AllMask);
-#endif
-
    uint64x2_t a_le_half = vcleq_f64 (ax, v_f64 (0.5));
  
    /* Evaluate polynomial Q(x) = z + z * z2 * P(z2) with
@@ -129,7 +102,6 @@ float64x2_t VPCS_ATTR V_NAME_D1 (acos) (float64x2_t x)
                = 2 Q(|x|)               , for  0.5 < x < 1.0
                = pi - 2 Q(|x|)          , for -1.0 < x < -0.5.  */
    float64x2_t y = vbslq_f64 (d->abs_mask, p, x);
-
    uint64x2_t is_neg = vcltzq_f64 (x);
    float64x2_t off = vreinterpretq_f64_u64 (
        vandq_u64 (is_neg, vreinterpretq_u64_f64 (d->pi)));
diff --git a/sysdeps/aarch64/fpu/acosf_advsimd.c b/sysdeps/aarch64/fpu/acosf_advsimd.c

index 4bdccf689e0cf08a5d5126d2dd9b035a99fee200..a2183dae8054933669073765f154e51a4b01c2d3 100644 (file)
--- a/sysdeps/aarch64/fpu/acosf_advsimd.c
+++ b/sysdeps/aarch64/fpu/acosf_advsimd.c
@@ -35,25 +35,10 @@ static const struct data
  
  #define AbsMask 0x7fffffff
  #define Half 0x3f000000
-#define One 0x3f800000
-#define Small 0x32800000 /* 2^-26.  */
-
-#if WANT_SIMD_EXCEPT
-static float32x4_t VPCS_ATTR NOINLINE
-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
-{
-  return v_call_f32 (acosf, x, y, special);
-}
-#endif
  
  /* Single-precision implementation of vector acos(x).
  
-   For |x| < Small, approximate acos(x) by pi/2 - x. Small = 2^-26 for correct
-   rounding.
-   If WANT_SIMD_EXCEPT = 0, Small = 0 and we proceed with the following
-   approximation.
-
-   For |x| in [Small, 0.5], use order 4 polynomial P such that the final
+   For |x| in [0, 0.5], use order 4 polynomial P such that the final
     approximation of asin is an odd polynomial:
  
       acos(x) ~ pi/2 - (x + x^3 P(x^2)).
@@ -67,7 +52,7 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
  
     The largest observed error in this region is 1.32 ulps,
     _ZGVnN4v_acosf (0x1.15ba56p-1) got 0x1.feb33p-1
-                          want 0x1.feb32ep-1.  */
+                                want 0x1.feb32ep-1.  */
  float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (acos) (float32x4_t x)
  {
    const struct data *d = ptr_barrier (&data);
@@ -75,14 +60,6 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (acos) (float32x4_t x)
    uint32x4_t ix = vreinterpretq_u32_f32 (x);
    uint32x4_t ia = vandq_u32 (ix, v_u32 (AbsMask));
  
-#if WANT_SIMD_EXCEPT
-  /* A single comparison for One, Small and QNaN.  */
-  uint32x4_t special
-      = vcgtq_u32 (vsubq_u32 (ia, v_u32 (Small)), v_u32 (One - Small));
-  if (__glibc_unlikely (v_any_u32 (special)))
-    return special_case (x, x, v_u32 (0xffffffff));
-#endif
-
    float32x4_t ax = vreinterpretq_f32_u32 (ia);
    uint32x4_t a_le_half = vcleq_u32 (ia, v_u32 (Half));
  
diff --git a/sysdeps/aarch64/fpu/acosh_advsimd.c b/sysdeps/aarch64/fpu/acosh_advsimd.c

index d7dab02766fee069b7b32584333c97a7931d6703..854a172bc6903ba1bbc163d030dea8ba68a8f4f5 100644 (file)
--- a/sysdeps/aarch64/fpu/acosh_advsimd.c
+++ b/sysdeps/aarch64/fpu/acosh_advsimd.c
@@ -47,12 +47,6 @@ VPCS_ATTR float64x2_t V_NAME_D1 (acosh) (float64x2_t x)
    const struct data *d = ptr_barrier (&data);
    uint64x2_t special
        = vcgeq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (x), d->one), d->thresh);
-  float64x2_t special_arg = x;
-
-#if WANT_SIMD_EXCEPT
-  if (__glibc_unlikely (v_any_u64 (special)))
-    x = vbslq_f64 (special, vreinterpretq_f64_u64 (d->one), x);
-#endif
  
    float64x2_t xm1 = vsubq_f64 (x, v_f64 (1.0));
    float64x2_t y = vaddq_f64 (x, v_f64 (1.0));
@@ -61,6 +55,6 @@ VPCS_ATTR float64x2_t V_NAME_D1 (acosh) (float64x2_t x)
    y = vaddq_f64 (xm1, y);
  
    if (__glibc_unlikely (v_any_u64 (special)))
-    return special_case (special_arg, y, special, &d->log1p_consts);
+    return special_case (x, y, special, &d->log1p_consts);
    return log1p_inline (y, &d->log1p_consts);
  }
diff --git a/sysdeps/aarch64/fpu/acoshf_advsimd.c b/sysdeps/aarch64/fpu/acoshf_advsimd.c

index 61d41d156ca12e643c46bc18cdc606b34dbce906..7c3f590723f8bb6e2c80edd55c72d050ebe665c4 100644 (file)
--- a/sysdeps/aarch64/fpu/acoshf_advsimd.c
+++ b/sysdeps/aarch64/fpu/acoshf_advsimd.c
@@ -36,13 +36,8 @@ special_case (float32x4_t x, float32x4_t y, uint16x4_t special,
    return v_call_f32 (acoshf, x, log1pf_inline (y, d), vmovl_u16 (special));
  }
  
-/* Vector approximation for single-precision acosh, based on log1p. Maximum
-   error depends on WANT_SIMD_EXCEPT. With SIMD fp exceptions enabled, it
-   is 3.00 ULP:
-   _ZGVnN4v_acoshf(0x1.01df3ap+0) got 0x1.ef0a82p-4
-                                want 0x1.ef0a7cp-4.
-   With exceptions disabled, we can compute u with a shorter dependency chain,
-   which gives maximum error of 3.22 ULP:
+/* Vector approximation for single-precision acosh, based on log1p.
+   The largest observed error is 3.22 ULP:
     _ZGVnN4v_acoshf(0x1.007ef2p+0) got 0x1.fdcdccp-5
                                  want 0x1.fdcdd2p-5.  */
  
@@ -52,19 +47,9 @@ VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (acosh) (float32x4_t x)
    uint32x4_t ix = vreinterpretq_u32_f32 (x);
    uint16x4_t special = vcge_u16 (vsubhn_u32 (ix, d->one), Thresh);
  
-#if WANT_SIMD_EXCEPT
-  /* Mask special lanes with 1 to side-step spurious invalid or overflow. Use
-     only xm1 to calculate u, as operating on x will trigger invalid for NaN.
-     Widening sign-extend special predicate in order to mask with it.  */
-  uint32x4_t p
-      = vreinterpretq_u32_s32 (vmovl_s16 (vreinterpret_s16_u16 (special)));
-  float32x4_t xm1 = v_zerofy_f32 (vsubq_f32 (x, v_f32 (1)), p);
-  float32x4_t u = vfmaq_f32 (vaddq_f32 (xm1, xm1), xm1, xm1);
-#else
    float32x4_t xm1 = vsubq_f32 (x, vreinterpretq_f32_u32 (d->one));
    float32x4_t u
        = vmulq_f32 (xm1, vaddq_f32 (x, vreinterpretq_f32_u32 (d->one)));
-#endif
  
    float32x4_t y = vaddq_f32 (xm1, vsqrtq_f32 (u));
  
diff --git a/sysdeps/aarch64/fpu/asin_advsimd.c b/sysdeps/aarch64/fpu/asin_advsimd.c

index f74141c8450b58737c5fa03023440acd9420369a..a0ede9011cfc0d1ffdc65badda58dc515f313daa 100644 (file)
--- a/sysdeps/aarch64/fpu/asin_advsimd.c
+++ b/sysdeps/aarch64/fpu/asin_advsimd.c
@@ -37,25 +37,8 @@ static const struct data
    .pi_over_2 = V2 (0x1.921fb54442d18p+0), .abs_mask = V2 (0x7fffffffffffffff),
  };
  
-#define AllMask v_u64 (0xffffffffffffffff)
-#define One 0x3ff0000000000000
-#define Small 0x3e50000000000000 /* 2^-12.  */
-
-#if WANT_SIMD_EXCEPT
-static float64x2_t VPCS_ATTR NOINLINE
-special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
-{
-  return v_call_f64 (asin, x, y, special);
-}
-#endif
-
  /* Double-precision implementation of vector asin(x).
-
-   For |x| < Small, approximate asin(x) by x. Small = 2^-12 for correct
-   rounding. If WANT_SIMD_EXCEPT = 0, Small = 0 and we proceed with the
-   following approximation.
-
-   For |x| in [Small, 0.5], use an order 11 polynomial P such that the final
+   For |x| in [0, 0.5], use an order 11 polynomial P such that the final
     approximation is an odd polynomial: asin(x) ~ x + x^3 P(x^2).
  
     The largest observed error in this region is 1.01 ulps,
@@ -72,19 +55,8 @@ special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
  float64x2_t VPCS_ATTR V_NAME_D1 (asin) (float64x2_t x)
  {
    const struct data *d = ptr_barrier (&data);
-
    float64x2_t ax = vabsq_f64 (x);
  
-#if WANT_SIMD_EXCEPT
-  /* Special values need to be computed with scalar fallbacks so
-     that appropriate exceptions are raised.  */
-  uint64x2_t special
-      = vcgtq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (ax), v_u64 (Small)),
-                  v_u64 (One - Small));
-  if (__glibc_unlikely (v_any_u64 (special)))
-    return special_case (x, x, AllMask);
-#endif
-
    uint64x2_t a_lt_half = vcaltq_f64 (x, v_f64 (0.5));
  
    /* Evaluate polynomial Q(x) = y + y * z * P(z) with
diff --git a/sysdeps/aarch64/fpu/asinf_advsimd.c b/sysdeps/aarch64/fpu/asinf_advsimd.c

index 013936c2c0f5cb29b309ac5fb9ad9a33cc850b2b..abadca1f19e1f4c2bbd5d562ef397095ac3db31e 100644 (file)
--- a/sysdeps/aarch64/fpu/asinf_advsimd.c
+++ b/sysdeps/aarch64/fpu/asinf_advsimd.c
@@ -33,21 +33,10 @@ static const struct data
  };
  
  #define AbsMask 0x7fffffff
-#define One 0x3f800000
-#define Small 0x39800000 /* 2^-12.  */
-
-#if WANT_SIMD_EXCEPT
-static float32x4_t VPCS_ATTR NOINLINE
-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
-{
-  return v_call_f32 (asinf, x, y, special);
-}
-#endif
  
  /* Single-precision implementation of vector asin(x).
  
-
-   For |x| <0.5, use order 4 polynomial P such that the final
+   For |x| in [0, 0.5), use order 4 polynomial P such that the final
     approximation is an odd polynomial: asin(x) ~ x + x^3 P(x^2).
  
      The largest observed error in this region is 0.83 ulps,
@@ -65,16 +54,6 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (asin) (float32x4_t x)
  
    uint32x4_t ix = vreinterpretq_u32_f32 (x);
    uint32x4_t ia = vandq_u32 (ix, v_u32 (AbsMask));
-
-#if WANT_SIMD_EXCEPT
-  /* Special values need to be computed with scalar fallbacks so
-     that appropriate fp exceptions are raised.  */
-  uint32x4_t special
-      = vcgtq_u32 (vsubq_u32 (ia, v_u32 (Small)), v_u32 (One - Small));
-  if (__glibc_unlikely (v_any_u32 (special)))
-    return special_case (x, x, v_u32 (0xffffffff));
-#endif
-
    float32x4_t ax = vreinterpretq_f32_u32 (ia);
    uint32x4_t a_lt_half = vcaltq_f32 (x, v_f32 (0.5f));
  
diff --git a/sysdeps/aarch64/fpu/asinh_advsimd.c b/sysdeps/aarch64/fpu/asinh_advsimd.c

index 1afc072fc3f2b825e247027adbd7dea27e051476..5b0f2ac05f5a24952af4c75ec4084e4d5c295fd8 100644 (file)
--- a/sysdeps/aarch64/fpu/asinh_advsimd.c
+++ b/sysdeps/aarch64/fpu/asinh_advsimd.c
@@ -23,54 +23,29 @@
  const static struct data
  {
    uint64x2_t huge_bound, abs_mask, off, mask;
-#if WANT_SIMD_EXCEPT
-  float64x2_t tiny_bound;
-#endif
    float64x2_t lc0, lc2;
    double lc1, lc3, ln2, lc4;
-
    float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c17;
    double c1, c3, c5, c7, c9, c11, c13, c15;
-
  } data = {
  
-#if WANT_SIMD_EXCEPT
-  .tiny_bound = V2 (0x1p-26),
-#endif
    /* Even terms of polynomial s.t. asinh(x) is approximated by
       asinh(x) ~= x + x^3 * (C0 + C1 * x + C2 * x^2 + C3 * x^3 + ...).
       Generated using Remez, f = (asinh(sqrt(x)) - sqrt(x))/x^(3/2).  */
-
-  .c0 = V2 (-0x1.55555555554a7p-3),
-  .c1 = 0x1.3333333326c7p-4,
-  .c2 = V2 (-0x1.6db6db68332e6p-5),
-  .c3 = 0x1.f1c71b26fb40dp-6,
-  .c4 = V2 (-0x1.6e8b8b654a621p-6),
-  .c5 = 0x1.1c4daa9e67871p-6,
-  .c6 = V2 (-0x1.c9871d10885afp-7),
-  .c7 = 0x1.7a16e8d9d2ecfp-7,
-  .c8 = V2 (-0x1.3ddca533e9f54p-7),
-  .c9 = 0x1.0becef748dafcp-7,
-  .c10 = V2 (-0x1.b90c7099dd397p-8),
-  .c11 = 0x1.541f2bb1ffe51p-8,
-  .c12 = V2 (-0x1.d217026a669ecp-9),
-  .c13 = 0x1.0b5c7977aaf7p-9,
-  .c14 = V2 (-0x1.e0f37daef9127p-11),
-  .c15 = 0x1.388b5fe542a6p-12,
-  .c16 = V2 (-0x1.021a48685e287p-14),
-  .c17 = V2 (0x1.93d4ba83d34dap-18),
-
-  .lc0 = V2 (-0x1.ffffffffffff7p-2),
-  .lc1 = 0x1.55555555170d4p-2,
-  .lc2 = V2 (-0x1.0000000399c27p-2),
-  .lc3 = 0x1.999b2e90e94cap-3,
-  .lc4 = -0x1.554e550bd501ep-3,
-  .ln2 = 0x1.62e42fefa39efp-1,
-
-  .off = V2 (0x3fe6900900000000),
-  .huge_bound = V2 (0x5fe0000000000000),
-  .abs_mask = V2 (0x7fffffffffffffff),
-  .mask = V2 (0xfffULL << 52),
+  .c0 = V2 (-0x1.55555555554a7p-3),    .c1 = 0x1.3333333326c7p-4,
+  .c2 = V2 (-0x1.6db6db68332e6p-5),    .c3 = 0x1.f1c71b26fb40dp-6,
+  .c4 = V2 (-0x1.6e8b8b654a621p-6),    .c5 = 0x1.1c4daa9e67871p-6,
+  .c6 = V2 (-0x1.c9871d10885afp-7),    .c7 = 0x1.7a16e8d9d2ecfp-7,
+  .c8 = V2 (-0x1.3ddca533e9f54p-7),    .c9 = 0x1.0becef748dafcp-7,
+  .c10 = V2 (-0x1.b90c7099dd397p-8),   .c11 = 0x1.541f2bb1ffe51p-8,
+  .c12 = V2 (-0x1.d217026a669ecp-9),   .c13 = 0x1.0b5c7977aaf7p-9,
+  .c14 = V2 (-0x1.e0f37daef9127p-11),  .c15 = 0x1.388b5fe542a6p-12,
+  .c16 = V2 (-0x1.021a48685e287p-14),  .c17 = V2 (0x1.93d4ba83d34dap-18),
+  .lc0 = V2 (-0x1.ffffffffffff7p-2),   .lc1 = 0x1.55555555170d4p-2,
+  .lc2 = V2 (-0x1.0000000399c27p-2),   .lc3 = 0x1.999b2e90e94cap-3,
+  .lc4 = -0x1.554e550bd501ep-3,               .ln2 = 0x1.62e42fefa39efp-1,
+  .off = V2 (0x3fe6900900000000),      .huge_bound = V2 (0x5fe0000000000000),
+  .abs_mask = V2 (0x7fffffffffffffff), .mask = V2 (0xfffULL << 52),
  };
  
  static float64x2_t NOINLINE VPCS_ATTR
@@ -106,10 +81,9 @@ lookup (uint64x2_t i)
  }
  
  static inline float64x2_t
-log_inline (float64x2_t xm, const struct data *d)
+log_inline (float64x2_t ax, const struct data *d)
  {
-
-  uint64x2_t u = vreinterpretq_u64_f64 (xm);
+  uint64x2_t u = vreinterpretq_u64_f64 (ax);
    uint64x2_t u_off = vsubq_u64 (u, d->off);
  
    int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52);
@@ -143,10 +117,10 @@ log_inline (float64x2_t xm, const struct data *d)
     asinh(x) = sign(x) * log(|x| + sqrt(x^2 + 1)      if |x| >= 1
             = sign(x) * (|x| + |x|^3 * P(x^2))       otherwise
     where log(x) is an optimized log approximation, and P(x) is a polynomial
-   shared with the scalar routine. The greatest observed error 2.79 ULP, in
-   |x| >= 1:
-   _ZGVnN2v_asinh(0x1.2cd9d73ea76a6p+0) got 0x1.ffffd003219dap-1
-                                      want  0x1.ffffd003219ddp-1.  */
+   shared with the scalar routine.
+   For |x| >= 1, the greatest observed error is 2.87 ULP.
+   _ZGVnN2v_asinh(-0x1.177c6017ce58ap+0) got -0x1.e3ba3d5cb1a46p-1
+                                       want -0x1.e3ba3d5cb1a49p-1.  */
  VPCS_ATTR float64x2_t V_NAME_D1 (asinh) (float64x2_t x)
  {
    const struct data *d = ptr_barrier (&data);
@@ -154,47 +128,26 @@ VPCS_ATTR float64x2_t V_NAME_D1 (asinh) (float64x2_t x)
  
    uint64x2_t gt1 = vcgeq_f64 (ax, v_f64 (1));
  
-#if WANT_SIMD_EXCEPT
-  uint64x2_t iax = vreinterpretq_u64_f64 (ax);
-  uint64x2_t special = vcgeq_u64 (iax, (d->huge_bound));
-  uint64x2_t tiny = vcltq_f64 (ax, d->tiny_bound);
-  special = vorrq_u64 (special, tiny);
-#else
    uint64x2_t special = vcgeq_f64 (ax, vreinterpretq_f64_u64 (d->huge_bound));
-#endif
  
    /* Option 1: |x| >= 1.
-     Compute asinh(x) according by asinh(x) = log(x + sqrt(x^2 + 1)).
-     If WANT_SIMD_EXCEPT is enabled, sidestep special values, which will
-     overflow, by setting special lanes to 1. These will be fixed later.  */
+     Compute asinh(x) according by asinh(x) = log(x + sqrt(x^2 + 1)).  */
    float64x2_t option_1 = v_f64 (0);
    if (__glibc_likely (v_any_u64 (gt1)))
      {
-#if WANT_SIMD_EXCEPT
-      float64x2_t xm = v_zerofy_f64 (ax, special);
-#else
-      float64x2_t xm = ax;
-#endif
        option_1 = log_inline (
-         vaddq_f64 (xm, vsqrtq_f64 (vfmaq_f64 (v_f64 (1), xm, xm))), d);
+         vaddq_f64 (ax, vsqrtq_f64 (vfmaq_f64 (v_f64 (1), ax, ax))), d);
      }
  
    /* Option 2: |x| < 1.
       Compute asinh(x) using a polynomial.
-     If WANT_SIMD_EXCEPT is enabled, sidestep special lanes, which will
-     overflow, and tiny lanes, which will underflow, by setting them to 0. They
-     will be fixed later, either by selecting x or falling back to the scalar
-     special-case. The largest observed error in this region is 1.47 ULPs:
-     _ZGVnN2v_asinh(0x1.fdfcd00cc1e6ap-1) got 0x1.c1d6bf874019bp-1
-                                        want 0x1.c1d6bf874019cp-1.  */
+     The largest observed error in this region is 1.36 ULPs:
+     _ZGVnN2v_asinh(0x1.fe1e2aaa8dd54p-1) got 0x1.c1ee60bc0788ap-1
+                                        want 0x1.c1ee60bc0788bp-1.  */
    float64x2_t option_2 = v_f64 (0);
  
    if (__glibc_likely (v_any_u64 (vceqzq_u64 (gt1))))
      {
-
-#if WANT_SIMD_EXCEPT
-      ax = v_zerofy_f64 (ax, vorrq_u64 (tiny, gt1));
-#endif
        float64x2_t x2 = vmulq_f64 (ax, ax), z2 = vmulq_f64 (x2, x2);
        /* Order-17 Pairwise Horner scheme.  */
        float64x2_t c13 = vld1q_f64 (&d->c1);
@@ -216,25 +169,18 @@ VPCS_ATTR float64x2_t V_NAME_D1 (asinh) (float64x2_t x)
        p = vfmaq_f64 (p1213, z2, p);
        p = vfmaq_f64 (p1011, z2, p);
        p = vfmaq_f64 (p89, z2, p);
-
        p = vfmaq_f64 (p67, z2, p);
        p = vfmaq_f64 (p45, z2, p);
-
        p = vfmaq_f64 (p23, z2, p);
-
        p = vfmaq_f64 (p01, z2, p);
        option_2 = vfmaq_f64 (ax, p, vmulq_f64 (ax, x2));
-#if WANT_SIMD_EXCEPT
-      option_2 = vbslq_f64 (tiny, x, option_2);
-#endif
      }
  
    /* Choose the right option for each lane.  */
    float64x2_t y = vbslq_f64 (gt1, option_1, option_2);
    if (__glibc_unlikely (v_any_u64 (special)))
-    {
-      return special_case (x, y, d->abs_mask, special);
-    }
+    return special_case (x, y, d->abs_mask, special);
+
    /* Copy sign.  */
    return vbslq_f64 (d->abs_mask, y, x);
  }
diff --git a/sysdeps/aarch64/fpu/asinhf_advsimd.c b/sysdeps/aarch64/fpu/asinhf_advsimd.c

index cff66e58c1516c560a9dc289fe4becd777d50bbc..90b2d78b5effdb64ae801e12d4ef3019b30b306b 100644 (file)
--- a/sysdeps/aarch64/fpu/asinhf_advsimd.c
+++ b/sysdeps/aarch64/fpu/asinhf_advsimd.c
@@ -25,16 +25,10 @@ const static struct data
    struct v_log1pf_data log1pf_consts;
    float32x4_t one;
    uint32x4_t big_bound;
-#if WANT_SIMD_EXCEPT
-  uint32x4_t tiny_bound;
-#endif
  } data = {
    .one = V4 (1),
    .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE,
    .big_bound = V4 (0x5f800000), /* asuint(0x1p64).  */
-#if WANT_SIMD_EXCEPT
-  .tiny_bound = V4 (0x30800000) /* asuint(0x1p-30).  */
-#endif
  };
  
  static float32x4_t NOINLINE VPCS_ATTR
@@ -59,18 +53,6 @@ VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (asinh) (float32x4_t x)
    uint32x4_t iax = vreinterpretq_u32_f32 (ax);
    uint32x4_t special = vcgeq_u32 (iax, dat->big_bound);
    uint32x4_t sign = veorq_u32 (vreinterpretq_u32_f32 (x), iax);
-  float32x4_t special_arg = x;
-
-#if WANT_SIMD_EXCEPT
-  /* Sidestep tiny and large values to avoid inadvertently triggering
-     under/overflow.  */
-  special = vorrq_u32 (special, vcltq_u32 (iax, dat->tiny_bound));
-  if (__glibc_unlikely (v_any_u32 (special)))
-    {
-      ax = v_zerofy_f32 (ax, special);
-      x = v_zerofy_f32 (x, special);
-    }
-#endif
  
    /* asinh(x) = log(x + sqrt(x * x + 1)).
       For positive x, asinh(x) = log1p(x + x * x / (1 + sqrt(x * x + 1))).  */
@@ -79,7 +61,7 @@ VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (asinh) (float32x4_t x)
    float32x4_t y = vaddq_f32 (ax, vdivq_f32 (vmulq_f32 (ax, ax), d));
  
    if (__glibc_unlikely (v_any_u32 (special)))
-    return special_case (special_arg, sign, y, special, dat);
+    return special_case (x, sign, y, special, dat);
    return vreinterpretq_f32_u32 (veorq_u32 (
        sign, vreinterpretq_u32_f32 (log1pf_inline (y, &dat->log1pf_consts))));
  }
diff --git a/sysdeps/aarch64/fpu/atan_advsimd.c b/sysdeps/aarch64/fpu/atan_advsimd.c

index da0d3715dfc5d6570dd522d0f63ddc8cce946c58..72a4d03e8aa1d9daad6d804f00a5823d424c032c 100644 (file)
--- a/sysdeps/aarch64/fpu/atan_advsimd.c
+++ b/sysdeps/aarch64/fpu/atan_advsimd.c
@@ -64,15 +64,6 @@ float64x2_t VPCS_ATTR V_NAME_D1 (atan) (float64x2_t x)
    uint64x2_t ix = vreinterpretq_u64_f64 (x);
    uint64x2_t sign = vandq_u64 (ix, SignMask);
  
-#if WANT_SIMD_EXCEPT
-  uint64x2_t ia12 = vandq_u64 (ix, v_u64 (0x7ff0000000000000));
-  uint64x2_t special = vcgtq_u64 (vsubq_u64 (ia12, v_u64 (TinyBound)),
-                                 v_u64 (BigBound - TinyBound));
-  /* If any lane is special, fall back to the scalar routine for all lanes.  */
-  if (__glibc_unlikely (v_any_u64 (special)))
-    return v_call_f64 (atan, x, v_f64 (0), v_u64 (-1));
-#endif
-
    /* Argument reduction:
       y := arctan(x) for x < 1
       y := pi/2 + arctan(-1/x) for x > 1
diff --git a/sysdeps/aarch64/fpu/atanf_advsimd.c b/sysdeps/aarch64/fpu/atanf_advsimd.c

index 817a47ef3e34903961d5bbd3930a48f27a304bb7..c69c617b7d57e5a91bb477ed40d3a1b22698f38c 100644 (file)
--- a/sysdeps/aarch64/fpu/atanf_advsimd.c
+++ b/sysdeps/aarch64/fpu/atanf_advsimd.c
@@ -24,13 +24,6 @@ static const struct data
  {
    uint32x4_t sign_mask, pi_over_2;
    float32x4_t neg_one;
-#if WANT_SIMD_EXCEPT
-  float32x4_t poly[8];
-} data = {
-  .poly = { V4 (-0x1.5554dcp-2), V4 (0x1.9978ecp-3), V4 (-0x1.230a94p-3),
-           V4 (0x1.b4debp-4), V4 (-0x1.3550dap-4), V4 (0x1.61eebp-5),
-           V4 (-0x1.0c17d4p-6), V4 (0x1.7ea694p-9) },
-#else
    float32x4_t c0, c2, c4, c6;
    float c1, c3, c5, c7;
  } data = {
@@ -41,23 +34,10 @@ static const struct data
    .c2 = V4 (-0x1.230a94p-3),   .c3 = 0x1.b4debp-4,
    .c4 = V4 (-0x1.3550dap-4),   .c5 = 0x1.61eebp-5,
    .c6 = V4 (-0x1.0c17d4p-6),   .c7 = 0x1.7ea694p-9,
-#endif
-  .pi_over_2 = V4 (0x3fc90fdb),
-  .neg_one = V4 (-1.0f),
+  .pi_over_2 = V4 (0x3fc90fdb), .neg_one = V4 (-1.0f),
    .sign_mask = V4 (0x80000000),
  };
  
-#if WANT_SIMD_EXCEPT
-#define TinyBound 0x30800000 /* asuint(0x1p-30).  */
-#define BigBound 0x4e800000  /* asuint(0x1p30).  */
-
-static float32x4_t VPCS_ATTR NOINLINE
-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
-{
-  return v_call_f32 (atanf, x, y, special);
-}
-#endif
-
  /* Fast implementation of vector atanf based on
     atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1]
     using z=-1/x and shift = pi/2. Maximum observed error is 2.02 ulps:
@@ -70,17 +50,6 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (atan) (float32x4_t x)
    uint32x4_t ix = vreinterpretq_u32_f32 (x);
    uint32x4_t sign = vandq_u32 (ix, d->sign_mask);
  
-#if WANT_SIMD_EXCEPT
-  /* Small cases, infs and nans are supported by our approximation technique,
-     but do not set fenv flags correctly. Only trigger special case if we need
-     fenv.  */
-  uint32x4_t ia = vandq_u32 (ix, v_u32 (0x7ff00000));
-  uint32x4_t special = vcgtq_u32 (vsubq_u32 (ia, v_u32 (TinyBound)),
-                                 v_u32 (BigBound - TinyBound));
-  /* If any lane is special, fall back to the scalar routine for all lanes.  */
-  if (__glibc_unlikely (v_any_u32 (special)))
-    return special_case (x, x, v_u32 (-1));
-#endif
    /* Argument reduction:
       y := arctan(x) for |x| < 1
       y := arctan(-1/x) + pi/2 for x > +1
@@ -97,18 +66,6 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (atan) (float32x4_t x)
    float32x4_t z2 = vmulq_f32 (z, z);
    float32x4_t z3 = vmulq_f32 (z, z2);
    float32x4_t z4 = vmulq_f32 (z2, z2);
-#if WANT_SIMD_EXCEPT
-
-  /* Calculate the polynomial approximation.
-     Use 2-level Estrin scheme for P(z^2) with deg(P)=7. However,
-     a standard implementation using z8 creates spurious underflow
-     in the very last fma (when z^8 is small enough).
-     Therefore, we split the last fma into a mul and an fma.  */
-  float32x4_t y = vfmaq_f32 (
-      v_pairwise_poly_3_f32 (z2, z4, d->poly), z4,
-      vmulq_f32 (z4, v_pairwise_poly_3_f32 (z2, z4, d->poly + 4)));
-
-#else
    float32x4_t z8 = vmulq_f32 (z4, z4);
  
    /* Uses an Estrin scheme for polynomial approximation.  */
@@ -123,7 +80,6 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (atan) (float32x4_t x)
    float32x4_t p47 = vfmaq_f32 (p45, z4, p67);
  
    float32x4_t y = vfmaq_f32 (p03, z8, p47);
-#endif
  
    /* y = shift + z * P(z^2).  */
    return vfmaq_f32 (vaddq_f32 (shift, z), z3, y);
diff --git a/sysdeps/aarch64/fpu/atanh_advsimd.c b/sysdeps/aarch64/fpu/atanh_advsimd.c

index 7fba3094ca8896c26e6b4324ca8c58f0f73b6059..446decb5f86a6bc9f98056181cd41281efa75854 100644 (file)
--- a/sysdeps/aarch64/fpu/atanh_advsimd.c
+++ b/sysdeps/aarch64/fpu/atanh_advsimd.c
@@ -52,20 +52,12 @@ float64x2_t V_NAME_D1 (atanh) (float64x2_t x)
    uint64x2_t ia = vreinterpretq_u64_f64 (ax);
    uint64x2_t special = vcgeq_u64 (ia, d->one);
  
-#if WANT_SIMD_EXCEPT
-  ax = v_zerofy_f64 (ax, special);
-#endif
-
    float64x2_t y;
    y = vaddq_f64 (ax, ax);
    y = vdivq_f64 (y, vsubq_f64 (vreinterpretq_f64_u64 (d->one), ax));
  
    if (__glibc_unlikely (v_any_u64 (special)))
-#if WANT_SIMD_EXCEPT
-    return special_case (x, halfsign, y, special, d);
-#else
      return special_case (ax, halfsign, y, special, d);
-#endif
  
    y = log1p_inline (y, &d->log1p_consts);
    return vmulq_f64 (y, halfsign);
diff --git a/sysdeps/aarch64/fpu/atanhf_advsimd.c b/sysdeps/aarch64/fpu/atanhf_advsimd.c

index ef23b89485c07457b99772d90cc3480014c28984..93c41fbdc6f5dc530b6005c08befc7a691a596ab 100644 (file)
--- a/sysdeps/aarch64/fpu/atanhf_advsimd.c
+++ b/sysdeps/aarch64/fpu/atanhf_advsimd.c
@@ -24,16 +24,9 @@ const static struct data
  {
    struct v_log1pf_data log1pf_consts;
    uint32x4_t one;
-#if WANT_SIMD_EXCEPT
-  uint32x4_t tiny_bound;
-#endif
  } data = {
    .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE,
    .one = V4 (0x3f800000),
-#if WANT_SIMD_EXCEPT
-  /* 0x1p-12, below which atanhf(x) rounds to x.  */
-  .tiny_bound = V4 (0x39800000),
-#endif
  };
  
  #define AbsMask v_u32 (0x7fffffff)
@@ -59,16 +52,7 @@ VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (atanh) (float32x4_t x)
    float32x4_t ax = vabsq_f32 (x);
    uint32x4_t iax = vreinterpretq_u32_f32 (ax);
  
-#if WANT_SIMD_EXCEPT
-  uint32x4_t special
-      = vorrq_u32 (vcgeq_u32 (iax, d->one), vcltq_u32 (iax, d->tiny_bound));
-  /* Side-step special cases by setting those lanes to 0, which will trigger no
-     exceptions. These will be fixed up later.  */
-  if (__glibc_unlikely (v_any_u32 (special)))
-    ax = v_zerofy_f32 (ax, special);
-#else
    uint32x4_t special = vcgeq_u32 (iax, d->one);
-#endif
  
    float32x4_t y = vdivq_f32 (vaddq_f32 (ax, ax),
                              vsubq_f32 (vreinterpretq_f32_u32 (d->one), ax));
@@ -78,11 +62,7 @@ VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (atanh) (float32x4_t x)
       chain. If exceptions are required ax will have been zerofied, so have to
       pass x.  */
    if (__glibc_unlikely (v_any_u32 (special)))
-#if WANT_SIMD_EXCEPT
-    return special_case (x, halfsign, y, special);
-#else
      return special_case (ax, halfsign, y, special);
-#endif
    return vmulq_f32 (halfsign, y);
  }
  libmvec_hidden_def (V_NAME_F1 (atanh))
diff --git a/sysdeps/aarch64/fpu/cos_advsimd.c b/sysdeps/aarch64/fpu/cos_advsimd.c

index 0b5b7af45e5eeab4c2566ac94c1296099fef2458..58c2a1e0032e0e7c8d8db4b5fecf746ee69fe3b7 100644 (file)
--- a/sysdeps/aarch64/fpu/cos_advsimd.c
+++ b/sysdeps/aarch64/fpu/cos_advsimd.c
@@ -51,27 +51,15 @@ float64x2_t VPCS_ATTR V_NAME_D1 (cos) (float64x2_t x)
    float64x2_t n, r, r2, r3, r4, t1, t2, t3, y;
    uint64x2_t odd, cmp;
  
-#if WANT_SIMD_EXCEPT
-  r = vabsq_f64 (x);
-  cmp = vcgeq_u64 (vreinterpretq_u64_f64 (r),
-                  vreinterpretq_u64_f64 (d->range_val));
-  if (__glibc_unlikely (v_any_u64 (cmp)))
-    /* If fenv exceptions are to be triggered correctly, set any special lanes
-       to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
-       special-case handler later.  */
-    r = vbslq_f64 (cmp, v_f64 (1.0), r);
-#else
    cmp = vcageq_f64 (x, d->range_val);
-  r = x;
-#endif
  
    /* n = rint((|x|+pi/2)/pi) - 0.5.  */
-  n = vrndaq_f64 (vfmaq_f64 (v_f64 (0.5), r, d->inv_pi));
+  n = vrndaq_f64 (vfmaq_f64 (v_f64 (0.5), x, d->inv_pi));
    odd = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtq_s64_f64 (n)), 63);
    n = vsubq_f64 (n, v_f64 (0.5f));
  
    /* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2).  */
-  r = vfmsq_f64 (r, d->pi_1, n);
+  r = vfmsq_f64 (x, d->pi_1, n);
    r = vfmsq_f64 (r, d->pi_2, n);
    r = vfmsq_f64 (r, d->pi_3, n);
  
diff --git a/sysdeps/aarch64/fpu/cosf_advsimd.c b/sysdeps/aarch64/fpu/cosf_advsimd.c

index 4d5167995f71e6cde84eb5018ade38fa40399716..39464101f0d657094824d995416c1a4381afaf7b 100644 (file)
--- a/sysdeps/aarch64/fpu/cosf_advsimd.c
+++ b/sysdeps/aarch64/fpu/cosf_advsimd.c
@@ -52,27 +52,15 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (cos) (float32x4_t x)
    float32x4_t n, r, r2, r3, y;
    uint32x4_t odd, cmp;
  
-#if WANT_SIMD_EXCEPT
-  r = vabsq_f32 (x);
-  cmp = vcgeq_u32 (vreinterpretq_u32_f32 (r),
-                  vreinterpretq_u32_f32 (d->range_val));
-  if (__glibc_unlikely (v_any_u32 (cmp)))
-    /* If fenv exceptions are to be triggered correctly, set any special lanes
-       to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
-       special-case handler later.  */
-    r = vbslq_f32 (cmp, v_f32 (1.0f), r);
-#else
    cmp = vcageq_f32 (x, d->range_val);
-  r = x;
-#endif
  
    /* n = rint((|x|+pi/2)/pi) - 0.5.  */
-  n = vrndaq_f32 (vfmaq_f32 (v_f32 (0.5), r, d->inv_pi));
+  n = vrndaq_f32 (vfmaq_f32 (v_f32 (0.5), x, d->inv_pi));
    odd = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 31);
    n = vsubq_f32 (n, v_f32 (0.5f));
  
    /* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2).  */
-  r = vfmsq_f32 (r, d->pi_1, n);
+  r = vfmsq_f32 (x, d->pi_1, n);
    r = vfmsq_f32 (r, d->pi_2, n);
    r = vfmsq_f32 (r, d->pi_3, n);
  
diff --git a/sysdeps/aarch64/fpu/coshf_advsimd.c b/sysdeps/aarch64/fpu/coshf_advsimd.c

index 35e1562a6ee198e0a2c612f905e53d3f5e478f57..3c5c5c4ba5f231891fef1fbb9a648f85a9814eb8 100644 (file)
--- a/sysdeps/aarch64/fpu/coshf_advsimd.c
+++ b/sysdeps/aarch64/fpu/coshf_advsimd.c
@@ -23,70 +23,37 @@
  static const struct data
  {
    struct v_expf_data expf_consts;
-  uint32x4_t tiny_bound;
    float32x4_t bound;
-#if WANT_SIMD_EXCEPT
-  uint32x4_t special_bound;
-#endif
  } data = {
    .expf_consts = V_EXPF_DATA,
-  .tiny_bound = V4 (0x20000000), /* 0x1p-63: Round to 1 below this.  */
    /* 0x1.5a92d8p+6: expf overflows above this, so have to use special case.  */
    .bound = V4 (0x1.5a92d8p+6),
-#if WANT_SIMD_EXCEPT
-  .special_bound = V4 (0x42ad496c),
-#endif
  };
  
-#if !WANT_SIMD_EXCEPT
  static float32x4_t NOINLINE VPCS_ATTR
  special_case (float32x4_t x, float32x4_t half_t, float32x4_t half_over_t,
               uint32x4_t special)
  {
    return v_call_f32 (coshf, x, vaddq_f32 (half_t, half_over_t), special);
  }
-#endif
  
  /* Single-precision vector cosh, using vector expf.
     Maximum error is 2.38 ULP:
     _ZGVnN4v_coshf (0x1.e8001ep+1) got 0x1.6a491ep+4
                                  want 0x1.6a4922p+4.  */
-float32x4_t VPCS_ATTR V_NAME_F1 (cosh) (float32x4_t x)
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (cosh) (float32x4_t x)
  {
    const struct data *d = ptr_barrier (&data);
  
-#if WANT_SIMD_EXCEPT
-  /* If fp exceptions are to be triggered correctly, fall back to the scalar
-     variant for all inputs if any input is a special value or above the bound
-     at which expf overflows.  */
-  float32x4_t ax = vabsq_f32 (x);
-  uint32x4_t iax = vreinterpretq_u32_f32 (ax);
-  uint32x4_t special = vcgeq_u32 (iax, d->special_bound);
-  if (__glibc_unlikely (v_any_u32 (special)))
-    return v_call_f32 (coshf, x, x, v_u32 (-1));
-
-  uint32x4_t tiny = vcleq_u32 (iax, d->tiny_bound);
-  /* If any input is tiny, avoid underflow exception by fixing tiny lanes of
-     input to 0, which will generate no exceptions.  */
-  if (__glibc_unlikely (v_any_u32 (tiny)))
-    ax = v_zerofy_f32 (ax, tiny);
-  float32x4_t t = v_expf_inline (ax, &d->expf_consts);
-#else
    uint32x4_t special = vcageq_f32 (x, d->bound);
    float32x4_t t = v_expf_inline (x, &d->expf_consts);
-#endif
  
    /* Calculate cosh by exp(x) / 2 + exp(-x) / 2.  */
    float32x4_t half_t = vmulq_n_f32 (t, 0.5);
    float32x4_t half_over_t = vdivq_f32 (v_f32 (0.5), t);
  
-#if WANT_SIMD_EXCEPT
-  if (__glibc_unlikely (v_any_u32 (tiny)))
-    return vbslq_f32 (tiny, v_f32 (1), vaddq_f32 (half_t, half_over_t));
-#else
    if (__glibc_unlikely (v_any_u32 (special)))
      return special_case (x, half_t, half_over_t, special);
-#endif
  
    return vaddq_f32 (half_t, half_over_t);
  }
diff --git a/sysdeps/aarch64/fpu/cospi_advsimd.c b/sysdeps/aarch64/fpu/cospi_advsimd.c

index dcd12c8f89343ae8b9c729306ed05c7a879605a6..5d03cca53951c85fd48115f505a701f15891ee30 100644 (file)
--- a/sysdeps/aarch64/fpu/cospi_advsimd.c
+++ b/sysdeps/aarch64/fpu/cospi_advsimd.c
@@ -51,24 +51,11 @@ float64x2_t VPCS_ATTR V_NAME_D1 (cospi) (float64x2_t x)
  {
    const struct data *d = ptr_barrier (&data);
  
-#if WANT_SIMD_EXCEPT
-  float64x2_t r = vabsq_f64 (x);
-  uint64x2_t cmp = vcaleq_f64 (v_f64 (0x1p64), x);
-
-  /* When WANT_SIMD_EXCEPT = 1, special lanes should be zero'd
-     to avoid them overflowing and throwing exceptions.  */
-  r = v_zerofy_f64 (r, cmp);
-  uint64x2_t odd = vshlq_n_u64 (vcvtnq_u64_f64 (r), 63);
-
-#else
-  float64x2_t r = x;
-  uint64x2_t cmp = vcageq_f64 (r, d->range_val);
+  uint64x2_t cmp = vcageq_f64 (x, d->range_val);
    uint64x2_t odd
-      = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtaq_s64_f64 (r)), 63);
-
-#endif
+      = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtaq_s64_f64 (x)), 63);
  
-  r = vsubq_f64 (r, vrndaq_f64 (r));
+  float64x2_t r = vsubq_f64 (x, vrndaq_f64 (x));
  
    /* cospi(x) = sinpi(0.5 - abs(x)) for values -1/2 .. 1/2.  */
    r = vsubq_f64 (v_f64 (0.5), vabsq_f64 (r));
diff --git a/sysdeps/aarch64/fpu/cospif_advsimd.c b/sysdeps/aarch64/fpu/cospif_advsimd.c

index a81471f40848e0d3ecb3ac6361b178794bec6e52..155ff0e1b6318dd8bfdf9f16e2514f03f1562a9c 100644 (file)
--- a/sysdeps/aarch64/fpu/cospif_advsimd.c
+++ b/sysdeps/aarch64/fpu/cospif_advsimd.c
@@ -42,30 +42,16 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp)
      Maximum Error: 3.17 ULP:
      _ZGVnN4v_cospif(0x1.d341a8p-5) got 0x1.f7cd56p-1
                                   want 0x1.f7cd5p-1.  */
-float32x4_t VPCS_ATTR V_NAME_F1 (cospi) (float32x4_t x)
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (cospi) (float32x4_t x)
  {
    const struct data *d = ptr_barrier (&data);
  
-#if WANT_SIMD_EXCEPT
-  float32x4_t r = vabsq_f32 (x);
-  uint32x4_t cmp = vcaleq_f32 (v_f32 (0x1p32f), x);
-
-  /* When WANT_SIMD_EXCEPT = 1, special lanes should be zero'd
-     to avoid them overflowing and throwing exceptions.  */
-  r = v_zerofy_f32 (r, cmp);
-  uint32x4_t odd = vshlq_n_u32 (vcvtnq_u32_f32 (r), 31);
-
-#else
-  float32x4_t r = x;
-  uint32x4_t cmp = vcageq_f32 (r, d->range_val);
-
+  uint32x4_t cmp = vcageq_f32 (x, d->range_val);
    uint32x4_t odd
-      = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (r)), 31);
-
-#endif
+      = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), 31);
  
    /* r = x - rint(x).  */
-  r = vsubq_f32 (r, vrndaq_f32 (r));
+  float32x4_t r = vsubq_f32 (x, vrndaq_f32 (x));
  
    /* cospi(x) = sinpi(0.5 - abs(x)) for values -1/2 .. 1/2.  */
    r = vsubq_f32 (v_f32 (0.5f), vabsq_f32 (r));
@@ -82,6 +68,5 @@ float32x4_t VPCS_ATTR V_NAME_F1 (cospi) (float32x4_t x)
    /* Reintroduce the sign bit for inputs which round to odd.  */
    return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
  }
-
  libmvec_hidden_def (V_NAME_F1 (cospi))
  HALF_WIDTH_ALIAS_F1 (cospi)
diff --git a/sysdeps/aarch64/fpu/erf_advsimd.c b/sysdeps/aarch64/fpu/erf_advsimd.c

index ec1ad0bcd0f8608ba113e28e6c7e499bdaa6ca49..fe7c03ed9d1ea6729ae2dd0e771451d48c3c44a3 100644 (file)
--- a/sysdeps/aarch64/fpu/erf_advsimd.c
+++ b/sysdeps/aarch64/fpu/erf_advsimd.c
@@ -23,12 +23,9 @@ static const struct data
  {
    float64x2_t third;
    float64x2_t tenth, two_over_five, two_over_nine;
-  double two_over_fifteen, two_over_fortyfive;
+  double two_over_fifteen;
    float64x2_t max, shift;
    uint64x2_t max_idx;
-#if WANT_SIMD_EXCEPT
-  float64x2_t tiny_bound, huge_bound, scale_minus_one;
-#endif
  } data = {
    .max_idx = V2 (768),
    .third = V2 (0x1.5555555555556p-2), /* used to compute 2/3 and 1/6 too.  */
@@ -36,14 +33,8 @@ static const struct data
    .tenth = V2 (-0x1.999999999999ap-4),
    .two_over_five = V2 (-0x1.999999999999ap-2),
    .two_over_nine = V2 (-0x1.c71c71c71c71cp-3),
-  .two_over_fortyfive = 0x1.6c16c16c16c17p-5,
    .max = V2 (5.9921875), /* 6 - 1/128.  */
    .shift = V2 (0x1p45),
-#if WANT_SIMD_EXCEPT
-  .huge_bound = V2 (0x1p205),
-  .tiny_bound = V2 (0x1p-226),
-  .scale_minus_one = V2 (0x1.06eba8214db69p-3), /* 2/sqrt(pi) - 1.0.  */
-#endif
  };
  
  #define AbsMask 0x7fffffffffffffff
@@ -92,22 +83,6 @@ float64x2_t VPCS_ATTR V_NAME_D1 (erf) (float64x2_t x)
    uint64x2_t a_le_max = vcaleq_f64 (x, dat->max);
    uint64x2_t a_gt_max = vcagtq_f64 (x, dat->max);
  
-#if WANT_SIMD_EXCEPT
-  /* |x| huge or tiny.  */
-  uint64x2_t cmp1 = vcgtq_f64 (a, dat->huge_bound);
-  uint64x2_t cmp2 = vcltq_f64 (a, dat->tiny_bound);
-  uint64x2_t cmp = vorrq_u64 (cmp1, cmp2);
-  /* If any lanes are special, mask them with 1 for small x or 8 for large
-     values and retain a copy of a to allow special case handler to fix special
-     lanes later. This is only necessary if fenv exceptions are to be triggered
-     correctly.  */
-  if (__glibc_unlikely (v_any_u64 (cmp)))
-    {
-      a = vbslq_f64 (cmp1, v_f64 (8.0), a);
-      a = vbslq_f64 (cmp2, v_f64 (1.0), a);
-    }
-#endif
-
    /* Set r to multiple of 1/128 nearest to |x|.  */
    float64x2_t shift = dat->shift;
    float64x2_t z = vaddq_f64 (a, shift);
@@ -153,16 +128,5 @@ float64x2_t VPCS_ATTR V_NAME_D1 (erf) (float64x2_t x)
    y = vbslq_f64 (a_gt_max, v_f64 (1.0), y);
  
    /* Copy sign.  */
-  y = vbslq_f64 (v_u64 (AbsMask), y, x);
-
-#if WANT_SIMD_EXCEPT
-  if (__glibc_unlikely (v_any_u64 (cmp2)))
-    {
-      /* Neutralise huge values of x before fixing small values.  */
-      x = vbslq_f64 (cmp1, v_f64 (1.0), x);
-      /* Fix tiny values that trigger spurious underflow.  */
-      return vbslq_f64 (cmp2, vfmaq_f64 (x, dat->scale_minus_one, x), y);
-    }
-#endif
-  return y;
+  return vbslq_f64 (v_u64 (AbsMask), y, x);
  }
diff --git a/sysdeps/aarch64/fpu/erfc_advsimd.c b/sysdeps/aarch64/fpu/erfc_advsimd.c

index 6b0fadb658a0c31878b8bc343205c88daf85787c..2b80e06d9d289f3a0e66553f3590b7560f46798a 100644 (file)
--- a/sysdeps/aarch64/fpu/erfc_advsimd.c
+++ b/sysdeps/aarch64/fpu/erfc_advsimd.c
@@ -27,9 +27,6 @@ static const struct data
    float64x2_t p20, p40, p41, p51;
    double p42, p52;
    double qr5[2], qr6[2], qr7[2], qr8[2], qr9[2];
-#if WANT_SIMD_EXCEPT
-  float64x2_t uflow_bound;
-#endif
  } data = {
    /* Set an offset so the range of the index used for lookup is 3487, and it
       can be clamped using a saturated add on an offset index.
@@ -50,12 +47,8 @@ static const struct data
    .qr7 = { 0x1.2492492492492p0, -0x1.8e38e38e38e39p-3 },
    .qr8 = { 0x1.2p0, -0x1.6c16c16c16c17p-3 },
    .qr9 = { 0x1.1c71c71c71c72p0, -0x1.4f2094f2094f2p-3 },
-#if WANT_SIMD_EXCEPT
-  .uflow_bound = V2 (0x1.a8b12fc6e4892p+4),
-#endif
  };
  
-#define TinyBound 0x4000000000000000 /* 0x1p-511 << 1.  */
  #define Off 0xfffffffffffff260      /* 0xffffffffffffffff - 3487.  */
  
  struct entry
@@ -77,14 +70,6 @@ lookup (uint64x2_t i)
    return e;
  }
  
-#if WANT_SIMD_EXCEPT
-static float64x2_t VPCS_ATTR NOINLINE
-special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp)
-{
-  return v_call_f64 (erfc, x, y, cmp);
-}
-#endif
-
  /* Optimized double-precision vector erfc(x).
     Approximation based on series expansion near x rounded to
     nearest multiple of 1/128.
@@ -116,23 +101,6 @@ float64x2_t V_NAME_D1 (erfc) (float64x2_t x)
  {
    const struct data *dat = ptr_barrier (&data);
  
-#if WANT_SIMD_EXCEPT
-  /* |x| < 2^-511. Avoid fabs by left-shifting by 1.  */
-  uint64x2_t ix = vreinterpretq_u64_f64 (x);
-  uint64x2_t cmp = vcltq_u64 (vaddq_u64 (ix, ix), v_u64 (TinyBound));
-  /* x >= ~26.54 (into subnormal case and uflow case). Comparison is done in
-     integer domain to avoid raising exceptions in presence of nans.  */
-  uint64x2_t uflow = vcgeq_s64 (vreinterpretq_s64_f64 (x),
-                               vreinterpretq_s64_f64 (dat->uflow_bound));
-  cmp = vorrq_u64 (cmp, uflow);
-  float64x2_t xm = x;
-  /* If any lanes are special, mask them with 0 and retain a copy of x to allow
-     special case handler to fix special lanes later. This is only necessary if
-     fenv exceptions are to be triggered correctly.  */
-  if (__glibc_unlikely (v_any_u64 (cmp)))
-    x = v_zerofy_f64 (x, cmp);
-#endif
-
    float64x2_t a = vabsq_f64 (x);
    a = vminq_f64 (a, dat->max);
  
@@ -198,10 +166,5 @@ float64x2_t V_NAME_D1 (erfc) (float64x2_t x)
    float64x2_t fac = vreinterpretq_f64_u64 (
        vsraq_n_u64 (vshlq_n_u64 (sign, 63), dat->table_scale, 1));
  
-#if WANT_SIMD_EXCEPT
-  if (__glibc_unlikely (v_any_u64 (cmp)))
-    return special_case (xm, vfmaq_f64 (off, fac, y), cmp);
-#endif
-
    return vfmaq_f64 (off, fac, y);
  }
diff --git a/sysdeps/aarch64/fpu/erfcf_advsimd.c b/sysdeps/aarch64/fpu/erfcf_advsimd.c

index c4690042db30aab7933d4f20f8b05a585a75231d..db2d217d7fbfc55f2711570872091d9a7f65c177 100644 (file)
--- a/sysdeps/aarch64/fpu/erfcf_advsimd.c
+++ b/sysdeps/aarch64/fpu/erfcf_advsimd.c
@@ -25,10 +25,6 @@ static const struct data
    float32x4_t max, shift;
    float coeffs[4];
    float32x4_t third, two_over_five, tenth;
-#if WANT_SIMD_EXCEPT
-  float32x4_t uflow_bound;
-#endif
-
  } data = {
    /* Set an offset so the range of the index used for lookup is 644, and it can
       be clamped using a saturated add.  */
@@ -42,13 +38,8 @@ static const struct data
    .third = V4 (0x1.555556p-2f),
    .two_over_five = V4 (-0x1.99999ap-2f),
    .tenth = V4 (-0x1.99999ap-4f),
-#if WANT_SIMD_EXCEPT
-  .uflow_bound = V4 (0x1.2639cp+3f),
-#endif
  };
  
-#define TinyBound 0x41000000 /* 0x1p-62f << 1.  */
-#define Thres 0xbe000000     /* asuint(infinity) << 1 - TinyBound.  */
  #define Off 0xfffffd7b      /* 0xffffffff - 644.  */
  
  struct entry
@@ -76,14 +67,6 @@ lookup (uint32x4_t i)
    return e;
  }
  
-#if WANT_SIMD_EXCEPT
-static float32x4_t VPCS_ATTR NOINLINE
-special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
-{
-  return v_call_f32 (erfcf, x, y, cmp);
-}
-#endif
-
  /* Optimized single-precision vector erfcf(x).
     Approximation based on series expansion near x rounded to
     nearest multiple of 1/64.
@@ -106,23 +89,6 @@ float32x4_t NOINLINE V_NAME_F1 (erfc) (float32x4_t x)
  {
    const struct data *dat = ptr_barrier (&data);
  
-#if WANT_SIMD_EXCEPT
-  /* |x| < 2^-62. Avoid fabs by left-shifting by 1.  */
-  uint32x4_t ix = vreinterpretq_u32_f32 (x);
-  uint32x4_t cmp = vcltq_u32 (vaddq_u32 (ix, ix), v_u32 (TinyBound));
-  /* x >= ~9.19 (into subnormal case and uflow case). Comparison is done in
-     integer domain to avoid raising exceptions in presence of nans.  */
-  uint32x4_t uflow = vcgeq_s32 (vreinterpretq_s32_f32 (x),
-                               vreinterpretq_s32_f32 (dat->uflow_bound));
-  cmp = vorrq_u32 (cmp, uflow);
-  float32x4_t xm = x;
-  /* If any lanes are special, mask them with 0 and retain a copy of x to allow
-     special case handler to fix special lanes later. This is only necessary if
-     fenv exceptions are to be triggered correctly.  */
-  if (__glibc_unlikely (v_any_u32 (cmp)))
-    x = v_zerofy_f32 (x, cmp);
-#endif
-
    float32x4_t a = vabsq_f32 (x);
    a = vminq_f32 (a, dat->max);
  
@@ -165,11 +131,6 @@ float32x4_t NOINLINE V_NAME_F1 (erfc) (float32x4_t x)
    float32x4_t fac = vreinterpretq_f32_u32 (
        vsraq_n_u32 (vshlq_n_u32 (sign, 31), dat->table_scale, 1));
  
-#if WANT_SIMD_EXCEPT
-  if (__glibc_unlikely (v_any_u32 (cmp)))
-    return special_case (xm, vfmaq_f32 (off, fac, y), cmp);
-#endif
-
    return vfmaq_f32 (off, fac, y);
  }
  libmvec_hidden_def (V_NAME_F1 (erfc))
diff --git a/sysdeps/aarch64/fpu/erff_advsimd.c b/sysdeps/aarch64/fpu/erff_advsimd.c

index 72e49a726933684990d3c51e152c60369d9242ca..2ecce8c1c10316a93b19fdcf6a19f656857f14d4 100644 (file)
--- a/sysdeps/aarch64/fpu/erff_advsimd.c
+++ b/sysdeps/aarch64/fpu/erff_advsimd.c
@@ -22,17 +22,10 @@
  static const struct data
  {
    float32x4_t max, shift, third;
-#if WANT_SIMD_EXCEPT
-  float32x4_t tiny_bound, scale_minus_one;
-#endif
  } data = {
    .max = V4 (3.9375), /* 4 - 8/128.  */
    .shift = V4 (0x1p16f),
    .third = V4 (0x1.555556p-2f), /* 1/3.  */
-#if WANT_SIMD_EXCEPT
-  .tiny_bound = V4 (0x1p-62f),
-  .scale_minus_one = V4 (0x1.06eba8p-3f), /* scale - 1.0.  */
-#endif
  };
  
  #define AbsMask 0x7fffffff
@@ -75,17 +68,6 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (erf) (float32x4_t x)
  {
    const struct data *dat = ptr_barrier (&data);
  
-#if WANT_SIMD_EXCEPT
-  /* |x| < 2^-62.  */
-  uint32x4_t cmp = vcaltq_f32 (x, dat->tiny_bound);
-  float32x4_t xm = x;
-  /* If any lanes are special, mask them with 1 and retain a copy of x to allow
-     special case handler to fix special lanes later. This is only necessary if
-     fenv exceptions are to be triggered correctly.  */
-  if (__glibc_unlikely (v_any_u32 (cmp)))
-    x = vbslq_f32 (cmp, v_f32 (1), x);
-#endif
-
    float32x4_t a = vabsq_f32 (x);
    uint32x4_t a_gt_max = vcgtq_f32 (a, dat->max);
  
@@ -111,13 +93,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (erf) (float32x4_t x)
    y = vbslq_f32 (a_gt_max, v_f32 (1.0f), y);
  
    /* Copy sign.  */
-  y = vbslq_f32 (v_u32 (AbsMask), y, x);
-
-#if WANT_SIMD_EXCEPT
-  if (__glibc_unlikely (v_any_u32 (cmp)))
-    return vbslq_f32 (cmp, vfmaq_f32 (xm, dat->scale_minus_one, xm), y);
-#endif
-  return y;
+  return vbslq_f32 (v_u32 (AbsMask), y, x);
  }
  libmvec_hidden_def (V_NAME_F1 (erf))
  HALF_WIDTH_ALIAS_F1 (erf)
diff --git a/sysdeps/aarch64/fpu/exp10_advsimd.c b/sysdeps/aarch64/fpu/exp10_advsimd.c

index 141778749b826c20490c02954b1aea708b4dd23a..c08daf092ad649e6c86621dacf1e22ad813bfd4e 100644 (file)
--- a/sysdeps/aarch64/fpu/exp10_advsimd.c
+++ b/sysdeps/aarch64/fpu/exp10_advsimd.c
@@ -28,9 +28,7 @@ const static struct data
  {
    float64x2_t poly[4];
    float64x2_t log10_2, log2_10_hi, log2_10_lo, shift;
-#if !WANT_SIMD_EXCEPT
    float64x2_t special_bound, scale_thresh;
-#endif
  } data = {
    /* Coefficients generated using Remez algorithm.
       rel error: 0x1.5ddf8f28p-54
@@ -42,31 +40,13 @@ const static struct data
    .log2_10_hi = V2 (0x1.34413509f79ffp-9), /* log2(10)/N.  */
    .log2_10_lo = V2 (-0x1.9dc1da994fd21p-66),
    .shift = V2 (0x1.8p+52),
-#if !WANT_SIMD_EXCEPT
    .scale_thresh = V2 (ScaleBound),
    .special_bound = V2 (SpecialBound),
-#endif
  };
  
  #define N (1 << V_EXP_TABLE_BITS)
  #define IndexMask v_u64 (N - 1)
  
-#if WANT_SIMD_EXCEPT
-
-# define TinyBound v_u64 (0x2000000000000000) /* asuint64 (0x1p-511).  */
-# define BigBound v_u64 (0x4070000000000000)  /* asuint64 (0x1p8).  */
-# define Thres v_u64 (0x2070000000000000)     /* BigBound - TinyBound.  */
-
-static float64x2_t VPCS_ATTR NOINLINE
-special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp)
-{
-  /* If fenv exceptions are to be triggered correctly, fall back to the scalar
-     routine for special lanes.  */
-  return v_call_f64 (exp10, x, y, cmp);
-}
-
-#else
-
  # define SpecialOffset v_u64 (0x6000000000000000) /* 0x1p513.  */
  /* SpecialBias1 + SpecialBias1 = asuint(1.0).  */
  # define SpecialBias1 v_u64 (0x7000000000000000)  /* 0x1p769.  */
@@ -87,8 +67,6 @@ special_case (float64x2_t s, float64x2_t y, float64x2_t n,
    return vbslq_f64 (cmp, r1, r0);
  }
  
-#endif
-
  /* Fast vector implementation of exp10.
     Maximum measured error is 1.64 ulp.
     _ZGVnN2v_exp10(0x1.ccd1c9d82cc8cp+0) got 0x1.f8dab6d7fed0cp+5
@@ -96,19 +74,7 @@ special_case (float64x2_t s, float64x2_t y, float64x2_t n,
  float64x2_t VPCS_ATTR V_NAME_D1 (exp10) (float64x2_t x)
  {
    const struct data *d = ptr_barrier (&data);
-  uint64x2_t cmp;
-#if WANT_SIMD_EXCEPT
-  /* If any lanes are special, mask them with 1 and retain a copy of x to allow
-     special_case to fix special lanes later. This is only necessary if fenv
-     exceptions are to be triggered correctly.  */
-  float64x2_t xm = x;
-  uint64x2_t iax = vreinterpretq_u64_f64 (vabsq_f64 (x));
-  cmp = vcgeq_u64 (vsubq_u64 (iax, TinyBound), Thres);
-  if (__glibc_unlikely (v_any_u64 (cmp)))
-    x = vbslq_f64 (cmp, v_f64 (1), x);
-#else
-  cmp = vcageq_f64 (x, d->special_bound);
-#endif
+  uint64x2_t cmp = vcageq_f64 (x, d->special_bound);
  
    /* n = round(x/(log10(2)/N)).  */
    float64x2_t z = vfmaq_f64 (d->shift, x, d->log10_2);
@@ -135,11 +101,7 @@ float64x2_t VPCS_ATTR V_NAME_D1 (exp10) (float64x2_t x)
    float64x2_t s = vreinterpretq_f64_u64 (vaddq_u64 (u, e));
  
    if (__glibc_unlikely (v_any_u64 (cmp)))
-#if WANT_SIMD_EXCEPT
-    return special_case (xm, vfmaq_f64 (s, y, s), cmp);
-#else
      return special_case (s, y, n, d);
-#endif
  
    return vfmaq_f64 (s, y, s);
  }
diff --git a/sysdeps/aarch64/fpu/exp10f_advsimd.c b/sysdeps/aarch64/fpu/exp10f_advsimd.c

index 47726ede246afc70117f93a6378ab4ce1053ad44..4eab670171eeb2bbc0ddcb16e89693c196ad3a96 100644 (file)
--- a/sysdeps/aarch64/fpu/exp10f_advsimd.c
+++ b/sysdeps/aarch64/fpu/exp10f_advsimd.c
@@ -27,9 +27,7 @@ static const struct data
    float log10_2_high, log10_2_low, c2, c4;
    float32x4_t inv_log10_2, special_bound;
    uint32x4_t exponent_bias, special_offset, special_bias;
-#if !WANT_SIMD_EXCEPT
    float32x4_t scale_thresh;
-#endif
  } data = {
    /* Coefficients generated using Remez algorithm with minimisation of relative
       error.
@@ -49,28 +47,9 @@ static const struct data
    .exponent_bias = V4 (0x3f800000),
    .special_offset = V4 (0x82000000),
    .special_bias = V4 (0x7f000000),
-#if !WANT_SIMD_EXCEPT
    .scale_thresh = V4 (ScaleBound)
-#endif
  };
  
-#if WANT_SIMD_EXCEPT
-
-# define SpecialBound 38.0f           /* rint(log10(2^127)).  */
-# define TinyBound v_u32 (0x20000000) /* asuint (0x1p-63).  */
-# define BigBound v_u32 (0x42180000)  /* asuint (SpecialBound).  */
-# define Thres v_u32 (0x22180000)     /* BigBound - TinyBound.  */
-
-static float32x4_t VPCS_ATTR NOINLINE
-special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
-{
-  /* If fenv exceptions are to be triggered correctly, fall back to the scalar
-     routine to special lanes.  */
-  return v_call_f32 (exp10f, x, y, cmp);
-}
-
-#else
-
  # define SpecialBound 126.0f
  
  static float32x4_t VPCS_ATTR NOINLINE
@@ -90,8 +69,6 @@ special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
    return vbslq_f32 (cmp2, r2, r);
  }
  
-#endif
-
  /* Fast vector implementation of single-precision exp10.
     Algorithm is accurate to 2.36 ULP.
     _ZGVnN4v_exp10f(0x1.be2b36p+1) got 0x1.7e79c4p+11
@@ -99,17 +76,6 @@ special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
  float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp10) (float32x4_t x)
  {
    const struct data *d = ptr_barrier (&data);
-#if WANT_SIMD_EXCEPT
-  /* asuint(x) - TinyBound >= BigBound - TinyBound.  */
-  uint32x4_t cmp = vcgeq_u32 (
-      vsubq_u32 (vreinterpretq_u32_f32 (vabsq_f32 (x)), TinyBound), Thres);
-  float32x4_t xm = x;
-  /* If any lanes are special, mask them with 1 and retain a copy of x to allow
-     special case handler to fix special lanes later. This is only necessary if
-     fenv exceptions are to be triggered correctly.  */
-  if (__glibc_unlikely (v_any_u32 (cmp)))
-    x = v_zerofy_f32 (x, cmp);
-#endif
  
    /* exp10(x) = 2^n * 10^r = 2^n * (1 + poly (r)),
       with poly(r) in [1/sqrt(2), sqrt(2)] and
@@ -122,9 +88,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp10) (float32x4_t x)
  
    float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
  
-#if !WANT_SIMD_EXCEPT
    uint32x4_t cmp = vcagtq_f32 (n, d->special_bound);
-#endif
  
    float32x4_t r2 = vmulq_f32 (r, r);
    float32x4_t p12 = vfmaq_laneq_f32 (d->c1, r, log10_2_c24, 2);
@@ -133,12 +97,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp10) (float32x4_t x)
    float32x4_t poly = vfmaq_f32 (vmulq_f32 (r, d->c0), p14, r2);
  
    if (__glibc_unlikely (v_any_u32 (cmp)))
-#if WANT_SIMD_EXCEPT
-    return special_case (xm, vfmaq_f32 (scale, poly, scale), cmp);
-#else
      return special_case (poly, n, e, cmp, scale, d);
-#endif
-
    return vfmaq_f32 (scale, poly, scale);
  }
  libmvec_hidden_def (V_NAME_F1 (exp10))
diff --git a/sysdeps/aarch64/fpu/exp2_advsimd.c b/sysdeps/aarch64/fpu/exp2_advsimd.c

index 71affe8f5c458612d23fa44c5af3fa345f00feaf..a7c1222c5e8614f54d87eec500600cdeb6c13b67 100644 (file)
--- a/sysdeps/aarch64/fpu/exp2_advsimd.c
+++ b/sysdeps/aarch64/fpu/exp2_advsimd.c
@@ -47,19 +47,6 @@ lookup_sbits (uint64x2_t i)
                        __v_exp_data[i[1] & IndexMask] };
  }
  
-#if WANT_SIMD_EXCEPT
-
-# define Thres 0x2080000000000000     /* asuint64(512.0) - TinyBound.  */
-
-/* Call scalar exp2 as a fallback.  */
-static float64x2_t VPCS_ATTR NOINLINE
-special_case (float64x2_t x, float64x2_t y, uint64x2_t is_special)
-{
-  return v_call_f64 (exp2, x, y, is_special);
-}
-
-#else
-
  # define SpecialOffset 0x6000000000000000 /* 0x1p513.  */
  /* SpecialBias1 + SpecialBias1 = asuint(1.0).  */
  # define SpecialBias1 0x7000000000000000 /* 0x1p769.  */
@@ -80,8 +67,6 @@ special_case (float64x2_t s, float64x2_t y, float64x2_t n,
    return vbslq_f64 (cmp, r1, r0);
  }
  
-#endif
-
  /* Fast vector implementation of exp2.
     Maximum measured error is 1.65 ulp.
     _ZGVnN2v_exp2(-0x1.4c264ab5b559bp-6) got 0x1.f8db0d4df721fp-1
@@ -90,17 +75,7 @@ VPCS_ATTR
  float64x2_t V_NAME_D1 (exp2) (float64x2_t x)
  {
    const struct data *d = ptr_barrier (&data);
-  uint64x2_t cmp;
-#if WANT_SIMD_EXCEPT
-  uint64x2_t ia = vreinterpretq_u64_f64 (vabsq_f64 (x));
-  cmp = vcgeq_u64 (vsubq_u64 (ia, v_u64 (TinyBound)), v_u64 (Thres));
-  /* Mask special lanes and retain a copy of x for passing to special-case
-     handler.  */
-  float64x2_t xc = x;
-  x = v_zerofy_f64 (x, cmp);
-#else
-  cmp = vcagtq_f64 (x, d->scale_big_bound);
-#endif
+  uint64x2_t cmp = vcagtq_f64 (x, d->scale_big_bound);
  
    /* n = round(x/N).  */
    float64x2_t z = vaddq_f64 (d->shift, x);
@@ -121,10 +96,6 @@ float64x2_t V_NAME_D1 (exp2) (float64x2_t x)
    y = vmulq_f64 (r, y);
  
    if (__glibc_unlikely (v_any_u64 (cmp)))
-#if !WANT_SIMD_EXCEPT
      return special_case (s, y, n, d);
-#else
-    return special_case (xc, vfmaq_f64 (s, s, y), cmp);
-#endif
    return vfmaq_f64 (s, s, y);
  }
diff --git a/sysdeps/aarch64/fpu/exp2f_advsimd.c b/sysdeps/aarch64/fpu/exp2f_advsimd.c

index 7cbce0dd6977c9601e2c0b9b7988fa78c4431a40..69863b19c761c5486a1ec941023998f580d3b29e 100644 (file)
--- a/sysdeps/aarch64/fpu/exp2f_advsimd.c
+++ b/sysdeps/aarch64/fpu/exp2f_advsimd.c
@@ -23,9 +23,7 @@ static const struct data
  {
    float32x4_t c1, c3;
    uint32x4_t exponent_bias, special_offset, special_bias;
-#if !WANT_SIMD_EXCEPT
    float32x4_t scale_thresh, special_bound;
-#endif
    float c0, c2, c4, zero;
  } data = {
    /* maxerr: 1.962 ulp.  */
@@ -37,28 +35,10 @@ static const struct data
    .exponent_bias = V4 (0x3f800000),
    .special_offset = V4 (0x82000000),
    .special_bias = V4 (0x7f000000),
-#if !WANT_SIMD_EXCEPT
    .special_bound = V4 (126.0f),
    .scale_thresh = V4 (192.0f),
-#endif
  };
  
-#if WANT_SIMD_EXCEPT
-
-# define TinyBound v_u32 (0x20000000)    /* asuint (0x1p-63).  */
-# define BigBound v_u32 (0x42800000)     /* asuint (0x1p6).  */
-# define SpecialBound v_u32 (0x22800000) /* BigBound - TinyBound.  */
-
-static float32x4_t VPCS_ATTR NOINLINE
-special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
-{
-  /* If fenv exceptions are to be triggered correctly, fall back to the scalar
-     routine for special lanes.  */
-  return v_call_f32 (exp2f, x, y, cmp);
-}
-
-#else
-
  static float32x4_t VPCS_ATTR NOINLINE
  special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
               float32x4_t scale, const struct data *d)
@@ -76,24 +56,10 @@ special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
    return vbslq_f32 (cmp2, r2, r);
  }
  
-#endif
-
  float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp2) (float32x4_t x)
  {
    const struct data *d = ptr_barrier (&data);
  
-#if WANT_SIMD_EXCEPT
-  /* asuint(|x|) - TinyBound >= BigBound - TinyBound.  */
-  uint32x4_t ia = vreinterpretq_u32_f32 (vabsq_f32 (x));
-  uint32x4_t cmp = vcgeq_u32 (vsubq_u32 (ia, TinyBound), SpecialBound);
-  float32x4_t xm = x;
-  /* If any lanes are special, mask them with 1 and retain a copy of x to allow
-     special_case to fix special lanes later. This is only necessary if fenv
-     exceptions are to be triggered correctly.  */
-  if (__glibc_unlikely (v_any_u32 (cmp)))
-    x = vbslq_f32 (cmp, v_f32 (1), x);
-#endif
-
    /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
       x = n + r, with r in [-1/2, 1/2].  */
    float32x4_t n = vrndaq_f32 (x);
@@ -101,9 +67,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp2) (float32x4_t x)
    uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), 23);
    float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
  
-#if !WANT_SIMD_EXCEPT
    uint32x4_t cmp = vcagtq_f32 (n, d->special_bound);
-#endif
  
    float32x4_t c024 = vld1q_f32 (&d->c0);
    float32x4_t r2 = vmulq_f32 (r, r);
@@ -114,11 +78,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp2) (float32x4_t x)
    float32x4_t poly = vfmaq_f32 (p, q, r2);
  
    if (__glibc_unlikely (v_any_u32 (cmp)))
-#if WANT_SIMD_EXCEPT
-    return special_case (xm, vfmaq_f32 (scale, poly, scale), cmp);
-#else
      return special_case (poly, n, e, cmp, scale, d);
-#endif
  
    return vfmaq_f32 (scale, poly, scale);
  }
diff --git a/sysdeps/aarch64/fpu/exp_advsimd.c b/sysdeps/aarch64/fpu/exp_advsimd.c

index bd97012237b5aa92799222c5357cbfbff8f8ba79..825c008abc84646896cd600148981c4584068c94 100644 (file)
--- a/sysdeps/aarch64/fpu/exp_advsimd.c
+++ b/sysdeps/aarch64/fpu/exp_advsimd.c
@@ -26,19 +26,15 @@ const static volatile struct
  {
    float64x2_t poly[3];
    float64x2_t inv_ln2, ln2_hi, ln2_lo, shift;
-#if !WANT_SIMD_EXCEPT
    float64x2_t special_bound, scale_thresh;
-#endif
  } data = {
    /* maxerr: 1.88 +0.5 ulp
       rel error: 1.4337*2^-53
       abs error: 1.4299*2^-53 in [ -ln2/256, ln2/256 ].  */
    .poly = { V2 (0x1.ffffffffffd43p-2), V2 (0x1.55555c75adbb2p-3),
             V2 (0x1.55555da646206p-5) },
-#if !WANT_SIMD_EXCEPT
    .scale_thresh = V2 (163840.0), /* 1280.0 * N.  */
    .special_bound = V2 (704.0),
-#endif
    .inv_ln2 = V2 (0x1.71547652b82fep7), /* N/ln2.  */
    .ln2_hi = V2 (0x1.62e42fefa39efp-8), /* ln2/N.  */
    .ln2_lo = V2 (0x1.abc9e3b39803f3p-63),
@@ -47,23 +43,6 @@ const static volatile struct
  
  #define C(i) data.poly[i]
  #define Tab __v_exp_data
-
-#if WANT_SIMD_EXCEPT
-
-# define TinyBound v_u64 (0x2000000000000000) /* asuint64 (0x1p-511).  */
-# define BigBound v_u64 (0x4080000000000000) /* asuint64 (0x1p9).  */
-# define SpecialBound v_u64 (0x2080000000000000) /* BigBound - TinyBound.  */
-
-static float64x2_t VPCS_ATTR NOINLINE
-special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp)
-{
-  /* If fenv exceptions are to be triggered correctly, fall back to the scalar
-     routine to special lanes.  */
-  return v_call_f64 (exp, x, y, cmp);
-}
-
-#else
-
  # define SpecialOffset v_u64 (0x6000000000000000) /* 0x1p513.  */
  /* SpecialBias1 + SpecialBias1 = asuint(1.0).  */
  # define SpecialBias1 v_u64 (0x7000000000000000) /* 0x1p769.  */
@@ -83,25 +62,12 @@ special_case (float64x2_t s, float64x2_t y, float64x2_t n)
    return vbslq_f64 (cmp, r1, r0);
  }
  
-#endif
-
  float64x2_t VPCS_ATTR V_NAME_D1 (exp) (float64x2_t x)
  {
    float64x2_t n, r, r2, s, y, z;
    uint64x2_t cmp, u, e;
  
-#if WANT_SIMD_EXCEPT
-  /* If any lanes are special, mask them with 1 and retain a copy of x to allow
-     special_case to fix special lanes later. This is only necessary if fenv
-     exceptions are to be triggered correctly.  */
-  float64x2_t xm = x;
-  uint64x2_t iax = vreinterpretq_u64_f64 (vabsq_f64 (x));
-  cmp = vcgeq_u64 (vsubq_u64 (iax, TinyBound), SpecialBound);
-  if (__glibc_unlikely (v_any_u64 (cmp)))
-    x = vbslq_f64 (cmp, v_f64 (1), x);
-#else
    cmp = vcagtq_f64 (x, data.special_bound);
-#endif
  
    /* n = round(x/(ln2/N)).  */
    z = vfmaq_f64 (data.shift, x, data.inv_ln2);
@@ -109,8 +75,7 @@ float64x2_t VPCS_ATTR V_NAME_D1 (exp) (float64x2_t x)
    n = vsubq_f64 (z, data.shift);
  
    /* r = x - n*ln2/N.  */
-  r = x;
-  r = vfmsq_f64 (r, data.ln2_hi, n);
+  r = vfmsq_f64 (x, data.ln2_hi, n);
    r = vfmsq_f64 (r, data.ln2_lo, n);
  
    e = vshlq_n_u64 (u, 52 - V_EXP_TABLE_BITS);
@@ -126,11 +91,7 @@ float64x2_t VPCS_ATTR V_NAME_D1 (exp) (float64x2_t x)
    s = vreinterpretq_f64_u64 (vaddq_u64 (u, e));
  
    if (__glibc_unlikely (v_any_u64 (cmp)))
-#if WANT_SIMD_EXCEPT
-    return special_case (xm, vfmaq_f64 (s, y, s), cmp);
-#else
      return special_case (s, y, n);
-#endif
  
    return vfmaq_f64 (s, y, s);
  }
diff --git a/sysdeps/aarch64/fpu/expf_advsimd.c b/sysdeps/aarch64/fpu/expf_advsimd.c

index f1e93b849067709c00bcb88c8077357564c130b4..c76dfe6517f075b6b398d8d2c4a4ba55b127f738 100644 (file)
--- a/sysdeps/aarch64/fpu/expf_advsimd.c
+++ b/sysdeps/aarch64/fpu/expf_advsimd.c
@@ -24,9 +24,7 @@ static const struct data
    float32x4_t c1, c3, c4, inv_ln2;
    float ln2_hi, ln2_lo, c0, c2;
    uint32x4_t exponent_bias, special_offset, special_bias;
-#if !WANT_SIMD_EXCEPT
    float32x4_t special_bound, scale_thresh;
-#endif
  } data = {
    /* maxerr: 1.45358 +0.5 ulp.  */
    .c0 = 0x1.0e4020p-7f,
@@ -40,30 +38,12 @@ static const struct data
    .exponent_bias = V4 (0x3f800000),
    .special_offset = V4 (0x82000000),
    .special_bias = V4 (0x7f000000),
-#if !WANT_SIMD_EXCEPT
    .special_bound = V4 (126.0f),
    .scale_thresh = V4 (192.0f),
-#endif
  };
  
  #define C(i) d->poly[i]
  
-#if WANT_SIMD_EXCEPT
-
-# define TinyBound v_u32 (0x20000000)  /* asuint (0x1p-63).  */
-# define BigBound v_u32 (0x42800000)   /* asuint (0x1p6).  */
-# define SpecialBound v_u32 (0x22800000) /* BigBound - TinyBound.  */
-
-static float32x4_t VPCS_ATTR NOINLINE
-special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
-{
-  /* If fenv exceptions are to be triggered correctly, fall back to the scalar
-     routine to special lanes.  */
-  return v_call_f32 (expf, x, y, cmp);
-}
-
-#else
-
  static float32x4_t VPCS_ATTR NOINLINE
  special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
               float32x4_t scale, const struct data *d)
@@ -82,27 +62,11 @@ special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
    return vbslq_f32 (cmp2, r2, r);
  }
  
-#endif
-
  float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp) (float32x4_t x)
  {
    const struct data *d = ptr_barrier (&data);
    float32x4_t ln2_c02 = vld1q_f32 (&d->ln2_hi);
  
-#if WANT_SIMD_EXCEPT
-  /* asuint(x) - TinyBound >= BigBound - TinyBound.  */
-  uint32x4_t cmp = vcgeq_u32 (
-      vsubq_u32 (vandq_u32 (vreinterpretq_u32_f32 (x), v_u32 (0x7fffffff)),
-                TinyBound),
-      SpecialBound);
-  float32x4_t xm = x;
-  /* If any lanes are special, mask them with 1 and retain a copy of x to allow
-     special case handler to fix special lanes later. This is only necessary if
-     fenv exceptions are to be triggered correctly.  */
-  if (__glibc_unlikely (v_any_u32 (cmp)))
-    x = vbslq_f32 (cmp, v_f32 (1), x);
-#endif
-
    /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
       x = ln2*n + r, with r in [-ln2/2, ln2/2].  */
    float32x4_t n = vrndaq_f32 (vmulq_f32 (x, d->inv_ln2));
@@ -111,9 +75,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp) (float32x4_t x)
    uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 23);
    float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
  
-#if !WANT_SIMD_EXCEPT
    uint32x4_t cmp = vcagtq_f32 (n, d->special_bound);
-#endif
  
    float32x4_t r2 = vmulq_f32 (r, r);
    float32x4_t p = vfmaq_laneq_f32 (d->c1, r, ln2_c02, 2);
@@ -123,11 +85,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp) (float32x4_t x)
    float32x4_t poly = vfmaq_f32 (p, q, r2);
  
    if (__glibc_unlikely (v_any_u32 (cmp)))
-#if WANT_SIMD_EXCEPT
-    return special_case (xm, vfmaq_f32 (scale, poly, scale), cmp);
-#else
      return special_case (poly, n, e, cmp, scale, d);
-#endif
  
    return vfmaq_f32 (scale, poly, scale);
  }
diff --git a/sysdeps/aarch64/fpu/expm1_advsimd.c b/sysdeps/aarch64/fpu/expm1_advsimd.c

index 01e7292bd54f7e63b20b371e67341715cfd1f044..eefa7abdd2aaa783fbcfad0d0cf31079e1059e2d 100644 (file)
--- a/sysdeps/aarch64/fpu/expm1_advsimd.c
+++ b/sysdeps/aarch64/fpu/expm1_advsimd.c
@@ -23,55 +23,31 @@
  static const struct data
  {
    struct v_expm1_data d;
-#if WANT_SIMD_EXCEPT
-  uint64x2_t thresh, tiny_bound;
-#else
    float64x2_t oflow_bound;
-#endif
  } data = {
    .d = V_EXPM1_DATA,
-#if WANT_SIMD_EXCEPT
-  /* asuint64(oflow_bound) - asuint64(0x1p-51), shifted left by 1 for abs
-     compare.  */
-  .thresh = V2 (0x78c56fa6d34b552),
-  /* asuint64(0x1p-51) << 1.  */
-  .tiny_bound = V2 (0x3cc0000000000000 << 1),
-#else
    /* Value above which expm1(x) should overflow. Absolute value of the
       underflow bound is greater than this, so it catches both cases - there is
       a small window where fallbacks are triggered unnecessarily.  */
    .oflow_bound = V2 (0x1.62b7d369a5aa9p+9),
-#endif
  };
  
  static float64x2_t VPCS_ATTR NOINLINE
  special_case (float64x2_t x, uint64x2_t special, const struct data *d)
  {
-  return v_call_f64 (expm1, x, expm1_inline (v_zerofy_f64 (x, special), &d->d),
-                    special);
+  return v_call_f64 (expm1, x, expm1_inline (x, &d->d), special);
  }
  
  /* Double-precision vector exp(x) - 1 function.
     The maximum error observed error is 2.05 ULP:
-  _ZGVnN2v_expm1(0x1.634902eaff3adp-2) got 0x1.a8b636e2a9388p-2
-                                     want 0x1.a8b636e2a9386p-2.  */
+  _ZGVnN2v_expm1(0x1.6329669eb8c87p-2) got 0x1.a8897eef87b34p-2
+                                     want 0x1.a8897eef87b32p-2.  */
  float64x2_t VPCS_ATTR V_NAME_D1 (expm1) (float64x2_t x)
  {
    const struct data *d = ptr_barrier (&data);
  
-#if WANT_SIMD_EXCEPT
-  uint64x2_t ix = vreinterpretq_u64_f64 (x);
-  /* If fp exceptions are to be triggered correctly, fall back to scalar for
-     |x| < 2^-51, |x| > oflow_bound, Inf & NaN. Add ix to itself for
-     shift-left by 1, and compare with thresh which was left-shifted offline -
-     this is effectively an absolute compare.  */
-  uint64x2_t special
-      = vcgeq_u64 (vsubq_u64 (vaddq_u64 (ix, ix), d->tiny_bound), d->thresh);
-#else
    /* Large input, NaNs and Infs.  */
    uint64x2_t special = vcageq_f64 (x, d->oflow_bound);
-#endif
-
    if (__glibc_unlikely (v_any_u64 (special)))
      return special_case (x, special, d);
  
diff --git a/sysdeps/aarch64/fpu/expm1f_advsimd.c b/sysdeps/aarch64/fpu/expm1f_advsimd.c

index a5712cf24b94451da73ba16067f89217a2b76095..37141c80bb6165d4c4afae8bae990656d0f98553 100644 (file)
--- a/sysdeps/aarch64/fpu/expm1f_advsimd.c
+++ b/sysdeps/aarch64/fpu/expm1f_advsimd.c
@@ -23,33 +23,19 @@
  static const struct data
  {
    struct v_expm1f_data d;
-#if WANT_SIMD_EXCEPT
-  uint32x4_t thresh;
-#else
    float32x4_t oflow_bound;
-#endif
  } data = {
    .d = V_EXPM1F_DATA,
-#if !WANT_SIMD_EXCEPT
    /* Value above which expm1f(x) should overflow. Absolute value of the
       underflow bound is greater than this, so it catches both cases - there is
       a small window where fallbacks are triggered unnecessarily.  */
    .oflow_bound = V4 (0x1.5ebc4p+6),
-#else
-  /* asuint(oflow_bound) - asuint(0x1p-23), shifted left by 1 for absolute
-     compare.  */
-  .thresh = V4 (0x1d5ebc40),
-#endif
  };
  
-/* asuint(0x1p-23), shifted by 1 for abs compare.  */
-#define TinyBound v_u32 (0x34000000 << 1)
-
  static float32x4_t VPCS_ATTR NOINLINE
  special_case (float32x4_t x, uint32x4_t special, const struct data *d)
  {
-  return v_call_f32 (
-      expm1f, x, expm1f_inline (v_zerofy_f32 (x, special), &d->d), special);
+  return v_call_f32 (expm1f, x, expm1f_inline (x, &d->d), special);
  }
  
  /* Single-precision vector exp(x) - 1 function.
@@ -60,18 +46,8 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (expm1) (float32x4_t x)
  {
    const struct data *d = ptr_barrier (&data);
  
-#if WANT_SIMD_EXCEPT
-  uint32x4_t ix = vreinterpretq_u32_f32 (x);
-  /* If fp exceptions are to be triggered correctly, fall back to scalar for
-     |x| < 2^-23, |x| > oflow_bound, Inf & NaN. Add ix to itself for
-     shift-left by 1, and compare with thresh which was left-shifted offline -
-     this is effectively an absolute compare.  */
-  uint32x4_t special
-      = vcgeq_u32 (vsubq_u32 (vaddq_u32 (ix, ix), TinyBound), d->thresh);
-#else
    /* Handles very large values (+ve and -ve), +/-NaN, +/-Inf.  */
    uint32x4_t special = vcagtq_f32 (x, d->oflow_bound);
-#endif
  
    if (__glibc_unlikely (v_any_u32 (special)))
      return special_case (x, special, d);
diff --git a/sysdeps/aarch64/fpu/finite_pow.h b/sysdeps/aarch64/fpu/finite_pow.h

index bbe6d2dbd89177522c81129a26b937e6bdad40f4..9fd535e77d2b29cabe88970ace0f637c80f62434 100644 (file)
--- a/sysdeps/aarch64/fpu/finite_pow.h
+++ b/sysdeps/aarch64/fpu/finite_pow.h
@@ -127,27 +127,6 @@ special_case (double tmp, uint64_t sbits, uint64_t ki)
    /* Note: sbits is signed scale.  */
    scale = asdouble (sbits);
    y = scale + scale * tmp;
-#if WANT_SIMD_EXCEPT
-  if (fabs (y) < 1.0)
-    {
-      /* Round y to the right precision before scaling it into the subnormal
-        range to avoid double rounding that can cause 0.5+E/2 ulp error where
-        E is the worst-case ulp error outside the subnormal range.  So this
-        is only useful if the goal is better than 1 ulp worst-case error.  */
-      double hi, lo, one = 1.0;
-      if (y < 0.0)
-       one = -1.0;
-      lo = scale - y + scale * tmp;
-      hi = one + y;
-      lo = one - hi + y + lo;
-      y = (hi + lo) - one;
-      /* Fix the sign of 0.  */
-      if (y == 0.0)
-       y = asdouble (sbits & 0x8000000000000000);
-      /* The underflow exception needs to be signaled explicitly.  */
-      force_eval_double (opt_barrier_double (0x1p-1022) * 0x1p-1022);
-    }
-#endif
    y = 0x1p-1022 * y;
    return y;
  }
@@ -170,13 +149,8 @@ exp_inline (double x, double xtail, uint32_t sign_bias)
         {
           /* Note: inf and nan are already handled.  */
           /* Skip errno handling.  */
-#if WANT_SIMD_EXCEPT
-         return asuint64 (x) >> 63 ? __math_uflow (sign_bias)
-                                   : __math_oflow (sign_bias);
-#else
           double res_uoflow = asuint64 (x) >> 63 ? 0.0 : INFINITY;
           return sign_bias ? -res_uoflow : res_uoflow;
-#endif
         }
        /* Large x is special cased below.  */
        abstop = 0;
@@ -221,11 +195,7 @@ exp_nosignbias (double x, double xtail)
         return 1.0;
        /* Note: inf and nan are already handled.  */
        if (abstop >= top12 (1024.0))
-#if WANT_SIMD_EXCEPT
-       return asuint64 (x) >> 63 ? __math_uflow (0) : __math_oflow (0);
-#else
         return asuint64 (x) >> 63 ? 0.0 : INFINITY;
-#endif
        /* Large x is special cased below.  */
        abstop = 0;
      }
@@ -319,10 +289,6 @@ pow_scalar_special_case (double x, double y)
               x2 = -x2;
               sign_bias = 1;
             }
-#if WANT_SIMD_EXCEPT
-         if (2 * ix == 0 && iy >> 63)
-           return __math_divzero (sign_bias);
-#endif
           return iy >> 63 ? 1 / x2 : x2;
         }
        /* Here x and y are non-zero finite.  */
@@ -331,11 +297,7 @@ pow_scalar_special_case (double x, double y)
           /* Finite x < 0.  */
           int yint = checkint (iy);
           if (yint == 0)
-#if WANT_SIMD_EXCEPT
-           return __math_invalid (x);
-#else
             return __builtin_nan ("");
-#endif
           if (yint == 1)
             sign_bias = SignBias;
           ix &= 0x7fffffffffffffff;
@@ -349,12 +311,7 @@ pow_scalar_special_case (double x, double y)
           /* |y| < 2^-65, x^y ~= 1 + y*log(x).  */
           if ((topy & 0x7ff) < SmallPowY)
             return 1.0;
-#if WANT_SIMD_EXCEPT
-         return (ix > asuint64 (1.0)) == (topy < 0x800) ? __math_oflow (0)
-                                                        : __math_uflow (0);
-#else
           return (ix > asuint64 (1.0)) == (topy < 0x800) ? INFINITY : 0;
-#endif
         }
        if (topx == 0)
         {
diff --git a/sysdeps/aarch64/fpu/hypot_advsimd.c b/sysdeps/aarch64/fpu/hypot_advsimd.c

index 9af5402e4a057f77de1ca2522d1885939c84b8ea..9f802d1e53b461cd24934428bc8ec2a5fc64c955 100644 (file)
--- a/sysdeps/aarch64/fpu/hypot_advsimd.c
+++ b/sysdeps/aarch64/fpu/hypot_advsimd.c
@@ -19,24 +19,14 @@
  
  #include "v_math.h"
  
-#if WANT_SIMD_EXCEPT
-static const struct data
-{
-  uint64x2_t tiny_bound, thres;
-} data = {
-  .tiny_bound = V2 (0x2000000000000000), /* asuint (0x1p-511).  */
-  .thres = V2 (0x3fe0000000000000), /* asuint (0x1p511) - tiny_bound.  */
-};
-#else
  static const struct data
  {
    uint64x2_t tiny_bound;
    uint32x4_t thres;
  } data = {
    .tiny_bound = V2 (0x0360000000000000), /* asuint (0x1p-969).  */
-  .thres = V4 (0x7c900000),     /* asuint (inf) - tiny_bound.  */
+  .thres = V4 (0x7c900000),             /* asuint (inf) - tiny_bound.  */
  };
-#endif
  
  static float64x2_t VPCS_ATTR NOINLINE
  special_case (float64x2_t x, float64x2_t y, float64x2_t sqsum,
@@ -50,35 +40,6 @@ special_case (float64x2_t x, float64x2_t y, float64x2_t sqsum,
     _ZGVnN2vv_hypot (0x1.6a1b193ff85b5p-204, 0x1.bc50676c2a447p-222)
      got 0x1.6a1b19400964ep-204
     want 0x1.6a1b19400964dp-204.  */
-#if WANT_SIMD_EXCEPT
-
-float64x2_t VPCS_ATTR V_NAME_D2 (hypot) (float64x2_t x, float64x2_t y)
-{
-  const struct data *d = ptr_barrier (&data);
-
-  float64x2_t ax = vabsq_f64 (x);
-  float64x2_t ay = vabsq_f64 (y);
-
-  uint64x2_t ix = vreinterpretq_u64_f64 (ax);
-  uint64x2_t iy = vreinterpretq_u64_f64 (ay);
-
-  /* Extreme values, NaNs, and infinities should be handled by the scalar
-     fallback for correct flag handling.  */
-  uint64x2_t specialx = vcgeq_u64 (vsubq_u64 (ix, d->tiny_bound), d->thres);
-  uint64x2_t specialy = vcgeq_u64 (vsubq_u64 (iy, d->tiny_bound), d->thres);
-  ax = v_zerofy_f64 (ax, specialx);
-  ay = v_zerofy_f64 (ay, specialy);
-  uint32x2_t special = vaddhn_u64 (specialx, specialy);
-
-  float64x2_t sqsum = vfmaq_f64 (vmulq_f64 (ax, ax), ay, ay);
-
-  if (__glibc_unlikely (v_any_u32h (special)))
-    return special_case (x, y, sqsum, special);
-
-  return vsqrtq_f64 (sqsum);
-}
-#else
-
  float64x2_t VPCS_ATTR V_NAME_D2 (hypot) (float64x2_t x, float64x2_t y)
  {
    const struct data *d = ptr_barrier (&data);
@@ -94,4 +55,3 @@ float64x2_t VPCS_ATTR V_NAME_D2 (hypot) (float64x2_t x, float64x2_t y)
  
    return vsqrtq_f64 (sqsum);
  }
-#endif
diff --git a/sysdeps/aarch64/fpu/hypotf_advsimd.c b/sysdeps/aarch64/fpu/hypotf_advsimd.c

index e2206f0e7c12483bf08bb2521814094025fc5bc9..feeba891b95bd3c84c06adcb27cafb8ce99b088c 100644 (file)
--- a/sysdeps/aarch64/fpu/hypotf_advsimd.c
+++ b/sysdeps/aarch64/fpu/hypotf_advsimd.c
@@ -19,15 +19,6 @@
  
  #include "v_math.h"
  
-#if WANT_SIMD_EXCEPT
-static const struct data
-{
-  uint32x4_t tiny_bound, thres;
-} data = {
-  .tiny_bound = V4 (0x20000000), /* asuint (0x1p-63).  */
-  .thres = V4 (0x3f000000), /* asuint (0x1p63) - tiny_bound.  */
-};
-#else
  static const struct data
  {
    uint32x4_t tiny_bound;
@@ -36,7 +27,6 @@ static const struct data
    .tiny_bound = V4 (0x0C800000), /* asuint (0x1p-102).  */
    .thres = V8 (0x7300), /* asuint (inf) - tiny_bound.  */
  };
-#endif
  
  static float32x4_t VPCS_ATTR NOINLINE
  special_case (float32x4_t x, float32x4_t y, float32x4_t sqsum,
@@ -49,36 +39,7 @@ special_case (float32x4_t x, float32x4_t y, float32x4_t sqsum,
     Maximum error observed is 1.21 ULP:
     _ZGVnN4vv_hypotf (0x1.6a419cp-13, 0x1.82a852p-22) got 0x1.6a41d2p-13
                                                     want 0x1.6a41dp-13.  */
-#if WANT_SIMD_EXCEPT
-
-float32x4_t VPCS_ATTR V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y)
-{
-  const struct data *d = ptr_barrier (&data);
-
-  float32x4_t ax = vabsq_f32 (x);
-  float32x4_t ay = vabsq_f32 (y);
-
-  uint32x4_t ix = vreinterpretq_u32_f32 (ax);
-  uint32x4_t iy = vreinterpretq_u32_f32 (ay);
-
-  /* Extreme values, NaNs, and infinities should be handled by the scalar
-     fallback for correct flag handling.  */
-  uint32x4_t specialx = vcgeq_u32 (vsubq_u32 (ix, d->tiny_bound), d->thres);
-  uint32x4_t specialy = vcgeq_u32 (vsubq_u32 (iy, d->tiny_bound), d->thres);
-  ax = v_zerofy_f32 (ax, specialx);
-  ay = v_zerofy_f32 (ay, specialy);
-  uint16x4_t special = vaddhn_u32 (specialx, specialy);
-
-  float32x4_t sqsum = vfmaq_f32 (vmulq_f32 (ax, ax), ay, ay);
-
-  if (__glibc_unlikely (v_any_u16h (special)))
-    return special_case (x, y, sqsum, special);
-
-  return vsqrtq_f32 (sqsum);
-}
-#else
-
-float32x4_t VPCS_ATTR V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y)
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y)
  {
    const struct data *d = ptr_barrier (&data);
  
@@ -93,6 +54,5 @@ float32x4_t VPCS_ATTR V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y)
  
    return vsqrtq_f32 (sqsum);
  }
-#endif
  libmvec_hidden_def (V_NAME_F2 (hypot))
  HALF_WIDTH_ALIAS_F2(hypot)
diff --git a/sysdeps/aarch64/fpu/log1p_advsimd.c b/sysdeps/aarch64/fpu/log1p_advsimd.c

index 44474be5e218429f31088fb4e920aaacdc138c31..9bf11ffb2a0bd8784be170c14d26a4c67ac802b7 100644 (file)
--- a/sysdeps/aarch64/fpu/log1p_advsimd.c
+++ b/sysdeps/aarch64/fpu/log1p_advsimd.c
@@ -35,8 +35,7 @@ special_case (float64x2_t x, uint64x2_t cmp, const struct data *d)
  {
    /* Side-step special lanes so fenv exceptions are not triggered
       inadvertently.  */
-  float64x2_t x_nospecial = v_zerofy_f64 (x, cmp);
-  return v_call_f64 (log1p, x, log1p_inline (x_nospecial, &d->d), cmp);
+  return v_call_f64 (log1p, x, log1p_inline (x, &d->d), cmp);
  }
  
  /* Vector log1p approximation using polynomial on reduced interval. Routine is
diff --git a/sysdeps/aarch64/fpu/log1pf_advsimd.c b/sysdeps/aarch64/fpu/log1pf_advsimd.c

index 887c504777195270de699644b4bcb3453f6592f5..79e33e4aae974117f62926a1d80c66fe8129a847 100644 (file)
--- a/sysdeps/aarch64/fpu/log1pf_advsimd.c
+++ b/sysdeps/aarch64/fpu/log1pf_advsimd.c
@@ -20,52 +20,6 @@
  #include "v_math.h"
  #include "v_log1pf_inline.h"
  
-#if WANT_SIMD_EXCEPT
-
-const static struct data
-{
-  uint32x4_t minus_one, thresh;
-  struct v_log1pf_data d;
-} data = {
-  .d = V_LOG1PF_CONSTANTS_TABLE,
-  .thresh = V4 (0x4b800000), /* asuint32(INFINITY) - TinyBound.  */
-  .minus_one = V4 (0xbf800000),
-};
-
-/* asuint32(0x1p-23). ulp=0.5 at 0x1p-23.  */
-#  define TinyBound v_u32 (0x34000000)
-
-static float32x4_t NOINLINE VPCS_ATTR
-special_case (float32x4_t x, uint32x4_t cmp, const struct data *d)
-{
-  /* Side-step special lanes so fenv exceptions are not triggered
-     inadvertently.  */
-  float32x4_t x_nospecial = v_zerofy_f32 (x, cmp);
-  return v_call_f32 (log1pf, x, log1pf_inline (x_nospecial, &d->d), cmp);
-}
-
-/* Vector log1pf approximation using polynomial on reduced interval. Worst-case
-   error is 1.69 ULP:
-   _ZGVnN4v_log1pf(0x1.04418ap-2) got 0x1.cfcbd8p-3
-                                want 0x1.cfcbdcp-3.  */
-VPCS_ATTR float32x4_t V_NAME_F1 (log1p) (float32x4_t x)
-{
-  const struct data *d = ptr_barrier (&data);
-  uint32x4_t ix = vreinterpretq_u32_f32 (x);
-  uint32x4_t ia = vreinterpretq_u32_f32 (vabsq_f32 (x));
-
-  uint32x4_t special_cases
-      = vorrq_u32 (vcgeq_u32 (vsubq_u32 (ia, TinyBound), d->thresh),
-                  vcgeq_u32 (ix, d->minus_one));
-
-  if (__glibc_unlikely (v_any_u32 (special_cases)))
-    return special_case (x, special_cases, d);
-
-  return log1pf_inline (x, &d->d);
-}
-
-#else
-
  const static struct v_log1pf_data data = V_LOG1PF_CONSTANTS_TABLE;
  
  static float32x4_t NOINLINE VPCS_ATTR
@@ -78,7 +32,7 @@ special_case (float32x4_t x, uint32x4_t cmp)
     error is 1.63 ULP:
     _ZGVnN4v_log1pf(0x1.216d12p-2) got 0x1.fdcb12p-3
                                  want 0x1.fdcb16p-3.  */
-VPCS_ATTR float32x4_t V_NAME_F1 (log1p) (float32x4_t x)
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log1p) (float32x4_t x)
  {
    uint32x4_t special_cases = vornq_u32 (vcleq_f32 (x, v_f32 (-1)),
                                         vcaleq_f32 (x, v_f32 (0x1p127f)));
@@ -88,9 +42,6 @@ VPCS_ATTR float32x4_t V_NAME_F1 (log1p) (float32x4_t x)
  
    return log1pf_inline (x, ptr_barrier (&data));
  }
-
-#endif
-
  libmvec_hidden_def (V_NAME_F1 (log1p))
  HALF_WIDTH_ALIAS_F1 (log1p)
  strong_alias (V_NAME_F1 (log1p), V_NAME_F1 (logp1))
diff --git a/sysdeps/aarch64/fpu/pow_advsimd.c b/sysdeps/aarch64/fpu/pow_advsimd.c

index e5bb4f0306e1201fc4dfc3c2c086bb9b840860a9..e1b3b20f31128a6f150f4256975f0e5c94c078e6 100644 (file)
--- a/sysdeps/aarch64/fpu/pow_advsimd.c
+++ b/sysdeps/aarch64/fpu/pow_advsimd.c
@@ -208,23 +208,13 @@ float64x2_t VPCS_ATTR V_NAME_D2 (pow) (float64x2_t x, float64x2_t y)
    uint64x2_t viy = vreinterpretq_u64_f64 (y);
    uint64x2_t iay = vandq_u64 (viy, d->inf);
  
-  /* Special cases of x or y.  */
-#if WANT_SIMD_EXCEPT
-  /* Small or large.  */
-  uint64x2_t vtopx = vshrq_n_u64 (vix, 52);
-  uint64x2_t vabstopy = vshrq_n_u64 (iay, 52);
-  uint64x2_t specialx
-      = vcgeq_u64 (vsubq_u64 (vtopx, VecSmallPowX), VecThresPowX);
-  uint64x2_t specialy
-      = vcgeq_u64 (vsubq_u64 (vabstopy, VecSmallPowY), VecThresPowY);
-#else
-  /* The case y==0 does not trigger a special case, since in this case it is
+  /* Special cases of x or y.
+     The case y==0 does not trigger a special case, since in this case it is
       necessary to fix the result only if x is a signalling nan, which already
       triggers a special case. We test y==0 directly in the scalar fallback.  */
    uint64x2_t iax = vandq_u64 (vix, d->inf);
    uint64x2_t specialx = vcgeq_u64 (iax, d->inf);
    uint64x2_t specialy = vcgeq_u64 (iay, d->inf);
-#endif
    uint64x2_t special = vorrq_u64 (specialx, specialy);
    /* Fallback to scalar on all lanes if any lane is inf or nan.  */
    if (__glibc_unlikely (v_any_u64 (special)))
diff --git a/sysdeps/aarch64/fpu/sin_advsimd.c b/sysdeps/aarch64/fpu/sin_advsimd.c

index 4e5118d338b4942b29bcd40427016ef6522f0857..e8a59a4d39a82b75026e0e8baa13391628473a40 100644 (file)
--- a/sysdeps/aarch64/fpu/sin_advsimd.c
+++ b/sysdeps/aarch64/fpu/sin_advsimd.c
@@ -36,13 +36,6 @@ static const struct data
    .pi_3 = V2 (0x1.c1cd129024e09p-106),
  };
  
-#if WANT_SIMD_EXCEPT
-/* asuint64(0x1p-253)), below which multiply by inv_pi underflows.  */
-# define TinyBound v_u64 (0x3020000000000000)
-/* RangeVal - TinyBound.  */
-# define Thresh v_u64 (0x1160000000000000)
-#endif
-
  #define C(i) d->poly[i]
  
  static float64x2_t VPCS_ATTR NOINLINE
@@ -67,24 +60,14 @@ float64x2_t VPCS_ATTR V_NAME_D1 (sin) (float64x2_t x)
    float64x2_t n, r, r2, r3, r4, y, t1, t2, t3;
    uint64x2_t odd, cmp;
  
-#if WANT_SIMD_EXCEPT
-  /* Detect |x| <= TinyBound or |x| >= RangeVal. If fenv exceptions are to be
-     triggered correctly, set any special lanes to 1 (which is neutral w.r.t.
-     fenv). These lanes will be fixed by special-case handler later.  */
-  uint64x2_t ir = vreinterpretq_u64_f64 (vabsq_f64 (x));
-  cmp = vcgeq_u64 (vsubq_u64 (ir, TinyBound), Thresh);
-  r = vreinterpretq_f64_u64 (vbicq_u64 (vreinterpretq_u64_f64 (x), cmp));
-#else
-  r = x;
    cmp = vcageq_f64 (x, d->range_val);
-#endif
  
    /* n = rint(|x|/pi).  */
-  n = vrndaq_f64 (vmulq_f64 (r, d->inv_pi));
+  n = vrndaq_f64 (vmulq_f64 (x, d->inv_pi));
    odd = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtq_s64_f64 (n)), 63);
  
    /* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2).  */
-  r = vfmsq_f64 (r, d->pi_1, n);
+  r = vfmsq_f64 (x, d->pi_1, n);
    r = vfmsq_f64 (r, d->pi_2, n);
    r = vfmsq_f64 (r, d->pi_3, n);
  
diff --git a/sysdeps/aarch64/fpu/sinf_advsimd.c b/sysdeps/aarch64/fpu/sinf_advsimd.c

index 4cda651410cbf6614a288dc29bf30fcd72e0ad40..25442badfba30c161b4f8695a85e306dc2ab7cb2 100644 (file)
--- a/sysdeps/aarch64/fpu/sinf_advsimd.c
+++ b/sysdeps/aarch64/fpu/sinf_advsimd.c
@@ -36,13 +36,6 @@ static const struct data
    .range_val = V4 (0x1p20f)
  };
  
-#if WANT_SIMD_EXCEPT
-/* asuint32(0x1p-59f), below which multiply by inv_pi underflows.  */
-# define TinyBound v_u32 (0x22000000)
-/* RangeVal - TinyBound.  */
-# define Thresh v_u32 (0x27800000)
-#endif
-
  #define C(i) d->poly[i]
  
  static float32x4_t VPCS_ATTR NOINLINE
@@ -58,25 +51,14 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sin) (float32x4_t x)
    const struct data *d = ptr_barrier (&data);
    float32x4_t n, r, r2, y;
    uint32x4_t odd, cmp;
-
-#if WANT_SIMD_EXCEPT
-  uint32x4_t ir = vreinterpretq_u32_f32 (vabsq_f32 (x));
-  cmp = vcgeq_u32 (vsubq_u32 (ir, TinyBound), Thresh);
-  /* If fenv exceptions are to be triggered correctly, set any special lanes
-     to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
-     special-case handler later.  */
-  r = vreinterpretq_f32_u32 (vbicq_u32 (vreinterpretq_u32_f32 (x), cmp));
-#else
-  r = x;
    cmp = vcageq_f32 (x, d->range_val);
-#endif
  
    /* n = rint(|x|/pi).  */
-  n = vrndaq_f32 (vmulq_f32 (r, d->inv_pi));
+  n = vrndaq_f32 (vmulq_f32 (x, d->inv_pi));
    odd = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 31);
  
    /* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2).  */
-  r = vfmsq_f32 (r, d->pi_1, n);
+  r = vfmsq_f32 (x, d->pi_1, n);
    r = vfmsq_f32 (r, d->pi_2, n);
    r = vfmsq_f32 (r, d->pi_3, n);
  
diff --git a/sysdeps/aarch64/fpu/sinhf_advsimd.c b/sysdeps/aarch64/fpu/sinhf_advsimd.c

index 41d1910d446366148eed847d8a711ef05633e324..dc4a63792463dac68a333eecfa3d6515c8f7c25a 100644 (file)
--- a/sysdeps/aarch64/fpu/sinhf_advsimd.c
+++ b/sysdeps/aarch64/fpu/sinhf_advsimd.c
@@ -23,22 +23,11 @@
  static const struct data
  {
    struct v_expm1f_data expm1f_consts;
-#if WANT_SIMD_EXCEPT
-  uint32x4_t tiny_bound, thresh;
-#else
    float32x4_t oflow_bound;
-#endif
  } data = {
    .expm1f_consts = V_EXPM1F_DATA,
-#if WANT_SIMD_EXCEPT
-  /* 0x1.6a09e8p-32, below which expm1f underflows.  */
-  .tiny_bound = V4 (0x2fb504f4),
-  /* asuint(oflow_bound) - asuint(tiny_bound).  */
-  .thresh = V4 (0x12fbbbb3),
-#else
    /* 0x1.61814ep+6, above which expm1f helper overflows.  */
    .oflow_bound = V4 (0x1.61814ep+6),
-#endif
  };
  
  static float32x4_t NOINLINE VPCS_ATTR
@@ -62,13 +51,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sinh) (float32x4_t x)
    float32x4_t halfsign = vreinterpretq_f32_u32 (
        vbslq_u32 (v_u32 (0x80000000), ix, vreinterpretq_u32_f32 (v_f32 (0.5))));
  
-#if WANT_SIMD_EXCEPT
-  uint32x4_t special = vcgeq_u32 (
-      vsubq_u32 (vreinterpretq_u32_f32 (ax), d->tiny_bound), d->thresh);
-  ax = v_zerofy_f32 (ax, special);
-#else
    uint32x4_t special = vcageq_f32 (x, d->oflow_bound);
-#endif
  
    /* Up to the point that expm1f overflows, we can use it to calculate sinhf
         using a slight rearrangement of the definition of asinh. This allows us
diff --git a/sysdeps/aarch64/fpu/sinpi_advsimd.c b/sysdeps/aarch64/fpu/sinpi_advsimd.c

index 69656448334379a3bc13d25ae5bbc6cd06cf7f81..1521d4e9ff6b4632f7fba1dd6b596dcf37dd42ea 100644 (file)
--- a/sysdeps/aarch64/fpu/sinpi_advsimd.c
+++ b/sysdeps/aarch64/fpu/sinpi_advsimd.c
@@ -33,20 +33,6 @@ static const struct data
             V2 (0x1.af86ae521260bp-21), V2 (-0x1.012a9870eeb7dp-25) },
  };
  
-#if WANT_SIMD_EXCEPT
-# define TinyBound v_u64 (0x3bf0000000000000) /* asuint64(0x1p-64).  */
-/* asuint64(0x1p64) - TinyBound.  */
-# define Thresh v_u64 (0x07f0000000000000)
-
-static float64x2_t VPCS_ATTR NOINLINE
-special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp)
-{
-  /* Fall back to scalar code.  */
-  y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
-  return v_call_f64 (sinpi, x, y, cmp);
-}
-#endif
-
  /* Approximation for vector double-precision sinpi(x).
     Maximum Error 3.05 ULP:
    _ZGVnN2v_sinpi(0x1.d32750db30b4ap-2) got 0x1.fb295878301c7p-1
@@ -54,34 +40,17 @@ special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp)
  float64x2_t VPCS_ATTR V_NAME_D1 (sinpi) (float64x2_t x)
  {
    const struct data *d = ptr_barrier (&data);
-
-#if WANT_SIMD_EXCEPT
-  uint64x2_t ir = vreinterpretq_u64_f64 (vabsq_f64 (x));
-  uint64x2_t cmp = vcgeq_u64 (vsubq_u64 (ir, TinyBound), Thresh);
-
-  /* When WANT_SIMD_EXCEPT = 1, special lanes should be set to 0
-     to avoid them under/overflowing and throwing exceptions.  */
-  float64x2_t r = v_zerofy_f64 (x, cmp);
-#else
-  float64x2_t r = x;
-#endif
-
    /* If r is odd, the sign of the result should be inverted.  */
    uint64x2_t odd
-      = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtaq_s64_f64 (r)), 63);
+      = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtaq_s64_f64 (x)), 63);
  
    /* r = x - rint(x). Range reduction to -1/2 .. 1/2.  */
-  r = vsubq_f64 (r, vrndaq_f64 (r));
+  float64x2_t r = vsubq_f64 (x, vrndaq_f64 (x));
  
    /* y = sin(r).  */
    float64x2_t r2 = vmulq_f64 (r, r);
    float64x2_t r4 = vmulq_f64 (r2, r2);
    float64x2_t y = vmulq_f64 (v_pw_horner_9_f64 (r2, r4, d->poly), r);
  
-#if WANT_SIMD_EXCEPT
-  if (__glibc_unlikely (v_any_u64 (cmp)))
-    return special_case (x, y, odd, cmp);
-#endif
-
    return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
  }
diff --git a/sysdeps/aarch64/fpu/sinpif_advsimd.c b/sysdeps/aarch64/fpu/sinpif_advsimd.c

index 2e77aaa8102243d474b7124a10b45101f2151550..a2aa5fe5ba8ecb0f27c406f47e6d206ea2574b69 100644 (file)
--- a/sysdeps/aarch64/fpu/sinpif_advsimd.c
+++ b/sysdeps/aarch64/fpu/sinpif_advsimd.c
@@ -29,19 +29,6 @@ static const struct data
             V4 (-0x1.32d2ccp-1f), V4 (0x1.50783p-4f), V4 (-0x1.e30750p-8f) },
  };
  
-#if WANT_SIMD_EXCEPT
-# define TinyBound v_u32 (0x30000000) /* asuint32(0x1p-31f).  */
-# define Thresh v_u32 (0x1f000000)    /* asuint32(0x1p31f) - TinyBound.  */
-
-static float32x4_t VPCS_ATTR NOINLINE
-special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp)
-{
-  /* Fall back to scalar code.  */
-  y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
-  return v_call_f32 (sinpif, x, y, cmp);
-}
-#endif
-
  /* Approximation for vector single-precision sinpi(x)
      Maximum Error 3.03 ULP:
      _ZGVnN4v_sinpif(0x1.c597ccp-2) got 0x1.f7cd56p-1
@@ -50,36 +37,19 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sinpi) (float32x4_t x)
  {
    const struct data *d = ptr_barrier (&data);
  
-#if WANT_SIMD_EXCEPT
-  uint32x4_t ir = vreinterpretq_u32_f32 (vabsq_f32 (x));
-  uint32x4_t cmp = vcgeq_u32 (vsubq_u32 (ir, TinyBound), Thresh);
-
-  /* When WANT_SIMD_EXCEPT = 1, special lanes should be set to 0
-     to avoid them under/overflowing and throwing exceptions.  */
-  float32x4_t r = v_zerofy_f32 (x, cmp);
-#else
-  float32x4_t r = x;
-#endif
-
    /* If r is odd, the sign of the result should be inverted.  */
    uint32x4_t odd
-      = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (r)), 31);
+      = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), 31);
  
    /* r = x - rint(x). Range reduction to -1/2 .. 1/2.  */
-  r = vsubq_f32 (r, vrndaq_f32 (r));
+  float32x4_t r = vsubq_f32 (x, vrndaq_f32 (x));
  
    /* Pairwise Horner approximation for y = sin(r * pi).  */
    float32x4_t r2 = vmulq_f32 (r, r);
    float32x4_t r4 = vmulq_f32 (r2, r2);
    float32x4_t y = vmulq_f32 (v_pw_horner_5_f32 (r2, r4, d->poly), r);
  
-#if WANT_SIMD_EXCEPT
-  if (__glibc_unlikely (v_any_u32 (cmp)))
-    return special_case (x, y, odd, cmp);
-#endif
-
    return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
  }
-
  libmvec_hidden_def (V_NAME_F1 (sinpi))
  HALF_WIDTH_ALIAS_F1 (sinpi)
diff --git a/sysdeps/aarch64/fpu/tanf_advsimd.c b/sysdeps/aarch64/fpu/tanf_advsimd.c

index a8adf16a0908d4a70eaba2e902c43ac5246bfec2..fd56ee2e84fbdadff775bf3bb56370178f8436a3 100644 (file)
--- a/sysdeps/aarch64/fpu/tanf_advsimd.c
+++ b/sysdeps/aarch64/fpu/tanf_advsimd.c
@@ -25,9 +25,7 @@ static const struct data
    float32x4_t poly[6];
    float pi_consts[4];
    float32x4_t shift;
-#if !WANT_SIMD_EXCEPT
    float32x4_t range_val;
-#endif
  } data = {
    /* Coefficients generated using FPMinimax.  */
    .poly = { V4 (0x1.55555p-2f), V4 (0x1.11166p-3f), V4 (0x1.b88a78p-5f),
@@ -36,15 +34,9 @@ static const struct data
    .pi_consts
    = { -0x1.921fb6p+0f, 0x1.777a5cp-25f, 0x1.ee59dap-50f, 0x1.45f306p-1f },
    .shift = V4 (0x1.8p+23f),
-#if !WANT_SIMD_EXCEPT
    .range_val = V4 (0x1p15f),
-#endif
  };
  
-#define RangeVal v_u32 (0x47000000)  /* asuint32(0x1p15f).  */
-#define TinyBound v_u32 (0x30000000) /* asuint32 (0x1p-31f).  */
-#define Thresh v_u32 (0x16000000)    /* asuint32(RangeVal) - TinyBound.  */
-
  /* Special cases (fall back to scalar calls).  */
  static float32x4_t VPCS_ATTR NOINLINE
  special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
@@ -57,15 +49,6 @@ static inline float32x4_t
  eval_poly (float32x4_t z, const struct data *d)
  {
    float32x4_t z2 = vmulq_f32 (z, z);
-#if WANT_SIMD_EXCEPT
-  /* Tiny z (<= 0x1p-31) will underflow when calculating z^4.
-     If fp exceptions are to be triggered correctly,
-     sidestep this by fixing such lanes to 0.  */
-  uint32x4_t will_uflow
-      = vcleq_u32 (vreinterpretq_u32_f32 (vabsq_f32 (z)), TinyBound);
-  if (__glibc_unlikely (v_any_u32 (will_uflow)))
-    z2 = vbslq_f32 (will_uflow, v_f32 (0), z2);
-#endif
    float32x4_t z4 = vmulq_f32 (z2, z2);
    return v_estrin_5_f32 (z, z2, z4, d->poly);
  }
@@ -79,20 +62,8 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tan) (float32x4_t x)
    const struct data *d = ptr_barrier (&data);
    float32x4_t special_arg = x;
  
-  /* iax >= RangeVal means x, if not inf or NaN, is too large to perform fast
-     regression.  */
-#if WANT_SIMD_EXCEPT
-  uint32x4_t iax = vreinterpretq_u32_f32 (vabsq_f32 (x));
-  /* If fp exceptions are to be triggered correctly, also special-case tiny
-     input, as this will load to overflow later. Fix any special lanes to 1 to
-     prevent any exceptions being triggered.  */
-  uint32x4_t special = vcgeq_u32 (vsubq_u32 (iax, TinyBound), Thresh);
-  if (__glibc_unlikely (v_any_u32 (special)))
-    x = vbslq_f32 (special, v_f32 (1.0f), x);
-#else
-  /* Otherwise, special-case large and special values.  */
+  /* Special-case large and special values.  */
    uint32x4_t special = vcageq_f32 (x, d->range_val);
-#endif
  
    /* n = rint(x/(pi/2)).  */
    float32x4_t pi_consts = vld1q_f32 (d->pi_consts);
diff --git a/sysdeps/aarch64/fpu/tanh_advsimd.c b/sysdeps/aarch64/fpu/tanh_advsimd.c

index 40acefd75241350c9b149618f635cfcd5d139fac..0adae883d4ba909ed35905734e5bf4b3065d6894 100644 (file)
--- a/sysdeps/aarch64/fpu/tanh_advsimd.c
+++ b/sysdeps/aarch64/fpu/tanh_advsimd.c
@@ -48,21 +48,12 @@ float64x2_t VPCS_ATTR V_NAME_D1 (tanh) (float64x2_t x)
  
    uint64x2_t ia = vreinterpretq_u64_f64 (vabsq_f64 (x));
  
-  float64x2_t u = x;
-
    /* Trigger special-cases for tiny, boring and infinity/NaN.  */
    uint64x2_t special = vcgtq_u64 (vsubq_u64 (ia, d->tiny_bound), d->thresh);
-#if WANT_SIMD_EXCEPT
-  /* To trigger fp exceptions correctly, set special lanes to a neutral value.
-     They will be fixed up later by the special-case handler.  */
-  if (__glibc_unlikely (v_any_u64 (special)))
-    u = v_zerofy_f64 (u, special);
-#endif
-
-  u = vaddq_f64 (u, u);
  
    /* tanh(x) = (e^2x - 1) / (e^2x + 1).  */
-  float64x2_t q = expm1_inline (u, &d->d);
+  float64x2_t twox = vaddq_f64 (x, x);
+  float64x2_t q = expm1_inline (twox, &d->d);
    float64x2_t qp2 = vaddq_f64 (q, v_f64 (2.0));
  
    if (__glibc_unlikely (v_any_u64 (special)))
diff --git a/sysdeps/aarch64/fpu/tanhf_advsimd.c b/sysdeps/aarch64/fpu/tanhf_advsimd.c

index f61a456bca0c3203c284f35abbcebca8c8f68a5f..4f5d1857772c75bd29c815eef69567d73e72efc4 100644 (file)
--- a/sysdeps/aarch64/fpu/tanhf_advsimd.c
+++ b/sysdeps/aarch64/fpu/tanhf_advsimd.c
@@ -57,17 +57,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tanh) (float32x4_t x)
    float32x4_t boring = vreinterpretq_f32_u32 (vorrq_u32 (
        sign, vreinterpretq_u32_s32 (d->expm1f_consts.exponent_bias)));
  
-#if WANT_SIMD_EXCEPT
-  /* If fp exceptions are to be triggered properly, set all special and boring
-     lanes to 0, which will trigger no exceptions, and fix them up later.  */
-  uint32x4_t special = vorrq_u32 (vcgtq_u32 (iax, d->large_bound),
-                                 vcltq_u32 (iax, v_u32 (0x34000000)));
-  x = v_zerofy_f32 (x, is_boring);
-  if (__glibc_unlikely (v_any_u32 (special)))
-    x = v_zerofy_f32 (x, special);
-#else
    uint32x4_t special = vcgtq_u32 (iax, d->large_bound);
-#endif
  
    /* tanh(x) = (e^2x - 1) / (e^2x + 1).  */
    float32x4_t q = expm1f_inline (vmulq_n_f32 (x, 2), &d->expm1f_consts);
diff --git a/sysdeps/aarch64/fpu/v_log1p_inline.h b/sysdeps/aarch64/fpu/v_log1p_inline.h

index d38f8fb3c52ddad72b0c57fd13729e00724b2e70..66d33724dc3772d3902eef5bc8cb64619d5dfa90 100644 (file)
--- a/sysdeps/aarch64/fpu/v_log1p_inline.h
+++ b/sysdeps/aarch64/fpu/v_log1p_inline.h
@@ -113,7 +113,7 @@ log1p_inline (float64x2_t x, const struct v_log1p_data *d)
    /* Shortcut if k is 0 - set correction term to 0 and f to x. The result is
       that the approximation is solely the polynomial.  */
    uint64x2_t k0 = vceqzq_f64 (k);
-  cm = v_zerofy_f64 (cm, k0);
+  cm = vbslq_f64 (k0, v_f64 (0), cm);
    f = vbslq_f64 (k0, x, f);
  #endif
  
diff --git a/sysdeps/aarch64/fpu/v_math.h b/sysdeps/aarch64/fpu/v_math.h

index 35d1b56a65fee35aca3735c3ecc17f3f0953029e..8deb22e1fc8f9db8095d496109b8e9f520c80817 100644 (file)
--- a/sysdeps/aarch64/fpu/v_math.h
+++ b/sysdeps/aarch64/fpu/v_math.h
@@ -108,11 +108,6 @@ v_call2_f32 (float (*f) (float, float), float32x4_t x1, float32x4_t x2,
                         p[2] ? f (x1[2], x2[2]) : y[2],
                         p[3] ? f (x1[3], x2[3]) : y[3] };
  }
-static inline float32x4_t
-v_zerofy_f32 (float32x4_t x, uint32x4_t mask)
-{
-  return vreinterpretq_f32_u32 (vbicq_u32 (vreinterpretq_u32_f32 (x), mask));
-}
  
  static inline float64x2_t
  v_f64 (double x)
@@ -172,10 +167,5 @@ v_call2_f64 (double (*f) (double, double), float64x2_t x1, float64x2_t x2,
    return (float64x2_t){ p[0] ? f (x1[0], x2[0]) : y[0],
                         p[1] ? f (x1[1], x2[1]) : y[1] };
  }
-static inline float64x2_t
-v_zerofy_f64 (float64x2_t x, uint64x2_t mask)
-{
-  return vreinterpretq_f64_u64 (vbicq_u64 (vreinterpretq_u64_f64 (x), mask));
-}
  
  #endif
diff --git a/sysdeps/aarch64/fpu/vecmath_config.h b/sysdeps/aarch64/fpu/vecmath_config.h

index 3af48525a952f85e0c6cb9de803e9842858270b5..2add10fd76f398872a70922ced48886736c4129e 100644 (file)
--- a/sysdeps/aarch64/fpu/vecmath_config.h
+++ b/sysdeps/aarch64/fpu/vecmath_config.h
@@ -21,11 +21,6 @@
  
  #include <math_private.h>
  
-/* Deprecated config option from Arm Optimized Routines which ensures
-   fp exceptions are correctly triggered. This is not intended to be
-   supported in GLIBC, however we keep it for ease of development.  */
-#define WANT_SIMD_EXCEPT 0
-
  /* Return ptr but hide its value from the compiler so accesses through it
     cannot be optimized based on the contents.  */
  #define ptr_barrier(ptr)                                                      \
author	Dylan Fleming <Dylan.Fleming@arm.com>
	Tue, 18 Nov 2025 15:34:41 +0000 (15:34 +0000)
committer	Wilco Dijkstra <wilco.dijkstra@arm.com>
	Tue, 18 Nov 2025 15:51:15 +0000 (15:51 +0000)
sysdeps/aarch64/fpu/acos_advsimd.c		patch \| blob \| blame \| history
sysdeps/aarch64/fpu/acosf_advsimd.c		patch \| blob \| blame \| history
sysdeps/aarch64/fpu/acosh_advsimd.c		patch \| blob \| blame \| history
sysdeps/aarch64/fpu/acoshf_advsimd.c		patch \| blob \| blame \| history
sysdeps/aarch64/fpu/asin_advsimd.c		patch \| blob \| blame \| history
sysdeps/aarch64/fpu/asinf_advsimd.c		patch \| blob \| blame \| history
sysdeps/aarch64/fpu/asinh_advsimd.c		patch \| blob \| blame \| history
sysdeps/aarch64/fpu/asinhf_advsimd.c		patch \| blob \| blame \| history
sysdeps/aarch64/fpu/atan_advsimd.c		patch \| blob \| blame \| history
sysdeps/aarch64/fpu/atanf_advsimd.c		patch \| blob \| blame \| history
sysdeps/aarch64/fpu/atanh_advsimd.c		patch \| blob \| blame \| history
sysdeps/aarch64/fpu/atanhf_advsimd.c		patch \| blob \| blame \| history
sysdeps/aarch64/fpu/cos_advsimd.c		patch \| blob \| blame \| history
sysdeps/aarch64/fpu/cosf_advsimd.c		patch \| blob \| blame \| history
sysdeps/aarch64/fpu/coshf_advsimd.c		patch \| blob \| blame \| history
sysdeps/aarch64/fpu/cospi_advsimd.c		patch \| blob \| blame \| history
sysdeps/aarch64/fpu/cospif_advsimd.c		patch \| blob \| blame \| history
sysdeps/aarch64/fpu/erf_advsimd.c		patch \| blob \| blame \| history
sysdeps/aarch64/fpu/erfc_advsimd.c		patch \| blob \| blame \| history
sysdeps/aarch64/fpu/erfcf_advsimd.c		patch \| blob \| blame \| history
sysdeps/aarch64/fpu/erff_advsimd.c		patch \| blob \| blame \| history
sysdeps/aarch64/fpu/exp10_advsimd.c		patch \| blob \| blame \| history
sysdeps/aarch64/fpu/exp10f_advsimd.c		patch \| blob \| blame \| history
sysdeps/aarch64/fpu/exp2_advsimd.c		patch \| blob \| blame \| history
sysdeps/aarch64/fpu/exp2f_advsimd.c		patch \| blob \| blame \| history
sysdeps/aarch64/fpu/exp_advsimd.c		patch \| blob \| blame \| history
sysdeps/aarch64/fpu/expf_advsimd.c		patch \| blob \| blame \| history
sysdeps/aarch64/fpu/expm1_advsimd.c		patch \| blob \| blame \| history
sysdeps/aarch64/fpu/expm1f_advsimd.c		patch \| blob \| blame \| history
sysdeps/aarch64/fpu/finite_pow.h		patch \| blob \| blame \| history
sysdeps/aarch64/fpu/hypot_advsimd.c		patch \| blob \| blame \| history
sysdeps/aarch64/fpu/hypotf_advsimd.c		patch \| blob \| blame \| history
sysdeps/aarch64/fpu/log1p_advsimd.c		patch \| blob \| blame \| history
sysdeps/aarch64/fpu/log1pf_advsimd.c		patch \| blob \| blame \| history
sysdeps/aarch64/fpu/pow_advsimd.c		patch \| blob \| blame \| history
sysdeps/aarch64/fpu/sin_advsimd.c		patch \| blob \| blame \| history
sysdeps/aarch64/fpu/sinf_advsimd.c		patch \| blob \| blame \| history
sysdeps/aarch64/fpu/sinhf_advsimd.c		patch \| blob \| blame \| history
sysdeps/aarch64/fpu/sinpi_advsimd.c		patch \| blob \| blame \| history
sysdeps/aarch64/fpu/sinpif_advsimd.c		patch \| blob \| blame \| history
sysdeps/aarch64/fpu/tanf_advsimd.c		patch \| blob \| blame \| history
sysdeps/aarch64/fpu/tanh_advsimd.c		patch \| blob \| blame \| history
sysdeps/aarch64/fpu/tanhf_advsimd.c		patch \| blob \| blame \| history
sysdeps/aarch64/fpu/v_log1p_inline.h		patch \| blob \| blame \| history
sysdeps/aarch64/fpu/v_math.h		patch \| blob \| blame \| history
sysdeps/aarch64/fpu/vecmath_config.h		patch \| blob \| blame \| history