.abs_mask = V2 (0x7fffffffffffffff),
};
-#define AllMask v_u64 (0xffffffffffffffff)
-#define Oneu 0x3ff0000000000000
-#define Small 0x3e50000000000000 /* 2^-53. */
-
-#if WANT_SIMD_EXCEPT
-static float64x2_t VPCS_ATTR NOINLINE
-special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
-{
- return v_call_f64 (acos, x, y, special);
-}
-#endif
-
/* Double-precision implementation of vector acos(x).
- For |x| < Small, approximate acos(x) by pi/2 - x. Small = 2^-53 for correct
- rounding.
- If WANT_SIMD_EXCEPT = 0, Small = 0 and we proceed with the following
- approximation.
-
- For |x| in [Small, 0.5], use an order 11 polynomial P such that the final
+ For |x| in [0, 0.5], use an order 11 polynomial P such that the final
approximation of asin is an odd polynomial:
acos(x) ~ pi/2 - (x + x^3 P(x^2)).
const struct data *d = ptr_barrier (&data);
float64x2_t ax = vabsq_f64 (x);
-
-#if WANT_SIMD_EXCEPT
- /* A single comparison for One, Small and QNaN. */
- uint64x2_t special
- = vcgtq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (ax), v_u64 (Small)),
- v_u64 (Oneu - Small));
- if (__glibc_unlikely (v_any_u64 (special)))
- return special_case (x, x, AllMask);
-#endif
-
uint64x2_t a_le_half = vcleq_f64 (ax, v_f64 (0.5));
/* Evaluate polynomial Q(x) = z + z * z2 * P(z2) with
= 2 Q(|x|) , for 0.5 < x < 1.0
= pi - 2 Q(|x|) , for -1.0 < x < -0.5. */
float64x2_t y = vbslq_f64 (d->abs_mask, p, x);
-
uint64x2_t is_neg = vcltzq_f64 (x);
float64x2_t off = vreinterpretq_f64_u64 (
vandq_u64 (is_neg, vreinterpretq_u64_f64 (d->pi)));
#define AbsMask 0x7fffffff
#define Half 0x3f000000
-#define One 0x3f800000
-#define Small 0x32800000 /* 2^-26. */
-
-#if WANT_SIMD_EXCEPT
-static float32x4_t VPCS_ATTR NOINLINE
-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
-{
- return v_call_f32 (acosf, x, y, special);
-}
-#endif
/* Single-precision implementation of vector acos(x).
- For |x| < Small, approximate acos(x) by pi/2 - x. Small = 2^-26 for correct
- rounding.
- If WANT_SIMD_EXCEPT = 0, Small = 0 and we proceed with the following
- approximation.
-
- For |x| in [Small, 0.5], use order 4 polynomial P such that the final
+ For |x| in [0, 0.5], use order 4 polynomial P such that the final
approximation of asin is an odd polynomial:
acos(x) ~ pi/2 - (x + x^3 P(x^2)).
The largest observed error in this region is 1.32 ulps,
_ZGVnN4v_acosf (0x1.15ba56p-1) got 0x1.feb33p-1
- want 0x1.feb32ep-1. */
+ want 0x1.feb32ep-1. */
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (acos) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
uint32x4_t ix = vreinterpretq_u32_f32 (x);
uint32x4_t ia = vandq_u32 (ix, v_u32 (AbsMask));
-#if WANT_SIMD_EXCEPT
- /* A single comparison for One, Small and QNaN. */
- uint32x4_t special
- = vcgtq_u32 (vsubq_u32 (ia, v_u32 (Small)), v_u32 (One - Small));
- if (__glibc_unlikely (v_any_u32 (special)))
- return special_case (x, x, v_u32 (0xffffffff));
-#endif
-
float32x4_t ax = vreinterpretq_f32_u32 (ia);
uint32x4_t a_le_half = vcleq_u32 (ia, v_u32 (Half));
const struct data *d = ptr_barrier (&data);
uint64x2_t special
= vcgeq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (x), d->one), d->thresh);
- float64x2_t special_arg = x;
-
-#if WANT_SIMD_EXCEPT
- if (__glibc_unlikely (v_any_u64 (special)))
- x = vbslq_f64 (special, vreinterpretq_f64_u64 (d->one), x);
-#endif
float64x2_t xm1 = vsubq_f64 (x, v_f64 (1.0));
float64x2_t y = vaddq_f64 (x, v_f64 (1.0));
y = vaddq_f64 (xm1, y);
if (__glibc_unlikely (v_any_u64 (special)))
- return special_case (special_arg, y, special, &d->log1p_consts);
+ return special_case (x, y, special, &d->log1p_consts);
return log1p_inline (y, &d->log1p_consts);
}
return v_call_f32 (acoshf, x, log1pf_inline (y, d), vmovl_u16 (special));
}
-/* Vector approximation for single-precision acosh, based on log1p. Maximum
- error depends on WANT_SIMD_EXCEPT. With SIMD fp exceptions enabled, it
- is 3.00 ULP:
- _ZGVnN4v_acoshf(0x1.01df3ap+0) got 0x1.ef0a82p-4
- want 0x1.ef0a7cp-4.
- With exceptions disabled, we can compute u with a shorter dependency chain,
- which gives maximum error of 3.22 ULP:
+/* Vector approximation for single-precision acosh, based on log1p.
+ The largest observed error is 3.22 ULP:
_ZGVnN4v_acoshf(0x1.007ef2p+0) got 0x1.fdcdccp-5
want 0x1.fdcdd2p-5. */
uint32x4_t ix = vreinterpretq_u32_f32 (x);
uint16x4_t special = vcge_u16 (vsubhn_u32 (ix, d->one), Thresh);
-#if WANT_SIMD_EXCEPT
- /* Mask special lanes with 1 to side-step spurious invalid or overflow. Use
- only xm1 to calculate u, as operating on x will trigger invalid for NaN.
- Widening sign-extend special predicate in order to mask with it. */
- uint32x4_t p
- = vreinterpretq_u32_s32 (vmovl_s16 (vreinterpret_s16_u16 (special)));
- float32x4_t xm1 = v_zerofy_f32 (vsubq_f32 (x, v_f32 (1)), p);
- float32x4_t u = vfmaq_f32 (vaddq_f32 (xm1, xm1), xm1, xm1);
-#else
float32x4_t xm1 = vsubq_f32 (x, vreinterpretq_f32_u32 (d->one));
float32x4_t u
= vmulq_f32 (xm1, vaddq_f32 (x, vreinterpretq_f32_u32 (d->one)));
-#endif
float32x4_t y = vaddq_f32 (xm1, vsqrtq_f32 (u));
.pi_over_2 = V2 (0x1.921fb54442d18p+0), .abs_mask = V2 (0x7fffffffffffffff),
};
-#define AllMask v_u64 (0xffffffffffffffff)
-#define One 0x3ff0000000000000
-#define Small 0x3e50000000000000 /* 2^-12. */
-
-#if WANT_SIMD_EXCEPT
-static float64x2_t VPCS_ATTR NOINLINE
-special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
-{
- return v_call_f64 (asin, x, y, special);
-}
-#endif
-
/* Double-precision implementation of vector asin(x).
-
- For |x| < Small, approximate asin(x) by x. Small = 2^-12 for correct
- rounding. If WANT_SIMD_EXCEPT = 0, Small = 0 and we proceed with the
- following approximation.
-
- For |x| in [Small, 0.5], use an order 11 polynomial P such that the final
+ For |x| in [0, 0.5], use an order 11 polynomial P such that the final
approximation is an odd polynomial: asin(x) ~ x + x^3 P(x^2).
The largest observed error in this region is 1.01 ulps,
float64x2_t VPCS_ATTR V_NAME_D1 (asin) (float64x2_t x)
{
const struct data *d = ptr_barrier (&data);
-
float64x2_t ax = vabsq_f64 (x);
-#if WANT_SIMD_EXCEPT
- /* Special values need to be computed with scalar fallbacks so
- that appropriate exceptions are raised. */
- uint64x2_t special
- = vcgtq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (ax), v_u64 (Small)),
- v_u64 (One - Small));
- if (__glibc_unlikely (v_any_u64 (special)))
- return special_case (x, x, AllMask);
-#endif
-
uint64x2_t a_lt_half = vcaltq_f64 (x, v_f64 (0.5));
/* Evaluate polynomial Q(x) = y + y * z * P(z) with
};
#define AbsMask 0x7fffffff
-#define One 0x3f800000
-#define Small 0x39800000 /* 2^-12. */
-
-#if WANT_SIMD_EXCEPT
-static float32x4_t VPCS_ATTR NOINLINE
-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
-{
- return v_call_f32 (asinf, x, y, special);
-}
-#endif
/* Single-precision implementation of vector asin(x).
-
- For |x| <0.5, use order 4 polynomial P such that the final
+ For |x| in [0, 0.5), use order 4 polynomial P such that the final
approximation is an odd polynomial: asin(x) ~ x + x^3 P(x^2).
The largest observed error in this region is 0.83 ulps,
uint32x4_t ix = vreinterpretq_u32_f32 (x);
uint32x4_t ia = vandq_u32 (ix, v_u32 (AbsMask));
-
-#if WANT_SIMD_EXCEPT
- /* Special values need to be computed with scalar fallbacks so
- that appropriate fp exceptions are raised. */
- uint32x4_t special
- = vcgtq_u32 (vsubq_u32 (ia, v_u32 (Small)), v_u32 (One - Small));
- if (__glibc_unlikely (v_any_u32 (special)))
- return special_case (x, x, v_u32 (0xffffffff));
-#endif
-
float32x4_t ax = vreinterpretq_f32_u32 (ia);
uint32x4_t a_lt_half = vcaltq_f32 (x, v_f32 (0.5f));
const static struct data
{
uint64x2_t huge_bound, abs_mask, off, mask;
-#if WANT_SIMD_EXCEPT
- float64x2_t tiny_bound;
-#endif
float64x2_t lc0, lc2;
double lc1, lc3, ln2, lc4;
-
float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c17;
double c1, c3, c5, c7, c9, c11, c13, c15;
-
} data = {
-#if WANT_SIMD_EXCEPT
- .tiny_bound = V2 (0x1p-26),
-#endif
/* Even terms of polynomial s.t. asinh(x) is approximated by
asinh(x) ~= x + x^3 * (C0 + C1 * x + C2 * x^2 + C3 * x^3 + ...).
Generated using Remez, f = (asinh(sqrt(x)) - sqrt(x))/x^(3/2). */
-
- .c0 = V2 (-0x1.55555555554a7p-3),
- .c1 = 0x1.3333333326c7p-4,
- .c2 = V2 (-0x1.6db6db68332e6p-5),
- .c3 = 0x1.f1c71b26fb40dp-6,
- .c4 = V2 (-0x1.6e8b8b654a621p-6),
- .c5 = 0x1.1c4daa9e67871p-6,
- .c6 = V2 (-0x1.c9871d10885afp-7),
- .c7 = 0x1.7a16e8d9d2ecfp-7,
- .c8 = V2 (-0x1.3ddca533e9f54p-7),
- .c9 = 0x1.0becef748dafcp-7,
- .c10 = V2 (-0x1.b90c7099dd397p-8),
- .c11 = 0x1.541f2bb1ffe51p-8,
- .c12 = V2 (-0x1.d217026a669ecp-9),
- .c13 = 0x1.0b5c7977aaf7p-9,
- .c14 = V2 (-0x1.e0f37daef9127p-11),
- .c15 = 0x1.388b5fe542a6p-12,
- .c16 = V2 (-0x1.021a48685e287p-14),
- .c17 = V2 (0x1.93d4ba83d34dap-18),
-
- .lc0 = V2 (-0x1.ffffffffffff7p-2),
- .lc1 = 0x1.55555555170d4p-2,
- .lc2 = V2 (-0x1.0000000399c27p-2),
- .lc3 = 0x1.999b2e90e94cap-3,
- .lc4 = -0x1.554e550bd501ep-3,
- .ln2 = 0x1.62e42fefa39efp-1,
-
- .off = V2 (0x3fe6900900000000),
- .huge_bound = V2 (0x5fe0000000000000),
- .abs_mask = V2 (0x7fffffffffffffff),
- .mask = V2 (0xfffULL << 52),
+ .c0 = V2 (-0x1.55555555554a7p-3), .c1 = 0x1.3333333326c7p-4,
+ .c2 = V2 (-0x1.6db6db68332e6p-5), .c3 = 0x1.f1c71b26fb40dp-6,
+ .c4 = V2 (-0x1.6e8b8b654a621p-6), .c5 = 0x1.1c4daa9e67871p-6,
+ .c6 = V2 (-0x1.c9871d10885afp-7), .c7 = 0x1.7a16e8d9d2ecfp-7,
+ .c8 = V2 (-0x1.3ddca533e9f54p-7), .c9 = 0x1.0becef748dafcp-7,
+ .c10 = V2 (-0x1.b90c7099dd397p-8), .c11 = 0x1.541f2bb1ffe51p-8,
+ .c12 = V2 (-0x1.d217026a669ecp-9), .c13 = 0x1.0b5c7977aaf7p-9,
+ .c14 = V2 (-0x1.e0f37daef9127p-11), .c15 = 0x1.388b5fe542a6p-12,
+ .c16 = V2 (-0x1.021a48685e287p-14), .c17 = V2 (0x1.93d4ba83d34dap-18),
+ .lc0 = V2 (-0x1.ffffffffffff7p-2), .lc1 = 0x1.55555555170d4p-2,
+ .lc2 = V2 (-0x1.0000000399c27p-2), .lc3 = 0x1.999b2e90e94cap-3,
+ .lc4 = -0x1.554e550bd501ep-3, .ln2 = 0x1.62e42fefa39efp-1,
+ .off = V2 (0x3fe6900900000000), .huge_bound = V2 (0x5fe0000000000000),
+ .abs_mask = V2 (0x7fffffffffffffff), .mask = V2 (0xfffULL << 52),
};
static float64x2_t NOINLINE VPCS_ATTR
}
static inline float64x2_t
-log_inline (float64x2_t xm, const struct data *d)
+log_inline (float64x2_t ax, const struct data *d)
{
-
- uint64x2_t u = vreinterpretq_u64_f64 (xm);
+ uint64x2_t u = vreinterpretq_u64_f64 (ax);
uint64x2_t u_off = vsubq_u64 (u, d->off);
int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52);
asinh(x) = sign(x) * log(|x| + sqrt(x^2 + 1) if |x| >= 1
= sign(x) * (|x| + |x|^3 * P(x^2)) otherwise
where log(x) is an optimized log approximation, and P(x) is a polynomial
- shared with the scalar routine. The greatest observed error 2.79 ULP, in
- |x| >= 1:
- _ZGVnN2v_asinh(0x1.2cd9d73ea76a6p+0) got 0x1.ffffd003219dap-1
- want 0x1.ffffd003219ddp-1. */
+ shared with the scalar routine.
+ For |x| >= 1, the greatest observed error is 2.87 ULP.
+ _ZGVnN2v_asinh(-0x1.177c6017ce58ap+0) got -0x1.e3ba3d5cb1a46p-1
+ want -0x1.e3ba3d5cb1a49p-1. */
VPCS_ATTR float64x2_t V_NAME_D1 (asinh) (float64x2_t x)
{
const struct data *d = ptr_barrier (&data);
uint64x2_t gt1 = vcgeq_f64 (ax, v_f64 (1));
-#if WANT_SIMD_EXCEPT
- uint64x2_t iax = vreinterpretq_u64_f64 (ax);
- uint64x2_t special = vcgeq_u64 (iax, (d->huge_bound));
- uint64x2_t tiny = vcltq_f64 (ax, d->tiny_bound);
- special = vorrq_u64 (special, tiny);
-#else
uint64x2_t special = vcgeq_f64 (ax, vreinterpretq_f64_u64 (d->huge_bound));
-#endif
/* Option 1: |x| >= 1.
- Compute asinh(x) according by asinh(x) = log(x + sqrt(x^2 + 1)).
- If WANT_SIMD_EXCEPT is enabled, sidestep special values, which will
- overflow, by setting special lanes to 1. These will be fixed later. */
+ Compute asinh(x) according by asinh(x) = log(x + sqrt(x^2 + 1)). */
float64x2_t option_1 = v_f64 (0);
if (__glibc_likely (v_any_u64 (gt1)))
{
-#if WANT_SIMD_EXCEPT
- float64x2_t xm = v_zerofy_f64 (ax, special);
-#else
- float64x2_t xm = ax;
-#endif
option_1 = log_inline (
- vaddq_f64 (xm, vsqrtq_f64 (vfmaq_f64 (v_f64 (1), xm, xm))), d);
+ vaddq_f64 (ax, vsqrtq_f64 (vfmaq_f64 (v_f64 (1), ax, ax))), d);
}
/* Option 2: |x| < 1.
Compute asinh(x) using a polynomial.
- If WANT_SIMD_EXCEPT is enabled, sidestep special lanes, which will
- overflow, and tiny lanes, which will underflow, by setting them to 0. They
- will be fixed later, either by selecting x or falling back to the scalar
- special-case. The largest observed error in this region is 1.47 ULPs:
- _ZGVnN2v_asinh(0x1.fdfcd00cc1e6ap-1) got 0x1.c1d6bf874019bp-1
- want 0x1.c1d6bf874019cp-1. */
+ The largest observed error in this region is 1.36 ULPs:
+ _ZGVnN2v_asinh(0x1.fe1e2aaa8dd54p-1) got 0x1.c1ee60bc0788ap-1
+ want 0x1.c1ee60bc0788bp-1. */
float64x2_t option_2 = v_f64 (0);
if (__glibc_likely (v_any_u64 (vceqzq_u64 (gt1))))
{
-
-#if WANT_SIMD_EXCEPT
- ax = v_zerofy_f64 (ax, vorrq_u64 (tiny, gt1));
-#endif
float64x2_t x2 = vmulq_f64 (ax, ax), z2 = vmulq_f64 (x2, x2);
/* Order-17 Pairwise Horner scheme. */
float64x2_t c13 = vld1q_f64 (&d->c1);
p = vfmaq_f64 (p1213, z2, p);
p = vfmaq_f64 (p1011, z2, p);
p = vfmaq_f64 (p89, z2, p);
-
p = vfmaq_f64 (p67, z2, p);
p = vfmaq_f64 (p45, z2, p);
-
p = vfmaq_f64 (p23, z2, p);
-
p = vfmaq_f64 (p01, z2, p);
option_2 = vfmaq_f64 (ax, p, vmulq_f64 (ax, x2));
-#if WANT_SIMD_EXCEPT
- option_2 = vbslq_f64 (tiny, x, option_2);
-#endif
}
/* Choose the right option for each lane. */
float64x2_t y = vbslq_f64 (gt1, option_1, option_2);
if (__glibc_unlikely (v_any_u64 (special)))
- {
- return special_case (x, y, d->abs_mask, special);
- }
+ return special_case (x, y, d->abs_mask, special);
+
/* Copy sign. */
return vbslq_f64 (d->abs_mask, y, x);
}
struct v_log1pf_data log1pf_consts;
float32x4_t one;
uint32x4_t big_bound;
-#if WANT_SIMD_EXCEPT
- uint32x4_t tiny_bound;
-#endif
} data = {
.one = V4 (1),
.log1pf_consts = V_LOG1PF_CONSTANTS_TABLE,
.big_bound = V4 (0x5f800000), /* asuint(0x1p64). */
-#if WANT_SIMD_EXCEPT
- .tiny_bound = V4 (0x30800000) /* asuint(0x1p-30). */
-#endif
};
static float32x4_t NOINLINE VPCS_ATTR
uint32x4_t iax = vreinterpretq_u32_f32 (ax);
uint32x4_t special = vcgeq_u32 (iax, dat->big_bound);
uint32x4_t sign = veorq_u32 (vreinterpretq_u32_f32 (x), iax);
- float32x4_t special_arg = x;
-
-#if WANT_SIMD_EXCEPT
- /* Sidestep tiny and large values to avoid inadvertently triggering
- under/overflow. */
- special = vorrq_u32 (special, vcltq_u32 (iax, dat->tiny_bound));
- if (__glibc_unlikely (v_any_u32 (special)))
- {
- ax = v_zerofy_f32 (ax, special);
- x = v_zerofy_f32 (x, special);
- }
-#endif
/* asinh(x) = log(x + sqrt(x * x + 1)).
For positive x, asinh(x) = log1p(x + x * x / (1 + sqrt(x * x + 1))). */
float32x4_t y = vaddq_f32 (ax, vdivq_f32 (vmulq_f32 (ax, ax), d));
if (__glibc_unlikely (v_any_u32 (special)))
- return special_case (special_arg, sign, y, special, dat);
+ return special_case (x, sign, y, special, dat);
return vreinterpretq_f32_u32 (veorq_u32 (
sign, vreinterpretq_u32_f32 (log1pf_inline (y, &dat->log1pf_consts))));
}
uint64x2_t ix = vreinterpretq_u64_f64 (x);
uint64x2_t sign = vandq_u64 (ix, SignMask);
-#if WANT_SIMD_EXCEPT
- uint64x2_t ia12 = vandq_u64 (ix, v_u64 (0x7ff0000000000000));
- uint64x2_t special = vcgtq_u64 (vsubq_u64 (ia12, v_u64 (TinyBound)),
- v_u64 (BigBound - TinyBound));
- /* If any lane is special, fall back to the scalar routine for all lanes. */
- if (__glibc_unlikely (v_any_u64 (special)))
- return v_call_f64 (atan, x, v_f64 (0), v_u64 (-1));
-#endif
-
/* Argument reduction:
y := arctan(x) for x < 1
y := pi/2 + arctan(-1/x) for x > 1
{
uint32x4_t sign_mask, pi_over_2;
float32x4_t neg_one;
-#if WANT_SIMD_EXCEPT
- float32x4_t poly[8];
-} data = {
- .poly = { V4 (-0x1.5554dcp-2), V4 (0x1.9978ecp-3), V4 (-0x1.230a94p-3),
- V4 (0x1.b4debp-4), V4 (-0x1.3550dap-4), V4 (0x1.61eebp-5),
- V4 (-0x1.0c17d4p-6), V4 (0x1.7ea694p-9) },
-#else
float32x4_t c0, c2, c4, c6;
float c1, c3, c5, c7;
} data = {
.c2 = V4 (-0x1.230a94p-3), .c3 = 0x1.b4debp-4,
.c4 = V4 (-0x1.3550dap-4), .c5 = 0x1.61eebp-5,
.c6 = V4 (-0x1.0c17d4p-6), .c7 = 0x1.7ea694p-9,
-#endif
- .pi_over_2 = V4 (0x3fc90fdb),
- .neg_one = V4 (-1.0f),
+ .pi_over_2 = V4 (0x3fc90fdb), .neg_one = V4 (-1.0f),
.sign_mask = V4 (0x80000000),
};
-#if WANT_SIMD_EXCEPT
-#define TinyBound 0x30800000 /* asuint(0x1p-30). */
-#define BigBound 0x4e800000 /* asuint(0x1p30). */
-
-static float32x4_t VPCS_ATTR NOINLINE
-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
-{
- return v_call_f32 (atanf, x, y, special);
-}
-#endif
-
/* Fast implementation of vector atanf based on
atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1]
using z=-1/x and shift = pi/2. Maximum observed error is 2.02 ulps:
uint32x4_t ix = vreinterpretq_u32_f32 (x);
uint32x4_t sign = vandq_u32 (ix, d->sign_mask);
-#if WANT_SIMD_EXCEPT
- /* Small cases, infs and nans are supported by our approximation technique,
- but do not set fenv flags correctly. Only trigger special case if we need
- fenv. */
- uint32x4_t ia = vandq_u32 (ix, v_u32 (0x7ff00000));
- uint32x4_t special = vcgtq_u32 (vsubq_u32 (ia, v_u32 (TinyBound)),
- v_u32 (BigBound - TinyBound));
- /* If any lane is special, fall back to the scalar routine for all lanes. */
- if (__glibc_unlikely (v_any_u32 (special)))
- return special_case (x, x, v_u32 (-1));
-#endif
/* Argument reduction:
y := arctan(x) for |x| < 1
y := arctan(-1/x) + pi/2 for x > +1
float32x4_t z2 = vmulq_f32 (z, z);
float32x4_t z3 = vmulq_f32 (z, z2);
float32x4_t z4 = vmulq_f32 (z2, z2);
-#if WANT_SIMD_EXCEPT
-
- /* Calculate the polynomial approximation.
- Use 2-level Estrin scheme for P(z^2) with deg(P)=7. However,
- a standard implementation using z8 creates spurious underflow
- in the very last fma (when z^8 is small enough).
- Therefore, we split the last fma into a mul and an fma. */
- float32x4_t y = vfmaq_f32 (
- v_pairwise_poly_3_f32 (z2, z4, d->poly), z4,
- vmulq_f32 (z4, v_pairwise_poly_3_f32 (z2, z4, d->poly + 4)));
-
-#else
float32x4_t z8 = vmulq_f32 (z4, z4);
/* Uses an Estrin scheme for polynomial approximation. */
float32x4_t p47 = vfmaq_f32 (p45, z4, p67);
float32x4_t y = vfmaq_f32 (p03, z8, p47);
-#endif
/* y = shift + z * P(z^2). */
return vfmaq_f32 (vaddq_f32 (shift, z), z3, y);
uint64x2_t ia = vreinterpretq_u64_f64 (ax);
uint64x2_t special = vcgeq_u64 (ia, d->one);
-#if WANT_SIMD_EXCEPT
- ax = v_zerofy_f64 (ax, special);
-#endif
-
float64x2_t y;
y = vaddq_f64 (ax, ax);
y = vdivq_f64 (y, vsubq_f64 (vreinterpretq_f64_u64 (d->one), ax));
if (__glibc_unlikely (v_any_u64 (special)))
-#if WANT_SIMD_EXCEPT
- return special_case (x, halfsign, y, special, d);
-#else
return special_case (ax, halfsign, y, special, d);
-#endif
y = log1p_inline (y, &d->log1p_consts);
return vmulq_f64 (y, halfsign);
{
struct v_log1pf_data log1pf_consts;
uint32x4_t one;
-#if WANT_SIMD_EXCEPT
- uint32x4_t tiny_bound;
-#endif
} data = {
.log1pf_consts = V_LOG1PF_CONSTANTS_TABLE,
.one = V4 (0x3f800000),
-#if WANT_SIMD_EXCEPT
- /* 0x1p-12, below which atanhf(x) rounds to x. */
- .tiny_bound = V4 (0x39800000),
-#endif
};
#define AbsMask v_u32 (0x7fffffff)
float32x4_t ax = vabsq_f32 (x);
uint32x4_t iax = vreinterpretq_u32_f32 (ax);
-#if WANT_SIMD_EXCEPT
- uint32x4_t special
- = vorrq_u32 (vcgeq_u32 (iax, d->one), vcltq_u32 (iax, d->tiny_bound));
- /* Side-step special cases by setting those lanes to 0, which will trigger no
- exceptions. These will be fixed up later. */
- if (__glibc_unlikely (v_any_u32 (special)))
- ax = v_zerofy_f32 (ax, special);
-#else
uint32x4_t special = vcgeq_u32 (iax, d->one);
-#endif
float32x4_t y = vdivq_f32 (vaddq_f32 (ax, ax),
vsubq_f32 (vreinterpretq_f32_u32 (d->one), ax));
chain. If exceptions are required ax will have been zerofied, so have to
pass x. */
if (__glibc_unlikely (v_any_u32 (special)))
-#if WANT_SIMD_EXCEPT
- return special_case (x, halfsign, y, special);
-#else
return special_case (ax, halfsign, y, special);
-#endif
return vmulq_f32 (halfsign, y);
}
libmvec_hidden_def (V_NAME_F1 (atanh))
float64x2_t n, r, r2, r3, r4, t1, t2, t3, y;
uint64x2_t odd, cmp;
-#if WANT_SIMD_EXCEPT
- r = vabsq_f64 (x);
- cmp = vcgeq_u64 (vreinterpretq_u64_f64 (r),
- vreinterpretq_u64_f64 (d->range_val));
- if (__glibc_unlikely (v_any_u64 (cmp)))
- /* If fenv exceptions are to be triggered correctly, set any special lanes
- to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
- special-case handler later. */
- r = vbslq_f64 (cmp, v_f64 (1.0), r);
-#else
cmp = vcageq_f64 (x, d->range_val);
- r = x;
-#endif
/* n = rint((|x|+pi/2)/pi) - 0.5. */
- n = vrndaq_f64 (vfmaq_f64 (v_f64 (0.5), r, d->inv_pi));
+ n = vrndaq_f64 (vfmaq_f64 (v_f64 (0.5), x, d->inv_pi));
odd = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtq_s64_f64 (n)), 63);
n = vsubq_f64 (n, v_f64 (0.5f));
/* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
- r = vfmsq_f64 (r, d->pi_1, n);
+ r = vfmsq_f64 (x, d->pi_1, n);
r = vfmsq_f64 (r, d->pi_2, n);
r = vfmsq_f64 (r, d->pi_3, n);
float32x4_t n, r, r2, r3, y;
uint32x4_t odd, cmp;
-#if WANT_SIMD_EXCEPT
- r = vabsq_f32 (x);
- cmp = vcgeq_u32 (vreinterpretq_u32_f32 (r),
- vreinterpretq_u32_f32 (d->range_val));
- if (__glibc_unlikely (v_any_u32 (cmp)))
- /* If fenv exceptions are to be triggered correctly, set any special lanes
- to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
- special-case handler later. */
- r = vbslq_f32 (cmp, v_f32 (1.0f), r);
-#else
cmp = vcageq_f32 (x, d->range_val);
- r = x;
-#endif
/* n = rint((|x|+pi/2)/pi) - 0.5. */
- n = vrndaq_f32 (vfmaq_f32 (v_f32 (0.5), r, d->inv_pi));
+ n = vrndaq_f32 (vfmaq_f32 (v_f32 (0.5), x, d->inv_pi));
odd = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 31);
n = vsubq_f32 (n, v_f32 (0.5f));
/* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
- r = vfmsq_f32 (r, d->pi_1, n);
+ r = vfmsq_f32 (x, d->pi_1, n);
r = vfmsq_f32 (r, d->pi_2, n);
r = vfmsq_f32 (r, d->pi_3, n);
static const struct data
{
struct v_expf_data expf_consts;
- uint32x4_t tiny_bound;
float32x4_t bound;
-#if WANT_SIMD_EXCEPT
- uint32x4_t special_bound;
-#endif
} data = {
.expf_consts = V_EXPF_DATA,
- .tiny_bound = V4 (0x20000000), /* 0x1p-63: Round to 1 below this. */
/* 0x1.5a92d8p+6: expf overflows above this, so have to use special case. */
.bound = V4 (0x1.5a92d8p+6),
-#if WANT_SIMD_EXCEPT
- .special_bound = V4 (0x42ad496c),
-#endif
};
-#if !WANT_SIMD_EXCEPT
static float32x4_t NOINLINE VPCS_ATTR
special_case (float32x4_t x, float32x4_t half_t, float32x4_t half_over_t,
uint32x4_t special)
{
return v_call_f32 (coshf, x, vaddq_f32 (half_t, half_over_t), special);
}
-#endif
/* Single-precision vector cosh, using vector expf.
Maximum error is 2.38 ULP:
_ZGVnN4v_coshf (0x1.e8001ep+1) got 0x1.6a491ep+4
want 0x1.6a4922p+4. */
-float32x4_t VPCS_ATTR V_NAME_F1 (cosh) (float32x4_t x)
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (cosh) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
-#if WANT_SIMD_EXCEPT
- /* If fp exceptions are to be triggered correctly, fall back to the scalar
- variant for all inputs if any input is a special value or above the bound
- at which expf overflows. */
- float32x4_t ax = vabsq_f32 (x);
- uint32x4_t iax = vreinterpretq_u32_f32 (ax);
- uint32x4_t special = vcgeq_u32 (iax, d->special_bound);
- if (__glibc_unlikely (v_any_u32 (special)))
- return v_call_f32 (coshf, x, x, v_u32 (-1));
-
- uint32x4_t tiny = vcleq_u32 (iax, d->tiny_bound);
- /* If any input is tiny, avoid underflow exception by fixing tiny lanes of
- input to 0, which will generate no exceptions. */
- if (__glibc_unlikely (v_any_u32 (tiny)))
- ax = v_zerofy_f32 (ax, tiny);
- float32x4_t t = v_expf_inline (ax, &d->expf_consts);
-#else
uint32x4_t special = vcageq_f32 (x, d->bound);
float32x4_t t = v_expf_inline (x, &d->expf_consts);
-#endif
/* Calculate cosh by exp(x) / 2 + exp(-x) / 2. */
float32x4_t half_t = vmulq_n_f32 (t, 0.5);
float32x4_t half_over_t = vdivq_f32 (v_f32 (0.5), t);
-#if WANT_SIMD_EXCEPT
- if (__glibc_unlikely (v_any_u32 (tiny)))
- return vbslq_f32 (tiny, v_f32 (1), vaddq_f32 (half_t, half_over_t));
-#else
if (__glibc_unlikely (v_any_u32 (special)))
return special_case (x, half_t, half_over_t, special);
-#endif
return vaddq_f32 (half_t, half_over_t);
}
{
const struct data *d = ptr_barrier (&data);
-#if WANT_SIMD_EXCEPT
- float64x2_t r = vabsq_f64 (x);
- uint64x2_t cmp = vcaleq_f64 (v_f64 (0x1p64), x);
-
- /* When WANT_SIMD_EXCEPT = 1, special lanes should be zero'd
- to avoid them overflowing and throwing exceptions. */
- r = v_zerofy_f64 (r, cmp);
- uint64x2_t odd = vshlq_n_u64 (vcvtnq_u64_f64 (r), 63);
-
-#else
- float64x2_t r = x;
- uint64x2_t cmp = vcageq_f64 (r, d->range_val);
+ uint64x2_t cmp = vcageq_f64 (x, d->range_val);
uint64x2_t odd
- = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtaq_s64_f64 (r)), 63);
-
-#endif
+ = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtaq_s64_f64 (x)), 63);
- r = vsubq_f64 (r, vrndaq_f64 (r));
+ float64x2_t r = vsubq_f64 (x, vrndaq_f64 (x));
/* cospi(x) = sinpi(0.5 - abs(x)) for values -1/2 .. 1/2. */
r = vsubq_f64 (v_f64 (0.5), vabsq_f64 (r));
Maximum Error: 3.17 ULP:
_ZGVnN4v_cospif(0x1.d341a8p-5) got 0x1.f7cd56p-1
want 0x1.f7cd5p-1. */
-float32x4_t VPCS_ATTR V_NAME_F1 (cospi) (float32x4_t x)
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (cospi) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
-#if WANT_SIMD_EXCEPT
- float32x4_t r = vabsq_f32 (x);
- uint32x4_t cmp = vcaleq_f32 (v_f32 (0x1p32f), x);
-
- /* When WANT_SIMD_EXCEPT = 1, special lanes should be zero'd
- to avoid them overflowing and throwing exceptions. */
- r = v_zerofy_f32 (r, cmp);
- uint32x4_t odd = vshlq_n_u32 (vcvtnq_u32_f32 (r), 31);
-
-#else
- float32x4_t r = x;
- uint32x4_t cmp = vcageq_f32 (r, d->range_val);
-
+ uint32x4_t cmp = vcageq_f32 (x, d->range_val);
uint32x4_t odd
- = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (r)), 31);
-
-#endif
+ = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), 31);
/* r = x - rint(x). */
- r = vsubq_f32 (r, vrndaq_f32 (r));
+ float32x4_t r = vsubq_f32 (x, vrndaq_f32 (x));
/* cospi(x) = sinpi(0.5 - abs(x)) for values -1/2 .. 1/2. */
r = vsubq_f32 (v_f32 (0.5f), vabsq_f32 (r));
/* Reintroduce the sign bit for inputs which round to odd. */
return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
}
-
libmvec_hidden_def (V_NAME_F1 (cospi))
HALF_WIDTH_ALIAS_F1 (cospi)
{
float64x2_t third;
float64x2_t tenth, two_over_five, two_over_nine;
- double two_over_fifteen, two_over_fortyfive;
+ double two_over_fifteen;
float64x2_t max, shift;
uint64x2_t max_idx;
-#if WANT_SIMD_EXCEPT
- float64x2_t tiny_bound, huge_bound, scale_minus_one;
-#endif
} data = {
.max_idx = V2 (768),
.third = V2 (0x1.5555555555556p-2), /* used to compute 2/3 and 1/6 too. */
.tenth = V2 (-0x1.999999999999ap-4),
.two_over_five = V2 (-0x1.999999999999ap-2),
.two_over_nine = V2 (-0x1.c71c71c71c71cp-3),
- .two_over_fortyfive = 0x1.6c16c16c16c17p-5,
.max = V2 (5.9921875), /* 6 - 1/128. */
.shift = V2 (0x1p45),
-#if WANT_SIMD_EXCEPT
- .huge_bound = V2 (0x1p205),
- .tiny_bound = V2 (0x1p-226),
- .scale_minus_one = V2 (0x1.06eba8214db69p-3), /* 2/sqrt(pi) - 1.0. */
-#endif
};
#define AbsMask 0x7fffffffffffffff
uint64x2_t a_le_max = vcaleq_f64 (x, dat->max);
uint64x2_t a_gt_max = vcagtq_f64 (x, dat->max);
-#if WANT_SIMD_EXCEPT
- /* |x| huge or tiny. */
- uint64x2_t cmp1 = vcgtq_f64 (a, dat->huge_bound);
- uint64x2_t cmp2 = vcltq_f64 (a, dat->tiny_bound);
- uint64x2_t cmp = vorrq_u64 (cmp1, cmp2);
- /* If any lanes are special, mask them with 1 for small x or 8 for large
- values and retain a copy of a to allow special case handler to fix special
- lanes later. This is only necessary if fenv exceptions are to be triggered
- correctly. */
- if (__glibc_unlikely (v_any_u64 (cmp)))
- {
- a = vbslq_f64 (cmp1, v_f64 (8.0), a);
- a = vbslq_f64 (cmp2, v_f64 (1.0), a);
- }
-#endif
-
/* Set r to multiple of 1/128 nearest to |x|. */
float64x2_t shift = dat->shift;
float64x2_t z = vaddq_f64 (a, shift);
y = vbslq_f64 (a_gt_max, v_f64 (1.0), y);
/* Copy sign. */
- y = vbslq_f64 (v_u64 (AbsMask), y, x);
-
-#if WANT_SIMD_EXCEPT
- if (__glibc_unlikely (v_any_u64 (cmp2)))
- {
- /* Neutralise huge values of x before fixing small values. */
- x = vbslq_f64 (cmp1, v_f64 (1.0), x);
- /* Fix tiny values that trigger spurious underflow. */
- return vbslq_f64 (cmp2, vfmaq_f64 (x, dat->scale_minus_one, x), y);
- }
-#endif
- return y;
+ return vbslq_f64 (v_u64 (AbsMask), y, x);
}
float64x2_t p20, p40, p41, p51;
double p42, p52;
double qr5[2], qr6[2], qr7[2], qr8[2], qr9[2];
-#if WANT_SIMD_EXCEPT
- float64x2_t uflow_bound;
-#endif
} data = {
/* Set an offset so the range of the index used for lookup is 3487, and it
can be clamped using a saturated add on an offset index.
.qr7 = { 0x1.2492492492492p0, -0x1.8e38e38e38e39p-3 },
.qr8 = { 0x1.2p0, -0x1.6c16c16c16c17p-3 },
.qr9 = { 0x1.1c71c71c71c72p0, -0x1.4f2094f2094f2p-3 },
-#if WANT_SIMD_EXCEPT
- .uflow_bound = V2 (0x1.a8b12fc6e4892p+4),
-#endif
};
-#define TinyBound 0x4000000000000000 /* 0x1p-511 << 1. */
#define Off 0xfffffffffffff260 /* 0xffffffffffffffff - 3487. */
struct entry
return e;
}
-#if WANT_SIMD_EXCEPT
-static float64x2_t VPCS_ATTR NOINLINE
-special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp)
-{
- return v_call_f64 (erfc, x, y, cmp);
-}
-#endif
-
/* Optimized double-precision vector erfc(x).
Approximation based on series expansion near x rounded to
nearest multiple of 1/128.
{
const struct data *dat = ptr_barrier (&data);
-#if WANT_SIMD_EXCEPT
- /* |x| < 2^-511. Avoid fabs by left-shifting by 1. */
- uint64x2_t ix = vreinterpretq_u64_f64 (x);
- uint64x2_t cmp = vcltq_u64 (vaddq_u64 (ix, ix), v_u64 (TinyBound));
- /* x >= ~26.54 (into subnormal case and uflow case). Comparison is done in
- integer domain to avoid raising exceptions in presence of nans. */
- uint64x2_t uflow = vcgeq_s64 (vreinterpretq_s64_f64 (x),
- vreinterpretq_s64_f64 (dat->uflow_bound));
- cmp = vorrq_u64 (cmp, uflow);
- float64x2_t xm = x;
- /* If any lanes are special, mask them with 0 and retain a copy of x to allow
- special case handler to fix special lanes later. This is only necessary if
- fenv exceptions are to be triggered correctly. */
- if (__glibc_unlikely (v_any_u64 (cmp)))
- x = v_zerofy_f64 (x, cmp);
-#endif
-
float64x2_t a = vabsq_f64 (x);
a = vminq_f64 (a, dat->max);
float64x2_t fac = vreinterpretq_f64_u64 (
vsraq_n_u64 (vshlq_n_u64 (sign, 63), dat->table_scale, 1));
-#if WANT_SIMD_EXCEPT
- if (__glibc_unlikely (v_any_u64 (cmp)))
- return special_case (xm, vfmaq_f64 (off, fac, y), cmp);
-#endif
-
return vfmaq_f64 (off, fac, y);
}
float32x4_t max, shift;
float coeffs[4];
float32x4_t third, two_over_five, tenth;
-#if WANT_SIMD_EXCEPT
- float32x4_t uflow_bound;
-#endif
-
} data = {
/* Set an offset so the range of the index used for lookup is 644, and it can
be clamped using a saturated add. */
.third = V4 (0x1.555556p-2f),
.two_over_five = V4 (-0x1.99999ap-2f),
.tenth = V4 (-0x1.99999ap-4f),
-#if WANT_SIMD_EXCEPT
- .uflow_bound = V4 (0x1.2639cp+3f),
-#endif
};
-#define TinyBound 0x41000000 /* 0x1p-62f << 1. */
-#define Thres 0xbe000000 /* asuint(infinity) << 1 - TinyBound. */
#define Off 0xfffffd7b /* 0xffffffff - 644. */
struct entry
return e;
}
-#if WANT_SIMD_EXCEPT
-static float32x4_t VPCS_ATTR NOINLINE
-special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
-{
- return v_call_f32 (erfcf, x, y, cmp);
-}
-#endif
-
/* Optimized single-precision vector erfcf(x).
Approximation based on series expansion near x rounded to
nearest multiple of 1/64.
{
const struct data *dat = ptr_barrier (&data);
-#if WANT_SIMD_EXCEPT
- /* |x| < 2^-62. Avoid fabs by left-shifting by 1. */
- uint32x4_t ix = vreinterpretq_u32_f32 (x);
- uint32x4_t cmp = vcltq_u32 (vaddq_u32 (ix, ix), v_u32 (TinyBound));
- /* x >= ~9.19 (into subnormal case and uflow case). Comparison is done in
- integer domain to avoid raising exceptions in presence of nans. */
- uint32x4_t uflow = vcgeq_s32 (vreinterpretq_s32_f32 (x),
- vreinterpretq_s32_f32 (dat->uflow_bound));
- cmp = vorrq_u32 (cmp, uflow);
- float32x4_t xm = x;
- /* If any lanes are special, mask them with 0 and retain a copy of x to allow
- special case handler to fix special lanes later. This is only necessary if
- fenv exceptions are to be triggered correctly. */
- if (__glibc_unlikely (v_any_u32 (cmp)))
- x = v_zerofy_f32 (x, cmp);
-#endif
-
float32x4_t a = vabsq_f32 (x);
a = vminq_f32 (a, dat->max);
float32x4_t fac = vreinterpretq_f32_u32 (
vsraq_n_u32 (vshlq_n_u32 (sign, 31), dat->table_scale, 1));
-#if WANT_SIMD_EXCEPT
- if (__glibc_unlikely (v_any_u32 (cmp)))
- return special_case (xm, vfmaq_f32 (off, fac, y), cmp);
-#endif
-
return vfmaq_f32 (off, fac, y);
}
libmvec_hidden_def (V_NAME_F1 (erfc))
static const struct data
{
float32x4_t max, shift, third;
-#if WANT_SIMD_EXCEPT
- float32x4_t tiny_bound, scale_minus_one;
-#endif
} data = {
.max = V4 (3.9375), /* 4 - 8/128. */
.shift = V4 (0x1p16f),
.third = V4 (0x1.555556p-2f), /* 1/3. */
-#if WANT_SIMD_EXCEPT
- .tiny_bound = V4 (0x1p-62f),
- .scale_minus_one = V4 (0x1.06eba8p-3f), /* scale - 1.0. */
-#endif
};
#define AbsMask 0x7fffffff
{
const struct data *dat = ptr_barrier (&data);
-#if WANT_SIMD_EXCEPT
- /* |x| < 2^-62. */
- uint32x4_t cmp = vcaltq_f32 (x, dat->tiny_bound);
- float32x4_t xm = x;
- /* If any lanes are special, mask them with 1 and retain a copy of x to allow
- special case handler to fix special lanes later. This is only necessary if
- fenv exceptions are to be triggered correctly. */
- if (__glibc_unlikely (v_any_u32 (cmp)))
- x = vbslq_f32 (cmp, v_f32 (1), x);
-#endif
-
float32x4_t a = vabsq_f32 (x);
uint32x4_t a_gt_max = vcgtq_f32 (a, dat->max);
y = vbslq_f32 (a_gt_max, v_f32 (1.0f), y);
/* Copy sign. */
- y = vbslq_f32 (v_u32 (AbsMask), y, x);
-
-#if WANT_SIMD_EXCEPT
- if (__glibc_unlikely (v_any_u32 (cmp)))
- return vbslq_f32 (cmp, vfmaq_f32 (xm, dat->scale_minus_one, xm), y);
-#endif
- return y;
+ return vbslq_f32 (v_u32 (AbsMask), y, x);
}
libmvec_hidden_def (V_NAME_F1 (erf))
HALF_WIDTH_ALIAS_F1 (erf)
{
float64x2_t poly[4];
float64x2_t log10_2, log2_10_hi, log2_10_lo, shift;
-#if !WANT_SIMD_EXCEPT
float64x2_t special_bound, scale_thresh;
-#endif
} data = {
/* Coefficients generated using Remez algorithm.
rel error: 0x1.5ddf8f28p-54
.log2_10_hi = V2 (0x1.34413509f79ffp-9), /* log2(10)/N. */
.log2_10_lo = V2 (-0x1.9dc1da994fd21p-66),
.shift = V2 (0x1.8p+52),
-#if !WANT_SIMD_EXCEPT
.scale_thresh = V2 (ScaleBound),
.special_bound = V2 (SpecialBound),
-#endif
};
#define N (1 << V_EXP_TABLE_BITS)
#define IndexMask v_u64 (N - 1)
-#if WANT_SIMD_EXCEPT
-
-# define TinyBound v_u64 (0x2000000000000000) /* asuint64 (0x1p-511). */
-# define BigBound v_u64 (0x4070000000000000) /* asuint64 (0x1p8). */
-# define Thres v_u64 (0x2070000000000000) /* BigBound - TinyBound. */
-
-static float64x2_t VPCS_ATTR NOINLINE
-special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp)
-{
- /* If fenv exceptions are to be triggered correctly, fall back to the scalar
- routine for special lanes. */
- return v_call_f64 (exp10, x, y, cmp);
-}
-
-#else
-
# define SpecialOffset v_u64 (0x6000000000000000) /* 0x1p513. */
/* SpecialBias1 + SpecialBias1 = asuint(1.0). */
# define SpecialBias1 v_u64 (0x7000000000000000) /* 0x1p769. */
return vbslq_f64 (cmp, r1, r0);
}
-#endif
-
/* Fast vector implementation of exp10.
Maximum measured error is 1.64 ulp.
_ZGVnN2v_exp10(0x1.ccd1c9d82cc8cp+0) got 0x1.f8dab6d7fed0cp+5
float64x2_t VPCS_ATTR V_NAME_D1 (exp10) (float64x2_t x)
{
const struct data *d = ptr_barrier (&data);
- uint64x2_t cmp;
-#if WANT_SIMD_EXCEPT
- /* If any lanes are special, mask them with 1 and retain a copy of x to allow
- special_case to fix special lanes later. This is only necessary if fenv
- exceptions are to be triggered correctly. */
- float64x2_t xm = x;
- uint64x2_t iax = vreinterpretq_u64_f64 (vabsq_f64 (x));
- cmp = vcgeq_u64 (vsubq_u64 (iax, TinyBound), Thres);
- if (__glibc_unlikely (v_any_u64 (cmp)))
- x = vbslq_f64 (cmp, v_f64 (1), x);
-#else
- cmp = vcageq_f64 (x, d->special_bound);
-#endif
+ uint64x2_t cmp = vcageq_f64 (x, d->special_bound);
/* n = round(x/(log10(2)/N)). */
float64x2_t z = vfmaq_f64 (d->shift, x, d->log10_2);
float64x2_t s = vreinterpretq_f64_u64 (vaddq_u64 (u, e));
if (__glibc_unlikely (v_any_u64 (cmp)))
-#if WANT_SIMD_EXCEPT
- return special_case (xm, vfmaq_f64 (s, y, s), cmp);
-#else
return special_case (s, y, n, d);
-#endif
return vfmaq_f64 (s, y, s);
}
float log10_2_high, log10_2_low, c2, c4;
float32x4_t inv_log10_2, special_bound;
uint32x4_t exponent_bias, special_offset, special_bias;
-#if !WANT_SIMD_EXCEPT
float32x4_t scale_thresh;
-#endif
} data = {
/* Coefficients generated using Remez algorithm with minimisation of relative
error.
.exponent_bias = V4 (0x3f800000),
.special_offset = V4 (0x82000000),
.special_bias = V4 (0x7f000000),
-#if !WANT_SIMD_EXCEPT
.scale_thresh = V4 (ScaleBound)
-#endif
};
-#if WANT_SIMD_EXCEPT
-
-# define SpecialBound 38.0f /* rint(log10(2^127)). */
-# define TinyBound v_u32 (0x20000000) /* asuint (0x1p-63). */
-# define BigBound v_u32 (0x42180000) /* asuint (SpecialBound). */
-# define Thres v_u32 (0x22180000) /* BigBound - TinyBound. */
-
-static float32x4_t VPCS_ATTR NOINLINE
-special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
-{
- /* If fenv exceptions are to be triggered correctly, fall back to the scalar
- routine to special lanes. */
- return v_call_f32 (exp10f, x, y, cmp);
-}
-
-#else
-
# define SpecialBound 126.0f
static float32x4_t VPCS_ATTR NOINLINE
return vbslq_f32 (cmp2, r2, r);
}
-#endif
-
/* Fast vector implementation of single-precision exp10.
Algorithm is accurate to 2.36 ULP.
_ZGVnN4v_exp10f(0x1.be2b36p+1) got 0x1.7e79c4p+11
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp10) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
-#if WANT_SIMD_EXCEPT
- /* asuint(x) - TinyBound >= BigBound - TinyBound. */
- uint32x4_t cmp = vcgeq_u32 (
- vsubq_u32 (vreinterpretq_u32_f32 (vabsq_f32 (x)), TinyBound), Thres);
- float32x4_t xm = x;
- /* If any lanes are special, mask them with 1 and retain a copy of x to allow
- special case handler to fix special lanes later. This is only necessary if
- fenv exceptions are to be triggered correctly. */
- if (__glibc_unlikely (v_any_u32 (cmp)))
- x = v_zerofy_f32 (x, cmp);
-#endif
/* exp10(x) = 2^n * 10^r = 2^n * (1 + poly (r)),
with poly(r) in [1/sqrt(2), sqrt(2)] and
float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
-#if !WANT_SIMD_EXCEPT
uint32x4_t cmp = vcagtq_f32 (n, d->special_bound);
-#endif
float32x4_t r2 = vmulq_f32 (r, r);
float32x4_t p12 = vfmaq_laneq_f32 (d->c1, r, log10_2_c24, 2);
float32x4_t poly = vfmaq_f32 (vmulq_f32 (r, d->c0), p14, r2);
if (__glibc_unlikely (v_any_u32 (cmp)))
-#if WANT_SIMD_EXCEPT
- return special_case (xm, vfmaq_f32 (scale, poly, scale), cmp);
-#else
return special_case (poly, n, e, cmp, scale, d);
-#endif
-
return vfmaq_f32 (scale, poly, scale);
}
libmvec_hidden_def (V_NAME_F1 (exp10))
__v_exp_data[i[1] & IndexMask] };
}
-#if WANT_SIMD_EXCEPT
-
-# define Thres 0x2080000000000000 /* asuint64(512.0) - TinyBound. */
-
-/* Call scalar exp2 as a fallback. */
-static float64x2_t VPCS_ATTR NOINLINE
-special_case (float64x2_t x, float64x2_t y, uint64x2_t is_special)
-{
- return v_call_f64 (exp2, x, y, is_special);
-}
-
-#else
-
# define SpecialOffset 0x6000000000000000 /* 0x1p513. */
/* SpecialBias1 + SpecialBias1 = asuint(1.0). */
# define SpecialBias1 0x7000000000000000 /* 0x1p769. */
return vbslq_f64 (cmp, r1, r0);
}
-#endif
-
/* Fast vector implementation of exp2.
Maximum measured error is 1.65 ulp.
_ZGVnN2v_exp2(-0x1.4c264ab5b559bp-6) got 0x1.f8db0d4df721fp-1
float64x2_t V_NAME_D1 (exp2) (float64x2_t x)
{
const struct data *d = ptr_barrier (&data);
- uint64x2_t cmp;
-#if WANT_SIMD_EXCEPT
- uint64x2_t ia = vreinterpretq_u64_f64 (vabsq_f64 (x));
- cmp = vcgeq_u64 (vsubq_u64 (ia, v_u64 (TinyBound)), v_u64 (Thres));
- /* Mask special lanes and retain a copy of x for passing to special-case
- handler. */
- float64x2_t xc = x;
- x = v_zerofy_f64 (x, cmp);
-#else
- cmp = vcagtq_f64 (x, d->scale_big_bound);
-#endif
+ uint64x2_t cmp = vcagtq_f64 (x, d->scale_big_bound);
/* n = round(x/N). */
float64x2_t z = vaddq_f64 (d->shift, x);
y = vmulq_f64 (r, y);
if (__glibc_unlikely (v_any_u64 (cmp)))
-#if !WANT_SIMD_EXCEPT
return special_case (s, y, n, d);
-#else
- return special_case (xc, vfmaq_f64 (s, s, y), cmp);
-#endif
return vfmaq_f64 (s, s, y);
}
{
float32x4_t c1, c3;
uint32x4_t exponent_bias, special_offset, special_bias;
-#if !WANT_SIMD_EXCEPT
float32x4_t scale_thresh, special_bound;
-#endif
float c0, c2, c4, zero;
} data = {
/* maxerr: 1.962 ulp. */
.exponent_bias = V4 (0x3f800000),
.special_offset = V4 (0x82000000),
.special_bias = V4 (0x7f000000),
-#if !WANT_SIMD_EXCEPT
.special_bound = V4 (126.0f),
.scale_thresh = V4 (192.0f),
-#endif
};
-#if WANT_SIMD_EXCEPT
-
-# define TinyBound v_u32 (0x20000000) /* asuint (0x1p-63). */
-# define BigBound v_u32 (0x42800000) /* asuint (0x1p6). */
-# define SpecialBound v_u32 (0x22800000) /* BigBound - TinyBound. */
-
-static float32x4_t VPCS_ATTR NOINLINE
-special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
-{
- /* If fenv exceptions are to be triggered correctly, fall back to the scalar
- routine for special lanes. */
- return v_call_f32 (exp2f, x, y, cmp);
-}
-
-#else
-
static float32x4_t VPCS_ATTR NOINLINE
special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
float32x4_t scale, const struct data *d)
return vbslq_f32 (cmp2, r2, r);
}
-#endif
-
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp2) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
-#if WANT_SIMD_EXCEPT
- /* asuint(|x|) - TinyBound >= BigBound - TinyBound. */
- uint32x4_t ia = vreinterpretq_u32_f32 (vabsq_f32 (x));
- uint32x4_t cmp = vcgeq_u32 (vsubq_u32 (ia, TinyBound), SpecialBound);
- float32x4_t xm = x;
- /* If any lanes are special, mask them with 1 and retain a copy of x to allow
- special_case to fix special lanes later. This is only necessary if fenv
- exceptions are to be triggered correctly. */
- if (__glibc_unlikely (v_any_u32 (cmp)))
- x = vbslq_f32 (cmp, v_f32 (1), x);
-#endif
-
/* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
x = n + r, with r in [-1/2, 1/2]. */
float32x4_t n = vrndaq_f32 (x);
uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), 23);
float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
-#if !WANT_SIMD_EXCEPT
uint32x4_t cmp = vcagtq_f32 (n, d->special_bound);
-#endif
float32x4_t c024 = vld1q_f32 (&d->c0);
float32x4_t r2 = vmulq_f32 (r, r);
float32x4_t poly = vfmaq_f32 (p, q, r2);
if (__glibc_unlikely (v_any_u32 (cmp)))
-#if WANT_SIMD_EXCEPT
- return special_case (xm, vfmaq_f32 (scale, poly, scale), cmp);
-#else
return special_case (poly, n, e, cmp, scale, d);
-#endif
return vfmaq_f32 (scale, poly, scale);
}
{
float64x2_t poly[3];
float64x2_t inv_ln2, ln2_hi, ln2_lo, shift;
-#if !WANT_SIMD_EXCEPT
float64x2_t special_bound, scale_thresh;
-#endif
} data = {
/* maxerr: 1.88 +0.5 ulp
rel error: 1.4337*2^-53
abs error: 1.4299*2^-53 in [ -ln2/256, ln2/256 ]. */
.poly = { V2 (0x1.ffffffffffd43p-2), V2 (0x1.55555c75adbb2p-3),
V2 (0x1.55555da646206p-5) },
-#if !WANT_SIMD_EXCEPT
.scale_thresh = V2 (163840.0), /* 1280.0 * N. */
.special_bound = V2 (704.0),
-#endif
.inv_ln2 = V2 (0x1.71547652b82fep7), /* N/ln2. */
.ln2_hi = V2 (0x1.62e42fefa39efp-8), /* ln2/N. */
.ln2_lo = V2 (0x1.abc9e3b39803f3p-63),
#define C(i) data.poly[i]
#define Tab __v_exp_data
-
-#if WANT_SIMD_EXCEPT
-
-# define TinyBound v_u64 (0x2000000000000000) /* asuint64 (0x1p-511). */
-# define BigBound v_u64 (0x4080000000000000) /* asuint64 (0x1p9). */
-# define SpecialBound v_u64 (0x2080000000000000) /* BigBound - TinyBound. */
-
-static float64x2_t VPCS_ATTR NOINLINE
-special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp)
-{
- /* If fenv exceptions are to be triggered correctly, fall back to the scalar
- routine to special lanes. */
- return v_call_f64 (exp, x, y, cmp);
-}
-
-#else
-
# define SpecialOffset v_u64 (0x6000000000000000) /* 0x1p513. */
/* SpecialBias1 + SpecialBias1 = asuint(1.0). */
# define SpecialBias1 v_u64 (0x7000000000000000) /* 0x1p769. */
return vbslq_f64 (cmp, r1, r0);
}
-#endif
-
float64x2_t VPCS_ATTR V_NAME_D1 (exp) (float64x2_t x)
{
float64x2_t n, r, r2, s, y, z;
uint64x2_t cmp, u, e;
-#if WANT_SIMD_EXCEPT
- /* If any lanes are special, mask them with 1 and retain a copy of x to allow
- special_case to fix special lanes later. This is only necessary if fenv
- exceptions are to be triggered correctly. */
- float64x2_t xm = x;
- uint64x2_t iax = vreinterpretq_u64_f64 (vabsq_f64 (x));
- cmp = vcgeq_u64 (vsubq_u64 (iax, TinyBound), SpecialBound);
- if (__glibc_unlikely (v_any_u64 (cmp)))
- x = vbslq_f64 (cmp, v_f64 (1), x);
-#else
cmp = vcagtq_f64 (x, data.special_bound);
-#endif
/* n = round(x/(ln2/N)). */
z = vfmaq_f64 (data.shift, x, data.inv_ln2);
n = vsubq_f64 (z, data.shift);
/* r = x - n*ln2/N. */
- r = x;
- r = vfmsq_f64 (r, data.ln2_hi, n);
+ r = vfmsq_f64 (x, data.ln2_hi, n);
r = vfmsq_f64 (r, data.ln2_lo, n);
e = vshlq_n_u64 (u, 52 - V_EXP_TABLE_BITS);
s = vreinterpretq_f64_u64 (vaddq_u64 (u, e));
if (__glibc_unlikely (v_any_u64 (cmp)))
-#if WANT_SIMD_EXCEPT
- return special_case (xm, vfmaq_f64 (s, y, s), cmp);
-#else
return special_case (s, y, n);
-#endif
return vfmaq_f64 (s, y, s);
}
float32x4_t c1, c3, c4, inv_ln2;
float ln2_hi, ln2_lo, c0, c2;
uint32x4_t exponent_bias, special_offset, special_bias;
-#if !WANT_SIMD_EXCEPT
float32x4_t special_bound, scale_thresh;
-#endif
} data = {
/* maxerr: 1.45358 +0.5 ulp. */
.c0 = 0x1.0e4020p-7f,
.exponent_bias = V4 (0x3f800000),
.special_offset = V4 (0x82000000),
.special_bias = V4 (0x7f000000),
-#if !WANT_SIMD_EXCEPT
.special_bound = V4 (126.0f),
.scale_thresh = V4 (192.0f),
-#endif
};
#define C(i) d->poly[i]
-#if WANT_SIMD_EXCEPT
-
-# define TinyBound v_u32 (0x20000000) /* asuint (0x1p-63). */
-# define BigBound v_u32 (0x42800000) /* asuint (0x1p6). */
-# define SpecialBound v_u32 (0x22800000) /* BigBound - TinyBound. */
-
-static float32x4_t VPCS_ATTR NOINLINE
-special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
-{
- /* If fenv exceptions are to be triggered correctly, fall back to the scalar
- routine to special lanes. */
- return v_call_f32 (expf, x, y, cmp);
-}
-
-#else
-
static float32x4_t VPCS_ATTR NOINLINE
special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
float32x4_t scale, const struct data *d)
return vbslq_f32 (cmp2, r2, r);
}
-#endif
-
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
float32x4_t ln2_c02 = vld1q_f32 (&d->ln2_hi);
-#if WANT_SIMD_EXCEPT
- /* asuint(x) - TinyBound >= BigBound - TinyBound. */
- uint32x4_t cmp = vcgeq_u32 (
- vsubq_u32 (vandq_u32 (vreinterpretq_u32_f32 (x), v_u32 (0x7fffffff)),
- TinyBound),
- SpecialBound);
- float32x4_t xm = x;
- /* If any lanes are special, mask them with 1 and retain a copy of x to allow
- special case handler to fix special lanes later. This is only necessary if
- fenv exceptions are to be triggered correctly. */
- if (__glibc_unlikely (v_any_u32 (cmp)))
- x = vbslq_f32 (cmp, v_f32 (1), x);
-#endif
-
/* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
float32x4_t n = vrndaq_f32 (vmulq_f32 (x, d->inv_ln2));
uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 23);
float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
-#if !WANT_SIMD_EXCEPT
uint32x4_t cmp = vcagtq_f32 (n, d->special_bound);
-#endif
float32x4_t r2 = vmulq_f32 (r, r);
float32x4_t p = vfmaq_laneq_f32 (d->c1, r, ln2_c02, 2);
float32x4_t poly = vfmaq_f32 (p, q, r2);
if (__glibc_unlikely (v_any_u32 (cmp)))
-#if WANT_SIMD_EXCEPT
- return special_case (xm, vfmaq_f32 (scale, poly, scale), cmp);
-#else
return special_case (poly, n, e, cmp, scale, d);
-#endif
return vfmaq_f32 (scale, poly, scale);
}
static const struct data
{
struct v_expm1_data d;
-#if WANT_SIMD_EXCEPT
- uint64x2_t thresh, tiny_bound;
-#else
float64x2_t oflow_bound;
-#endif
} data = {
.d = V_EXPM1_DATA,
-#if WANT_SIMD_EXCEPT
- /* asuint64(oflow_bound) - asuint64(0x1p-51), shifted left by 1 for abs
- compare. */
- .thresh = V2 (0x78c56fa6d34b552),
- /* asuint64(0x1p-51) << 1. */
- .tiny_bound = V2 (0x3cc0000000000000 << 1),
-#else
/* Value above which expm1(x) should overflow. Absolute value of the
underflow bound is greater than this, so it catches both cases - there is
a small window where fallbacks are triggered unnecessarily. */
.oflow_bound = V2 (0x1.62b7d369a5aa9p+9),
-#endif
};
static float64x2_t VPCS_ATTR NOINLINE
special_case (float64x2_t x, uint64x2_t special, const struct data *d)
{
- return v_call_f64 (expm1, x, expm1_inline (v_zerofy_f64 (x, special), &d->d),
- special);
+ return v_call_f64 (expm1, x, expm1_inline (x, &d->d), special);
}
/* Double-precision vector exp(x) - 1 function.
The maximum error observed error is 2.05 ULP:
- _ZGVnN2v_expm1(0x1.634902eaff3adp-2) got 0x1.a8b636e2a9388p-2
- want 0x1.a8b636e2a9386p-2. */
+ _ZGVnN2v_expm1(0x1.6329669eb8c87p-2) got 0x1.a8897eef87b34p-2
+ want 0x1.a8897eef87b32p-2. */
float64x2_t VPCS_ATTR V_NAME_D1 (expm1) (float64x2_t x)
{
const struct data *d = ptr_barrier (&data);
-#if WANT_SIMD_EXCEPT
- uint64x2_t ix = vreinterpretq_u64_f64 (x);
- /* If fp exceptions are to be triggered correctly, fall back to scalar for
- |x| < 2^-51, |x| > oflow_bound, Inf & NaN. Add ix to itself for
- shift-left by 1, and compare with thresh which was left-shifted offline -
- this is effectively an absolute compare. */
- uint64x2_t special
- = vcgeq_u64 (vsubq_u64 (vaddq_u64 (ix, ix), d->tiny_bound), d->thresh);
-#else
/* Large input, NaNs and Infs. */
uint64x2_t special = vcageq_f64 (x, d->oflow_bound);
-#endif
-
if (__glibc_unlikely (v_any_u64 (special)))
return special_case (x, special, d);
static const struct data
{
struct v_expm1f_data d;
-#if WANT_SIMD_EXCEPT
- uint32x4_t thresh;
-#else
float32x4_t oflow_bound;
-#endif
} data = {
.d = V_EXPM1F_DATA,
-#if !WANT_SIMD_EXCEPT
/* Value above which expm1f(x) should overflow. Absolute value of the
underflow bound is greater than this, so it catches both cases - there is
a small window where fallbacks are triggered unnecessarily. */
.oflow_bound = V4 (0x1.5ebc4p+6),
-#else
- /* asuint(oflow_bound) - asuint(0x1p-23), shifted left by 1 for absolute
- compare. */
- .thresh = V4 (0x1d5ebc40),
-#endif
};
-/* asuint(0x1p-23), shifted by 1 for abs compare. */
-#define TinyBound v_u32 (0x34000000 << 1)
-
static float32x4_t VPCS_ATTR NOINLINE
special_case (float32x4_t x, uint32x4_t special, const struct data *d)
{
- return v_call_f32 (
- expm1f, x, expm1f_inline (v_zerofy_f32 (x, special), &d->d), special);
+ return v_call_f32 (expm1f, x, expm1f_inline (x, &d->d), special);
}
/* Single-precision vector exp(x) - 1 function.
{
const struct data *d = ptr_barrier (&data);
-#if WANT_SIMD_EXCEPT
- uint32x4_t ix = vreinterpretq_u32_f32 (x);
- /* If fp exceptions are to be triggered correctly, fall back to scalar for
- |x| < 2^-23, |x| > oflow_bound, Inf & NaN. Add ix to itself for
- shift-left by 1, and compare with thresh which was left-shifted offline -
- this is effectively an absolute compare. */
- uint32x4_t special
- = vcgeq_u32 (vsubq_u32 (vaddq_u32 (ix, ix), TinyBound), d->thresh);
-#else
/* Handles very large values (+ve and -ve), +/-NaN, +/-Inf. */
uint32x4_t special = vcagtq_f32 (x, d->oflow_bound);
-#endif
if (__glibc_unlikely (v_any_u32 (special)))
return special_case (x, special, d);
/* Note: sbits is signed scale. */
scale = asdouble (sbits);
y = scale + scale * tmp;
-#if WANT_SIMD_EXCEPT
- if (fabs (y) < 1.0)
- {
- /* Round y to the right precision before scaling it into the subnormal
- range to avoid double rounding that can cause 0.5+E/2 ulp error where
- E is the worst-case ulp error outside the subnormal range. So this
- is only useful if the goal is better than 1 ulp worst-case error. */
- double hi, lo, one = 1.0;
- if (y < 0.0)
- one = -1.0;
- lo = scale - y + scale * tmp;
- hi = one + y;
- lo = one - hi + y + lo;
- y = (hi + lo) - one;
- /* Fix the sign of 0. */
- if (y == 0.0)
- y = asdouble (sbits & 0x8000000000000000);
- /* The underflow exception needs to be signaled explicitly. */
- force_eval_double (opt_barrier_double (0x1p-1022) * 0x1p-1022);
- }
-#endif
y = 0x1p-1022 * y;
return y;
}
{
/* Note: inf and nan are already handled. */
/* Skip errno handling. */
-#if WANT_SIMD_EXCEPT
- return asuint64 (x) >> 63 ? __math_uflow (sign_bias)
- : __math_oflow (sign_bias);
-#else
double res_uoflow = asuint64 (x) >> 63 ? 0.0 : INFINITY;
return sign_bias ? -res_uoflow : res_uoflow;
-#endif
}
/* Large x is special cased below. */
abstop = 0;
return 1.0;
/* Note: inf and nan are already handled. */
if (abstop >= top12 (1024.0))
-#if WANT_SIMD_EXCEPT
- return asuint64 (x) >> 63 ? __math_uflow (0) : __math_oflow (0);
-#else
return asuint64 (x) >> 63 ? 0.0 : INFINITY;
-#endif
/* Large x is special cased below. */
abstop = 0;
}
x2 = -x2;
sign_bias = 1;
}
-#if WANT_SIMD_EXCEPT
- if (2 * ix == 0 && iy >> 63)
- return __math_divzero (sign_bias);
-#endif
return iy >> 63 ? 1 / x2 : x2;
}
/* Here x and y are non-zero finite. */
/* Finite x < 0. */
int yint = checkint (iy);
if (yint == 0)
-#if WANT_SIMD_EXCEPT
- return __math_invalid (x);
-#else
return __builtin_nan ("");
-#endif
if (yint == 1)
sign_bias = SignBias;
ix &= 0x7fffffffffffffff;
/* |y| < 2^-65, x^y ~= 1 + y*log(x). */
if ((topy & 0x7ff) < SmallPowY)
return 1.0;
-#if WANT_SIMD_EXCEPT
- return (ix > asuint64 (1.0)) == (topy < 0x800) ? __math_oflow (0)
- : __math_uflow (0);
-#else
return (ix > asuint64 (1.0)) == (topy < 0x800) ? INFINITY : 0;
-#endif
}
if (topx == 0)
{
#include "v_math.h"
-#if WANT_SIMD_EXCEPT
-static const struct data
-{
- uint64x2_t tiny_bound, thres;
-} data = {
- .tiny_bound = V2 (0x2000000000000000), /* asuint (0x1p-511). */
- .thres = V2 (0x3fe0000000000000), /* asuint (0x1p511) - tiny_bound. */
-};
-#else
static const struct data
{
uint64x2_t tiny_bound;
uint32x4_t thres;
} data = {
.tiny_bound = V2 (0x0360000000000000), /* asuint (0x1p-969). */
- .thres = V4 (0x7c900000), /* asuint (inf) - tiny_bound. */
+ .thres = V4 (0x7c900000), /* asuint (inf) - tiny_bound. */
};
-#endif
static float64x2_t VPCS_ATTR NOINLINE
special_case (float64x2_t x, float64x2_t y, float64x2_t sqsum,
_ZGVnN2vv_hypot (0x1.6a1b193ff85b5p-204, 0x1.bc50676c2a447p-222)
got 0x1.6a1b19400964ep-204
want 0x1.6a1b19400964dp-204. */
-#if WANT_SIMD_EXCEPT
-
-float64x2_t VPCS_ATTR V_NAME_D2 (hypot) (float64x2_t x, float64x2_t y)
-{
- const struct data *d = ptr_barrier (&data);
-
- float64x2_t ax = vabsq_f64 (x);
- float64x2_t ay = vabsq_f64 (y);
-
- uint64x2_t ix = vreinterpretq_u64_f64 (ax);
- uint64x2_t iy = vreinterpretq_u64_f64 (ay);
-
- /* Extreme values, NaNs, and infinities should be handled by the scalar
- fallback for correct flag handling. */
- uint64x2_t specialx = vcgeq_u64 (vsubq_u64 (ix, d->tiny_bound), d->thres);
- uint64x2_t specialy = vcgeq_u64 (vsubq_u64 (iy, d->tiny_bound), d->thres);
- ax = v_zerofy_f64 (ax, specialx);
- ay = v_zerofy_f64 (ay, specialy);
- uint32x2_t special = vaddhn_u64 (specialx, specialy);
-
- float64x2_t sqsum = vfmaq_f64 (vmulq_f64 (ax, ax), ay, ay);
-
- if (__glibc_unlikely (v_any_u32h (special)))
- return special_case (x, y, sqsum, special);
-
- return vsqrtq_f64 (sqsum);
-}
-#else
-
float64x2_t VPCS_ATTR V_NAME_D2 (hypot) (float64x2_t x, float64x2_t y)
{
const struct data *d = ptr_barrier (&data);
return vsqrtq_f64 (sqsum);
}
-#endif
#include "v_math.h"
-#if WANT_SIMD_EXCEPT
-static const struct data
-{
- uint32x4_t tiny_bound, thres;
-} data = {
- .tiny_bound = V4 (0x20000000), /* asuint (0x1p-63). */
- .thres = V4 (0x3f000000), /* asuint (0x1p63) - tiny_bound. */
-};
-#else
static const struct data
{
uint32x4_t tiny_bound;
.tiny_bound = V4 (0x0C800000), /* asuint (0x1p-102). */
.thres = V8 (0x7300), /* asuint (inf) - tiny_bound. */
};
-#endif
static float32x4_t VPCS_ATTR NOINLINE
special_case (float32x4_t x, float32x4_t y, float32x4_t sqsum,
Maximum error observed is 1.21 ULP:
_ZGVnN4vv_hypotf (0x1.6a419cp-13, 0x1.82a852p-22) got 0x1.6a41d2p-13
want 0x1.6a41dp-13. */
-#if WANT_SIMD_EXCEPT
-
-float32x4_t VPCS_ATTR V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y)
-{
- const struct data *d = ptr_barrier (&data);
-
- float32x4_t ax = vabsq_f32 (x);
- float32x4_t ay = vabsq_f32 (y);
-
- uint32x4_t ix = vreinterpretq_u32_f32 (ax);
- uint32x4_t iy = vreinterpretq_u32_f32 (ay);
-
- /* Extreme values, NaNs, and infinities should be handled by the scalar
- fallback for correct flag handling. */
- uint32x4_t specialx = vcgeq_u32 (vsubq_u32 (ix, d->tiny_bound), d->thres);
- uint32x4_t specialy = vcgeq_u32 (vsubq_u32 (iy, d->tiny_bound), d->thres);
- ax = v_zerofy_f32 (ax, specialx);
- ay = v_zerofy_f32 (ay, specialy);
- uint16x4_t special = vaddhn_u32 (specialx, specialy);
-
- float32x4_t sqsum = vfmaq_f32 (vmulq_f32 (ax, ax), ay, ay);
-
- if (__glibc_unlikely (v_any_u16h (special)))
- return special_case (x, y, sqsum, special);
-
- return vsqrtq_f32 (sqsum);
-}
-#else
-
-float32x4_t VPCS_ATTR V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y)
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y)
{
const struct data *d = ptr_barrier (&data);
return vsqrtq_f32 (sqsum);
}
-#endif
libmvec_hidden_def (V_NAME_F2 (hypot))
HALF_WIDTH_ALIAS_F2(hypot)
{
/* Side-step special lanes so fenv exceptions are not triggered
inadvertently. */
- float64x2_t x_nospecial = v_zerofy_f64 (x, cmp);
- return v_call_f64 (log1p, x, log1p_inline (x_nospecial, &d->d), cmp);
+ return v_call_f64 (log1p, x, log1p_inline (x, &d->d), cmp);
}
/* Vector log1p approximation using polynomial on reduced interval. Routine is
#include "v_math.h"
#include "v_log1pf_inline.h"
-#if WANT_SIMD_EXCEPT
-
-const static struct data
-{
- uint32x4_t minus_one, thresh;
- struct v_log1pf_data d;
-} data = {
- .d = V_LOG1PF_CONSTANTS_TABLE,
- .thresh = V4 (0x4b800000), /* asuint32(INFINITY) - TinyBound. */
- .minus_one = V4 (0xbf800000),
-};
-
-/* asuint32(0x1p-23). ulp=0.5 at 0x1p-23. */
-# define TinyBound v_u32 (0x34000000)
-
-static float32x4_t NOINLINE VPCS_ATTR
-special_case (float32x4_t x, uint32x4_t cmp, const struct data *d)
-{
- /* Side-step special lanes so fenv exceptions are not triggered
- inadvertently. */
- float32x4_t x_nospecial = v_zerofy_f32 (x, cmp);
- return v_call_f32 (log1pf, x, log1pf_inline (x_nospecial, &d->d), cmp);
-}
-
-/* Vector log1pf approximation using polynomial on reduced interval. Worst-case
- error is 1.69 ULP:
- _ZGVnN4v_log1pf(0x1.04418ap-2) got 0x1.cfcbd8p-3
- want 0x1.cfcbdcp-3. */
-VPCS_ATTR float32x4_t V_NAME_F1 (log1p) (float32x4_t x)
-{
- const struct data *d = ptr_barrier (&data);
- uint32x4_t ix = vreinterpretq_u32_f32 (x);
- uint32x4_t ia = vreinterpretq_u32_f32 (vabsq_f32 (x));
-
- uint32x4_t special_cases
- = vorrq_u32 (vcgeq_u32 (vsubq_u32 (ia, TinyBound), d->thresh),
- vcgeq_u32 (ix, d->minus_one));
-
- if (__glibc_unlikely (v_any_u32 (special_cases)))
- return special_case (x, special_cases, d);
-
- return log1pf_inline (x, &d->d);
-}
-
-#else
-
const static struct v_log1pf_data data = V_LOG1PF_CONSTANTS_TABLE;
static float32x4_t NOINLINE VPCS_ATTR
error is 1.63 ULP:
_ZGVnN4v_log1pf(0x1.216d12p-2) got 0x1.fdcb12p-3
want 0x1.fdcb16p-3. */
-VPCS_ATTR float32x4_t V_NAME_F1 (log1p) (float32x4_t x)
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log1p) (float32x4_t x)
{
uint32x4_t special_cases = vornq_u32 (vcleq_f32 (x, v_f32 (-1)),
vcaleq_f32 (x, v_f32 (0x1p127f)));
return log1pf_inline (x, ptr_barrier (&data));
}
-
-#endif
-
libmvec_hidden_def (V_NAME_F1 (log1p))
HALF_WIDTH_ALIAS_F1 (log1p)
strong_alias (V_NAME_F1 (log1p), V_NAME_F1 (logp1))
uint64x2_t viy = vreinterpretq_u64_f64 (y);
uint64x2_t iay = vandq_u64 (viy, d->inf);
- /* Special cases of x or y. */
-#if WANT_SIMD_EXCEPT
- /* Small or large. */
- uint64x2_t vtopx = vshrq_n_u64 (vix, 52);
- uint64x2_t vabstopy = vshrq_n_u64 (iay, 52);
- uint64x2_t specialx
- = vcgeq_u64 (vsubq_u64 (vtopx, VecSmallPowX), VecThresPowX);
- uint64x2_t specialy
- = vcgeq_u64 (vsubq_u64 (vabstopy, VecSmallPowY), VecThresPowY);
-#else
- /* The case y==0 does not trigger a special case, since in this case it is
+ /* Special cases of x or y.
+ The case y==0 does not trigger a special case, since in this case it is
necessary to fix the result only if x is a signalling nan, which already
triggers a special case. We test y==0 directly in the scalar fallback. */
uint64x2_t iax = vandq_u64 (vix, d->inf);
uint64x2_t specialx = vcgeq_u64 (iax, d->inf);
uint64x2_t specialy = vcgeq_u64 (iay, d->inf);
-#endif
uint64x2_t special = vorrq_u64 (specialx, specialy);
/* Fallback to scalar on all lanes if any lane is inf or nan. */
if (__glibc_unlikely (v_any_u64 (special)))
.pi_3 = V2 (0x1.c1cd129024e09p-106),
};
-#if WANT_SIMD_EXCEPT
-/* asuint64(0x1p-253)), below which multiply by inv_pi underflows. */
-# define TinyBound v_u64 (0x3020000000000000)
-/* RangeVal - TinyBound. */
-# define Thresh v_u64 (0x1160000000000000)
-#endif
-
#define C(i) d->poly[i]
static float64x2_t VPCS_ATTR NOINLINE
float64x2_t n, r, r2, r3, r4, y, t1, t2, t3;
uint64x2_t odd, cmp;
-#if WANT_SIMD_EXCEPT
- /* Detect |x| <= TinyBound or |x| >= RangeVal. If fenv exceptions are to be
- triggered correctly, set any special lanes to 1 (which is neutral w.r.t.
- fenv). These lanes will be fixed by special-case handler later. */
- uint64x2_t ir = vreinterpretq_u64_f64 (vabsq_f64 (x));
- cmp = vcgeq_u64 (vsubq_u64 (ir, TinyBound), Thresh);
- r = vreinterpretq_f64_u64 (vbicq_u64 (vreinterpretq_u64_f64 (x), cmp));
-#else
- r = x;
cmp = vcageq_f64 (x, d->range_val);
-#endif
/* n = rint(|x|/pi). */
- n = vrndaq_f64 (vmulq_f64 (r, d->inv_pi));
+ n = vrndaq_f64 (vmulq_f64 (x, d->inv_pi));
odd = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtq_s64_f64 (n)), 63);
/* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
- r = vfmsq_f64 (r, d->pi_1, n);
+ r = vfmsq_f64 (x, d->pi_1, n);
r = vfmsq_f64 (r, d->pi_2, n);
r = vfmsq_f64 (r, d->pi_3, n);
.range_val = V4 (0x1p20f)
};
-#if WANT_SIMD_EXCEPT
-/* asuint32(0x1p-59f), below which multiply by inv_pi underflows. */
-# define TinyBound v_u32 (0x22000000)
-/* RangeVal - TinyBound. */
-# define Thresh v_u32 (0x27800000)
-#endif
-
#define C(i) d->poly[i]
static float32x4_t VPCS_ATTR NOINLINE
const struct data *d = ptr_barrier (&data);
float32x4_t n, r, r2, y;
uint32x4_t odd, cmp;
-
-#if WANT_SIMD_EXCEPT
- uint32x4_t ir = vreinterpretq_u32_f32 (vabsq_f32 (x));
- cmp = vcgeq_u32 (vsubq_u32 (ir, TinyBound), Thresh);
- /* If fenv exceptions are to be triggered correctly, set any special lanes
- to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
- special-case handler later. */
- r = vreinterpretq_f32_u32 (vbicq_u32 (vreinterpretq_u32_f32 (x), cmp));
-#else
- r = x;
cmp = vcageq_f32 (x, d->range_val);
-#endif
/* n = rint(|x|/pi). */
- n = vrndaq_f32 (vmulq_f32 (r, d->inv_pi));
+ n = vrndaq_f32 (vmulq_f32 (x, d->inv_pi));
odd = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 31);
/* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
- r = vfmsq_f32 (r, d->pi_1, n);
+ r = vfmsq_f32 (x, d->pi_1, n);
r = vfmsq_f32 (r, d->pi_2, n);
r = vfmsq_f32 (r, d->pi_3, n);
static const struct data
{
struct v_expm1f_data expm1f_consts;
-#if WANT_SIMD_EXCEPT
- uint32x4_t tiny_bound, thresh;
-#else
float32x4_t oflow_bound;
-#endif
} data = {
.expm1f_consts = V_EXPM1F_DATA,
-#if WANT_SIMD_EXCEPT
- /* 0x1.6a09e8p-32, below which expm1f underflows. */
- .tiny_bound = V4 (0x2fb504f4),
- /* asuint(oflow_bound) - asuint(tiny_bound). */
- .thresh = V4 (0x12fbbbb3),
-#else
/* 0x1.61814ep+6, above which expm1f helper overflows. */
.oflow_bound = V4 (0x1.61814ep+6),
-#endif
};
static float32x4_t NOINLINE VPCS_ATTR
float32x4_t halfsign = vreinterpretq_f32_u32 (
vbslq_u32 (v_u32 (0x80000000), ix, vreinterpretq_u32_f32 (v_f32 (0.5))));
-#if WANT_SIMD_EXCEPT
- uint32x4_t special = vcgeq_u32 (
- vsubq_u32 (vreinterpretq_u32_f32 (ax), d->tiny_bound), d->thresh);
- ax = v_zerofy_f32 (ax, special);
-#else
uint32x4_t special = vcageq_f32 (x, d->oflow_bound);
-#endif
/* Up to the point that expm1f overflows, we can use it to calculate sinhf
using a slight rearrangement of the definition of asinh. This allows us
V2 (0x1.af86ae521260bp-21), V2 (-0x1.012a9870eeb7dp-25) },
};
-#if WANT_SIMD_EXCEPT
-# define TinyBound v_u64 (0x3bf0000000000000) /* asuint64(0x1p-64). */
-/* asuint64(0x1p64) - TinyBound. */
-# define Thresh v_u64 (0x07f0000000000000)
-
-static float64x2_t VPCS_ATTR NOINLINE
-special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp)
-{
- /* Fall back to scalar code. */
- y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
- return v_call_f64 (sinpi, x, y, cmp);
-}
-#endif
-
/* Approximation for vector double-precision sinpi(x).
Maximum Error 3.05 ULP:
_ZGVnN2v_sinpi(0x1.d32750db30b4ap-2) got 0x1.fb295878301c7p-1
float64x2_t VPCS_ATTR V_NAME_D1 (sinpi) (float64x2_t x)
{
const struct data *d = ptr_barrier (&data);
-
-#if WANT_SIMD_EXCEPT
- uint64x2_t ir = vreinterpretq_u64_f64 (vabsq_f64 (x));
- uint64x2_t cmp = vcgeq_u64 (vsubq_u64 (ir, TinyBound), Thresh);
-
- /* When WANT_SIMD_EXCEPT = 1, special lanes should be set to 0
- to avoid them under/overflowing and throwing exceptions. */
- float64x2_t r = v_zerofy_f64 (x, cmp);
-#else
- float64x2_t r = x;
-#endif
-
/* If r is odd, the sign of the result should be inverted. */
uint64x2_t odd
- = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtaq_s64_f64 (r)), 63);
+ = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtaq_s64_f64 (x)), 63);
/* r = x - rint(x). Range reduction to -1/2 .. 1/2. */
- r = vsubq_f64 (r, vrndaq_f64 (r));
+ float64x2_t r = vsubq_f64 (x, vrndaq_f64 (x));
/* y = sin(r). */
float64x2_t r2 = vmulq_f64 (r, r);
float64x2_t r4 = vmulq_f64 (r2, r2);
float64x2_t y = vmulq_f64 (v_pw_horner_9_f64 (r2, r4, d->poly), r);
-#if WANT_SIMD_EXCEPT
- if (__glibc_unlikely (v_any_u64 (cmp)))
- return special_case (x, y, odd, cmp);
-#endif
-
return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
}
V4 (-0x1.32d2ccp-1f), V4 (0x1.50783p-4f), V4 (-0x1.e30750p-8f) },
};
-#if WANT_SIMD_EXCEPT
-# define TinyBound v_u32 (0x30000000) /* asuint32(0x1p-31f). */
-# define Thresh v_u32 (0x1f000000) /* asuint32(0x1p31f) - TinyBound. */
-
-static float32x4_t VPCS_ATTR NOINLINE
-special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp)
-{
- /* Fall back to scalar code. */
- y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
- return v_call_f32 (sinpif, x, y, cmp);
-}
-#endif
-
/* Approximation for vector single-precision sinpi(x)
Maximum Error 3.03 ULP:
_ZGVnN4v_sinpif(0x1.c597ccp-2) got 0x1.f7cd56p-1
{
const struct data *d = ptr_barrier (&data);
-#if WANT_SIMD_EXCEPT
- uint32x4_t ir = vreinterpretq_u32_f32 (vabsq_f32 (x));
- uint32x4_t cmp = vcgeq_u32 (vsubq_u32 (ir, TinyBound), Thresh);
-
- /* When WANT_SIMD_EXCEPT = 1, special lanes should be set to 0
- to avoid them under/overflowing and throwing exceptions. */
- float32x4_t r = v_zerofy_f32 (x, cmp);
-#else
- float32x4_t r = x;
-#endif
-
/* If r is odd, the sign of the result should be inverted. */
uint32x4_t odd
- = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (r)), 31);
+ = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), 31);
/* r = x - rint(x). Range reduction to -1/2 .. 1/2. */
- r = vsubq_f32 (r, vrndaq_f32 (r));
+ float32x4_t r = vsubq_f32 (x, vrndaq_f32 (x));
/* Pairwise Horner approximation for y = sin(r * pi). */
float32x4_t r2 = vmulq_f32 (r, r);
float32x4_t r4 = vmulq_f32 (r2, r2);
float32x4_t y = vmulq_f32 (v_pw_horner_5_f32 (r2, r4, d->poly), r);
-#if WANT_SIMD_EXCEPT
- if (__glibc_unlikely (v_any_u32 (cmp)))
- return special_case (x, y, odd, cmp);
-#endif
-
return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
}
-
libmvec_hidden_def (V_NAME_F1 (sinpi))
HALF_WIDTH_ALIAS_F1 (sinpi)
float32x4_t poly[6];
float pi_consts[4];
float32x4_t shift;
-#if !WANT_SIMD_EXCEPT
float32x4_t range_val;
-#endif
} data = {
/* Coefficients generated using FPMinimax. */
.poly = { V4 (0x1.55555p-2f), V4 (0x1.11166p-3f), V4 (0x1.b88a78p-5f),
.pi_consts
= { -0x1.921fb6p+0f, 0x1.777a5cp-25f, 0x1.ee59dap-50f, 0x1.45f306p-1f },
.shift = V4 (0x1.8p+23f),
-#if !WANT_SIMD_EXCEPT
.range_val = V4 (0x1p15f),
-#endif
};
-#define RangeVal v_u32 (0x47000000) /* asuint32(0x1p15f). */
-#define TinyBound v_u32 (0x30000000) /* asuint32 (0x1p-31f). */
-#define Thresh v_u32 (0x16000000) /* asuint32(RangeVal) - TinyBound. */
-
/* Special cases (fall back to scalar calls). */
static float32x4_t VPCS_ATTR NOINLINE
special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
eval_poly (float32x4_t z, const struct data *d)
{
float32x4_t z2 = vmulq_f32 (z, z);
-#if WANT_SIMD_EXCEPT
- /* Tiny z (<= 0x1p-31) will underflow when calculating z^4.
- If fp exceptions are to be triggered correctly,
- sidestep this by fixing such lanes to 0. */
- uint32x4_t will_uflow
- = vcleq_u32 (vreinterpretq_u32_f32 (vabsq_f32 (z)), TinyBound);
- if (__glibc_unlikely (v_any_u32 (will_uflow)))
- z2 = vbslq_f32 (will_uflow, v_f32 (0), z2);
-#endif
float32x4_t z4 = vmulq_f32 (z2, z2);
return v_estrin_5_f32 (z, z2, z4, d->poly);
}
const struct data *d = ptr_barrier (&data);
float32x4_t special_arg = x;
- /* iax >= RangeVal means x, if not inf or NaN, is too large to perform fast
- regression. */
-#if WANT_SIMD_EXCEPT
- uint32x4_t iax = vreinterpretq_u32_f32 (vabsq_f32 (x));
- /* If fp exceptions are to be triggered correctly, also special-case tiny
- input, as this will load to overflow later. Fix any special lanes to 1 to
- prevent any exceptions being triggered. */
- uint32x4_t special = vcgeq_u32 (vsubq_u32 (iax, TinyBound), Thresh);
- if (__glibc_unlikely (v_any_u32 (special)))
- x = vbslq_f32 (special, v_f32 (1.0f), x);
-#else
- /* Otherwise, special-case large and special values. */
+ /* Special-case large and special values. */
uint32x4_t special = vcageq_f32 (x, d->range_val);
-#endif
/* n = rint(x/(pi/2)). */
float32x4_t pi_consts = vld1q_f32 (d->pi_consts);
uint64x2_t ia = vreinterpretq_u64_f64 (vabsq_f64 (x));
- float64x2_t u = x;
-
/* Trigger special-cases for tiny, boring and infinity/NaN. */
uint64x2_t special = vcgtq_u64 (vsubq_u64 (ia, d->tiny_bound), d->thresh);
-#if WANT_SIMD_EXCEPT
- /* To trigger fp exceptions correctly, set special lanes to a neutral value.
- They will be fixed up later by the special-case handler. */
- if (__glibc_unlikely (v_any_u64 (special)))
- u = v_zerofy_f64 (u, special);
-#endif
-
- u = vaddq_f64 (u, u);
/* tanh(x) = (e^2x - 1) / (e^2x + 1). */
- float64x2_t q = expm1_inline (u, &d->d);
+ float64x2_t twox = vaddq_f64 (x, x);
+ float64x2_t q = expm1_inline (twox, &d->d);
float64x2_t qp2 = vaddq_f64 (q, v_f64 (2.0));
if (__glibc_unlikely (v_any_u64 (special)))
float32x4_t boring = vreinterpretq_f32_u32 (vorrq_u32 (
sign, vreinterpretq_u32_s32 (d->expm1f_consts.exponent_bias)));
-#if WANT_SIMD_EXCEPT
- /* If fp exceptions are to be triggered properly, set all special and boring
- lanes to 0, which will trigger no exceptions, and fix them up later. */
- uint32x4_t special = vorrq_u32 (vcgtq_u32 (iax, d->large_bound),
- vcltq_u32 (iax, v_u32 (0x34000000)));
- x = v_zerofy_f32 (x, is_boring);
- if (__glibc_unlikely (v_any_u32 (special)))
- x = v_zerofy_f32 (x, special);
-#else
uint32x4_t special = vcgtq_u32 (iax, d->large_bound);
-#endif
/* tanh(x) = (e^2x - 1) / (e^2x + 1). */
float32x4_t q = expm1f_inline (vmulq_n_f32 (x, 2), &d->expm1f_consts);
/* Shortcut if k is 0 - set correction term to 0 and f to x. The result is
that the approximation is solely the polynomial. */
uint64x2_t k0 = vceqzq_f64 (k);
- cm = v_zerofy_f64 (cm, k0);
+ cm = vbslq_f64 (k0, v_f64 (0), cm);
f = vbslq_f64 (k0, x, f);
#endif
p[2] ? f (x1[2], x2[2]) : y[2],
p[3] ? f (x1[3], x2[3]) : y[3] };
}
-static inline float32x4_t
-v_zerofy_f32 (float32x4_t x, uint32x4_t mask)
-{
- return vreinterpretq_f32_u32 (vbicq_u32 (vreinterpretq_u32_f32 (x), mask));
-}
static inline float64x2_t
v_f64 (double x)
return (float64x2_t){ p[0] ? f (x1[0], x2[0]) : y[0],
p[1] ? f (x1[1], x2[1]) : y[1] };
}
-static inline float64x2_t
-v_zerofy_f64 (float64x2_t x, uint64x2_t mask)
-{
- return vreinterpretq_f64_u64 (vbicq_u64 (vreinterpretq_u64_f64 (x), mask));
-}
#endif
#include <math_private.h>
-/* Deprecated config option from Arm Optimized Routines which ensures
- fp exceptions are correctly triggered. This is not intended to be
- supported in GLIBC, however we keep it for ease of development. */
-#define WANT_SIMD_EXCEPT 0
-
/* Return ptr but hide its value from the compiler so accesses through it
cannot be optimized based on the contents. */
#define ptr_barrier(ptr) \