static const struct data
{
- float64x2_t poly[3];
- float64x2_t inv_ln2;
- double ln2[2];
- float64x2_t shift, thres;
- uint64x2_t index_mask, special_bound;
+ double c2, inv_ln2;
+ float64x2_t c0, c1;
+ float64x2_t shift, inf_bound, cosh_9, special_bound;
+ uint64x2_t index_mask;
+ double ln2_hi_lo[2];
} data = {
- .poly = { V2 (0x1.fffffffffffd4p-2), V2 (0x1.5555571d6b68cp-3),
- V2 (0x1.5555576a59599p-5), },
-
- .inv_ln2 = V2 (0x1.71547652b82fep8), /* N/ln2. */
+ .c0 = V2 (0x1.fffffffffffd4p-2),
+ .c1 = V2 (0x1.5555571d6b68cp-3),
+ .c2 = 0x1.5555576a59599p-5,
+ .inv_ln2 = 0x1.71547652b82fep8, /* N/ln2. */
/* -ln2/N. */
- .ln2 = {-0x1.62e42fefa39efp-9, -0x1.abc9e3b39803f3p-64},
+ .ln2_hi_lo = { -0x1.62e42fefa39efp-9, -0x1.abc9e3b39803f3p-64 },
.shift = V2 (0x1.8p+52),
- .thres = V2 (704.0),
-
.index_mask = V2 (0xff),
- /* 0x1.6p9, above which exp overflows. */
- .special_bound = V2 (0x4086000000000000),
+ /* ln(2^1023). expm1 helper overflows for large input. */
+ .special_bound = V2 (0x1.6232e147ae148p+9), /* 708.40. */
+ /* Bound past which function returns inf. */
+ .inf_bound = V2 (0x1.634p+9), /* 710.5. */
+ /* Cosh(9) slightly shifted for accuracy. */
+ .cosh_9 = V2 (0x1.fa7157c470f82p+11),
};
-static float64x2_t NOINLINE VPCS_ATTR
-special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
-{
- return v_call_f64 (cosh, x, y, special);
-}
-
/* Helper for approximating exp(x). Copied from v_exp_tail, with no
special-case handling or tail. */
static inline float64x2_t
{
const struct data *d = ptr_barrier (&data);
+ float64x2_t c2_inv_ln2 = vld1q_f64 (&d->c2);
/* n = round(x/(ln2/N)). */
- float64x2_t z = vfmaq_f64 (d->shift, x, d->inv_ln2);
+ float64x2_t z = vfmaq_laneq_f64 (d->shift, x, c2_inv_ln2, 1);
uint64x2_t u = vreinterpretq_u64_f64 (z);
float64x2_t n = vsubq_f64 (z, d->shift);
/* r = x - n*ln2/N. */
- float64x2_t ln2 = vld1q_f64 (d->ln2);
- float64x2_t r = vfmaq_laneq_f64 (x, n, ln2, 0);
- r = vfmaq_laneq_f64 (r, n, ln2, 1);
+ float64x2_t ln2_hi_lo = vld1q_f64 (d->ln2_hi_lo);
+ float64x2_t r = vfmaq_laneq_f64 (x, n, ln2_hi_lo, 0);
+ r = vfmaq_laneq_f64 (r, n, ln2_hi_lo, 1);
uint64x2_t e = vshlq_n_u64 (u, 52 - V_EXP_TAIL_TABLE_BITS);
uint64x2_t i = vandq_u64 (u, d->index_mask);
/* y = tail + exp(r) - 1 ~= r + C1 r^2 + C2 r^3 + C3 r^4. */
- float64x2_t y = vfmaq_f64 (d->poly[1], d->poly[2], r);
- y = vfmaq_f64 (d->poly[0], y, r);
- y = vmulq_f64 (vfmaq_f64 (v_f64 (1), y, r), r);
+ float64x2_t poly = vfmaq_laneq_f64 (d->c1, r, c2_inv_ln2, 0);
+ poly = vfmaq_f64 (d->c0, poly, r);
+ poly = vmulq_f64 (vfmaq_f64 (v_f64 (1), poly, r), r);
/* s = 2^(n/N). */
u = v_lookup_u64 (__v_exp_tail_data, i);
- float64x2_t s = vreinterpretq_f64_u64 (vaddq_u64 (u, e));
+ float64x2_t scale = vreinterpretq_f64_u64 (vaddq_u64 (u, e));
+
+ return vfmaq_f64 (scale, poly, scale);
+}
+
+/* Uses the compound angle formula to adjust x back into an approximable range:
+ cosh (A + B) = cosh(A)cosh(B) + sinh(A)sinh(B)
+ By choosing sufficiently large values whereby after rounding cosh == sinh,
+ this can be simplified into: cosh (A + B) = cosh(A) * e^B. */
+static float64x2_t NOINLINE VPCS_ATTR
+special_case (float64x2_t x, float64x2_t t, uint64x2_t special)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ /* Complete fast path computation. */
+ float64x2_t half_t = vmulq_n_f64 (t, 0.5);
+ float64x2_t half_over_t = vdivq_f64 (v_f64 (0.5), t);
+ float64x2_t y = vaddq_f64 (half_t, half_over_t);
+
+ /* Absolute x so we can subtract 9.0. */
+ float64x2_t ax = vabsq_f64 (x);
+ /* Subtract 9.0 from x as a reduction to prevent early overflow. */
+ float64x2_t sx = vsubq_f64 (ax, v_f64 (9.0));
+ float64x2_t s = exp_inline (sx);
+
+ /* Multiply the result by cosh(9) slightly shifted for accuracy. */
+ float64x2_t r = vmulq_f64 (s, d->cosh_9);
+
+ /* Check for overflowing lanes and return inf. */
+ uint64x2_t cmp = vcagtq_f64 (ax, d->inf_bound);
+
+ /* Set overflowing lines to inf and set none over flowing to result. */
+ r = vbslq_f64 (cmp, v_f64 (INFINITY), r);
- return vfmaq_f64 (s, y, s);
+ /* Return r for special lanes and y for none special lanes. */
+ return vbslq_f64 (special, r, y);
}
/* Approximation for vector double-precision cosh(x) using exp_inline.
cosh(x) = (exp(x) + exp(-x)) / 2.
- The greatest observed error is in the scalar fall-back region, so is the
- same as the scalar routine, 1.93 ULP:
- _ZGVnN2v_cosh (0x1.628af341989dap+9) got 0x1.fdf28623ef921p+1021
- want 0x1.fdf28623ef923p+1021.
-
- The greatest observed error in the non-special region is 1.54 ULP:
- _ZGVnN2v_cosh (0x1.8e205b6ecacf7p+2) got 0x1.f711dcb0c77afp+7
- want 0x1.f711dcb0c77b1p+7. */
+ The greatest observed error in the non-special region is 2.12 + 0.5 ULP:
+ _ZGVnN2v_cosh (-0x1.6241387f982f3p+1) got 0x1.ff784e05bad75p+2
+ want 0x1.ff784e05bad72p+2. */
float64x2_t VPCS_ATTR V_NAME_D1 (cosh) (float64x2_t x)
{
const struct data *d = ptr_barrier (&data);
- float64x2_t ax = vabsq_f64 (x);
- uint64x2_t special
- = vcgtq_u64 (vreinterpretq_u64_f64 (ax), d->special_bound);
-
/* Up to the point that exp overflows, we can use it to calculate cosh by
exp(|x|) / 2 + 1 / (2 * exp(|x|)). */
- float64x2_t t = exp_inline (ax);
- float64x2_t half_t = vmulq_n_f64 (t, 0.5);
- float64x2_t half_over_t = vdivq_f64 (v_f64 (0.5), t);
+ float64x2_t t = exp_inline (x);
- /* Fall back to scalar for any special cases. */
+ /* Check for special cases. */
+ uint64x2_t special = vcagtq_f64 (x, d->special_bound);
+ /* Fall back to vectorised special case for any lanes which would cause
+ exp to overflow. */
if (__glibc_unlikely (v_any_u64 (special)))
- return special_case (x, vaddq_f64 (half_t, half_over_t), special);
+ return special_case (x, t, special);
+ /* Complete fast path if no special lanes. */
+ float64x2_t half_t = vmulq_n_f64 (t, 0.5);
+ float64x2_t half_over_t = vdivq_f64 (v_f64 (0.5), t);
return vaddq_f64 (half_t, half_over_t);
}
{
double c0, c2;
double c1, c3;
+ double special_bound;
float64_t inv_ln2, ln2_hi, ln2_lo, shift;
- uint64_t special_bound;
+ float64_t exp_9;
} data = {
/* Generated using Remez, in [-log(2)/128, log(2)/128]. */
.c0 = 0x1.fffffffffdbcdp-2,
/* 1/ln2. */
.inv_ln2 = 0x1.71547652b82fep+0,
.shift = 0x1.800000000ff80p+46, /* 1.5*2^46+1022. */
-
- /* asuint(ln(2^(1024 - 1/128))), the value above which exp overflows. */
- .special_bound = 0x40862e37e7d8ba72,
+ /* (ln(2^(1021 + 1/128))), above which exp overflows. */
+ .special_bound = 0x1.61dab63dc7dc1p+9, /* ~ 707.71. */
+ .exp_9 = 0x1.fa7157c470f82p+12, /* exp(9) ~ 8103.08. */
};
/* Helper for approximating exp(x)/2.
return svmla_x (pg, scale, scale, p);
}
-/* Vectorised special case to handle values past where exp_inline overflows.
- Halves the input value and uses the identity exp(x) = exp(x/2)^2 to double
- the valid range of inputs, and returns inf for anything past that. */
+/* Uses the compound angle formula to adjust x back into an approximable range:
+ cosh (A + B) = cosh(A)cosh(B) + sinh(A)sinh(B)
+ By choosing sufficiently large values whereby after rounding cosh == sinh,
+ this can be simplified into: cosh (A + B) = cosh(A) * e^B. */
static svfloat64_t NOINLINE
-special_case (svbool_t pg, svbool_t special, svfloat64_t ax, svfloat64_t t,
+special_case (svfloat64_t x, svbool_t pg, svbool_t special, svfloat64_t t,
const struct data *d)
{
/* Finish fast path to compute values for non-special cases. */
svfloat64_t inv_twoexp = svdivr_x (pg, t, 0.25);
svfloat64_t y = svadd_x (pg, t, inv_twoexp);
- /* Halves input value, and then check if any cases
- are still going to overflow. */
- ax = svmul_x (special, ax, 0.5);
- svbool_t is_safe
- = svcmplt (special, svreinterpret_u64 (ax), d->special_bound);
+ /* Absolute x so we can subtract 9.0 without worrying about signing. */
+ svfloat64_t ax = svabs_x (svptrue_b64 (), x);
+ /* The input `x` is reduced by an offset of 9.0 to allow for accurate
+ approximation on the interval x > SpecialBound ~ 710.47. */
+ ax = svsub_x (svptrue_b64 (), ax, 9.0);
- /* Computes exp(x/2), and sets any overflowing lanes to inf. */
- svfloat64_t half_exp = exp_over_two_inline (special, ax, d);
- half_exp = svsel (is_safe, half_exp, sv_f64 (INFINITY));
+ svfloat64_t half_exp = exp_over_two_inline (svptrue_b64 (), ax, d);
- /* Construct special case cosh(x) = (exp(x/2)^2)/2. */
- svfloat64_t exp = svmul_x (svptrue_b64 (), half_exp, 2);
- svfloat64_t special_y = svmul_x (special, exp, half_exp);
+ /* Multiply the result by exp(9) for special lanes only. */
+ svfloat64_t cosh_sum = svmul_x (svptrue_b64 (), half_exp, d->exp_9);
- /* Select correct return values for special and non-special cases. */
- special_y = svsel (special, special_y, y);
+ /* Check for overflowing special lanes and return inf for these lanes. */
+ svbool_t is_inf = svcmpgt (special, ax, d->special_bound);
+ /* Return inf for overflowing lanes. */
+ svfloat64_t special_y = svsel (is_inf, sv_f64 (INFINITY), cosh_sum);
- /* Ensure an input of nan is correctly propagated. */
- svbool_t is_nan
- = svcmpgt (special, svreinterpret_u64 (ax), sv_u64 (0x7ff0000000000000));
- return svsel (is_nan, ax, svsel (special, special_y, y));
+ return svsel (special, special_y, y);
}
/* Approximation for SVE double-precision cosh(x) using exp_inline.
cosh(x) = (exp(x) + exp(-x)) / 2.
- The greatest observed error in special case region is 2.66 + 0.5 ULP:
- _ZGVsMxv_cosh (0x1.633b532ffbc1ap+9) got 0x1.f9b2d3d22399ep+1023
- want 0x1.f9b2d3d22399bp+1023
-
- The greatest observed error in the non-special region is 1.01 + 0.5 ULP:
- _ZGVsMxv_cosh (0x1.998ecbb3c1f81p+1) got 0x1.890b225657f84p+3
- want 0x1.890b225657f82p+3. */
+ The greatest observed error is 2.10 + 0.5 ULP:
+ _ZGVsMxv_cosh (-0x1.2acb2978bd15ep+4) got 0x1.ebbd8806ea342p+25
+ want 0x1.ebbd8806ea33fp+25. */
svfloat64_t SV_NAME_D1 (cosh) (svfloat64_t x, const svbool_t pg)
{
const struct data *d = ptr_barrier (&data);
- svfloat64_t ax = svabs_x (pg, x);
- svbool_t special = svcmpgt (pg, svreinterpret_u64 (ax), d->special_bound);
-
/* Up to the point that exp overflows, we can use it to calculate cosh by
(exp(|x|)/2 + 1) / (2 * exp(|x|)). */
- svfloat64_t half_exp = exp_over_two_inline (pg, ax, d);
+ svfloat64_t half_exp = exp_over_two_inline (pg, x, d);
- /* Falls back to entirely standalone vectorized special case. */
+ /* Fall back to vectorised special case for any lanes which would cause
+ exp to overflow. */
+ svbool_t special = svacge (pg, x, d->special_bound);
if (__glibc_unlikely (svptest_any (pg, special)))
- return special_case (pg, special, ax, half_exp, d);
+ return special_case (x, pg, special, half_exp, d);
svfloat64_t inv_twoexp = svdivr_x (pg, half_exp, 0.25);
return svadd_x (pg, half_exp, inv_twoexp);
static const struct data
{
struct v_expf_data expf_consts;
- float32x4_t bound;
+ float32x4_t special_bound, inf_bound, cosh_9, nine;
} data = {
.expf_consts = V_EXPF_DATA,
- /* 0x1.5a92d8p+6: expf overflows above this, so have to use special case. */
- .bound = V4 (0x1.5a92d8p+6),
+ /* 86.64: expf overflows above this, so have to use special case. */
+ .special_bound = V4 (0x1.5a92d8p+6),
+ /* Value above which inf is returned. */
+ .inf_bound = V4 (0x1.65a9fap+6), /* ~ 89.42. */
+ .cosh_9 = V4 (0x1.fa715845p+11), /* cosh(9). */
+ .nine = V4 (0x1.2p+3), /* 9.0. */
};
+/* Uses the compound angle formula to adjust x back into an approximable range:
+ cosh (A + B) = cosh(A)cosh(B) + sinh(A)sinh(B)
+ By choosing sufficiently large values whereby after rounding cosh == sinh,
+ this can be simplified into: cosh (A + B) = cosh(A) * e^B. */
static float32x4_t NOINLINE VPCS_ATTR
-special_case (float32x4_t x, float32x4_t half_t, float32x4_t half_over_t,
- uint32x4_t special)
+special_case (float32x4_t x, float32x4_t t, uint32x4_t special)
{
- return v_call_f32 (coshf, x, vaddq_f32 (half_t, half_over_t), special);
+ const struct data *d = ptr_barrier (&data);
+
+ /* Complete fast path computation. */
+ /* Calculate cosh by exp(x) / 2 + exp(-x) / 2. */
+ float32x4_t half_t = vmulq_n_f32 (t, 0.5);
+ float32x4_t half_over_t = vdivq_f32 (v_f32 (0.5), t);
+ float32x4_t y = vaddq_f32 (half_t, half_over_t);
+
+ /* Absolute x so we can subtract 9.0 without worrying about signing. */
+ float32x4_t ax = vabsq_f32 (x);
+ /* Subtract 9.0 from x as a reduction to prevent early overflow. */
+ float32x4_t sx = vsubq_f32 (ax, d->nine);
+ float32x4_t s = v_expf_inline (sx, &d->expf_consts);
+
+ /* Multiply the result by cosh(9) slightly shifted for accuracy. */
+ float32x4_t r = vmulq_f32 (s, d->cosh_9);
+
+ /* Check for overflowing lanes and return inf. */
+ uint32x4_t cmp = vcagtq_f32 (ax, d->inf_bound);
+
+ /* Set overflowing lines to inf and set none over flowing to result. */
+ r = vbslq_f32 (cmp, v_f32 (INFINITY), r);
+
+ /* Return r for special lanes and y for none special lanes. */
+ return vbslq_f32 (special, r, y);
}
/* Single-precision vector cosh, using vector expf.
{
const struct data *d = ptr_barrier (&data);
- uint32x4_t special = vcageq_f32 (x, d->bound);
float32x4_t t = v_expf_inline (x, &d->expf_consts);
+ /* Check for special cases. */
+ uint32x4_t special = vcageq_f32 (x, d->special_bound);
+ /* Fall back to vectorised special case for any lanes which would cause
+ expm1 to overflow. */
+ if (__glibc_unlikely (v_any_u32 (special)))
+ return special_case (x, t, special);
+
+ /* Complete fast path if no special lanes. */
/* Calculate cosh by exp(x) / 2 + exp(-x) / 2. */
float32x4_t half_t = vmulq_n_f32 (t, 0.5);
float32x4_t half_over_t = vdivq_f32 (v_f32 (0.5), t);
-
- if (__glibc_unlikely (v_any_u32 (special)))
- return special_case (x, half_t, half_over_t, special);
-
return vaddq_f32 (half_t, half_over_t);
}
libmvec_hidden_def (V_NAME_F1 (cosh))
#include "sv_math.h"
#include "sv_expf_inline.h"
+/* For x < SpecialBound, the result of exp is subnormal and not handled
+ correctly by FEXPA. */
+#define SpecialBound 0x1.5d5e2ap+6f /* ~ 87.34. */
+
static const struct data
{
struct sv_expf_data expf_consts;
- float special_bound;
+ float32_t special_bound, cosh_9;
} data = {
.expf_consts = SV_EXPF_DATA,
- /* 0x1.5a92d8p+6: expf overflows above this, so have to use special case. */
- .special_bound = 0x1.5a92d8p+6,
+ .special_bound = SpecialBound,
+ .cosh_9 = 0x1.fa715845p+11, /* cosh(9). */
};
-static svfloat32_t NOINLINE
-special_case (svfloat32_t x, svfloat32_t half_e, svfloat32_t half_over_e,
- svbool_t pg)
+/* Uses the compound angle formula to adjust x back into an approximable range:
+ cosh (A + B) = cosh(A)cosh(B) + sinh(A)sinh(B)
+ By choosing sufficiently large values whereby after rounding cosh == sinh,
+ this can be simplified into: cosh (A + B) = cosh(A) * e^B. */
+static inline svfloat32_t
+special_case (svfloat32_t x, svbool_t special, svfloat32_t half_e,
+ svfloat32_t half_over_e, const struct data *d)
{
- return sv_call_f32 (coshf, x, svadd_x (svptrue_b32 (), half_e, half_over_e),
- pg);
+ /* Finish fastpass to compute values for non-special cases. */
+ svfloat32_t y = svadd_x (svptrue_b32 (), half_e, half_over_e);
+
+ /* Make special values positive. */
+ svfloat32_t ax = svabs_x (svptrue_b32 (), x);
+
+ /* The input `x` is reduced by an offset of 9.0 to allow for accurate
+ approximation on the interval `x > SpecialBound ~ 87.34`. */
+ ax = svsub_x (svptrue_b32 (), ax, 9.0);
+ svfloat32_t r = expf_inline (ax, svptrue_b32 (), &d->expf_consts);
+
+ /* Multiply the result e by cosh(9) = exp(9)/2 for special lanes only. */
+ svfloat32_t coshf_sum = svmul_x (svptrue_b32 (), r, d->cosh_9);
+
+ /* Check for overflow in exponential for special case lanes. */
+ svbool_t is_inf = svcmpge (special, ax, d->special_bound);
+
+ /* Set overflowing lines to inf and set none over flowing to result. */
+ svfloat32_t special_y = svsel (is_inf, sv_f32 (INFINITY), coshf_sum);
+
+ /* Return special_y for special lanes and y for none special lanes. */
+ return svsel (special, special_y, y);
}
/* Single-precision vector cosh, using vector expf.
- Maximum error is 2.56 +0.5 ULP:
+ Maximum error is 2.55 +0.5 ULP:
_ZGVsMxv_coshf(-0x1.5b40f4p+1) got 0x1.e47748p+2
want 0x1.e4774ep+2. */
svfloat32_t SV_NAME_F1 (cosh) (svfloat32_t x, svbool_t pg)
{
const struct data *d = ptr_barrier (&data);
- svbool_t special = svacge (pg, x, d->special_bound);
-
/* Calculate cosh by exp(x) / 2 + exp(-x) / 2.
Note that x is passed to exp here, rather than |x|. This is to avoid using
destructive unary ABS for better register usage. However it means the
svfloat32_t half_e = svmul_x (svptrue_b32 (), e, 0.5);
svfloat32_t half_over_e = svdivr_x (pg, e, 0.5);
+ /* Check for special cases and fall back to vectorised special case for any
+ lanes which would cause expf to overflow. */
+ svbool_t special = svacgt (pg, x, d->special_bound);
if (__glibc_unlikely (svptest_any (pg, special)))
- return special_case (x, half_e, half_over_e, special);
+ return special_case (x, special, half_e, half_over_e, d);
+ /* Complete fast path if no special lanes. */
return svadd_x (svptrue_b32 (), half_e, half_over_e);
}
{
struct v_expm1_data d;
uint64x2_t halff;
- float64x2_t large_bound;
+ float64x2_t special_bound;
+ float64x2_t inf_bound, cosh_9;
} data = {
.d = V_EXPM1_DATA,
.halff = V2 (0x3fe0000000000000),
- /* 2^9. expm1 helper overflows for large input. */
- .large_bound = V2 (0x1p+9),
+ /* ln(2^1023). expm1 helper overflows for large input. */
+ .special_bound = V2 (0x1.628b76e3a7b61p+9), /* 709.09. */
+ /* Bound past which function returns inf. */
+ .inf_bound = V2 (0x1.634p+9), /* 710.5. */
+ /* Cosh(9) slightly shifted for accuracy. */
+ .cosh_9 = V2 (0x1.fa7157c470f82p+11),
};
+/* Uses the compound angle formula to adjust x back into an approximable range:
+ sinh (A + B) = cosh(A)cosh(B) + sinh(A)sinh(B)
+ By choosing sufficiently large values whereby after rounding sinh == cosh,
+ this can be simplified into: sinh (A + B) = sinh(A) * e^B. */
static float64x2_t NOINLINE VPCS_ATTR
special_case (float64x2_t x, float64x2_t t, float64x2_t halfsign,
uint64x2_t special)
{
- return v_call_f64 (sinh, x, vmulq_f64 (t, halfsign), special);
+ const struct data *d = ptr_barrier (&data);
+
+ /* Complete fast path. */
+ t = vaddq_f64 (t, vdivq_f64 (t, vaddq_f64 (t, v_f64 (1.0))));
+ float64x2_t y = vmulq_f64 (t, halfsign);
+
+ float64x2_t ax = vabsq_f64 (x);
+
+ /* Preserve sign for later use. */
+ uint64x2_t sign
+ = veorq_u64 (vreinterpretq_u64_f64 (x), vreinterpretq_u64_f64 (ax));
+
+ /* Subtract 9.0 from x as a reduction to prevent early overflow. */
+ float64x2_t sx = vsubq_f64 (ax, v_f64 (9.0));
+ float64x2_t s = expm1_inline (sx, &d->d);
+
+ /* Multiply the result by cosh(9) slightly shifted for accuracy. */
+ float64x2_t r = vmulq_f64 (s, d->cosh_9);
+
+ /* Check for overflowing lanes and set to inf. */
+ uint64x2_t cmp = vcagtq_f64 (ax, d->inf_bound);
+
+ /* Set overflowing lines to inf and set none over flowing to result. */
+ r = vbslq_f64 (cmp, v_f64 (INFINITY), r);
+
+ /* Change sign back to original and return. */
+ r = vreinterpretq_f64_u64 (vorrq_u64 (sign, vreinterpretq_u64_f64 (r)));
+
+ /* Return r for special lanes and y for none special lanes. */
+ return vbslq_f64 (special, r, y);
}
/* Approximation for vector double-precision sinh(x) using expm1.
float64x2_t halfsign = vreinterpretq_f64_u64 (
vbslq_u64 (v_u64 (0x8000000000000000), ix, d->halff));
- uint64x2_t special = vcageq_f64 (x, d->large_bound);
-
/* Up to the point that expm1 overflows, we can use it to calculate sinh
using a slight rearrangement of the definition of sinh. This allows us to
retain acceptable accuracy for very small inputs. */
float64x2_t t = expm1_inline (ax, &d->d);
- t = vaddq_f64 (t, vdivq_f64 (t, vaddq_f64 (t, v_f64 (1.0))));
+ /* Check for special cases. */
+ uint64x2_t special = vcageq_f64 (x, d->special_bound);
+ /* Fall back to vectorised special case for any lanes which would cause
+ expm1 to overflow. */
if (__glibc_unlikely (v_any_u64 (special)))
return special_case (x, t, halfsign, special);
+ /* Complete fast path if no special lanes. */
+ t = vaddq_f64 (t, vdivq_f64 (t, vaddq_f64 (t, v_f64 (1.0))));
return vmulq_f64 (t, halfsign);
}
static const struct data
{
+ uint64_t expm1_data[20];
uint64_t halff;
double c2, c4;
double inv_ln2;
double ln2_hi, ln2_lo;
double c0, c1, c3;
- double shift, special_bound, bound;
- uint64_t expm1_data[20];
+ double shift, small_bound;
+ double special_bound, cosh_9;
} data = {
/* Table lookup of 2^(i/64) - 1, for values of i from 0..19. */
.expm1_data = {
.shift = 0x1.800000000ffc0p+46, /* 1.5*2^46+1023. */
.halff = 0x3fe0000000000000,
.special_bound = 0x1.62e37e7d8ba72p+9, /* ln(2^(1024 - 1/128)). */
- .bound = 0x1.a56ef8ec924ccp-3 /* 19*ln2/64. */
+ .small_bound = 0x1.a56ef8ec924ccp-3, /* 19*ln2/64. */
+ /* cosh(9) 4051.541963787692 slightly shifted for accuracy. */
+ .cosh_9 = 0x1.fa7157c470f82p+11,
};
/* A specialised FEXPA expm1 that is only valid for positive inputs and
This can be circumvented by using a small lookup for scale-1
when our input is below a certain bound, otherwise we can use FEXPA. */
- svbool_t is_small = svaclt (pg, x, d->bound);
+ svbool_t is_small = svaclt (pg, x, d->small_bound);
/* Index via the input of FEXPA, but we only care about the lower 5 bits. */
svuint64_t base_idx = svand_x (pg, u, 0x1f);
return svmla_x (pg, scalem1, scale, p);
}
-/* Vectorised special case to handle values past where exp_inline overflows.
- Halves the input value and uses the identity exp(x) = exp(x/2)^2 to double
- the valid range of inputs, and returns inf for anything past that. */
+/* Uses the compound angle formula to adjust x back into an approximable range:
+ sinh (A + B) = cosh(A)cosh(B) + sinh(A)sinh(B)
+ By choosing sufficiently large values whereby after rounding sinh == cosh,
+ this can be simplified into: sinh (A + B) = sinh(A) * e^B. */
static svfloat64_t NOINLINE
-special_case (svbool_t pg, svbool_t special, svfloat64_t ax,
- svfloat64_t halfsign, const struct data *d)
+special_case (svuint64_t sign, svbool_t pg, svbool_t special, svfloat64_t ax,
+ svfloat64_t halfsign)
{
- /* Halves input value, and then check if any cases
- are still going to overflow. */
- ax = svmul_x (special, ax, 0.5);
- svbool_t is_safe = svaclt (special, ax, d->special_bound);
+ const struct data *d = ptr_barrier (&data);
- svfloat64_t t = expm1_inline (pg, ax);
+ /* The input `x` is reduced by an offset of 9.0 to allow for accurate
+ approximation on the interval x > SpecialBound ~ 709.78. */
+ ax = svsub_m (special, ax, 9.0);
+ svfloat64_t t = expm1_inline (pg, ax);
/* Finish fastpass to compute values for non-special cases. */
svfloat64_t y = svadd_x (pg, t, svdiv_x (pg, t, svadd_x (pg, t, 1.0)));
y = svmul_x (pg, y, halfsign);
- /* Computes special lane, and set remaining overflow lanes to inf. */
- svfloat64_t half_special_y = svmul_x (svptrue_b64 (), t, halfsign);
- svfloat64_t special_y = svmul_x (svptrue_b64 (), half_special_y, t);
+ /* Multiply the result by cosh(9) with a slight tweek for accuracy for
+ special lanes only. */
+ svfloat64_t cosh_sum = svmul_x (svptrue_b64 (), t, d->cosh_9);
+
+ /* Check for overflowing special lanes. */
+ svbool_t is_inf = svcmpgt (special, ax, d->special_bound);
+ /* Return inf for overflowing lanes. */
+ svfloat64_t special_y = svsel (is_inf, sv_f64 (INFINITY), cosh_sum);
- svuint64_t signed_inf
- = svorr_x (svptrue_b64 (), svreinterpret_u64 (halfsign),
- sv_u64 (0x7ff0000000000000));
- special_y = svsel (is_safe, special_y, svreinterpret_f64 (signed_inf));
+ /* Change sign back to original and return. */
+ special_y = svreinterpret_f64 (
+ svorr_x (svptrue_b64 (), sign, svreinterpret_u64 (special_y)));
/* Join resulting vectors together and return. */
return svsel (special, special_y, y);
/* Approximation for SVE double-precision sinh(x) using FEXPA expm1.
Uses sinh(x) = e^2x - 1 / 2e^x, rewritten for accuracy.
- The greatest observed error in the non-special region is 2.63 + 0.5 ULP:
+ The greatest observed error is 2.62 + 0.5 ULP:
_ZGVsMxv_sinh (0x1.b5e0e13ba88aep-2) got 0x1.c3587faf97b0cp-2
- want 0x1.c3587faf97b09p-2
-
- The greatest observed error in the special region is 2.65 + 0.5 ULP:
- _ZGVsMxv_sinh (0x1.633ce847dab1ap+9) got 0x1.fffd30eea0066p+1023
- want 0x1.fffd30eea0063p+1023. */
+ want 0x1.c3587faf97b09p-2. */
svfloat64_t SV_NAME_D1 (sinh) (svfloat64_t x, svbool_t pg)
{
const struct data *d = ptr_barrier (&data);
= sveor_x (pg, svreinterpret_u64 (x), svreinterpret_u64 (ax));
svfloat64_t halfsign = svreinterpret_f64 (svorr_x (pg, sign, d->halff));
- /* Fall back to scalar variant for all lanes if any are special. */
+ /* Fall back to vectorised special case for any lanes which would cause
+ expm1f to overflow. */
if (__glibc_unlikely (svptest_any (pg, special)))
- return special_case (pg, special, ax, halfsign, d);
+ return special_case (sign, pg, special, ax, halfsign);
/* Up to the point that expm1 overflows, we can use it to calculate sinh
using a slight rearrangement of the definition of sinh. This allows us to
static const struct data
{
struct v_expm1f_data expm1f_consts;
- float32x4_t oflow_bound;
+ float32x4_t special_bound, inf_bound, cosh_9, nine;
} data = {
.expm1f_consts = V_EXPM1F_DATA,
- /* 0x1.61814ep+6, above which expm1f helper overflows. */
- .oflow_bound = V4 (0x1.61814ep+6),
+ /* 88.38, above which expm1f helper overflows. */
+ .special_bound = V4 (0x1.61814ap+6),
+ /* Value above which inf is returned. */
+ .inf_bound = V4 (0x1.65a9fap+6), /* ~ 89.42. */
+ .cosh_9 = V4 (0x1.fa715845p+11), /* cosh(9). */
+ .nine = V4 (0x1.2p+3), /* 9.0. */
};
+/* Uses the compound angle formula to adjust x back into an approximable range:
+ sinh (A + B) = cosh(A)cosh(B) + sinh(A)sinh(B)
+ By choosing sufficiently large values whereby after rounding sinh == cosh,
+ this can be simplified into: sinh (A + B) = sinh(A) * e^B. */
static float32x4_t NOINLINE VPCS_ATTR
special_case (float32x4_t x, float32x4_t t, float32x4_t halfsign,
uint32x4_t special)
{
- return v_call_f32 (sinhf, x, vmulq_f32 (t, halfsign), special);
+ const struct data *d = ptr_barrier (&data);
+
+ /* Complete fast path. */
+ t = vaddq_f32 (t, vdivq_f32 (t, vaddq_f32 (t, v_f32 (1.0))));
+ float32x4_t y = vmulq_f32 (t, halfsign);
+
+ float32x4_t ax = vabsq_f32 (x);
+ uint32x4_t iax = vreinterpretq_u32_f32 (ax);
+
+ /* Preserve sign for later use. */
+ uint32x4_t sign = veorq_u32 (vreinterpretq_u32_f32 (x), iax);
+
+ /* Subtract 9.0 from x as a reduction to prevent early overflow. */
+ float32x4_t sx = vsubq_f32 (ax, d->nine);
+ float32x4_t s = expm1f_inline (sx, &d->expm1f_consts);
+
+ /* Multiply the result by cosh(9) slightly shifted for accuracy. */
+ float32x4_t r = vmulq_f32 (s, d->cosh_9);
+
+ /* Check for overflowing lanes and return inf. */
+ uint32x4_t cmp = vcagtq_f32 (ax, d->inf_bound);
+
+ /* Set overflowing lines to inf and set none over flowing to result. */
+ r = vbslq_f32 (cmp, v_f32 (INFINITY), r);
+
+ /* Change sign back to original and return. */
+ r = vreinterpretq_f32_u32 (vorrq_u32 (sign, vreinterpretq_u32_f32 (r)));
+
+ /* Return r for special lanes and y for none special lanes. */
+ return vbslq_f32 (special, r, y);
}
/* Approximation for vector single-precision sinh(x) using expm1.
float32x4_t halfsign = vreinterpretq_f32_u32 (
vbslq_u32 (v_u32 (0x80000000), ix, vreinterpretq_u32_f32 (v_f32 (0.5))));
- uint32x4_t special = vcageq_f32 (x, d->oflow_bound);
-
/* Up to the point that expm1f overflows, we can use it to calculate sinhf
- using a slight rearrangement of the definition of asinh. This allows us
+ using a slight rearrangement of the definition of asinh. This allows us
to retain acceptable accuracy for very small inputs. */
float32x4_t t = expm1f_inline (ax, &d->expm1f_consts);
- t = vaddq_f32 (t, vdivq_f32 (t, vaddq_f32 (t, v_f32 (1.0))));
- /* Fall back to the scalar variant for any lanes that should trigger an
- exception. */
+ /* Check for special cases. */
+ uint32x4_t special = vcageq_f32 (x, d->special_bound);
+
+ /* Fall back to vectorised special case for any lanes which would cause
+ expm1 to overflow. */
if (__glibc_unlikely (v_any_u32 (special)))
return special_case (x, t, halfsign, special);
+ /* Complete fast path if no special lanes. */
+ t = vaddq_f32 (t, vdivq_f32 (t, vaddq_f32 (t, v_f32 (1.0))));
return vmulq_f32 (t, halfsign);
}
libmvec_hidden_def (V_NAME_F1 (sinh))
<https://www.gnu.org/licenses/>. */
#include "sv_expm1f_inline.h"
+#include "sv_expf_inline.h"
#include "sv_math.h"
static const struct data
{
struct sv_expm1f_data expm1f_consts;
- uint32_t halff, large_bound;
+ struct sv_expf_data expf_consts;
+ float32_t special_bound, cosh_9;
+ uint32_t halff;
} data = {
.expm1f_consts = SV_EXPM1F_DATA,
+ .expf_consts = SV_EXPF_DATA,
.halff = 0x3f000000,
- /* 0x1.61814ep+6, above which expm1f helper overflows. */
- .large_bound = 0x42b0c0a7,
+ /* ~ 88.37 above which expm1f helper overflows. */
+ .special_bound = 0x1.61814ap+6,
+ .cosh_9 = 0x1.fa715845p+11, /* cosh(9) ~ 4051.54. */
};
-static svfloat32_t NOINLINE
-special_case (svfloat32_t x, svfloat32_t y, svbool_t pg)
+/* Uses the compound angle formula to adjust x back into an approximable range:
+ sinh (A + B) = cosh(A)cosh(B) + sinh(A)sinh(B)
+ By choosing sufficiently large values whereby after rounding sinh == cosh,
+ this can be simplified into: sinh (A + B) = sinh(A) * e^B. */
+static inline svfloat32_t
+special_case (const svbool_t pg, svbool_t special, svfloat32_t ax,
+ svfloat32_t x, svfloat32_t t, const struct data *d)
{
- return sv_call_f32 (sinhf, x, y, pg);
+ /* Preserve the sign bit to return final calcualtion to correct sign. */
+ svuint32_t sign
+ = sveor_x (pg, svreinterpret_u32 (x), svreinterpret_u32 (ax));
+
+ /* Finish fastpass to compute values for non-special cases. */
+ svfloat32_t halfsign = svreinterpret_f32 (svorr_x (pg, sign, d->halff));
+ t = svadd_x (pg, t, svdiv_x (pg, t, svadd_x (pg, t, 1.0)));
+ svfloat32_t y = svmul_x (svptrue_b32 (), t, halfsign);
+
+ /* The input `x` is reduced by an offset of 9.0 to allow for accurate
+ approximation on the interval x > SpecialBound ~ 88.37. */
+ ax = svsub_x (svptrue_b32 (), ax, 9.0);
+ svfloat32_t e = expf_inline (ax, svptrue_b32 (), &d->expf_consts);
+
+ /* Multiply the result e by cosh(9) = exp(9)/2 for special lanes only. */
+ svfloat32_t sinhf_sum = svmul_x (svptrue_b32 (), e, d->cosh_9);
+
+ /* Check for overflow in exponential for special case lanes. */
+ svbool_t is_inf = svcmpge (special, ax, d->special_bound);
+
+ /* Set overflowing lines to inf and set none over flowing to result. */
+ svfloat32_t special_y = svsel (is_inf, sv_f32 (INFINITY), sinhf_sum);
+
+ /* Change sign back to original and return. */
+ special_y = svreinterpret_f32 (
+ svorr_x (svptrue_b32 (), sign, svreinterpret_u32 (special_y)));
+
+ /* Return special_y for special lanes and y for none special lanes. */
+ return svsel (special, special_y, y);
}
/* Approximation for SVE single-precision sinh(x) using expm1.
sinh(x) = (exp(x) - exp(-x)) / 2.
- The maximum error is 2.26 ULP:
- _ZGVsMxv_sinhf (0x1.e34a9ep-4) got 0x1.e469ep-4
- want 0x1.e469e4p-4. */
+ Maximum error is 2.76 +0.5 ULP:
+ _ZGVsMxv_sinhf (0x1.6587e8p+6) got 0x1.ef3f98p+127
+ want 0x1.ef3f92p+127. */
svfloat32_t SV_NAME_F1 (sinh) (svfloat32_t x, const svbool_t pg)
{
const struct data *d = ptr_barrier (&data);
- svfloat32_t ax = svabs_x (pg, x);
- svuint32_t sign
- = sveor_x (pg, svreinterpret_u32 (x), svreinterpret_u32 (ax));
- svfloat32_t halfsign = svreinterpret_f32 (svorr_x (pg, sign, d->halff));
- svbool_t special = svcmpge (pg, svreinterpret_u32 (ax), d->large_bound);
+ /* Use absolute number for calculations for accuracy. */
+ svfloat32_t ax = svabs_x (pg, x);
/* Up to the point that expm1f overflows, we can use it to calculate sinhf
- using a slight rearrangement of the definition of asinh. This allows us to
- retain acceptable accuracy for very small inputs. */
+ using a slight rearrangement of the definition of asinh. This allows us to
+ retain acceptable accuracy for very small inputs. */
svfloat32_t t = expm1f_inline (ax, pg, &d->expm1f_consts);
- t = svadd_x (pg, t, svdiv_x (pg, t, svadd_x (pg, t, 1.0)));
- /* Fall back to the scalar variant for any lanes which would cause
- expm1f to overflow. */
+ /* Check for special cases and fall back to vectorised special case for any
+ lanes which would cause expm1f to overflow. */
+ svbool_t special = svacge (pg, x, d->special_bound);
if (__glibc_unlikely (svptest_any (pg, special)))
- return special_case (x, svmul_x (pg, t, halfsign), special);
+ return special_case (pg, special, ax, x, t, d);
+ /* Preserve the sign bit to return final calcualtion to correct sign. */
+ svuint32_t sign
+ = sveor_x (pg, svreinterpret_u32 (x), svreinterpret_u32 (ax));
+ svfloat32_t halfsign = svreinterpret_f32 (svorr_x (pg, sign, d->halff));
+ /* Complete fast path if no special lanes. */
+ t = svadd_x (pg, t, svdiv_x (pg, t, svadd_x (pg, t, 1.0)));
return svmul_x (svptrue_b32 (), t, halfsign);
}
<https://www.gnu.org/licenses/>. */
#include "v_math.h"
-#include "v_expm1_inline.h"
static const struct data
{
- struct v_expm1_data d;
- uint64x2_t thresh, tiny_bound;
+ float64x2_t c2, c4, c6, c8;
+ float64x2_t two_over_ln2;
+ int64x2_t exponent_bias;
+ double c1, c3, c5, c7, c9, c10;
+ double ln2_hi_lo[2];
+ float64x2_t special_bound;
} data = {
- .d = V_EXPM1_DATA,
- .tiny_bound = V2 (0x3e40000000000000), /* asuint64 (0x1p-27). */
- /* asuint64(0x1.241bf835f9d5fp+4) - asuint64(tiny_bound). */
- .thresh = V2 (0x01f241bf835f9d5f),
+ .c1 = 0x1.5555555555559p-3,
+ .c2 = V2 (0x1.555555555554bp-5),
+ .c3 = 0x1.111111110f663p-7,
+ .c4 = V2 (0x1.6c16c16c1b5f3p-10),
+ .c5 = 0x1.a01a01affa35dp-13,
+ .c6 = V2 (0x1.a01a018b4ecbbp-16),
+ .c7 = 0x1.71ddf82db5bb4p-19,
+ .c8 = V2 (0x1.27e517fc0d54bp-22),
+ .c9 = 0x1.af5eedae67435p-26,
+ .c10 = 0x1.1f143d060a28ap-29,
+ .ln2_hi_lo = { 0x1.62e42fefa39efp-1, 0x1.abc9e3b39803fp-56 },
+ .two_over_ln2 = V2 (0x1.71547652b82fep1),
+ .exponent_bias = V2 (0x3ff0000000000000),
+ /* Bound past which function returns signed 1 as the result. */
+ .special_bound = V2 (0x1.2cccccccccccdp+4), /* 18.80. */
};
+/* e^2x - 1 inline helper. */
+static inline float64x2_t
+e2xm1_inline (float64x2_t x, const struct data *d)
+{
+ float64x2_t ln2_hi_lo = vld1q_f64 (&d->ln2_hi_lo[0]);
+
+ /* Reduce argument to smaller range:
+ Let i = round(x / ln2)
+ and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
+ exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
+ where 2^i is exact because i is an integer. */
+ float64x2_t n = vrndaq_f64 (vmulq_f64 (x, d->two_over_ln2));
+ int64x2_t i = vcvtq_s64_f64 (n);
+ float64x2_t f = vaddq_f64 (x, x);
+ f = vfmsq_laneq_f64 (f, n, ln2_hi_lo, 0);
+ f = vfmsq_laneq_f64 (f, n, ln2_hi_lo, 1);
+
+ /* Approximate expm1(f) using polynomial.
+ Taylor expansion for expm1(x) has the form:
+ x + ax^2 + bx^3 + cx^4 ....
+ So we calculate the polynomial P(f) = a + bf + cf^2 + ...
+ and assemble the approximation expm1(f) ~= f + f^2 * P(f). */
+ float64x2_t f2 = vmulq_f64 (f, f);
+ float64x2_t f4 = vmulq_f64 (f2, f2);
+ float64x2_t lane_consts_13 = vld1q_f64 (&d->c1);
+ float64x2_t lane_consts_57 = vld1q_f64 (&d->c5);
+ float64x2_t lane_consts_910 = vld1q_f64 (&d->c9);
+ float64x2_t p01 = vfmaq_laneq_f64 (v_f64 (0.5), f, lane_consts_13, 0);
+ float64x2_t p23 = vfmaq_laneq_f64 (d->c2, f, lane_consts_13, 1);
+ float64x2_t p45 = vfmaq_laneq_f64 (d->c4, f, lane_consts_57, 0);
+ float64x2_t p67 = vfmaq_laneq_f64 (d->c6, f, lane_consts_57, 1);
+ float64x2_t p03 = vfmaq_f64 (p01, f2, p23);
+ float64x2_t p47 = vfmaq_f64 (p45, f2, p67);
+ float64x2_t p89 = vfmaq_laneq_f64 (d->c8, f, lane_consts_910, 0);
+ float64x2_t p = vfmaq_laneq_f64 (p89, f2, lane_consts_910, 1);
+ p = vfmaq_f64 (p47, f4, p);
+ p = vfmaq_f64 (p03, f4, p);
+
+ p = vfmaq_f64 (f, f2, p);
+
+ /* Assemble the result.
+ expm1(x) ~= 2^i * (p + 1) - 1
+ Let t = 2^i. */
+ int64x2_t u = vaddq_s64 (vshlq_n_s64 (i, 52), d->exponent_bias);
+ float64x2_t t = vreinterpretq_f64_s64 (u);
+
+ /* expm1(x) ~= p * t + (t - 1). */
+ return vfmaq_f64 (vsubq_f64 (t, v_f64 (1.0)), p, t);
+}
+
static float64x2_t NOINLINE VPCS_ATTR
-special_case (float64x2_t x, float64x2_t q, float64x2_t qp2,
- uint64x2_t special)
+special_case (float64x2_t x, float64x2_t q, uint64x2_t special)
{
- return v_call_f64 (tanh, x, vdivq_f64 (q, qp2), special);
+ const struct data *d = ptr_barrier (&data);
+ /* Complete fast path. */
+ float64x2_t y = vdivq_f64 (q, (vaddq_f64 (q, v_f64 (2.0))));
+
+ uint64x2_t ix = vreinterpretq_u64_f64 (x);
+
+ /* expm1 exponent bias is +1.0f. */
+ uint64x2_t one_bits = vreinterpretq_u64_s64 (d->exponent_bias);
+
+ /* Mask selecting only the sign bit in each lane. */
+ uint64x2_t sign_mask = vdupq_n_u64 (0x8000000000000000ULL);
+
+ /* Produce signed 1 for return of special cases:
+ sign bit taken from ix
+ all other bits taken from +1.0f (one_bits). */
+ uint64x2_t special_bits = vbslq_u64 (sign_mask, ix, one_bits);
+ float64x2_t special_y = vreinterpretq_f64_u64 (special_bits);
+
+ /* Select between special case or regular case and return value. */
+ return vbslq_f64 (special, special_y, y);
}
/* Vector approximation for double-precision tanh(x), using a simplified
{
const struct data *d = ptr_barrier (&data);
- uint64x2_t ia = vreinterpretq_u64_f64 (vabsq_f64 (x));
-
- /* Trigger special-cases for tiny, boring and infinity/NaN. */
- uint64x2_t special = vcgtq_u64 (vsubq_u64 (ia, d->tiny_bound), d->thresh);
-
/* tanh(x) = (e^2x - 1) / (e^2x + 1). */
- float64x2_t twox = vaddq_f64 (x, x);
- float64x2_t q = expm1_inline (twox, &d->d);
- float64x2_t qp2 = vaddq_f64 (q, v_f64 (2.0));
+ float64x2_t q = e2xm1_inline (x, d);
+ /* Check for special cases. */
+ uint64x2_t special = vcagtq_f64 (x, d->special_bound);
+ /* For sufficiently high inputs, the result of tanh(|x|) is 1 when correctly
+ rounded, at this point we can return 1 directly, with sign correction.
+ This will also act as a guard against our approximation overflowing.
+ Kept as a special case to avoid slow down in fast path. */
if (__glibc_unlikely (v_any_u64 (special)))
- return special_case (x, q, qp2, special);
- return vdivq_f64 (q, qp2);
+ return special_case (x, q, special);
+
+ /* Complete fast path if no special lanes. */
+ return vdivq_f64 (q, (vaddq_f64 (q, v_f64 (2.0))));
}
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
-#include "v_expm1f_inline.h"
+#include "v_math.h"
static const struct data
{
- struct v_expm1f_data expm1f_consts;
- uint32x4_t boring_bound, large_bound, onef;
+ float32x4_t special_bound, two;
+ float32x4_t c0, c2;
+ int32x4_t exponent_bias;
+ float c1, c3, two_over_ln2, c4;
+ float ln2_hi, ln2_lo;
} data = {
- .expm1f_consts = V_EXPM1F_DATA,
- /* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for negative). */
- .boring_bound = V4 (0x41102cb3),
- .large_bound = V4 (0x7f800000),
+ /* 9.01, above which tanhf rounds to 1 (or -1 for negative). */
+ .special_bound = V4 (0x1.205966p+3),
+ .two = V4 (0x1.0p+1), /* 2.0. */
+ /* Coefficients generated using fpminimax with degree=5 in [-log(2)/2,
+ log(2)/2]. Exponent bias is asuint(1.0f). */
+ .c0 = V4 (0x1.fffffep-2),
+ .c1 = 0x1.5554aep-3,
+ .c2 = V4 (0x1.555736p-5),
+ .c3 = 0x1.12287cp-7,
+ .c4 = 0x1.6b55a2p-10,
+ .exponent_bias = V4 (0x3f800000),
+ .two_over_ln2 = 0x1.715476p+1f,
+ .ln2_hi = 0x1.62e4p-1f,
+ .ln2_lo = 0x1.7f7d1cp-20f,
};
+/* e^2x - 1 inline helper. */
+static inline float32x4_t
+e2xm1f_inline (float32x4_t x, const struct data *d)
+{
+ float32x2_t ln2 = vld1_f32 (&d->ln2_hi);
+ float32x4_t lane_consts = vld1q_f32 (&d->c1);
+
+ /* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */
+ float32x4_t j = vrndaq_f32 (vmulq_laneq_f32 (x, lane_consts, 2));
+ int32x4_t i = vcvtq_s32_f32 (j);
+ float32x4_t f = vaddq_f32 (x, x);
+ f = vfmsq_lane_f32 (f, j, ln2, 0);
+ f = vfmsq_lane_f32 (f, j, ln2, 1);
+
+ /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f). */
+ float32x4_t f2 = vmulq_f32 (f, f);
+ float32x4_t f4 = vmulq_f32 (f2, f2);
+ float32x4_t p01 = vfmaq_laneq_f32 (d->c0, f, lane_consts, 0);
+ float32x4_t p23 = vfmaq_laneq_f32 (d->c2, f, lane_consts, 1);
+ float32x4_t poly = vfmaq_f32 (p01, f2, p23);
+ poly = vfmaq_laneq_f32 (poly, f4, lane_consts, 3);
+ poly = vfmaq_f32 (f, f2, poly);
+
+ /* scale = 2^i. */
+ int32x4_t u = vaddq_s32 (vshlq_n_s32 (i, 23), d->exponent_bias);
+ float32x4_t scale = vreinterpretq_f32_s32 (u);
+ /* expm1(x) ~= poly * scale + (scale - 1). */
+ return vfmaq_f32 (vsubq_f32 (scale, v_f32 (1.0f)), poly, scale);
+}
+
static float32x4_t NOINLINE VPCS_ATTR
-special_case (float32x4_t x, uint32x4_t is_boring, float32x4_t boring,
- float32x4_t q, uint32x4_t special)
+special_case (float32x4_t x, float32x4_t q, uint32x4_t special)
{
- return v_call_f32 (
- tanhf, x,
- vbslq_f32 (is_boring, boring, vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0)))),
- special);
+ const struct data *d = ptr_barrier (&data);
+
+ /* Complete fast path. */
+ float32x4_t y = vdivq_f32 (q, vaddq_f32 (q, d->two));
+
+ uint32x4_t ix = vreinterpretq_u32_f32 (x);
+
+ /* expm1 exponent bias is +1.0f. */
+ uint32x4_t one_bits = vreinterpretq_u32_s32 (d->exponent_bias);
+
+ /* Mask selecting only the sign bit in each lane. */
+ uint32x4_t sign_mask = vdupq_n_u32 (0x80000000u);
+
+ /* Produce signed 1 for return of special cases:
+ sign bit taken from ix
+ all other bits taken from +1.0f (one_bits). */
+ uint32x4_t special_bits = vbslq_u32 (sign_mask, ix, one_bits);
+ float32x4_t special_y = vreinterpretq_f32_u32 (special_bits);
+
+ /* Select between special case or regular case and return value. */
+ return vbslq_f32 (special, special_y, y);
}
/* Approximation for single-precision vector tanh(x), using a simplified
- version of expm1f. The maximum error is 2.58 ULP:
+ version of expm1f. The maximum error is 2.08 + 0.5 ULP:
_ZGVnN4v_tanhf (0x1.fa5eep-5) got 0x1.f9ba02p-5
want 0x1.f9ba08p-5. */
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tanh) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
- uint32x4_t ix = vreinterpretq_u32_f32 (x);
- float32x4_t ax = vabsq_f32 (x);
- uint32x4_t iax = vreinterpretq_u32_f32 (ax);
- uint32x4_t sign = veorq_u32 (ix, iax);
- uint32x4_t is_boring = vcgtq_u32 (iax, d->boring_bound);
- /* expm1 exponent bias is 1.0f reinterpreted to int. */
- float32x4_t boring = vreinterpretq_f32_u32 (vorrq_u32 (
- sign, vreinterpretq_u32_s32 (d->expm1f_consts.exponent_bias)));
-
- uint32x4_t special = vcgtq_u32 (iax, d->large_bound);
-
/* tanh(x) = (e^2x - 1) / (e^2x + 1). */
- float32x4_t q = expm1f_inline (vmulq_n_f32 (x, 2), &d->expm1f_consts);
+ float32x4_t q = e2xm1f_inline (x, d);
+
+ /* Check for special cases. */
+ uint32x4_t special = vcagtq_f32 (x, d->special_bound);
+ /* Fall back to vectorised special case for any lanes which would cause
+ expm1 to overflow. */
if (__glibc_unlikely (v_any_u32 (special)))
- return special_case (vreinterpretq_f32_u32 (ix), is_boring, boring, q,
- special);
+ return special_case (x, q, special);
- float32x4_t y = vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0)));
- return vbslq_f32 (is_boring, boring, y);
+ /* Complete fast path if no special lanes. */
+ return vdivq_f32 (q, vaddq_f32 (q, d->two));
}
libmvec_hidden_def (V_NAME_F1 (tanh))
HALF_WIDTH_ALIAS_F1 (tanh)
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
-#include "sv_expm1f_inline.h"
+#include "sv_math.h"
/* Largest value of x for which tanhf(x) rounds to 1 (or -1 for negative). */
-#define BoringBound 0x1.205966p+3f
+#define SpecialBound 0x1.205966p+3f /* ~9.01. */
static const struct data
{
- struct sv_expm1f_data expm1f_consts;
- uint32_t onef, special_bound;
- float boring_bound;
+ /* These 4 are grouped together so they can be loaded as one quadword, then
+ used with _lane forms of svmla/svmls. */
+ float32_t c2, c4, ln2_hi, ln2_lo;
+ float c0, two_over_ln2, c1, c3, special_bound;
} data = {
- .expm1f_consts = SV_EXPM1F_DATA,
- .onef = 0x3f800000,
- .special_bound = 0x7f800000,
- .boring_bound = BoringBound,
+ .special_bound = SpecialBound,
+ /* Coefficients generated using fpminimax. */
+ .c0 = 0x1.fffffep-2,
+ .c1 = 0x1.5554aep-3,
+ /* 2/ln2. */
+ .two_over_ln2 = 0x1.715476p+1f,
+ .c2 = 0x1.555736p-5,
+ .c3 = 0x1.12287cp-7,
+ .c4 = 0x1.6b55a2p-10,
+ .ln2_lo = 0x1.7f7d1cp-20f,
+ .ln2_hi = 0x1.62e4p-1f,
};
+/* An expm1 inspired helper function that returns an accurate
+ estimate for e^2x - 1. */
+static inline svfloat32_t
+e2xm1f_inline (svfloat32_t x, svbool_t pg, const struct data *d)
+{
+ /* This vector is reliant on layout of data - it contains constants
+ that can be used with _lane forms of svmla/svmls. Values are:
+ [ coeff_2, coeff_4, ln2_hi, ln2_lo ]. */
+ svfloat32_t lane_constants = svld1rq (svptrue_b32 (), &d->c2);
+
+ /* Reduce argument to smaller range:
+ Let i = round(x / (2 * ln2))
+ and f = (x + x) - i * ln2, then f is in [-ln2/2, ln2/2].
+ exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
+ where 2^i is exact because i is an integer. */
+ svfloat32_t j = svmul_x (svptrue_b32 (), x, d->two_over_ln2);
+ j = svrinta_x (pg, j);
+ svfloat32_t f = svadd_x (pg, x, x);
+ f = svmls_lane (f, j, lane_constants, 2);
+ f = svmls_lane (f, j, lane_constants, 3);
+
+ /* Approximate expm1(f) using polynomial.
+ Taylor expansion for expm1(x) has the form:
+ x + ax^2 + bx^3 + cx^4 ....
+ So we calculate the polynomial P(f) = a + bf + cf^2 + ...
+ and assemble the approximation expm1(f) ~= f + f^2 * P(f). */
+ svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), f, lane_constants, 0);
+ svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), f, lane_constants, 1);
+ svfloat32_t f2 = svmul_x (svptrue_b32 (), f, f);
+ svfloat32_t p = svmla_x (pg, p12, f2, p34);
+ p = svmla_x (pg, sv_f32 (d->c0), f, p);
+ p = svmla_x (pg, f, f2, p);
+
+ /* Assemble the result.
+ expm1(x) ~= 2^i * (p + 1) - 1
+ Let t = 2^i. */
+ svfloat32_t t = svscale_x (pg, sv_f32 (1.0f), svcvt_s32_x (pg, j));
+ return svmla_x (pg, svsub_x (pg, t, 1.0f), p, t);
+}
+
static svfloat32_t NOINLINE
-special_case (svfloat32_t x, svbool_t pg, svbool_t is_boring,
- svfloat32_t boring, svfloat32_t q, svbool_t special)
+special_case (svfloat32_t x, svbool_t pg, svbool_t special, svfloat32_t q)
{
- svfloat32_t y
- = svsel_f32 (is_boring, boring, svdiv_x (pg, q, svadd_x (pg, q, 2.0)));
- return sv_call_f32 (tanhf, x, y, special);
+ /* Finish fastpass to compute values for non-special cases. */
+ svfloat32_t y = svdiv_x (pg, q, svadd_x (pg, q, 2.0));
+
+ /* Make special values positive for best accuracy. */
+ svfloat32_t ax = svabs_x (svptrue_b32 (), x);
+ svuint32_t iax = svreinterpret_u32 (ax);
+
+ /* Preserve the sign bit to return final calcualtion to correct sign. */
+ svuint32_t sign = sveor_x (svptrue_b32 (), svreinterpret_u32 (x), iax);
+
+ /* Set overflowing lanes to signed 1. */
+ svfloat32_t special_y = svreinterpret_f32 (
+ svorr_x (svptrue_b32 (), sign, sv_u32 (0x3f800000)));
+
+ /* Return special_y for special lanes and y for none special lanes. */
+ return svsel_f32 (special, special_y, y);
}
/* Approximation for single-precision SVE tanh(x), using a simplified
- version of expm1f. The maximum error is 2.57 ULP:
+ version of expm1f.
+ Maximum error is 2.06 +0.5 ULP:
_ZGVsMxv_tanhf (0x1.fc1832p-5) got 0x1.fb71a4p-5
want 0x1.fb71aap-5. */
svfloat32_t SV_NAME_F1 (tanh) (svfloat32_t x, const svbool_t pg)
{
const struct data *d = ptr_barrier (&data);
- svfloat32_t ax = svabs_x (pg, x);
- svuint32_t iax = svreinterpret_u32 (ax);
- svuint32_t sign = sveor_x (pg, svreinterpret_u32 (x), iax);
- svfloat32_t boring = svreinterpret_f32 (svorr_x (pg, sign, d->onef));
- svbool_t special = svcmpgt (pg, iax, d->special_bound);
- svbool_t is_boring = svacgt (pg, x, d->boring_bound);
-
/* tanh(x) = (e^2x - 1) / (e^2x + 1). */
- svfloat32_t q = expm1f_inline (svmul_x (svptrue_b32 (), x, 2.0), pg,
- &d->expm1f_consts);
+ svfloat32_t q = e2xm1f_inline (x, pg, d);
+ svbool_t special = svacgt (pg, x, d->special_bound);
if (__glibc_unlikely (svptest_any (pg, special)))
- return special_case (x, pg, is_boring, boring, q, special);
- svfloat32_t y = svdiv_x (pg, q, svadd_x (pg, q, 2.0));
- return svsel_f32 (is_boring, boring, y);
+ return special_case (x, pg, special, q);
+
+ return svdiv_x (pg, q, svadd_x (pg, q, 2.0));
}