From: James Chesterman Date: Fri, 28 Nov 2025 11:18:53 +0000 (+0000) Subject: aarch64: Optimise AdvSIMD atanhf X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=f9bb6bcff604357b7a53a268381d265b998b040d;p=thirdparty%2Fglibc.git aarch64: Optimise AdvSIMD atanhf Optimise AdvSIMD atanhf by vectorising the special case. There are asymptotes at x = -1 and x = 1. So return inf for these. Values for which |x| > 1, return NaN. R.Throughput difference on V2 with GCC@15: 58-60% improvement in special cases. No regression in fast pass. --- diff --git a/sysdeps/aarch64/fpu/atanhf_advsimd.c b/sysdeps/aarch64/fpu/atanhf_advsimd.c index 93c41fbdc6..8d2c956e13 100644 --- a/sysdeps/aarch64/fpu/atanhf_advsimd.c +++ b/sysdeps/aarch64/fpu/atanhf_advsimd.c @@ -23,46 +23,68 @@ const static struct data { struct v_log1pf_data log1pf_consts; + uint32x4_t abs_mask; + float32x4_t half; uint32x4_t one; + float32x4_t pinf, minf, nan; } data = { .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE, + .abs_mask = V4 (0x7fffffff), + .half = V4 (0.5f), .one = V4 (0x3f800000), + .pinf = V4 (INFINITY), + .minf = V4 (-INFINITY), + .nan = V4 (NAN), }; -#define AbsMask v_u32 (0x7fffffff) -#define Half v_u32 (0x3f000000) - -static float32x4_t NOINLINE VPCS_ATTR -special_case (float32x4_t x, float32x4_t halfsign, float32x4_t y, - uint32x4_t special) +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, float32x4_t halfsign, + uint32x4_t special, const struct data *d) { - return v_call_f32 (atanhf, vbslq_f32 (AbsMask, x, halfsign), - vmulq_f32 (halfsign, y), special); + y = log1pf_inline (y, &d->log1pf_consts); + y = vmulq_f32 (halfsign, y); + + float32x4_t float_one = vreinterpretq_f32_u32 (d->one); + /* Need the NOT here to make an input of NaN return true. */ + uint32x4_t ret_nan = vmvnq_u32 (vcaltq_f32 (x, float_one)); + uint32x4_t ret_pinf = vceqzq_f32 (vsubq_f32 (x, float_one)); + uint32x4_t ret_minf = vceqzq_f32 (vaddq_f32 (x, float_one)); + + y = vbslq_f32 (ret_nan, d->nan, y); + y = vbslq_f32 (ret_pinf, d->pinf, y); + y = vbslq_f32 (ret_minf, d->minf, y); + return y; } /* Approximation for vector single-precision atanh(x) using modified log1p. The maximum error is 2.93 ULP: _ZGVnN4v_atanhf(0x1.f43d7p-5) got 0x1.f4dcfep-5 want 0x1.f4dcf8p-5. */ -VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (atanh) (float32x4_t x) +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (atanh) (float32x4_t x) { const struct data *d = ptr_barrier (&data); - float32x4_t halfsign = vbslq_f32 (AbsMask, v_f32 (0.5), x); + /* This computes a 1/2 with the sign bit being the sign bit of x. + This is because: + atanh(x) = 1/2 ln ((1+x)/(1-x)) + = 1/2 log1pf ((1+x)/(1-x) - 1) + = 1/2 log1pf (2x/(1-x)). */ + float32x4_t halfsign = vbslq_f32 (d->abs_mask, d->half, x); float32x4_t ax = vabsq_f32 (x); uint32x4_t iax = vreinterpretq_u32_f32 (ax); + /* Values between but not including -1 and 1 are treated appropriately by the + fast pass. -1 and 1 are asymptotes of atanh(x), so they are treated as + special cases along with any input outside of these bounds. */ uint32x4_t special = vcgeq_u32 (iax, d->one); - float32x4_t y = vdivq_f32 (vaddq_f32 (ax, ax), + float32x4_t r = vdivq_f32 (vaddq_f32 (ax, ax), vsubq_f32 (vreinterpretq_f32_u32 (d->one), ax)); - y = log1pf_inline (y, &d->log1pf_consts); - /* If exceptions not required, pass ax to special-case for shorter dependency - chain. If exceptions are required ax will have been zerofied, so have to - pass x. */ if (__glibc_unlikely (v_any_u32 (special))) - return special_case (ax, halfsign, y, special); + return special_case (x, r, halfsign, special, d); + + float32x4_t y = log1pf_inline (r, &d->log1pf_consts); return vmulq_f32 (halfsign, y); } libmvec_hidden_def (V_NAME_F1 (atanh))