From: James Chesterman Date: Wed, 19 Nov 2025 14:11:40 +0000 (+0000) Subject: aarch64: Optimise AdvSIMD log10f X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=57215df30e4978c6eb3b6b78dee8230ff575acbb;p=thirdparty%2Fglibc.git aarch64: Optimise AdvSIMD log10f Optimise AdvSIMD log10f by vectorising the special case. Use scaling technique on subnormal values, then check for inf and nan values. The scaling technique will sqrt the input then multiply the output by 2 because: log(sqrt(x)) = 1/2(log(x)), so log(x) = 2log(sqrt(x)) Reviewed-by: Adhemerval Zanella --- diff --git a/sysdeps/aarch64/fpu/log10f_advsimd.c b/sysdeps/aarch64/fpu/log10f_advsimd.c index 092abeac73..35ec0d9e80 100644 --- a/sysdeps/aarch64/fpu/log10f_advsimd.c +++ b/sysdeps/aarch64/fpu/log10f_advsimd.c @@ -23,9 +23,11 @@ static const struct data { float32x4_t c0, c2, c4, c6, inv_ln10, ln2; uint32x4_t off, offset_lower_bound; - uint16x8_t special_bound; + uint32x4_t special_bound; + uint16x8_t special_bound_u16; uint32x4_t mantissa_mask; float c1, c3, c5, c7; + float32x4_t pinf, minf, nan; } data = { /* Use order 9 for log10(1+x), i.e. order 8 for log10(1+x)/x, with x in [-1/3, 1/3] (offset=2/3). Max. relative error: 0x1.068ee468p-25. */ @@ -42,43 +44,20 @@ static const struct data /* Lower bound is the smallest positive normal float 0x00800000. For optimised register use subnormals are detected after offset has been subtracted, so lower bound is 0x0080000 - offset (which wraps around). */ + .off = V4 (0x3f2aaaab), /* 0.666667. */ .offset_lower_bound = V4 (0x00800000 - 0x3f2aaaab), - .special_bound = V8 (0x7f00), /* top16(asuint32(inf) - 0x00800000). */ - .off = V4 (0x3f2aaaab), /* 0.666667. */ + .special_bound = V4 (0x7f000000), /* asuint32(inf) - 0x00800000. */ + .special_bound_u16 = V8 (0x7f00), .mantissa_mask = V4 (0x007fffff), + .pinf = V4 (INFINITY), + .minf = V4 (-INFINITY), + .nan = V4 (NAN), }; -static float32x4_t VPCS_ATTR NOINLINE -special_case (float32x4_t y, uint32x4_t u_off, float32x4_t p, float32x4_t r2, - uint16x4_t cmp, const struct data *d) +static inline float32x4_t VPCS_ATTR +inline_log10f (uint32x4_t u_off, float32x4_t n, const struct data *d) { - /* Fall back to scalar code. */ - return v_call_f32 (log10f, vreinterpretq_f32_u32 (vaddq_u32 (u_off, d->off)), - vfmaq_f32 (y, p, r2), vmovl_u16 (cmp)); -} - -/* Fast implementation of AdvSIMD log10f, - uses a similar approach as AdvSIMD logf with the same offset (i.e., 2/3) and - an order 9 polynomial. - Maximum error: 3.305ulps (nearest rounding.) - _ZGVnN4v_log10f(0x1.555c16p+0) got 0x1.ffe2fap-4 - want 0x1.ffe2f4p-4. */ -float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log10) (float32x4_t x) -{ - const struct data *d = ptr_barrier (&data); float32x4_t c1357 = vld1q_f32 (&d->c1); - /* To avoid having to mov x out of the way, keep u after offset has been - applied, and recover x by adding the offset back in the special-case - handler. */ - uint32x4_t u_off = vreinterpretq_u32_f32 (x); - - /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ - u_off = vsubq_u32 (u_off, d->off); - float32x4_t n = vcvtq_f32_s32 ( - vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend. */ - - uint16x4_t special = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound), - vget_low_u16 (d->special_bound)); uint32x4_t u = vaddq_u32 (vandq_u32 (u_off, d->mantissa_mask), d->off); float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f)); @@ -99,9 +78,66 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log10) (float32x4_t x) float32x4_t y = vfmaq_f32 (r, d->ln2, n); y = vmulq_f32 (y, d->inv_ln10); - if (__glibc_unlikely (v_any_u16h (special))) - return special_case (y, u_off, poly, r2, special, d); return vfmaq_f32 (y, poly, r2); } + +static inline float32x4_t VPCS_ATTR +special_case (float32x4_t x, const struct data *d) +{ + float32x4_t x_sqrt = vsqrtq_f32 (x); + + uint32x4_t u_off = vsubq_u32 (vreinterpretq_u32_f32 (x_sqrt), d->off); + float32x4_t n = vcvtq_f32_s32 ( + vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend. */ + + float32x4_t y = inline_log10f (u_off, n, d); + + /* Scale down by multiplying output by two. + Because log(x) = 2log(sqrt(x)). */ + y = vmulq_f32 (y, v_f32 (2.0f)); + + /* Is true for +/- inf, +/- nan as well as all negative numbers. */ + uint32x4_t is_infnan + = vcgeq_u32 (vreinterpretq_u32_f32 (x), vreinterpretq_u32_f32 (d->pinf)); + uint32x4_t infnan_or_zero = vorrq_u32 (is_infnan, vceqzq_f32 (x)); + + y = vbslq_f32 (infnan_or_zero, d->nan, y); + uint32x4_t ret_pinf = vceqq_f32 (x, d->pinf); + uint32x4_t ret_minf = vceqzq_f32 (x); + y = vbslq_f32 (ret_pinf, d->pinf, y); + y = vbslq_f32 (ret_minf, d->minf, y); + return y; +} + +/* Fast implementation of AdvSIMD log10f, + uses a similar approach as AdvSIMD logf with the same offset (i.e., 2/3) and + an order 9 polynomial. + Maximum error: 2.81 + 0.5 + _ZGVnN4v_log10f(0x1.555c16p+0) got 0x1.ffe2fap-4 + want 0x1.ffe2f4p-4. */ +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log10) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + + /* To avoid having to mov x out of the way, keep u after offset has been + applied, and recover x by adding the offset back in the special-case + handler. */ + uint32x4_t u_off = vsubq_u32 (vreinterpretq_u32_f32 (x), d->off); + + /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ + float32x4_t n = vcvtq_f32_s32 ( + vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend. */ + + uint32x4_t special + = vcgeq_u32 (vsubq_u32 (u_off, d->offset_lower_bound), d->special_bound); + uint16x4_t special_u16 = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound), + vget_low_u16 (d->special_bound_u16)); + + if (__glibc_unlikely (v_any_u16h (special_u16))) + return vbslq_f32 (special, special_case (x, d), + inline_log10f (u_off, n, d)); + return inline_log10f (u_off, n, d); +} + libmvec_hidden_def (V_NAME_F1 (log10)) HALF_WIDTH_ALIAS_F1 (log10)