From: Richard.Wild@arm.com Date: Thu, 26 Feb 2026 11:04:46 +0000 (+0000) Subject: AArch64: Vectorise SVE log/log2/log10 single and double precision special cases. X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=447a8a74dcf0bbebbd3bf1702b6c7a4f149097e8;p=thirdparty%2Fglibc.git AArch64: Vectorise SVE log/log2/log10 single and double precision special cases. This patch vectorises scalar fallbacks for SVE logB(f) functions. Special case performance increase of around 3x speedup for double and 5x speedup for single precision. Some fast path gains during rework of files. Most fast paths improved by 2-3%. 2 unchanged. No regressions. Benchmarked on Neoverse V2 with GCC@15. --- diff --git a/sysdeps/aarch64/fpu/log10_sve.c b/sysdeps/aarch64/fpu/log10_sve.c index d9980feacc..b87ef297ec 100644 --- a/sysdeps/aarch64/fpu/log10_sve.c +++ b/sysdeps/aarch64/fpu/log10_sve.c @@ -18,20 +18,16 @@ . */ #include "sv_math.h" -#include "poly_sve_f64.h" -#define Min 0x0010000000000000 -#define Max 0x7ff0000000000000 -#define Thres 0x7fe0000000000000 /* Max - Min. */ #define N (1 << V_LOG10_TABLE_BITS) static const struct data { - double c0, c2; + double two_2_52, log10_52; + double c0, c2, c4; double c1, c3; double invln10, log10_2; - double c4; - uint64_t off; + uint64_t off, thresh; } data = { .c0 = -0x1.bcb7b1526e506p-3, .c1 = 0x1.287a7636be1d1p-3, @@ -40,28 +36,18 @@ static const struct data .c4 = -0x1.287461742fee4p-4, .invln10 = 0x1.bcb7b1526e50ep-2, .log10_2 = 0x1.34413509f79ffp-2, + .two_2_52 = 0x1p52, /* 2^52. */ + .log10_52 = -0x1.f4e9f6303263ep+3, /* log10(2) * 52 ~ 15.65. */ .off = 0x3fe6900900000000, + /* The threshold is computed from the lower and upper bounds, + respectively the smallest normalised number, min = 0x0010000000000000 + and infinity, 0x7ff0000000000000. */ + .thresh = 0x7fe0000000000000, /* infinity - min. */ }; -static svfloat64_t NOINLINE -special_case (svfloat64_t hi, svuint64_t tmp, svfloat64_t y, svfloat64_t r2, - svbool_t special, const struct data *d) -{ - svfloat64_t x = svreinterpret_f64 (svadd_x (svptrue_b64 (), tmp, d->off)); - return sv_call_f64 (log10, x, svmla_x (svptrue_b64 (), hi, r2, y), special); -} - -/* Double-precision SVE log10 routine. - Maximum measured error is 2.46 ulps. - SV_NAME_D1 (log10)(0x1.131956cd4b627p+0) got 0x1.fffbdf6eaa669p-6 - want 0x1.fffbdf6eaa667p-6. */ -svfloat64_t SV_NAME_D1 (log10) (svfloat64_t x, const svbool_t pg) +static inline svfloat64_t +v_log10_inline (svuint64_t ix, const svbool_t pg, const struct data *d) { - const struct data *d = ptr_barrier (&data); - - svuint64_t ix = svreinterpret_u64 (x); - svbool_t special = svcmpge (pg, svsub_x (pg, ix, Min), Thres); - /* x = 2^k z; where z is in range [Off,2*Off) and exact. The range is split into N subintervals. The ith subinterval contains z and c is near its center. */ @@ -95,7 +81,58 @@ svfloat64_t SV_NAME_D1 (log10) (svfloat64_t x, const svbool_t pg) y = svmla_x (pg, y, r2, d->c4); y = svmla_x (pg, p, r2, y); - if (__glibc_unlikely (svptest_any (pg, special))) - return special_case (hi, tmp, y, r2, special, d); return svmla_x (pg, hi, r2, y); } + +/* The special case is made up of a series of selects which chose the correct + outcome of the special lanes from inf, -inf or nan or for subnormals a + calculation of x * 2^52 (2^mantissa) to normalise the number at entry to + the log function and then subtract log10(2) * 52 to re-subnormalise the + output to the correct result. */ +static svfloat64_t NOINLINE +special_case (svfloat64_t x, svbool_t pg, svbool_t special) +{ + const struct data *d = ptr_barrier (&data); + /* Check covers subnormal range. This is greater than the actual range but + standard case lanes and +inf are handled seperately. */ + svbool_t is_sub = svcmpgt_f64 (pg, x, sv_f64 (0)); + /* Check for 0 which = -Infinity. */ + svbool_t is_minf = svcmpeq_f64 (pg, x, sv_f64 (0)); + svbool_t is_pinf = svcmpeq_f64 (pg, x, sv_f64 (INFINITY)); + + /* Increase x for special cases to catch sub normals. */ + x = svmul_m (special, x, d->two_2_52); + svuint64_t ix = svreinterpret_u64 (x); + + /* Select correct special case correction depending on x. */ + svfloat64_t special_log = svsel (is_sub, sv_f64 (d->log10_52), sv_f64 (NAN)); + special_log = svsel (is_minf, sv_f64 (-INFINITY), special_log); + special_log = svsel (is_pinf, sv_f64 (INFINITY), special_log); + + /* Return log for both special after offset and none special cases. */ + svfloat64_t log_sum = v_log10_inline (ix, pg, d); + + /* Reduce the output of log for special cases to complete the subnormals + calculation or add inf, -inf or nan depending on special_log. + Return log without correction for none special lanes. */ + return svadd_m (special, log_sum, special_log); +} + +/* Double-precision SVE log10 routine. + Maximum measured error is 2.46 ulps. + SV_NAME_D1 (log10)(0x1.131956cd4b627p+0) got 0x1.fffbdf6eaa669p-6 + want 0x1.fffbdf6eaa667p-6. */ +svfloat64_t SV_NAME_D1 (log10) (svfloat64_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svuint64_t ix = svreinterpret_u64 (x); + /* Special cases: x is subnormal, x <= 0, x == inf, x == nan. */ + svbool_t special + = svcmpge (pg, svsub_x (pg, ix, 0x0010000000000000), d->thresh); + if (__glibc_unlikely (svptest_any (special, special))) + return special_case (x, pg, special); + + /* If no special cases just return log_10 function call. */ + return v_log10_inline (ix, pg, d); +} diff --git a/sysdeps/aarch64/fpu/log10f_sve.c b/sysdeps/aarch64/fpu/log10f_sve.c index a38a6615f0..365c606535 100644 --- a/sysdeps/aarch64/fpu/log10f_sve.c +++ b/sysdeps/aarch64/fpu/log10f_sve.c @@ -21,54 +21,40 @@ static const struct data { - float poly_0246[4]; - float poly_1357[4]; + float c1, c3, c5, c7; + float c0, c2, c4, c6; float ln2, inv_ln10; - uint32_t off, lower; + float log10_23, two_2_23; + uint32_t off, lower, thresh; } data = { - .poly_1357 = { - /* Coefficients copied from the AdvSIMD routine, then rearranged so that coeffs - 1, 3, 5 and 7 can be loaded as a single quad-word, hence used with _lane - variant of MLA intrinsic. */ - 0x1.2879c8p-3f, 0x1.6408f8p-4f, 0x1.f0e514p-5f, 0x1.f5f76ap-5f - }, - .poly_0246 = { -0x1.bcb79cp-3f, -0x1.bcd472p-4f, -0x1.246f8p-4f, - -0x1.0fc92cp-4f }, + /* Coefficients copied from the AdvSIMD routine, then rearranged so that + coeffs 1, 3, 5 and 7 can be loaded as a single quad-word, hence used with + _lane variant of MLA intrinsic. */ + .c0 = -0x1.bcb79cp-3f, + .c1 = 0x1.2879c8p-3f, + .c2 = -0x1.bcd472p-4f, + .c3 = 0x1.6408f8p-4f, + .c4 = -0x1.246f8p-4f, + .c5 = 0x1.f0e514p-5f, + .c6 = -0x1.0fc92cp-4f, + .c7 = 0x1.f5f76ap-5f, .ln2 = 0x1.62e43p-1f, .inv_ln10 = 0x1.bcb7b2p-2f, .off = 0x3f2aaaab, /* Lower bound is the smallest positive normal float 0x00800000. For optimised register use subnormals are detected after offset has been subtracted, so lower bound is 0x0080000 - offset (which wraps around). */ - .lower = 0x00800000 - 0x3f2aaaab + .lower = 0x00800000 - 0x3f2aaaab, + .log10_23 = -0x1.bb1dbcp+2, /* log10(2) * 23 ~ 6.92. */ + .two_2_23 = 0x1p23, /* 2^23. */ + .thresh = 0x7f000000, /* asuint32(inf) - 0x00800000. */ }; -#define Thres 0x7f000000 /* asuint32(inf) - 0x00800000. */ #define MantissaMask 0x007fffff -static svfloat32_t NOINLINE -special_case (svuint32_t u_off, svfloat32_t p, svfloat32_t r2, svfloat32_t y, - svbool_t cmp) -{ - return sv_call_f32 ( - log10f, svreinterpret_f32 (svadd_x (svptrue_b32 (), u_off, data.off)), - svmla_x (svptrue_b32 (), p, r2, y), cmp); -} - -/* Optimised implementation of SVE log10f using the same algorithm and - polynomial as AdvSIMD log10f. - Maximum error is 3.31ulps: - SV_NAME_F1 (log10)(0x1.555c16p+0) got 0x1.ffe2fap-4 - want 0x1.ffe2f4p-4. */ -svfloat32_t SV_NAME_F1 (log10) (svfloat32_t x, const svbool_t pg) +static inline svfloat32_t +v_log10f_inline (svuint32_t u_off, const svbool_t pg, const struct data *d) { - const struct data *d = ptr_barrier (&data); - - svuint32_t u_off = svreinterpret_u32 (x); - - u_off = svsub_x (pg, u_off, d->off); - svbool_t special = svcmpge (pg, svsub_x (pg, u_off, d->lower), Thres); - /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ svfloat32_t n = svcvt_f32_x ( pg, svasr_x (pg, svreinterpret_s32 (u_off), 23)); /* signextend. */ @@ -82,20 +68,74 @@ svfloat32_t SV_NAME_F1 (log10) (svfloat32_t x, const svbool_t pg) log10(1+x)/x, with x in [-1/3, 1/3] (offset=2/3). */ svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r); svfloat32_t r4 = svmul_x (svptrue_b32 (), r2, r2); - svfloat32_t p_1357 = svld1rq (svptrue_b32 (), &d->poly_1357[0]); - svfloat32_t q_01 = svmla_lane (sv_f32 (d->poly_0246[0]), r, p_1357, 0); - svfloat32_t q_23 = svmla_lane (sv_f32 (d->poly_0246[1]), r, p_1357, 1); - svfloat32_t q_45 = svmla_lane (sv_f32 (d->poly_0246[2]), r, p_1357, 2); - svfloat32_t q_67 = svmla_lane (sv_f32 (d->poly_0246[3]), r, p_1357, 3); + svfloat32_t p_1357 = svld1rq (svptrue_b32 (), &d->c1); + svfloat32_t q_01 = svmla_lane (sv_f32 (d->c0), r, p_1357, 0); + svfloat32_t q_23 = svmla_lane (sv_f32 (d->c2), r, p_1357, 1); + svfloat32_t q_45 = svmla_lane (sv_f32 (d->c4), r, p_1357, 2); + svfloat32_t q_67 = svmla_lane (sv_f32 (d->c6), r, p_1357, 3); svfloat32_t q_47 = svmla_x (pg, q_45, r2, q_67); svfloat32_t q_03 = svmla_x (pg, q_01, r2, q_23); svfloat32_t y = svmla_x (pg, q_03, r4, q_47); /* Using hi = Log10(2)*n + r*InvLn(10) is faster but less accurate. */ - svfloat32_t hi = svmla_x (pg, r, n, d->ln2); - hi = svmul_x (pg, hi, d->inv_ln10); + svfloat32_t ln2_inv_ln10 = svld1rq (svptrue_b32 (), &d->ln2); + svfloat32_t hi = svmla_lane (r, n, ln2_inv_ln10, 0); + hi = svmul_lane (hi, ln2_inv_ln10, 1); - if (__glibc_unlikely (svptest_any (pg, special))) - return special_case (u_off, hi, r2, y, special); return svmla_x (svptrue_b32 (), hi, r2, y); } + +/* The special case is made up of a series of selects which chose the correct + outcome of the special lanes from inf, -inf or nan or for subnormals a + calculation of x * 2^23 (2^mantissa) to normalise the number at entry to + the log function and then subtract log10(2) * 23 to re-subnormalise the + output to the correct result. */ +static svfloat32_t NOINLINE +special_case (svfloat32_t x, svbool_t pg, svbool_t special, + const struct data *d) +{ + /* Check covers subnormal range. This is greater than the actual range but + standard case lanes and +inf are handled seperately. */ + svbool_t is_sub = svcmpgt_f32 (pg, x, sv_f32 (0)); + /* Check for 0 which = -Infinity. */ + svbool_t is_minf = svcmpeq_f32 (pg, x, sv_f32 (0)); + svbool_t is_pinf = svcmpeq_f32 (pg, x, sv_f32 (INFINITY)); + + /* Increase x for special cases to catch sub normals. */ + x = svmul_m (special, x, d->two_2_23); + svuint32_t u_off = svreinterpret_u32 (x); + u_off = svsub_m (pg, u_off, d->off); + + /* Select correct special case correction depending on x. */ + svfloat32_t special_log = svsel (is_sub, sv_f32 (d->log10_23), sv_f32 (NAN)); + special_log = svsel (is_minf, sv_f32 (-INFINITY), special_log); + special_log = svsel (is_pinf, sv_f32 (INFINITY), special_log); + + /* Return log for both special after offset and none special cases. */ + svfloat32_t ret_log = v_log10f_inline (u_off, svptrue_b32 (), d); + + /* Reduce the output of log for special cases to complete the subnormals + calculation or add inf, -inf or nan depending on special_log. + Return log without correction for none special lanes. */ + return svadd_m (special, ret_log, special_log); +} + +/* Optimised implementation of SVE log10f using the same algorithm and + polynomial as AdvSIMD log10f. + Maximum error is 3.31ulps: + SV_NAME_F1 (log10)(0x1.555c16p+0) got 0x1.ffe2fap-4 + want 0x1.ffe2f4p-4. */ +svfloat32_t SV_NAME_F1 (log10) (svfloat32_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svuint32_t u_off = svreinterpret_u32 (x); + u_off = svsub_x (pg, u_off, d->off); + /* Special cases: x is subnormal, x <= 0, x == inf, x == nan. */ + svbool_t special = svcmpge (pg, svsub_x (pg, u_off, d->lower), d->thresh); + if (__glibc_unlikely (svptest_any (special, special))) + return special_case (x, pg, special, d); + + /* If no special cases just return log_10f function call. */ + return v_log10f_inline (u_off, pg, d); +} diff --git a/sysdeps/aarch64/fpu/log2_sve.c b/sysdeps/aarch64/fpu/log2_sve.c index c3ffb6c071..918ed42025 100644 --- a/sysdeps/aarch64/fpu/log2_sve.c +++ b/sysdeps/aarch64/fpu/log2_sve.c @@ -18,19 +18,16 @@ . */ #include "sv_math.h" -#include "poly_sve_f64.h" #define N (1 << V_LOG2_TABLE_BITS) -#define Max (0x7ff0000000000000) -#define Min (0x0010000000000000) -#define Thresh (0x7fe0000000000000) /* Max - Min. */ static const struct data { - double c0, c2; double c1, c3; double invln2, c4; - uint64_t off; + double c0, c2; + double two_2_52; + uint64_t off, thresh; } data = { .c0 = -0x1.71547652b83p-1, .c1 = 0x1.ec709dc340953p-2, @@ -38,40 +35,26 @@ static const struct data .c3 = 0x1.2777ebe12dda5p-2, .c4 = -0x1.ec738d616fe26p-3, .invln2 = 0x1.71547652b82fep0, + .two_2_52 = 0x1p52, /* 2^52. */ .off = 0x3fe6900900000000, + /* The threshold is computed from the lower and upper bounds, + respectively the smallest normalised number, min = 0x0010000000000000 + and infinity, 0x7ff0000000000000. */ + .thresh = (0x7fe0000000000000), /* infinity - min. */ }; -static svfloat64_t NOINLINE -special_case (svfloat64_t w, svuint64_t tmp, svfloat64_t y, svfloat64_t r2, - svbool_t special, const struct data *d) -{ - svfloat64_t x = svreinterpret_f64 (svadd_x (svptrue_b64 (), tmp, d->off)); - return sv_call_f64 (log2, x, svmla_x (svptrue_b64 (), w, r2, y), special); -} - -/* Double-precision SVE log2 routine. - Implements the same algorithm as AdvSIMD log10, with coefficients and table - entries scaled in extended precision. - The maximum observed error is 2.58 ULP: - SV_NAME_D1 (log2)(0x1.0b556b093869bp+0) got 0x1.fffb34198d9dap-5 - want 0x1.fffb34198d9ddp-5. */ -svfloat64_t SV_NAME_D1 (log2) (svfloat64_t x, const svbool_t pg) +static inline svfloat64_t +v_log2_inline (svuint64_t ix, const svbool_t pg, const struct data *d) { - const struct data *d = ptr_barrier (&data); - - svuint64_t ix = svreinterpret_u64 (x); - svbool_t special = svcmpge (pg, svsub_x (pg, ix, Min), Thresh); - /* x = 2^k z; where z is in range [Off,2*Off) and exact. The range is split into N subintervals. The ith subinterval contains z and c is near its center. */ svuint64_t tmp = svsub_x (pg, ix, d->off); - svuint64_t i = svlsr_x (pg, tmp, 51 - V_LOG2_TABLE_BITS); - i = svand_x (pg, i, (N - 1) << 1); svfloat64_t k = svcvt_f64_x (pg, svasr_x (pg, svreinterpret_s64 (tmp), 52)); svfloat64_t z = svreinterpret_f64 ( svsub_x (pg, ix, svand_x (pg, tmp, 0xfffULL << 52))); - + svuint64_t i = svlsr_x (pg, tmp, 51 - V_LOG2_TABLE_BITS); + i = svand_x (pg, i, (N - 1) << 1); svfloat64_t invc = svld1_gather_index (pg, &__v_log2_data.table[0].invc, i); svfloat64_t log2c = svld1_gather_index (pg, &__v_log2_data.table[0].log2c, i); @@ -83,14 +66,67 @@ svfloat64_t SV_NAME_D1 (log2) (svfloat64_t x, const svbool_t pg) svfloat64_t w = svmla_lane_f64 (log2c, r, invln2_and_c4, 0); w = svadd_x (pg, k, w); - svfloat64_t odd_coeffs = svld1rq_f64 (svptrue_b64 (), &d->c1); svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r); - svfloat64_t y = svmla_lane_f64 (sv_f64 (d->c2), r, odd_coeffs, 1); + svfloat64_t odd_coeffs = svld1rq_f64 (svptrue_b64 (), &d->c1); svfloat64_t p = svmla_lane_f64 (sv_f64 (d->c0), r, odd_coeffs, 0); + svfloat64_t y = svmla_lane_f64 (sv_f64 (d->c2), r, odd_coeffs, 1); y = svmla_lane_f64 (y, r2, invln2_and_c4, 1); y = svmla_x (pg, p, r2, y); - if (__glibc_unlikely (svptest_any (pg, special))) - return special_case (w, tmp, y, r2, special, d); return svmla_x (pg, w, r2, y); } + +/* The special case is made up of a series of selects which chose the correct + outcome of the special lanes from inf, -inf or nan or for subnormals a + calculation of x * 2^52 (2^mantissa) to normalise the number at entry to + the log function and then subtract log2(2) * 52 = 52 to re-subnormalise the + output to the correct result. */ +static svfloat64_t NOINLINE +special_case (svfloat64_t x, svbool_t pg, svbool_t special) +{ + const struct data *d = ptr_barrier (&data); + /* Check covers subnormal range. This is greater than the actual range but + standard case lanes and +inf are handled seperately. */ + svbool_t is_sub = svcmpgt_f64 (pg, x, sv_f64 (0)); + /* Check for 0 which = -Infinity. */ + svbool_t is_minf = svcmpeq_f64 (pg, x, sv_f64 (0)); + svbool_t is_pinf = svcmpeq_f64 (pg, x, sv_f64 (INFINITY)); + + /* Increase x for special cases to catch sub normals. */ + x = svmul_m (special, x, d->two_2_52); + svuint64_t ix = svreinterpret_u64 (x); + + /* Select correct special case correction depending on x. */ + svfloat64_t special_log = svsel (is_sub, sv_f64 (-52), sv_f64 (NAN)); + special_log = svsel (is_minf, sv_f64 (-INFINITY), special_log); + special_log = svsel (is_pinf, sv_f64 (INFINITY), special_log); + + /* Return log for both special after offset and none special cases. */ + svfloat64_t log_sum = v_log2_inline (ix, pg, d); + + /* Reduce the output of log for special cases to complete the subnormals + calculation or add inf, -inf or nan depending on special_log. + Return log without correction for none special lanes. */ + return svadd_m (special, log_sum, special_log); +} + +/* Double-precision SVE log2 routine. + Implements the same algorithm as AdvSIMD log10, with coefficients and table + entries scaled in extended precision. + The maximum observed error is 2.58 ULP: + SV_NAME_D1 (log2)(0x1.0b556b093869bp+0) got 0x1.fffb34198d9dap-5 + want 0x1.fffb34198d9ddp-5. */ +svfloat64_t SV_NAME_D1 (log2) (svfloat64_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svuint64_t ix = svreinterpret_u64 (x); + /* Special cases: x is subnormal, x <= 0, x == inf, x == nan. */ + svbool_t special + = svcmpge (pg, svsub_x (pg, ix, 0x0010000000000000), d->thresh); + if (__glibc_unlikely (svptest_any (special, special))) + return special_case (x, pg, special); + + /* If no special cases just return log_2 function call. */ + return v_log2_inline (ix, pg, d); +} diff --git a/sysdeps/aarch64/fpu/log2f_sve.c b/sysdeps/aarch64/fpu/log2f_sve.c index 1a1e0d60b0..89d676e5a9 100644 --- a/sysdeps/aarch64/fpu/log2f_sve.c +++ b/sysdeps/aarch64/fpu/log2f_sve.c @@ -21,35 +21,93 @@ static const struct data { - float poly_02468[5]; - float poly_1357[4]; - uint32_t off, lower; + float c1, c3, c5, c7; + float c0, c2, c4, c6, c8; + float two_2_23; + uint32_t off, lower, thresh; } data = { - .poly_1357 = { - /* Coefficients copied from the AdvSIMD routine, then rearranged so that coeffs - 1, 3, 5 and 7 can be loaded as a single quad-word, hence used with _lane - variant of MLA intrinsic. */ - -0x1.715458p-1f, -0x1.7171a4p-2f, -0x1.e5143ep-3f, -0x1.c675bp-3f - }, - .poly_02468 = { 0x1.715476p0f, 0x1.ec701cp-2f, 0x1.27a0b8p-2f, - 0x1.9d8ecap-3f, 0x1.9e495p-3f }, + /* Coefficients copied from the AdvSIMD routine, then rearranged so that + coeffs 1, 3, 5 and 7 can be loaded as a single quad-word, hence used with + _lane variant of MLA intrinsic. */ + .c0 = 0x1.715476p0f, + .c1 = -0x1.715458p-1f, + .c2 = 0x1.ec701cp-2f, + .c3 = -0x1.7171a4p-2f, + .c4 = 0x1.27a0b8p-2f, + .c5 = -0x1.e5143ep-3f, + .c6 = 0x1.9d8ecap-3f, + .c7 = -0x1.c675bp-3f, + .c8 = 0x1.9e495p-3f, .off = 0x3f2aaaab, /* Lower bound is the smallest positive normal float 0x00800000. For optimised register use subnormals are detected after offset has been subtracted, so lower bound is 0x0080000 - offset (which wraps around). */ - .lower = 0x00800000 - 0x3f2aaaab + .lower = 0xC1555555, /* 0x00800000 - 0x3f2aaaab. */ + .two_2_23 = 0x1p23, /* 2^23. */ + .thresh = 0x7f000000, /* asuint32(inf) - 0x00800000. */ }; -#define Thresh (0x7f000000) /* asuint32(inf) - 0x00800000. */ -#define MantissaMask (0x007fffff) +#define MantissaMask 0x007fffff -static svfloat32_t NOINLINE -special_case (svuint32_t u_off, svfloat32_t p, svfloat32_t r2, svfloat32_t y, - svbool_t cmp) +static inline svfloat32_t +v_log2f_inline (svuint32_t u_off, svbool_t pg, const struct data *d) { - return sv_call_f32 ( - log2f, svreinterpret_f32 (svadd_x (svptrue_b32 (), u_off, data.off)), - svmla_x (svptrue_b32 (), p, r2, y), cmp); + svuint32_t u = svand_x (pg, u_off, MantissaMask); + /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ + svfloat32_t n = svcvt_f32_x ( + pg, svasr_x (pg, svreinterpret_s32 (u_off), 23)); /* Sign-extend. */ + u = svadd_x (pg, u, d->off); + svfloat32_t r = svsub_x (pg, svreinterpret_f32 (u), 1.0f); + + /* Evaluate polynomial using pairwise Horner scheme. */ + svfloat32_t p_1357 = svld1rq (svptrue_b32 (), &d->c1); + svfloat32_t q_01 = svmla_lane (sv_f32 (d->c0), r, p_1357, 0); + svfloat32_t q_23 = svmla_lane (sv_f32 (d->c2), r, p_1357, 1); + svfloat32_t q_45 = svmla_lane (sv_f32 (d->c4), r, p_1357, 2); + svfloat32_t q_67 = svmla_lane (sv_f32 (d->c6), r, p_1357, 3); + /* y = log2(1+r) + n. */ + svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r); + svfloat32_t y = svmla_x (pg, q_67, r2, sv_f32 (d->c8)); + y = svmla_x (pg, q_45, r2, y); + y = svmla_x (pg, q_23, r2, y); + y = svmla_x (pg, q_01, r2, y); + + return svmla_x (svptrue_b32 (), n, r, y); +} + +/* The special case is made up of a series of selects which chose the correct + outcome of the special lanes from inf, -inf or nan or for subnormals a + calculation of x * 2^23 (2^mantissa) to normalise the number at entry to + the log function and then subtract log2(2) * 23 = 23 to re-subnormalise the + output to the correct result. */ +static inline svfloat32_t +special_case (svfloat32_t x, svbool_t pg, svbool_t special, + const struct data *d) +{ + /* Check covers subnormal range. This is greater than the actual range but + standard case lanes and +inf are handled seperately. */ + svbool_t is_sub = svcmpgt_f32 (pg, x, sv_f32 (0)); + /* Check for 0 which = -Infinity. */ + svbool_t is_minf = svcmpeq_f32 (pg, x, sv_f32 (0)); + svbool_t is_pinf = svcmpeq_f32 (pg, x, sv_f32 (INFINITY)); + + /* Increase x for special cases to catch sub normals. */ + x = svmul_m (special, x, d->two_2_23); + svuint32_t u_off = svreinterpret_u32 (x); + u_off = svsub_m (pg, u_off, d->off); + + /* Select correct special case correction depending on x. */ + svfloat32_t special_log = svsel (is_sub, sv_f32 (-23), sv_f32 (NAN)); + special_log = svsel (is_minf, sv_f32 (-INFINITY), special_log); + special_log = svsel (is_pinf, sv_f32 (INFINITY), special_log); + + /* Return log for both special after offset and none special cases. */ + svfloat32_t ret_log = v_log2f_inline (u_off, svptrue_b32 (), d); + + /* Reduce the output of log for special cases to complete the subnormals + calculation or add inf, -inf or nan depending on special_log. + Return log without correction for none special lanes. */ + return svadd_m (special, ret_log, special_log); } /* Optimised implementation of SVE log2f, using the same algorithm @@ -62,32 +120,12 @@ svfloat32_t SV_NAME_F1 (log2) (svfloat32_t x, const svbool_t pg) const struct data *d = ptr_barrier (&data); svuint32_t u_off = svreinterpret_u32 (x); - u_off = svsub_x (pg, u_off, d->off); - svbool_t special = svcmpge (pg, svsub_x (pg, u_off, d->lower), Thresh); - - /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ - svfloat32_t n = svcvt_f32_x ( - pg, svasr_x (pg, svreinterpret_s32 (u_off), 23)); /* Sign-extend. */ - svuint32_t u = svand_x (pg, u_off, MantissaMask); - u = svadd_x (pg, u, d->off); - svfloat32_t r = svsub_x (pg, svreinterpret_f32 (u), 1.0f); - - /* y = log2(1+r) + n. */ - svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r); - - /* Evaluate polynomial using pairwise Horner scheme. */ - svfloat32_t p_1357 = svld1rq (svptrue_b32 (), &d->poly_1357[0]); - svfloat32_t q_01 = svmla_lane (sv_f32 (d->poly_02468[0]), r, p_1357, 0); - svfloat32_t q_23 = svmla_lane (sv_f32 (d->poly_02468[1]), r, p_1357, 1); - svfloat32_t q_45 = svmla_lane (sv_f32 (d->poly_02468[2]), r, p_1357, 2); - svfloat32_t q_67 = svmla_lane (sv_f32 (d->poly_02468[3]), r, p_1357, 3); - svfloat32_t y = svmla_x (pg, q_67, r2, sv_f32 (d->poly_02468[4])); - y = svmla_x (pg, q_45, r2, y); - y = svmla_x (pg, q_23, r2, y); - y = svmla_x (pg, q_01, r2, y); + /* Special cases: x is subnormal, x <= 0, x == inf, x == nan. */ + svbool_t special = svcmpge (pg, svsub_x (pg, u_off, d->lower), d->thresh); + if (__glibc_unlikely (svptest_any (special, special))) + return special_case (x, pg, special, d); - if (__glibc_unlikely (svptest_any (pg, special))) - return special_case (u_off, n, r, y, special); - return svmla_x (svptrue_b32 (), n, r, y); + /* If no special cases just return log_2f function call. */ + return v_log2f_inline (u_off, pg, d); } diff --git a/sysdeps/aarch64/fpu/log_sve.c b/sysdeps/aarch64/fpu/log_sve.c index 590b0c315a..5eaae0ae56 100644 --- a/sysdeps/aarch64/fpu/log_sve.c +++ b/sysdeps/aarch64/fpu/log_sve.c @@ -20,16 +20,14 @@ #include "sv_math.h" #define N (1 << V_LOG_TABLE_BITS) -#define Max (0x7ff0000000000000) -#define Min (0x0010000000000000) -#define Thresh (0x7fe0000000000000) /* Max - Min. */ static const struct data { - double c0, c2; double c1, c3; double ln2, c4; - uint64_t off; + double c0, c2; + double two_2_52, ln2_52; + uint64_t off, thresh; } data = { .c0 = -0x1.ffffffffffff7p-2, .c1 = 0x1.55555555170d4p-2, @@ -37,28 +35,18 @@ static const struct data .c3 = 0x1.999b2e90e94cap-3, .c4 = -0x1.554e550bd501ep-3, .ln2 = 0x1.62e42fefa39efp-1, + .two_2_52 = 0x1p52, /* 2^52. */ + .ln2_52 = -0x1.205966f2b4f12p+5, /* ln(2) * 52 ~ 36.04. */ .off = 0x3fe6900900000000, + /* The threshold is computed from the lower and upper bounds, + respectively the smallest normalised number, min = 0x0010000000000000 + and infinity, 0x7ff0000000000000. */ + .thresh = (0x7fe0000000000000), /* infinity - min. */ }; -static svfloat64_t NOINLINE -special_case (svfloat64_t hi, svuint64_t tmp, svfloat64_t y, svfloat64_t r2, - svbool_t special, const struct data *d) +static inline svfloat64_t +v_log_inline (svuint64_t ix, const svbool_t pg, const struct data *d) { - svfloat64_t x = svreinterpret_f64 (svadd_x (svptrue_b64 (), tmp, d->off)); - return sv_call_f64 (log, x, svmla_x (svptrue_b64 (), hi, r2, y), special); -} - -/* Double-precision SVE log routine. - Maximum measured error is 2.64 ulp: - SV_NAME_D1 (log)(0x1.95e54bc91a5e2p+184) got 0x1.fffffffe88cacp+6 - want 0x1.fffffffe88cafp+6. */ -svfloat64_t SV_NAME_D1 (log) (svfloat64_t x, const svbool_t pg) -{ - const struct data *d = ptr_barrier (&data); - - svuint64_t ix = svreinterpret_u64 (x); - svbool_t special = svcmpge (pg, svsub_x (pg, ix, Min), Thresh); - /* x = 2^k z; where z is in range [Off,2*Off) and exact. The range is split into N subintervals. The ith subinterval contains z and c is near its center. */ @@ -89,7 +77,58 @@ svfloat64_t SV_NAME_D1 (log) (svfloat64_t x, const svbool_t pg) y = svmla_lane_f64 (y, r2, ln2_and_c4, 1); y = svmla_x (pg, p, r2, y); - if (__glibc_unlikely (svptest_any (pg, special))) - return special_case (hi, tmp, y, r2, special, d); return svmla_x (pg, hi, r2, y); } + +/* The special case is made up of a series of selects which chose the correct + outcome of the special lanes from inf, -inf or nan or for subnormals a + calculation of x * 2^52 (2^mantissa) to normalise the number at entry to + the log function and then subtract ln(2) * 52 to re-subnormalise the + output to the correct result. */ +static svfloat64_t NOINLINE +special_case (svfloat64_t x, svbool_t pg, svbool_t special) +{ + const struct data *d = ptr_barrier (&data); + /* Check covers subnormal range. This is greater than the actual range but + standard case lanes and +inf are handled seperately. */ + svbool_t is_sub = svcmpgt_f64 (pg, x, sv_f64 (0)); + /* Check for 0 which = -Infinity. */ + svbool_t is_minf = svcmpeq_f64 (pg, x, sv_f64 (0)); + svbool_t is_pinf = svcmpeq_f64 (pg, x, sv_f64 (INFINITY)); + + /* Increase x for special cases to catch sub normals. */ + x = svmul_m (special, x, d->two_2_52); + svuint64_t ix = svreinterpret_u64 (x); + + /* Select correct special case correction depending on x. */ + svfloat64_t special_log = svsel (is_sub, sv_f64 (d->ln2_52), sv_f64 (NAN)); + special_log = svsel (is_minf, sv_f64 (-INFINITY), special_log); + special_log = svsel (is_pinf, sv_f64 (INFINITY), special_log); + + /* Return log for both special after offset and none special cases. */ + svfloat64_t log_sum = v_log_inline (ix, pg, d); + + /* Reduce the output of log for special cases to complete the subnormals + calculation or add inf, -inf or nan depending on special_log. + Return log without correction for none special lanes. */ + return svadd_m (special, log_sum, special_log); +} + +/* Double-precision SVE log routine. + Maximum measured error is 2.64 ulp: + SV_NAME_D1 (log)(0x1.95e54bc91a5e2p+184) got 0x1.fffffffe88cacp+6 + want 0x1.fffffffe88cafp+6. */ +svfloat64_t SV_NAME_D1 (log) (svfloat64_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svuint64_t ix = svreinterpret_u64 (x); + /* Special cases: x is subnormal, x <= 0, x == inf, x == nan. */ + svbool_t special + = svcmpge (pg, svsub_x (pg, ix, 0x0010000000000000), d->thresh); + if (__glibc_unlikely (svptest_any (special, special))) + return special_case (x, pg, special); + + /* If no special cases just return log function call. */ + return v_log_inline (ix, pg, d); +} diff --git a/sysdeps/aarch64/fpu/logf_sve.c b/sysdeps/aarch64/fpu/logf_sve.c index 16a9fcfd2c..95af72519f 100644 --- a/sysdeps/aarch64/fpu/logf_sve.c +++ b/sysdeps/aarch64/fpu/logf_sve.c @@ -21,73 +21,111 @@ static const struct data { - float poly_0135[4]; - float poly_246[3]; - float ln2; - uint32_t off, lower; + float c0, c1, c3, c5; + float c2, c4, c6, ln2; + float ln2_23, two_2_23; + uint32_t off, lower, thresh; } data = { - .poly_0135 = { - /* Coefficients copied from the AdvSIMD routine in math/, then rearranged so - that coeffs 0, 1, 3 and 5 can be loaded as a single quad-word, hence used - with _lane variant of MLA intrinsic. */ - -0x1.3e737cp-3f, 0x1.5a9aa2p-3f, 0x1.961348p-3f, 0x1.555d7cp-2f - }, - .poly_246 = { -0x1.4f9934p-3f, -0x1.00187cp-2f, -0x1.ffffc8p-2f }, + /* Coefficients copied from the AdvSIMD routine, then rearranged so that + coeffs 0, 1, 3,and 5 can be loaded as a single quad-word, hence used with + _lane variant of MLA intrinsic. */ + .c0 = -0x1.3e737cp-3f, + .c1 = 0x1.5a9aa2p-3f, + .c2 = -0x1.4f9934p-3f, + .c3 = 0x1.961348p-3f, + .c4 = -0x1.00187cp-2f, + .c5 = 0x1.555d7cp-2f, + .c6 = -0x1.ffffc8p-2f, .ln2 = 0x1.62e43p-1f, .off = 0x3f2aaaab, /* Lower bound is the smallest positive normal float 0x00800000. For optimised register use subnormals are detected after offset has been subtracted, so lower bound is 0x0080000 - offset (which wraps around). */ - .lower = 0x00800000 - 0x3f2aaaab + .lower = 0x00800000 - 0x3f2aaaab, + .ln2_23 = -0x1.fe2804p+3, /* ln(2) * 23 ~ 15.94. */ + .two_2_23 = 0x1p23, /* 2^23. */ + .thresh = 0x7f000000, /* asuint32(inf) - 0x00800000. */ }; -#define Thresh (0x7f000000) /* asuint32(inf) - 0x00800000. */ -#define Mask (0x007fffff) +#define MantissaMask 0x007fffff -static svfloat32_t NOINLINE -special_case (svuint32_t u_off, svfloat32_t p, svfloat32_t r2, svfloat32_t y, - svbool_t cmp) +static inline svfloat32_t +v_logf_inline (svuint32_t u_off, const svbool_t pg, const struct data *d) { - return sv_call_f32 ( - logf, svreinterpret_f32 (svadd_x (svptrue_b32 (), u_off, data.off)), - svmla_x (svptrue_b32 (), p, r2, y), cmp); -} - -/* Optimised implementation of SVE logf, using the same algorithm and - polynomial as the AdvSIMD routine. Maximum error is 3.34 ULPs: - SV_NAME_F1 (log)(0x1.557298p+0) got 0x1.26edecp-2 - want 0x1.26ede6p-2. */ -svfloat32_t SV_NAME_F1 (log) (svfloat32_t x, const svbool_t pg) -{ - const struct data *d = ptr_barrier (&data); - - svuint32_t u_off = svreinterpret_u32 (x); - - u_off = svsub_x (pg, u_off, d->off); - svbool_t cmp = svcmpge (pg, svsub_x (pg, u_off, d->lower), Thresh); - + svuint32_t u = svand_x (pg, u_off, MantissaMask); /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ svfloat32_t n = svcvt_f32_x ( pg, svasr_x (pg, svreinterpret_s32 (u_off), 23)); /* Sign-extend. */ - - svuint32_t u = svand_x (pg, u_off, Mask); u = svadd_x (pg, u, d->off); svfloat32_t r = svsub_x (pg, svreinterpret_f32 (u), 1.0f); /* y = log(1+r) + n*ln2. */ svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r); /* n*ln2 + r + r2*(P6 + r*P5 + r2*(P4 + r*P3 + r2*(P2 + r*P1 + r2*P0))). */ - svfloat32_t p_0135 = svld1rq (svptrue_b32 (), &d->poly_0135[0]); - svfloat32_t p = svmla_lane (sv_f32 (d->poly_246[0]), r, p_0135, 1); - svfloat32_t q = svmla_lane (sv_f32 (d->poly_246[1]), r, p_0135, 2); - svfloat32_t y = svmla_lane (sv_f32 (d->poly_246[2]), r, p_0135, 3); + svfloat32_t p_0135 = svld1rq (svptrue_b32 (), &d->c0); + svfloat32_t p = svmla_lane (sv_f32 (d->c2), r, p_0135, 1); + svfloat32_t q = svmla_lane (sv_f32 (d->c4), r, p_0135, 2); + svfloat32_t y = svmla_lane (sv_f32 (d->c6), r, p_0135, 3); p = svmla_lane (p, r2, p_0135, 0); q = svmla_x (pg, q, r2, p); y = svmla_x (pg, y, r2, q); p = svmla_x (pg, r, n, d->ln2); - if (__glibc_unlikely (svptest_any (pg, cmp))) - return special_case (u_off, p, r2, y, cmp); - return svmla_x (pg, p, r2, y); + /* Minus the shift to get the result by ln2(52). */ + return svmla_x (svptrue_b32 (), p, r2, y); +} + +/* The special case is made up of a series of selects which chose the correct + outcome of the special lanes from inf, -inf or nan or for subnormals a + calculation of x * 2^23 (2^mantissa) to normalise the number at entry to + the log function and then subtract ln(2) * 23 to re-subnormalise the result + output to the correct result. */ +static inline svfloat32_t +special_case (svfloat32_t x, svbool_t pg, svbool_t special, + const struct data *d) +{ + /* Check covers subnormal range. This is greater than the actual range but + standard case lanes and +inf are handled seperately. */ + svbool_t is_sub = svcmpgt_f32 (pg, x, sv_f32 (0)); + /* Check for 0 which = -Infinity. */ + svbool_t is_minf = svcmpeq_f32 (pg, x, sv_f32 (0)); + svbool_t is_pinf = svcmpeq_f32 (pg, x, sv_f32 (INFINITY)); + + /* Increase x for special cases to catch sub normals. */ + x = svmul_m (special, x, d->two_2_23); + svuint32_t u_off = svreinterpret_u32 (x); + u_off = svsub_m (pg, u_off, d->off); + + /* Select correct special case correction depending on x. */ + svfloat32_t special_log = svsel (is_sub, sv_f32 (d->ln2_23), sv_f32 (NAN)); + special_log = svsel (is_minf, sv_f32 (-INFINITY), special_log); + special_log = svsel (is_pinf, sv_f32 (INFINITY), special_log); + + /* Return log for both special after offset and none special cases. */ + svfloat32_t ret_log = v_logf_inline (u_off, svptrue_b32 (), d); + + /* Reduce the output of log for special cases to complete the subnormals + calculation or add inf, -inf or nan depending on special_log. + Return log without correction for none special lanes. */ + return svadd_m (special, ret_log, special_log); +} + +/* Optimised implementation of SVE logf, using the same algorithm and + polynomial as the AdvSIMD routine. Maximum error is 3.34 ULPs: + SV_NAME_F1 (log)(0x1.557298p+0) got 0x1.26edecp-2 + want 0x1.26ede6p-2. */ +svfloat32_t SV_NAME_F1 (log) (svfloat32_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svuint32_t u_off = svreinterpret_u32 (x); + u_off = svsub_x (pg, u_off, d->off); + /* Special cases: x is subnormal, x <= 0, x == inf, x == nan. */ + svbool_t special = svcmpge (pg, svsub_x (pg, u_off, d->lower), d->thresh); + if (__glibc_unlikely (svptest_any (special, special))) + return special_case (x, pg, special, d); + + /* If no special cases just return log_f function call. */ + return v_logf_inline (u_off, pg, d); }