From: Pierre Blanchard Date: Tue, 18 Nov 2025 15:09:05 +0000 (+0000) Subject: AArch64: Fix and improve SVE pow(f) special cases X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=bb6519de1e6fe73d79bc71588ec4e5668907f080;p=thirdparty%2Fglibc.git AArch64: Fix and improve SVE pow(f) special cases powf: Update scalar special case function to best use new interface. pow: Make specialcase NOINLINE to prevent str/ldr leaking in fast path. Remove depency in sv_call2, as new callback impl is not a performance gain. Replace with vectorised specialcase since structure of scalar routine is fairly simple. Throughput gain of about 5-10% on V1 for large values and 25% for subnormal `x`. Reviewed-by: Wilco Dijkstra  --- diff --git a/sysdeps/aarch64/fpu/pow_sve.c b/sysdeps/aarch64/fpu/pow_sve.c index 8fe51b7d4b..e09a52f115 100644 --- a/sysdeps/aarch64/fpu/pow_sve.c +++ b/sysdeps/aarch64/fpu/pow_sve.c @@ -31,8 +31,8 @@ The SVE algorithm drops the tail in the exp computation at the price of a lower accuracy, slightly above 1ULP. The SVE algorithm also drops the special treatement of small (< 2^-65) and - large (> 2^63) finite values of |y|, as they only affect non-round to nearest - modes. + large (> 2^63) finite values of |y|, as they only affect non-round to + nearest modes. Maximum measured error is 1.04 ULPs: SV_NAME_D2 (pow) (0x1.3d2d45bc848acp+63, -0x1.a48a38b40cd43p-12) @@ -156,42 +156,22 @@ sv_zeroinfnan (svbool_t pg, svuint64_t i) a double. (int32_t)KI is the k used in the argument reduction and exponent adjustment of scale, positive k here means the result may overflow and negative k means the result may underflow. */ -static inline double -specialcase (double tmp, uint64_t sbits, uint64_t ki) -{ - double scale; - if ((ki & 0x80000000) == 0) - { - /* k > 0, the exponent of scale might have overflowed by <= 460. */ - sbits -= 1009ull << 52; - scale = asdouble (sbits); - return 0x1p1009 * (scale + scale * tmp); - } - /* k < 0, need special care in the subnormal range. */ - sbits += 1022ull << 52; - /* Note: sbits is signed scale. */ - scale = asdouble (sbits); - double y = scale + scale * tmp; - return 0x1p-1022 * y; -} - -/* Scalar fallback for special cases of SVE pow's exp. */ static inline svfloat64_t -sv_call_specialcase (svfloat64_t x1, svuint64_t u1, svuint64_t u2, - svfloat64_t y, svbool_t cmp) +specialcase (svfloat64_t tmp, svuint64_t sbits, svuint64_t ki, svbool_t cmp) { - svbool_t p = svpfirst (cmp, svpfalse ()); - while (svptest_any (cmp, p)) - { - double sx1 = svclastb (p, 0, x1); - uint64_t su1 = svclastb (p, 0, u1); - uint64_t su2 = svclastb (p, 0, u2); - double elem = specialcase (sx1, su1, su2); - svfloat64_t y2 = sv_f64 (elem); - y = svsel (p, y2, y); - p = svpnext_b64 (cmp, p); - } - return y; + svbool_t p_pos = svcmpge_n_f64 (cmp, svreinterpret_f64_u64 (ki), 0.0); + + /* Scale up or down depending on sign of k. */ + svint64_t offset + = svsel_s64 (p_pos, sv_s64 (1009ull << 52), sv_s64 (-1022ull << 52)); + svfloat64_t factor + = svsel_f64 (p_pos, sv_f64 (0x1p1009), sv_f64 (0x1p-1022)); + + svuint64_t offset_sbits + = svsub_u64_x (cmp, sbits, svreinterpret_u64_s64 (offset)); + svfloat64_t scale = svreinterpret_f64_u64 (offset_sbits); + svfloat64_t res = svmad_f64_x (cmp, scale, tmp, scale); + return svmul_f64_x (cmp, res, factor); } /* Compute y+TAIL = log(x) where the rounded result is y and TAIL has about @@ -214,8 +194,8 @@ sv_log_inline (svbool_t pg, svuint64_t ix, svfloat64_t *tail, /* log(x) = k*Ln2 + log(c) + log1p(z/c-1). */ /* SVE lookup requires 3 separate lookup tables, as opposed to scalar version - that uses array of structures. We also do the lookup earlier in the code to - make sure it finishes as early as possible. */ + that uses array of structures. We also do the lookup earlier in the code + to make sure it finishes as early as possible. */ svfloat64_t invc = svld1_gather_index (pg, __v_pow_log_data.invc, i); svfloat64_t logc = svld1_gather_index (pg, __v_pow_log_data.logc, i); svfloat64_t logctail = svld1_gather_index (pg, __v_pow_log_data.logctail, i); @@ -325,14 +305,14 @@ sv_exp_inline (svbool_t pg, svfloat64_t x, svfloat64_t xtail, svbool_t oflow = svcmpge (pg, abstop, HugeExp); oflow = svand_z (pg, uoflow, svbic_z (pg, oflow, uflow)); - /* For large |x| values (512 < |x| < 1024) scale * (1 + TMP) can overflow - or underflow. */ + /* Handle underflow and overlow in scale. + For large |x| values (512 < |x| < 1024), scale * (1 + TMP) can + overflow or underflow. */ svbool_t special = svbic_z (pg, uoflow, svorr_z (pg, uflow, oflow)); + if (__glibc_unlikely (svptest_any (pg, special))) + z = svsel (special, specialcase (tmp, sbits, ki, special), z); - /* Update result with special and large cases. */ - z = sv_call_specialcase (tmp, sbits, ki, z, special); - - /* Handle underflow and overflow. */ + /* Handle underflow and overflow in exp. */ svbool_t x_is_neg = svcmplt (pg, x, 0); svuint64_t sign_mask = svlsl_x (pg, sign_bias, 52 - V_POW_EXP_TABLE_BITS); @@ -353,7 +333,7 @@ sv_exp_inline (svbool_t pg, svfloat64_t x, svfloat64_t xtail, } static inline double -pow_sc (double x, double y) +pow_specialcase (double x, double y) { uint64_t ix = asuint64 (x); uint64_t iy = asuint64 (y); @@ -382,6 +362,14 @@ pow_sc (double x, double y) return x; } +/* Scalar fallback for special case routines with custom signature. */ +static svfloat64_t NOINLINE +sv_pow_specialcase (svfloat64_t x1, svfloat64_t x2, svfloat64_t y, + svbool_t cmp) +{ + return sv_call2_f64 (pow_specialcase, x1, x2, y, cmp); +} + svfloat64_t SV_NAME_D2 (pow) (svfloat64_t x, svfloat64_t y, const svbool_t pg) { const struct data *d = ptr_barrier (&data); @@ -444,7 +432,7 @@ svfloat64_t SV_NAME_D2 (pow) (svfloat64_t x, svfloat64_t y, const svbool_t pg) /* Cases of zero/inf/nan x or y. */ if (__glibc_unlikely (svptest_any (svptrue_b64 (), special))) - vz = sv_call2_f64 (pow_sc, x, y, vz, special); + vz = sv_pow_specialcase (x, y, vz, special); return vz; } diff --git a/sysdeps/aarch64/fpu/powf_sve.c b/sysdeps/aarch64/fpu/powf_sve.c index 22e6cc54fb..cbe2044926 100644 --- a/sysdeps/aarch64/fpu/powf_sve.c +++ b/sysdeps/aarch64/fpu/powf_sve.c @@ -116,11 +116,10 @@ zeroinfnan (uint32_t ix) preamble of scalar powf except that we do not update ix and sign_bias. This is done in the preamble of the SVE powf. */ static inline float -powf_specialcase (float x, float y, float z) +powf_specialcase (float x, float y) { uint32_t ix = asuint (x); uint32_t iy = asuint (y); - /* Either (x < 0x1p-126 or inf or nan) or (y is 0 or inf or nan). */ if (__glibc_unlikely (zeroinfnan (iy))) { if (2 * iy == 0) @@ -142,32 +141,15 @@ powf_specialcase (float x, float y, float z) x2 = -x2; return iy & 0x80000000 ? 1 / x2 : x2; } - /* We need a return here in case x<0 and y is integer, but all other tests - need to be run. */ - return z; + /* Return x for convenience, but make sure result is never used. */ + return x; } /* Scalar fallback for special case routines with custom signature. */ static svfloat32_t NOINLINE -sv_call_powf_sc (svfloat32_t x1, svfloat32_t x2, svfloat32_t y) +sv_call_powf_sc (svfloat32_t x1, svfloat32_t x2, svfloat32_t y, svbool_t cmp) { - /* Special cases of x or y: zero, inf and nan. */ - svbool_t xspecial = sv_zeroinfnan (svptrue_b32 (), svreinterpret_u32 (x1)); - svbool_t yspecial = sv_zeroinfnan (svptrue_b32 (), svreinterpret_u32 (x2)); - svbool_t cmp = svorr_z (svptrue_b32 (), xspecial, yspecial); - - svbool_t p = svpfirst (cmp, svpfalse ()); - while (svptest_any (cmp, p)) - { - float sx1 = svclastb (p, 0, x1); - float sx2 = svclastb (p, 0, x2); - float elem = svclastb (p, 0, y); - elem = powf_specialcase (sx1, sx2, elem); - svfloat32_t y2 = sv_f32 (elem); - y = svsel (p, y2, y); - p = svpnext_b32 (cmp, p); - } - return y; + return sv_call2_f32 (powf_specialcase, x1, x2, y, cmp); } /* Compute core for half of the lanes in double precision. */ @@ -330,7 +312,7 @@ svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg) ret = svsel (yint_or_xpos, ret, sv_f32 (__builtin_nanf (""))); if (__glibc_unlikely (svptest_any (cmp, cmp))) - return sv_call_powf_sc (x, y, ret); + return sv_call_powf_sc (x, y, ret, cmp); return ret; }