From: Adhemerval Zanella Date: Wed, 26 Nov 2025 17:06:16 +0000 (-0300) Subject: math: New generic fmaf implementation X-Git-Tag: glibc-2.43~150 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=8a0152b61bdf0d3cb1d174d3627adee79db9ee07;p=thirdparty%2Fglibc.git math: New generic fmaf implementation The current implementation relies on setting the rounding mode for different calculations (FE_TOWARDZERO) to obtain correctly rounded results. For most CPUs, this adds significant performance overhead because it requires executing a typically slow instruction (to get/set the floating-point status), necessitates flushing the pipeline, and breaks some compiler assumptions/optimizations. The original implementation adds tests to handle underflow in corner cases, whereas this implementation uses a different strategy that checks both the mantissa and the result to determine whether the result is not subject to double rounding. I tested this implementation on various targets (x86_64, i686, arm, aarch64, powerpc), including some by manually disabling the compiler instructions. Performance-wise, it shows large improvements: reciprocal-throughput master patched improvement x86_64 [1] 58.09 7.96 7.33x i686 [1] 279.41 16.97 16.46x aarch64 [2] 26.09 4.10 6.35x armhf [2] 30.25 4.20 7.18x powerpc [3] 9.46 1.46 6.45x latency master patched improvement x86_64 64.50 14.25 4.53x i686 304.39 61.04 4.99x aarch64 27.71 5.74 4.82x armhf 33.46 7.34 4.55x powerpc 10.96 2.65 4.13x Checked on x86_64-linux-gnu and i686-linux-gnu with —disable-multi-arch, and on arm-linux-gnueabihf. [1] gcc 15.2.1, Zen3 [2] gcc 15.2.1, Neoverse N1 [3] gcc 15.2.1, POWER10 Signed-off-by: Szabolcs Nagy Co-authored-by: Adhemerval Zanella Co-authored-by: Wilco Dijkstra Reviewed-by: Wilco Dijkstra  --- diff --git a/sysdeps/i386/Makefile b/sysdeps/i386/Makefile index 11ddbd402d..e53bb0c5cd 100644 --- a/sysdeps/i386/Makefile +++ b/sysdeps/i386/Makefile @@ -14,6 +14,7 @@ CFLAGS-s_erf.c += -fexcess-precision=standard CFLAGS-s_erfc.c += -fexcess-precision=standard CFLAGS-s_erf_common.c += -fexcess-precision=standard CFLAGS-s_fma.c += -fexcess-precision=standard +CFLAGS-s_fmaf.c += -fexcess-precision=standard endif ifeq ($(subdir),gmon) diff --git a/sysdeps/ieee754/dbl-64/s_fmaf.c b/sysdeps/ieee754/dbl-64/s_fmaf.c index 26ced15465..9fabe86586 100644 --- a/sysdeps/ieee754/dbl-64/s_fmaf.c +++ b/sysdeps/ieee754/dbl-64/s_fmaf.c @@ -19,16 +19,9 @@ #define NO_MATH_REDIRECT #include #include -#include -#include -#include #include - -/* This implementation relies on double being more than twice as - precise as float and uses rounding to odd in order to avoid problems - with double rounding. - See a paper by Boldo and Melquiond: - http://www.lri.fr/~melquion/doc/08-tc.pdf */ +#include +#include "math_config.h" float __fmaf (float x, float y, float z) @@ -36,34 +29,40 @@ __fmaf (float x, float y, float z) #if USE_FMAF_BUILTIN return __builtin_fmaf (x, y, z); #else - /* Use generic implementation. */ - fenv_t env; - /* Multiplication is always exact. */ - double temp = (double) x * (double) y; - - /* Ensure correct sign of an exact zero result by performing the - addition in the original rounding mode in that case. */ - if (temp == (double) -z) - return (float) temp + z; - - union ieee754_double u; + double xy = (double) x * (double) y; + double result = xy + z; - libc_feholdexcept_setround (&env, FE_TOWARDZERO); + uint64_t u = asuint64 (result); + /* If not exact or at round to even boundary, the result is correct in + all rounding modes. */ + if (__glibc_likely ((u & 0xfffffff) != 0)) + return result; - /* Perform addition with round to odd. */ - u.d = temp + (double) z; - /* Ensure the addition is not scheduled after fetestexcept call. */ - math_force_eval (u.d); + /* Also check if the double result appears exact when it might not be and + thus it will not set the underflow flag if denormal. */ + if ((u & 0x10000000) == 0 + && ((u >> MANTISSA_WIDTH) & 0x7ff) > EXPONENT_BIAS - 126) + return result; - /* Reset rounding mode and test for inexact simultaneously. */ - int j = libc_feupdateenv_test (&env, FE_INEXACT) != 0; + /* Return if result is exact in all rounding modes. */ + if (result - xy == z && result - z == xy) + return result; - if ((u.ieee.mantissa1 & 1) == 0 && u.ieee.exponent != 0x7ff) - u.ieee.mantissa1 |= j; + /* This is where 'double-rouding' might return a wrong value, and thus + needs adjusting the low-order bits in the direction of the error. */ + double err; + int neg = u >> 63; + if (neg == (z > xy)) + err = xy - result + z; + else + err = z - result + xy; - /* And finally truncation with round to nearest. */ - return (float) u.d; + if (neg == (err < 0)) + u++; + else + u--; + return asdouble (u); #endif /* ! USE_FMAF_BUILTIN */ } #ifndef __fmaf