From: Adhemerval Zanella <adhemerval.zanella@linaro.org>
Date: Wed, 26 Nov 2025 17:06:16 +0000 (-0300)
Subject: math: New generic fmaf implementation
X-Git-Tag: glibc-2.43~150
X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=8a0152b61bdf0d3cb1d174d3627adee79db9ee07;p=thirdparty%2Fglibc.git

math: New generic fmaf implementation

The current implementation relies on setting the rounding mode for
different calculations (FE_TOWARDZERO) to obtain correctly rounded
results.  For most CPUs, this adds significant performance overhead
because it requires executing a typically slow instruction (to
get/set the floating-point status), necessitates flushing the
pipeline, and breaks some compiler assumptions/optimizations.

The original implementation adds tests to handle underflow in corner
cases, whereas this implementation uses a different strategy that
checks both the mantissa and the result to determine whether the
result is not subject to double rounding.

I tested this implementation on various targets (x86_64, i686, arm,
aarch64, powerpc), including some by manually disabling the compiler
instructions.

Performance-wise, it shows large improvements:

reciprocal-throughput       master       patched       improvement
x86_64 [1]                   58.09          7.96             7.33x
i686 [1]                    279.41         16.97            16.46x
aarch64 [2]                  26.09          4.10             6.35x
armhf [2]                    30.25          4.20             7.18x
powerpc [3]                   9.46          1.46             6.45x

latency                     master       patched       improvement
x86_64                       64.50         14.25             4.53x
i686                        304.39         61.04             4.99x
aarch64                      27.71          5.74             4.82x
armhf                        33.46          7.34             4.55x
powerpc                      10.96          2.65             4.13x

Checked on x86_64-linux-gnu and i686-linux-gnu with —disable-multi-arch,
and on arm-linux-gnueabihf.

[1] gcc 15.2.1, Zen3
[2] gcc 15.2.1, Neoverse N1
[3] gcc 15.2.1, POWER10

Signed-off-by: Szabolcs Nagy <nsz@gcc.gnu.org>
Co-authored-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
Co-authored-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
Reviewed-by: Wilco Dijkstra  <Wilco.Dijkstra@arm.com>
---

diff --git a/sysdeps/i386/Makefile b/sysdeps/i386/Makefile
index 11ddbd402d..e53bb0c5cd 100644
--- a/sysdeps/i386/Makefile
+++ b/sysdeps/i386/Makefile
@@ -14,6 +14,7 @@ CFLAGS-s_erf.c += -fexcess-precision=standard
 CFLAGS-s_erfc.c += -fexcess-precision=standard
 CFLAGS-s_erf_common.c += -fexcess-precision=standard
 CFLAGS-s_fma.c += -fexcess-precision=standard
+CFLAGS-s_fmaf.c += -fexcess-precision=standard
 endif
 
 ifeq ($(subdir),gmon)
diff --git a/sysdeps/ieee754/dbl-64/s_fmaf.c b/sysdeps/ieee754/dbl-64/s_fmaf.c
index 26ced15465..9fabe86586 100644
--- a/sysdeps/ieee754/dbl-64/s_fmaf.c
+++ b/sysdeps/ieee754/dbl-64/s_fmaf.c
@@ -19,16 +19,9 @@
 #define NO_MATH_REDIRECT
 #include <math.h>
 #include <fenv.h>
-#include <ieee754.h>
-#include <math-barriers.h>
-#include <fenv_private.h>
 #include <libm-alias-float.h>
-
-/* This implementation relies on double being more than twice as
-   precise as float and uses rounding to odd in order to avoid problems
-   with double rounding.
-   See a paper by Boldo and Melquiond:
-   http://www.lri.fr/~melquion/doc/08-tc.pdf  */
+#include <math-use-builtins.h>
+#include "math_config.h"
 
 float
 __fmaf (float x, float y, float z)
@@ -36,34 +29,40 @@ __fmaf (float x, float y, float z)
 #if USE_FMAF_BUILTIN
   return __builtin_fmaf (x, y, z);
 #else
-  /* Use generic implementation.  */
-  fenv_t env;
-
   /* Multiplication is always exact.  */
-  double temp = (double) x * (double) y;
-
-  /* Ensure correct sign of an exact zero result by performing the
-     addition in the original rounding mode in that case.  */
-  if (temp == (double) -z)
-    return (float) temp + z;
-
-  union ieee754_double u;
+  double xy = (double) x * (double) y;
+  double result = xy + z;
 
-  libc_feholdexcept_setround (&env, FE_TOWARDZERO);
+  uint64_t u = asuint64 (result);
+  /* If not exact or at round to even boundary, the result is correct in
+     all rounding modes.  */
+  if (__glibc_likely ((u & 0xfffffff) != 0))
+    return result;
 
-  /* Perform addition with round to odd.  */
-  u.d = temp + (double) z;
-  /* Ensure the addition is not scheduled after fetestexcept call.  */
-  math_force_eval (u.d);
+  /* Also check if the double result appears exact when it might not be and
+     thus it will not set the underflow flag if denormal.  */
+  if ((u & 0x10000000) == 0
+      && ((u >> MANTISSA_WIDTH) & 0x7ff) > EXPONENT_BIAS - 126)
+    return result;
 
-  /* Reset rounding mode and test for inexact simultaneously.  */
-  int j = libc_feupdateenv_test (&env, FE_INEXACT) != 0;
+  /* Return if result is exact in all rounding modes.  */
+  if (result - xy == z && result - z == xy)
+    return result;
 
-  if ((u.ieee.mantissa1 & 1) == 0 && u.ieee.exponent != 0x7ff)
-    u.ieee.mantissa1 |= j;
+  /* This is where 'double-rouding' might return a wrong value, and thus
+     needs adjusting the low-order bits in the direction of the error.  */
+  double err;
+  int neg = u >> 63;
+  if (neg == (z > xy))
+    err = xy - result + z;
+  else
+    err = z - result + xy;
 
-  /* And finally truncation with round to nearest.  */
-  return (float) u.d;
+  if (neg == (err < 0))
+    u++;
+  else
+    u--;
+  return asdouble (u);
 #endif /* ! USE_FMAF_BUILTIN  */
 }
 #ifndef __fmaf