From 82a4f50b4e53df728430a9fe8d5939f57038db16 Mon Sep 17 00:00:00 2001 From: Adhemerval Zanella Date: Fri, 10 Oct 2025 09:50:21 -0300 Subject: [PATCH] math: Optimize fma call on asinpif The fma is required only for x == +/-0x1.6371e8p-4f in FE_TOWARDZERO to provide correctly rounded results. Checked on x86_64-linux-gnu and aarch64-linux-gnu. Reviewed-by: Wilco Dijkstra --- math/auto-libm-test-in | 2 ++ math/auto-libm-test-out-asinpi | 50 ++++++++++++++++++++++++++++++ sysdeps/ieee754/flt-32/s_asinpif.c | 9 ++++-- 3 files changed, 59 insertions(+), 2 deletions(-) diff --git a/math/auto-libm-test-in b/math/auto-libm-test-in index 198dac5455..7e8cb4cef8 100644 --- a/math/auto-libm-test-in +++ b/math/auto-libm-test-in @@ -524,6 +524,8 @@ asinpi 0x1.f1c012p-1 asinpi -0x1.8805060cb885cp-3 asinpi 0x8.14d7e32b5c44642p-4 asinpi -0xa.7ca6c96caefe80b9d757de58a578p-4 +asinpi 0x1.6371e8p-4 +asinpi -0x1.6371e8p-4 atan inf atan -inf diff --git a/math/auto-libm-test-out-asinpi b/math/auto-libm-test-out-asinpi index 31fe806411..80f83eb654 100644 --- a/math/auto-libm-test-out-asinpi +++ b/math/auto-libm-test-out-asinpi @@ -2780,3 +2780,53 @@ asinpi -0xa.7ca6c96caefe80b9d757de58a578p-4 = asinpi tonearest ibm128 -0xa.7ca6c96caefe80b9d757de58a8p-4 : -0x3.a3e55379cf8d0f73aac00cc2e5p-4 : inexact-ok = asinpi towardzero ibm128 -0xa.7ca6c96caefe80b9d757de58a8p-4 : -0x3.a3e55379cf8d0f73aac00cc2e4p-4 : inexact-ok = asinpi upward ibm128 -0xa.7ca6c96caefe80b9d757de58a8p-4 : -0x3.a3e55379cf8d0f73aac00cc2e4p-4 : inexact-ok +asinpi 0x1.6371e8p-4 += asinpi downward binary32 0x1.6371e8p-4 : 0x7.148bcp-8 : inexact-ok += asinpi tonearest binary32 0x1.6371e8p-4 : 0x7.148bc8p-8 : inexact-ok += asinpi towardzero binary32 0x1.6371e8p-4 : 0x7.148bcp-8 : inexact-ok += asinpi upward binary32 0x1.6371e8p-4 : 0x7.148bc8p-8 : inexact-ok += asinpi downward binary64 0x1.6371e8p-4 : 0x7.148bc7fffff78p-8 : inexact-ok += asinpi tonearest binary64 0x1.6371e8p-4 : 0x7.148bc7fffff7cp-8 : inexact-ok += asinpi towardzero binary64 0x1.6371e8p-4 : 0x7.148bc7fffff78p-8 : inexact-ok += asinpi upward binary64 0x1.6371e8p-4 : 0x7.148bc7fffff7cp-8 : inexact-ok += asinpi downward intel96 0x1.6371e8p-4 : 0x7.148bc7fffff7af9p-8 : inexact-ok += asinpi tonearest intel96 0x1.6371e8p-4 : 0x7.148bc7fffff7af98p-8 : inexact-ok += asinpi towardzero intel96 0x1.6371e8p-4 : 0x7.148bc7fffff7af9p-8 : inexact-ok += asinpi upward intel96 0x1.6371e8p-4 : 0x7.148bc7fffff7af98p-8 : inexact-ok += asinpi downward m68k96 0x1.6371e8p-4 : 0x7.148bc7fffff7af9p-8 : inexact-ok += asinpi tonearest m68k96 0x1.6371e8p-4 : 0x7.148bc7fffff7af98p-8 : inexact-ok += asinpi towardzero m68k96 0x1.6371e8p-4 : 0x7.148bc7fffff7af9p-8 : inexact-ok += asinpi upward m68k96 0x1.6371e8p-4 : 0x7.148bc7fffff7af98p-8 : inexact-ok += asinpi downward binary128 0x1.6371e8p-4 : 0x7.148bc7fffff7af94c63520731f08p-8 : inexact-ok += asinpi tonearest binary128 0x1.6371e8p-4 : 0x7.148bc7fffff7af94c63520731f08p-8 : inexact-ok += asinpi towardzero binary128 0x1.6371e8p-4 : 0x7.148bc7fffff7af94c63520731f08p-8 : inexact-ok += asinpi upward binary128 0x1.6371e8p-4 : 0x7.148bc7fffff7af94c63520731f0cp-8 : inexact-ok += asinpi downward ibm128 0x1.6371e8p-4 : 0x7.148bc7fffff7af94c63520731ep-8 : inexact-ok += asinpi tonearest ibm128 0x1.6371e8p-4 : 0x7.148bc7fffff7af94c63520732p-8 : inexact-ok += asinpi towardzero ibm128 0x1.6371e8p-4 : 0x7.148bc7fffff7af94c63520731ep-8 : inexact-ok += asinpi upward ibm128 0x1.6371e8p-4 : 0x7.148bc7fffff7af94c63520732p-8 : inexact-ok +asinpi -0x1.6371e8p-4 += asinpi downward binary32 -0x1.6371e8p-4 : -0x7.148bc8p-8 : inexact-ok += asinpi tonearest binary32 -0x1.6371e8p-4 : -0x7.148bc8p-8 : inexact-ok += asinpi towardzero binary32 -0x1.6371e8p-4 : -0x7.148bcp-8 : inexact-ok += asinpi upward binary32 -0x1.6371e8p-4 : -0x7.148bcp-8 : inexact-ok += asinpi downward binary64 -0x1.6371e8p-4 : -0x7.148bc7fffff7cp-8 : inexact-ok += asinpi tonearest binary64 -0x1.6371e8p-4 : -0x7.148bc7fffff7cp-8 : inexact-ok += asinpi towardzero binary64 -0x1.6371e8p-4 : -0x7.148bc7fffff78p-8 : inexact-ok += asinpi upward binary64 -0x1.6371e8p-4 : -0x7.148bc7fffff78p-8 : inexact-ok += asinpi downward intel96 -0x1.6371e8p-4 : -0x7.148bc7fffff7af98p-8 : inexact-ok += asinpi tonearest intel96 -0x1.6371e8p-4 : -0x7.148bc7fffff7af98p-8 : inexact-ok += asinpi towardzero intel96 -0x1.6371e8p-4 : -0x7.148bc7fffff7af9p-8 : inexact-ok += asinpi upward intel96 -0x1.6371e8p-4 : -0x7.148bc7fffff7af9p-8 : inexact-ok += asinpi downward m68k96 -0x1.6371e8p-4 : -0x7.148bc7fffff7af98p-8 : inexact-ok += asinpi tonearest m68k96 -0x1.6371e8p-4 : -0x7.148bc7fffff7af98p-8 : inexact-ok += asinpi towardzero m68k96 -0x1.6371e8p-4 : -0x7.148bc7fffff7af9p-8 : inexact-ok += asinpi upward m68k96 -0x1.6371e8p-4 : -0x7.148bc7fffff7af9p-8 : inexact-ok += asinpi downward binary128 -0x1.6371e8p-4 : -0x7.148bc7fffff7af94c63520731f0cp-8 : inexact-ok += asinpi tonearest binary128 -0x1.6371e8p-4 : -0x7.148bc7fffff7af94c63520731f08p-8 : inexact-ok += asinpi towardzero binary128 -0x1.6371e8p-4 : -0x7.148bc7fffff7af94c63520731f08p-8 : inexact-ok += asinpi upward binary128 -0x1.6371e8p-4 : -0x7.148bc7fffff7af94c63520731f08p-8 : inexact-ok += asinpi downward ibm128 -0x1.6371e8p-4 : -0x7.148bc7fffff7af94c63520732p-8 : inexact-ok += asinpi tonearest ibm128 -0x1.6371e8p-4 : -0x7.148bc7fffff7af94c63520732p-8 : inexact-ok += asinpi towardzero ibm128 -0x1.6371e8p-4 : -0x7.148bc7fffff7af94c63520731ep-8 : inexact-ok += asinpi upward ibm128 -0x1.6371e8p-4 : -0x7.148bc7fffff7af94c63520731ep-8 : inexact-ok diff --git a/sysdeps/ieee754/flt-32/s_asinpif.c b/sysdeps/ieee754/flt-32/s_asinpif.c index f9e93533d4..d50de7fcd7 100644 --- a/sysdeps/ieee754/flt-32/s_asinpif.c +++ b/sysdeps/ieee754/flt-32/s_asinpif.c @@ -79,8 +79,13 @@ __asinpif (float x) c0 += c2 * z2; c4 += c6 * z2; c0 += c4 * z4; - double r = fma (-c0, copysign (f, x), copysign (0.5, x)); - return r; +#ifndef __FP_FAST_FMA + /* The fma is required only for x == 0x1.6371e8p-4f in FE_TOWARDZERO + to provide correctly rounded results. */ + if (__glibc_likely (ax != 0x1.6371e8p-4f)) + return copysign (0.5, x) - c0 * copysign (f, x); +#endif + return fma (-c0, copysign (f, x), copysign (0.5, x)); } } libm_alias_float (__asinpi, asinpi) -- 2.47.3