From: Adhemerval Zanella Date: Fri, 10 Oct 2025 18:15:32 +0000 (-0300) Subject: math: Consolidate erf/erfc definitions X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=e4d812c980cd6cd1f774bbd500b126aec28ab8db;p=thirdparty%2Fglibc.git math: Consolidate erf/erfc definitions The common code definitions are consolidated in s_erf_common.h and s_erf_common.c. Checked on x86_64-linux-gnu, aarch64-linux-gnu, and powerpc64le-linux-gnu. Reviewed-by: DJ Delorie --- diff --git a/math/Makefile b/math/Makefile index 4658357f11..94258d160c 100644 --- a/math/Makefile +++ b/math/Makefile @@ -363,6 +363,7 @@ type-double-routines := \ math_err \ s_asincosh_data \ s_atanh_data \ + s_erf_common \ s_erf_data \ s_erfc_data \ sincostab \ diff --git a/sysdeps/ieee754/dbl-64/s_erf.c b/sysdeps/ieee754/dbl-64/s_erf.c index 4c90eb5ac9..7a885a5426 100644 --- a/sysdeps/ieee754/dbl-64/s_erf.c +++ b/sysdeps/ieee754/dbl-64/s_erf.c @@ -30,191 +30,12 @@ SOFTWARE. #include #include #include "math_config.h" -#include "s_erf_data.h" +#include "s_erf_common.h" /* CH+CL is a double-double approximation of 2/sqrt(pi) to nearest */ static const double CH = 0x1.20dd750429b6dp+0; static const double CL = 0x1.1ae3a914fed8p-56; -/* Add a + b, such that *hi + *lo approximates a + b. - Assumes |a| >= |b|. - For rounding to nearest we have hi + lo = a + b exactly. - For directed rounding, we have - (a) hi + lo = a + b exactly when the exponent difference between a and b - is at most 53 (the binary64 precision) - (b) otherwise |(a+b)-(hi+lo)| <= 2^-105 min(|a+b|,|hi|) - (see https://hal.inria.fr/hal-03798376) - We also have |lo| < ulp(hi). */ -static inline void -fast_two_sum (double *hi, double *lo, double a, double b) -{ - double e; - - *hi = a + b; - e = *hi - a; /* exact */ - *lo = b - e; /* exact */ -} - -/* Reference: https://hal.science/hal-01351529v3/document */ -static inline void -two_sum (double *hi, double *lo, double a, double b) -{ - *hi = a + b; - double aa = *hi - b; - double bb = *hi - aa; - double da = a - aa; - double db = b - bb; - *lo = da + db; -} - -// Multiply exactly a and b, such that *hi + *lo = a * b. -static inline void -a_mul (double *hi, double *lo, double a, double b) -{ - *hi = a * b; - *lo = fma (a, b, -*hi); -} - -/* Assuming 0 <= z <= 0x1.7afb48dc96626p+2, put in h+l an approximation - of erf(z). Return err the maximal relative error: - |(h + l)/erf(z) - 1| < err*|h+l| */ -static double -cr_erf_fast (double *h, double *l, double z) -{ - double th, tl; - /* we split [0,0x1.7afb48dc96626p+2] into intervals i/16 <= z < (i+1)/16, - and for each interval, we use a minimax polynomial: - * for i=0 (0 <= z < 1/16) we use a polynomial evaluated at zero, - since if we evaluate in the middle 1/32, we will get bad accuracy - for tiny z, and moreover z-1/32 might not be exact - * for 1 <= i <= 94, we use a polynomial evaluated in the middle of - the interval, namely i/16+1/32 - */ - if (z < 0.0625) /* z < 1/16 */ - { - double z2h, z2l, z4; - a_mul (&z2h, &z2l, z, z); - z4 = z2h * z2h; - double c9 = fma (C0[7], z2h, C0[6]); - double c5 = fma (C0[5], z2h, C0[4]); - c5 = fma (c9, z4, c5); - /* compute C0[2] + C0[3] + z2h*c5 */ - a_mul (&th, &tl, z2h, c5); - fast_two_sum (h, l, C0[2], th); - *l += tl + C0[3]; - /* compute C0[0] + C0[1] + (z2h + z2l)*(h + l) */ - double h_copy = *h; - a_mul (&th, &tl, z2h, *h); - tl += fma (z2h, *l, C0[1]); - fast_two_sum (h, l, C0[0], th); - *l += fma (z2l, h_copy, tl); - /* multiply (h,l) by z */ - a_mul (h, &tl, *h, z); - *l = fma (*l, z, tl); - return 0x1.78p-69; /* err < 2.48658249618372e-21, cf Analyze0() */ - } - double v = floor (16.0 * z); - uint32_t i = 16.0 * z; - /* i/16 <= z < (i+1)/16 */ - /* For 0.0625 0 <= z <= 0x1.7afb48dc96626p+2, z - 0.03125 is exact: - (1) either z - 0.03125 is in the same binade as z, then 0.03125 is - an integer multiple of ulp(z), so is z - 0.03125 - (2) if z - 0.03125 is in a smaller binade, both z and 0.03125 are - integer multiple of the ulp() of that smaller binade. - Also, subtracting 0.0625 * v is exact. */ - z = (z - 0.03125) - 0.0625 * v; - /* now |z| <= 1/32 */ - const double *c = C[i - 1]; - double z2 = z * z, z4 = z2 * z2; - /* the degree-10 coefficient is c[12] */ - double c9 = fma (c[12], z, c[11]); - double c7 = fma (c[10], z, c[9]); - double c5 = fma (c[8], z, c[7]); - double c3h, c3l; - /* c3h, c3l <- c[5] + z*c[6] */ - fast_two_sum (&c3h, &c3l, c[5], z * c[6]); - c7 = fma (c9, z2, c7); - /* c3h, c3l <- c3h, c3l + c5*z2 */ - fast_two_sum (&c3h, &tl, c3h, c5 * z2); - c3l += tl; - /* c3h, c3l <- c3h, c3l + c7*z4 */ - fast_two_sum (&c3h, &tl, c3h, c7 * z4); - c3l += tl; - /* c2h, c2l <- c[4] + z*(c3h + c3l) */ - double c2h, c2l; - a_mul (&th, &tl, z, c3h); - fast_two_sum (&c2h, &c2l, c[4], th); - c2l += fma (z, c3l, tl); - /* compute c[2] + c[3] + z*(c2h + c2l) */ - a_mul (&th, &tl, z, c2h); - fast_two_sum (h, l, c[2], th); - *l += tl + fma (z, c2l, c[3]); - /* compute c[0] + c[1] + z*(h + l) */ - a_mul (&th, &tl, z, *h); - tl = fma (z, *l, tl); /* tl += z*l */ - fast_two_sum (h, l, c[0], th); - *l += tl + c[1]; - return 0x1.11p-69; /* err < 1.80414390200020e-21, cf analyze_p(1) - (larger values of i yield smaller error bounds) */ -} - -/* for |z| < 1/8, assuming z >= 2^-61, thus no underflow can occur */ -static void -cr_erf_accurate_tiny (double *h, double *l, double z) -{ - uint64_t i, j, k; - /* use dichotomy */ - for (i = 0, j = EXCEPTIONS_LEN; i + 1 < j;) - { - k = (i + j) / 2; - if (EXCEPTIONS_TINY[k][0] <= z) - i = k; - else - j = k; - } - /* Either i=0 and z < exceptions[i+1][0], - or i=n-1 and exceptions[i][0] <= z - or 0 < i < n-1 and exceptions[i][0] <= z < exceptions[i+1][0]. - In all cases z can only equal exceptions[i][0]. */ - if (z == EXCEPTIONS_TINY[i][0]) - { - *h = EXCEPTIONS_TINY[i][1]; - *l = EXCEPTIONS_TINY[i][2]; - return; - } - - double z2 = z * z, th, tl; - *h = P[21 / 2 + 4]; /* degree 21 */ - for (int a = 19; a > 11; a -= 2) - *h = fma (*h, z2, P[a / 2 + 4]); /* degree j */ - *l = 0; - for (int a = 11; a > 7; a -= 2) - { - /* multiply h+l by z^2 */ - a_mul (&th, &tl, *h, z); - tl = fma (*l, z, tl); - a_mul (h, l, th, z); - *l = fma (tl, z, *l); - /* add p[j] to h + l */ - fast_two_sum (h, &tl, P[a / 2 + 4], *h); - *l += tl; - } - for (int a = 7; a >= 1; a -= 2) - { - /* multiply h+l by z^2 */ - a_mul (&th, &tl, *h, z); - tl = fma (*l, z, tl); - a_mul (h, l, th, z); - *l = fma (tl, z, *l); - fast_two_sum (h, &tl, P[a - 1], *h); - *l += P[a] + tl; - } - /* multiply by z */ - a_mul (h, &tl, *h, z); - *l = fma (*l, z, tl); - return; -} - /* Assuming 0 <= z <= 0x1.7afb48dc96626p+2, put in h+l an accurate approximation of erf(z). Assumes z >= 2^-61, thus no underflow can occur. */ @@ -238,10 +59,7 @@ cr_erf_accurate (double *h, double *l, double z) the interval, namely i/8+1/16 */ if (z < 0.125) /* z < 1/8 */ - { - cr_erf_accurate_tiny (h, l, z); - return; - } + return __cr_erf_accurate_tiny (h, l, z, true); double v = floor (8.0 * z); uint32_t i = 8.0 * z; z = (z - 0.0625) - 0.125 * v; @@ -313,7 +131,7 @@ __erf (double x) return res; } /* now z >= 2^-61 */ - err = cr_erf_fast (&h, &l, z); + err = __cr_erf_fast (&h, &l, z); uint64_t u = asuint64 (h), v = asuint64 (l); t = asuint64 (x); u ^= t & SIGN_MASK; diff --git a/sysdeps/ieee754/dbl-64/s_erf_common.c b/sysdeps/ieee754/dbl-64/s_erf_common.c new file mode 100644 index 0000000000..56ac93bf1b --- /dev/null +++ b/sysdeps/ieee754/dbl-64/s_erf_common.c @@ -0,0 +1,170 @@ +/* Common definitions for erf/erc implementation. + +Copyright (c) 2023-2025 Paul Zimmermann + +The original version of this file was copied from the CORE-MATH +project (file src/binary64/erf/erf.c, revision 384ed01d). + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include +#include +#include +#include +#include "s_erf_common.h" + +double +__cr_erf_fast (double *h, double *l, double z) +{ + double th, tl; + /* we split [0,0x1.7afb48dc96626p+2] into intervals i/16 <= z < (i+1)/16, + and for each interval, we use a minimax polynomial: + * for i=0 (0 <= z < 1/16) we use a polynomial evaluated at zero, + since if we evaluate in the middle 1/32, we will get bad accuracy + for tiny z, and moreover z-1/32 might not be exact + * for 1 <= i <= 94, we use a polynomial evaluated in the middle of + the interval, namely i/16+1/32 + */ + if (z < 0.0625) /* z < 1/16 */ + { + double z2h, z2l, z4; + a_mul (&z2h, &z2l, z, z); + z4 = z2h * z2h; + double c9 = fma (C0[7], z2h, C0[6]); + double c5 = fma (C0[5], z2h, C0[4]); + c5 = fma (c9, z4, c5); + /* compute C0[2] + C0[3] + z2h*c5 */ + a_mul (&th, &tl, z2h, c5); + fast_two_sum (h, l, C0[2], th); + *l += tl + C0[3]; + /* compute C0[0] + C0[1] + (z2h + z2l)*(h + l) */ + double h_copy = *h; + a_mul (&th, &tl, z2h, *h); + tl += fma (z2h, *l, C0[1]); + fast_two_sum (h, l, C0[0], th); + *l += fma (z2l, h_copy, tl); + /* multiply (h,l) by z */ + a_mul (h, &tl, *h, z); + *l = fma (*l, z, tl); + return 0x1.78p-69; /* err < 2.48658249618372e-21, cf Analyze0() */ + } + double v = floor (16.0 * z); + uint32_t i = 16.0 * z; + /* i/16 <= z < (i+1)/16 */ + /* For 0.0625 0 <= z <= 0x1.7afb48dc96626p+2, z - 0.03125 is exact: + (1) either z - 0.03125 is in the same binade as z, then 0.03125 is + an integer multiple of ulp(z), so is z - 0.03125 + (2) if z - 0.03125 is in a smaller binade, both z and 0.03125 are + integer multiple of the ulp() of that smaller binade. + Also, subtracting 0.0625 * v is exact. */ + z = (z - 0.03125) - 0.0625 * v; + /* now |z| <= 1/32 */ + const double *c = C[i - 1]; + double z2 = z * z, z4 = z2 * z2; + /* the degree-10 coefficient is c[12] */ + double c9 = fma (c[12], z, c[11]); + double c7 = fma (c[10], z, c[9]); + double c5 = fma (c[8], z, c[7]); + double c3h, c3l; + /* c3h, c3l <- c[5] + z*c[6] */ + fast_two_sum (&c3h, &c3l, c[5], z * c[6]); + c7 = fma (c9, z2, c7); + /* c3h, c3l <- c3h, c3l + c5*z2 */ + fast_two_sum (&c3h, &tl, c3h, c5 * z2); + c3l += tl; + /* c3h, c3l <- c3h, c3l + c7*z4 */ + fast_two_sum (&c3h, &tl, c3h, c7 * z4); + c3l += tl; + /* c2h, c2l <- c[4] + z*(c3h + c3l) */ + double c2h, c2l; + a_mul (&th, &tl, z, c3h); + fast_two_sum (&c2h, &c2l, c[4], th); + c2l += fma (z, c3l, tl); + /* compute c[2] + c[3] + z*(c2h + c2l) */ + a_mul (&th, &tl, z, c2h); + fast_two_sum (h, l, c[2], th); + *l += tl + fma (z, c2l, c[3]); + /* compute c[0] + c[1] + z*(h + l) */ + a_mul (&th, &tl, z, *h); + tl = fma (z, *l, tl); /* tl += z*l */ + fast_two_sum (h, l, c[0], th); + *l += tl + c[1]; + return 0x1.11p-69; /* err < 1.80414390200020e-21, cf analyze_p(1) + (larger values of i yield smaller error bounds) */ +} + +void +__cr_erf_accurate_tiny (double *h, double *l, double z, bool exceptions) +{ + if (exceptions) + { + uint64_t i, j, k; + /* use dichotomy */ + for (i = 0, j = EXCEPTIONS_TINY_LEN; i + 1 < j;) + { + k = (i + j) / 2; + if (EXCEPTIONS_TINY[k][0] <= z) + i = k; + else + j = k; + } + /* Either i=0 and z < exceptions[i+1][0], + or i=n-1 and exceptions[i][0] <= z + or 0 < i < n-1 and exceptions[i][0] <= z < exceptions[i+1][0]. + In all cases z can only equal exceptions[i][0]. */ + if (z == EXCEPTIONS_TINY[i][0]) + { + *h = EXCEPTIONS_TINY[i][1]; + *l = EXCEPTIONS_TINY[i][2]; + return; + } + } + + double z2 = z * z, th, tl; + *h = P[21 / 2 + 4]; /* degree 21 */ + for (int a = 19; a > 11; a -= 2) + *h = fma (*h, z2, P[a / 2 + 4]); /* degree j */ + *l = 0; + for (int a = 11; a > 7; a -= 2) + { + /* multiply h+l by z^2 */ + a_mul (&th, &tl, *h, z); + tl = fma (*l, z, tl); + a_mul (h, l, th, z); + *l = fma (tl, z, *l); + /* add p[j] to h + l */ + fast_two_sum (h, &tl, P[a / 2 + 4], *h); + *l += tl; + } + for (int a = 7; a >= 1; a -= 2) + { + /* multiply h+l by z^2 */ + a_mul (&th, &tl, *h, z); + tl = fma (*l, z, tl); + a_mul (h, l, th, z); + *l = fma (tl, z, *l); + fast_two_sum (h, &tl, P[a - 1], *h); + *l += P[a] + tl; + } + /* multiply by z */ + a_mul (h, &tl, *h, z); + *l = fma (*l, z, tl); + return; +} diff --git a/sysdeps/ieee754/dbl-64/s_erf_common.h b/sysdeps/ieee754/dbl-64/s_erf_common.h new file mode 100644 index 0000000000..b85e8da5aa --- /dev/null +++ b/sysdeps/ieee754/dbl-64/s_erf_common.h @@ -0,0 +1,80 @@ +/* Common definitions for erf/erc implementation. + +Copyright (c) 2023-2025 Paul Zimmermann + +This file is part of the CORE-MATH project +(https://core-math.gitlabpages.inria.fr/). + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#ifndef _S_ERF_COMMON_H +#define _S_ERF_COMMON_H + +#include "s_erf_data.h" + +/* Add a + b, such that *hi + *lo approximates a + b. + Assumes |a| >= |b|. + For rounding to nearest we have hi + lo = a + b exactly. + For directed rounding, we have + (a) hi + lo = a + b exactly when the exponent difference between a and b + is at most 53 (the binary64 precision) + (b) otherwise |(a+b)-(hi+lo)| <= 2^-105 min(|a+b|,|hi|) + (see https://hal.inria.fr/hal-03798376) + We also have |lo| < ulp(hi). */ +static inline void +fast_two_sum (double *hi, double *lo, double a, double b) +{ + double e; + + *hi = a + b; + e = *hi - a; /* exact */ + *lo = b - e; /* exact */ +} + +/* Reference: https://hal.science/hal-01351529v3/document */ +static inline void +two_sum (double *hi, double *lo, double a, double b) +{ + *hi = a + b; + double aa = *hi - b; + double bb = *hi - aa; + double da = a - aa; + double db = b - bb; + *lo = da + db; +} + +// Multiply exactly a and b, such that *hi + *lo = a * b. +static inline void +a_mul (double *hi, double *lo, double a, double b) +{ + *hi = a * b; + *lo = fma (a, b, -*hi); +} + +/* Assuming 0 <= z <= 0x1.7afb48dc96626p+2, put in h+l an approximation + of erf(z). Return err the maximal relative error: + |(h + l)/erf(z) - 1| < err*|h+l| */ +double __cr_erf_fast (double *h, double *l, double z) attribute_hidden; + +/* for |z| < 1/8, assuming z >= 2^-61, thus no underflow can occur */ +void __cr_erf_accurate_tiny (double *h, double *l, double z, bool exceptions) + attribute_hidden; + +#endif diff --git a/sysdeps/ieee754/dbl-64/s_erfc.c b/sysdeps/ieee754/dbl-64/s_erfc.c index 2688e944da..d240263cfe 100644 --- a/sysdeps/ieee754/dbl-64/s_erfc.c +++ b/sysdeps/ieee754/dbl-64/s_erfc.c @@ -48,129 +48,9 @@ SOFTWARE. #include #include #include "math_config.h" -#include "s_erf_data.h" +#include "s_erf_common.h" #include "s_erfc_data.h" -static inline void -fast_two_sum (double *hi, double *lo, double a, double b) -{ - double e; - - *hi = a + b; - e = *hi - a; /* exact */ - *lo = b - e; /* exact */ -} - -static inline void -two_sum (double *hi, double *lo, double a, double b) -{ - *hi = a + b; - double aa = *hi - b; - double bb = *hi - aa; - double da = a - aa; - double db = b - bb; - *lo = da + db; -} - -static inline void -a_mul (double *hi, double *lo, double a, double b) -{ - *hi = a * b; - *lo = fma (a, b, -*hi); -} - -/* Assuming 0 <= z <= 0x1.7afb48dc96626p+2, put in h+l an approximation - of erf(z). Return err the maximal relative error: - |(h + l)/erf(z) - 1| < err*|h+l| */ -static double -cr_erf_fast (double *h, double *l, double z) -{ - double th, tl; - if (z < 0.0625) /* z < 1/16 */ - { - double z2h, z2l, z4; - a_mul (&z2h, &z2l, z, z); - z4 = z2h * z2h; - double c9 = fma (C0[7], z2h, C0[6]); - double c5 = fma (C0[5], z2h, C0[4]); - c5 = fma (c9, z4, c5); - a_mul (&th, &tl, z2h, c5); - fast_two_sum (h, l, C0[2], th); - *l += tl + C0[3]; - double h_copy = *h; - a_mul (&th, &tl, z2h, *h); - tl += fma (z2h, *l, C0[1]); - fast_two_sum (h, l, C0[0], th); - *l += fma (z2l, h_copy, tl); - a_mul (h, &tl, *h, z); - *l = fma (*l, z, tl); - return 0x1.78p-69; - } - double v = floor (16.0 * z); - uint32_t i = 16.0 * z; - z = (z - 0.03125) - 0.0625 * v; - const double *c = C[i - 1]; - double z2 = z * z, z4 = z2 * z2; - double c9 = fma (c[12], z, c[11]); - double c7 = fma (c[10], z, c[9]); - double c5 = fma (c[8], z, c[7]); - double c3h, c3l; - fast_two_sum (&c3h, &c3l, c[5], z * c[6]); - c7 = fma (c9, z2, c7); - fast_two_sum (&c3h, &tl, c3h, c5 * z2); - c3l += tl; - fast_two_sum (&c3h, &tl, c3h, c7 * z4); - c3l += tl; - double c2h, c2l; - a_mul (&th, &tl, z, c3h); - fast_two_sum (&c2h, &c2l, c[4], th); - c2l += fma (z, c3l, tl); - a_mul (&th, &tl, z, c2h); - fast_two_sum (h, l, c[2], th); - *l += tl + fma (z, c2l, c[3]); - a_mul (&th, &tl, z, *h); - tl = fma (z, *l, tl); /* tl += z*l */ - fast_two_sum (h, l, c[0], th); - *l += tl + c[1]; - return 0x1.11p-69; -} - -/* for |z| < 1/8, assuming z >= 2^-61, thus no underflow can occur */ -static void -cr_erf_accurate_tiny (double *h, double *l, double z) -{ - double z2 = z * z, th, tl; - *h = P[21 / 2 + 4]; /* degree 21 */ - for (int j = 19; j > 11; j -= 2) - *h = fma (*h, z2, P[j / 2 + 4]); /* degree j */ - *l = 0; - for (int j = 11; j > 7; j -= 2) - { - /* multiply h+l by z^2 */ - a_mul (&th, &tl, *h, z); - tl = fma (*l, z, tl); - a_mul (h, l, th, z); - *l = fma (tl, z, *l); - /* add P[j] to h + l */ - fast_two_sum (h, &tl, P[j / 2 + 4], *h); - *l += tl; - } - for (int j = 7; j >= 1; j -= 2) - { - /* multiply h+l by z^2 */ - a_mul (&th, &tl, *h, z); - tl = fma (*l, z, tl); - a_mul (h, l, th, z); - *l = fma (tl, z, *l); - fast_two_sum (h, &tl, P[j - 1], *h); - *l += P[j] + tl; - } - /* multiply by z */ - a_mul (h, &tl, *h, z); - *l = fma (*l, z, tl); - return; -} - /* Assuming 0 <= z <= 0x1.7afb48dc96626p+2, put in h+l an accurate approximation of erf(z). Assumes z >= 2^-61, thus no underflow can occur. */ @@ -179,7 +59,7 @@ cr_erf_accurate (double *h, double *l, double z) { double th, tl; if (z < 0.125) /* z < 1/8 */ - return cr_erf_accurate_tiny (h, l, z); + return __cr_erf_accurate_tiny (h, l, z, false); double v = floor (8.0 * z); uint32_t i = 8.0 * z; z = (z - 0.0625) - 0.125 * v; @@ -507,7 +387,7 @@ cr_erfc_fast (double *h, double *l, double x) throughput is about 44 cycles */ if (x < 0) // erfc(x) = 1 - erf(x) = 1 + erf(-x) { - double err = cr_erf_fast (h, l, -x); + double err = __cr_erf_fast (h, l, -x); /* h+l approximates erf(-x), with relative error bounded by err, where err <= 0x1.78p-69 */ err = err * *h; /* convert into absolute error */ @@ -533,7 +413,7 @@ cr_erfc_fast (double *h, double *l, double x) the average reciprocal throughput is about 59 cycles */ else if (x <= THRESHOLD1) { - double err = cr_erf_fast (h, l, x); + double err = __cr_erf_fast (h, l, x); /* h+l approximates erf(x), with relative error bounded by err, where err <= 0x1.78p-69 */ err = err * *h; /* convert into absolute error */