From: Adhemerval Zanella <adhemerval.zanella@linaro.org>
Date: Fri, 10 Oct 2025 18:15:32 +0000 (-0300)
Subject: math: Consolidate erf/erfc definitions
X-Git-Tag: glibc-2.43~357
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=e4d812c980cd6cd1f774bbd500b126aec28ab8db;p=thirdparty%2Fglibc.git

math: Consolidate erf/erfc definitions

The common code definitions are consolidated in s_erf_common.h
and s_erf_common.c.

Checked on x86_64-linux-gnu, aarch64-linux-gnu, and
powerpc64le-linux-gnu.

Reviewed-by: DJ Delorie <dj@redhat.com>
---

diff --git a/math/Makefile b/math/Makefile
index 4658357f11..94258d160c 100644
--- a/math/Makefile
+++ b/math/Makefile
@@ -363,6 +363,7 @@ type-double-routines := \
   math_err \
   s_asincosh_data \
   s_atanh_data \
+  s_erf_common \
   s_erf_data \
   s_erfc_data \
   sincostab \
diff --git a/sysdeps/ieee754/dbl-64/s_erf.c b/sysdeps/ieee754/dbl-64/s_erf.c
index 4c90eb5ac9..7a885a5426 100644
--- a/sysdeps/ieee754/dbl-64/s_erf.c
+++ b/sysdeps/ieee754/dbl-64/s_erf.c
@@ -30,191 +30,12 @@ SOFTWARE.
 #include <errno.h>
 #include <libm-alias-double.h>
 #include "math_config.h"
-#include "s_erf_data.h"
+#include "s_erf_common.h"
 
 /* CH+CL is a double-double approximation of 2/sqrt(pi) to nearest */
 static const double CH = 0x1.20dd750429b6dp+0;
 static const double CL = 0x1.1ae3a914fed8p-56;
 
-/* Add a + b, such that *hi + *lo approximates a + b.
-   Assumes |a| >= |b|.
-   For rounding to nearest we have hi + lo = a + b exactly.
-   For directed rounding, we have
-   (a) hi + lo = a + b exactly when the exponent difference between a and b
-       is at most 53 (the binary64 precision)
-   (b) otherwise |(a+b)-(hi+lo)| <= 2^-105 min(|a+b|,|hi|)
-       (see https://hal.inria.fr/hal-03798376)
-   We also have |lo| < ulp(hi). */
-static inline void
-fast_two_sum (double *hi, double *lo, double a, double b)
-{
-  double e;
-
-  *hi = a + b;
-  e = *hi - a; /* exact */
-  *lo = b - e; /* exact */
-}
-
-/* Reference: https://hal.science/hal-01351529v3/document */
-static inline void
-two_sum (double *hi, double *lo, double a, double b)
-{
-  *hi = a + b;
-  double aa = *hi - b;
-  double bb = *hi - aa;
-  double da = a - aa;
-  double db = b - bb;
-  *lo = da + db;
-}
-
-// Multiply exactly a and b, such that *hi + *lo = a * b.
-static inline void
-a_mul (double *hi, double *lo, double a, double b)
-{
-  *hi = a * b;
-  *lo = fma (a, b, -*hi);
-}
-
-/* Assuming 0 <= z <= 0x1.7afb48dc96626p+2, put in h+l an approximation
-   of erf(z). Return err the maximal relative error:
-   |(h + l)/erf(z) - 1| < err*|h+l| */
-static double
-cr_erf_fast (double *h, double *l, double z)
-{
-  double th, tl;
-  /* we split [0,0x1.7afb48dc96626p+2] into intervals i/16 <= z < (i+1)/16,
-     and for each interval, we use a minimax polynomial:
-     * for i=0 (0 <= z < 1/16) we use a polynomial evaluated at zero,
-       since if we evaluate in the middle 1/32, we will get bad accuracy
-       for tiny z, and moreover z-1/32 might not be exact
-     * for 1 <= i <= 94, we use a polynomial evaluated in the middle of
-       the interval, namely i/16+1/32
-  */
-  if (z < 0.0625) /* z < 1/16 */
-    {
-      double z2h, z2l, z4;
-      a_mul (&z2h, &z2l, z, z);
-      z4 = z2h * z2h;
-      double c9 = fma (C0[7], z2h, C0[6]);
-      double c5 = fma (C0[5], z2h, C0[4]);
-      c5 = fma (c9, z4, c5);
-      /* compute C0[2] + C0[3] + z2h*c5 */
-      a_mul (&th, &tl, z2h, c5);
-      fast_two_sum (h, l, C0[2], th);
-      *l += tl + C0[3];
-      /* compute C0[0] + C0[1] + (z2h + z2l)*(h + l) */
-      double h_copy = *h;
-      a_mul (&th, &tl, z2h, *h);
-      tl += fma (z2h, *l, C0[1]);
-      fast_two_sum (h, l, C0[0], th);
-      *l += fma (z2l, h_copy, tl);
-      /* multiply (h,l) by z */
-      a_mul (h, &tl, *h, z);
-      *l = fma (*l, z, tl);
-      return 0x1.78p-69; /* err < 2.48658249618372e-21, cf Analyze0() */
-    }
-  double v = floor (16.0 * z);
-  uint32_t i = 16.0 * z;
-  /* i/16 <= z < (i+1)/16 */
-  /* For 0.0625 0 <= z <= 0x1.7afb48dc96626p+2, z - 0.03125 is exact:
-     (1) either z - 0.03125 is in the same binade as z, then 0.03125 is
-	 an integer multiple of ulp(z), so is z - 0.03125
-     (2) if z - 0.03125 is in a smaller binade, both z and 0.03125 are
-	 integer multiple of the ulp() of that smaller binade.
-     Also, subtracting 0.0625 * v is exact. */
-  z = (z - 0.03125) - 0.0625 * v;
-  /* now |z| <= 1/32 */
-  const double *c = C[i - 1];
-  double z2 = z * z, z4 = z2 * z2;
-  /* the degree-10 coefficient is c[12] */
-  double c9 = fma (c[12], z, c[11]);
-  double c7 = fma (c[10], z, c[9]);
-  double c5 = fma (c[8], z, c[7]);
-  double c3h, c3l;
-  /* c3h, c3l <- c[5] + z*c[6] */
-  fast_two_sum (&c3h, &c3l, c[5], z * c[6]);
-  c7 = fma (c9, z2, c7);
-  /* c3h, c3l <- c3h, c3l + c5*z2 */
-  fast_two_sum (&c3h, &tl, c3h, c5 * z2);
-  c3l += tl;
-  /* c3h, c3l <- c3h, c3l + c7*z4 */
-  fast_two_sum (&c3h, &tl, c3h, c7 * z4);
-  c3l += tl;
-  /* c2h, c2l <- c[4] + z*(c3h + c3l) */
-  double c2h, c2l;
-  a_mul (&th, &tl, z, c3h);
-  fast_two_sum (&c2h, &c2l, c[4], th);
-  c2l += fma (z, c3l, tl);
-  /* compute c[2] + c[3] + z*(c2h + c2l) */
-  a_mul (&th, &tl, z, c2h);
-  fast_two_sum (h, l, c[2], th);
-  *l += tl + fma (z, c2l, c[3]);
-  /* compute c[0] + c[1] + z*(h + l) */
-  a_mul (&th, &tl, z, *h);
-  tl = fma (z, *l, tl); /* tl += z*l */
-  fast_two_sum (h, l, c[0], th);
-  *l += tl + c[1];
-  return 0x1.11p-69; /* err < 1.80414390200020e-21, cf analyze_p(1)
-			(larger values of i yield smaller error bounds) */
-}
-
-/* for |z| < 1/8, assuming z >= 2^-61, thus no underflow can occur */
-static void
-cr_erf_accurate_tiny (double *h, double *l, double z)
-{
-  uint64_t i, j, k;
-  /* use dichotomy */
-  for (i = 0, j = EXCEPTIONS_LEN; i + 1 < j;)
-    {
-      k = (i + j) / 2;
-      if (EXCEPTIONS_TINY[k][0] <= z)
-	i = k;
-      else
-	j = k;
-    }
-  /* Either i=0 and z < exceptions[i+1][0],
-     or i=n-1 and exceptions[i][0] <= z
-     or 0 < i < n-1 and exceptions[i][0] <= z < exceptions[i+1][0].
-     In all cases z can only equal exceptions[i][0]. */
-  if (z == EXCEPTIONS_TINY[i][0])
-    {
-      *h = EXCEPTIONS_TINY[i][1];
-      *l = EXCEPTIONS_TINY[i][2];
-      return;
-    }
-
-  double z2 = z * z, th, tl;
-  *h = P[21 / 2 + 4]; /* degree 21 */
-  for (int a = 19; a > 11; a -= 2)
-    *h = fma (*h, z2, P[a / 2 + 4]); /* degree j */
-  *l = 0;
-  for (int a = 11; a > 7; a -= 2)
-    {
-      /* multiply h+l by z^2 */
-      a_mul (&th, &tl, *h, z);
-      tl = fma (*l, z, tl);
-      a_mul (h, l, th, z);
-      *l = fma (tl, z, *l);
-      /* add p[j] to h + l */
-      fast_two_sum (h, &tl, P[a / 2 + 4], *h);
-      *l += tl;
-    }
-  for (int a = 7; a >= 1; a -= 2)
-    {
-      /* multiply h+l by z^2 */
-      a_mul (&th, &tl, *h, z);
-      tl = fma (*l, z, tl);
-      a_mul (h, l, th, z);
-      *l = fma (tl, z, *l);
-      fast_two_sum (h, &tl, P[a - 1], *h);
-      *l += P[a] + tl;
-    }
-  /* multiply by z */
-  a_mul (h, &tl, *h, z);
-  *l = fma (*l, z, tl);
-  return;
-}
-
 /* Assuming 0 <= z <= 0x1.7afb48dc96626p+2, put in h+l an accurate
    approximation of erf(z).
    Assumes z >= 2^-61, thus no underflow can occur. */
@@ -238,10 +59,7 @@ cr_erf_accurate (double *h, double *l, double z)
        the interval, namely i/8+1/16
   */
   if (z < 0.125) /* z < 1/8 */
-    {
-      cr_erf_accurate_tiny (h, l, z);
-      return;
-    }
+    return __cr_erf_accurate_tiny (h, l, z, true);
   double v = floor (8.0 * z);
   uint32_t i = 8.0 * z;
   z = (z - 0.0625) - 0.125 * v;
@@ -313,7 +131,7 @@ __erf (double x)
       return res;
     }
   /* now z >= 2^-61 */
-  err = cr_erf_fast (&h, &l, z);
+  err = __cr_erf_fast (&h, &l, z);
   uint64_t u = asuint64 (h), v = asuint64 (l);
   t = asuint64 (x);
   u ^= t & SIGN_MASK;
diff --git a/sysdeps/ieee754/dbl-64/s_erf_common.c b/sysdeps/ieee754/dbl-64/s_erf_common.c
new file mode 100644
index 0000000000..56ac93bf1b
--- /dev/null
+++ b/sysdeps/ieee754/dbl-64/s_erf_common.c
@@ -0,0 +1,170 @@
+/* Common definitions for erf/erc implementation.
+
+Copyright (c) 2023-2025 Paul Zimmermann
+
+The original version of this file was copied from the CORE-MATH
+project (file src/binary64/erf/erf.c, revision 384ed01d).
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include <stdint.h>
+#include <math.h>
+#include <stddef.h>
+#include <stdbool.h>
+#include "s_erf_common.h"
+
+double
+__cr_erf_fast (double *h, double *l, double z)
+{
+  double th, tl;
+  /* we split [0,0x1.7afb48dc96626p+2] into intervals i/16 <= z < (i+1)/16,
+     and for each interval, we use a minimax polynomial:
+     * for i=0 (0 <= z < 1/16) we use a polynomial evaluated at zero,
+       since if we evaluate in the middle 1/32, we will get bad accuracy
+       for tiny z, and moreover z-1/32 might not be exact
+     * for 1 <= i <= 94, we use a polynomial evaluated in the middle of
+       the interval, namely i/16+1/32
+  */
+  if (z < 0.0625) /* z < 1/16 */
+    {
+      double z2h, z2l, z4;
+      a_mul (&z2h, &z2l, z, z);
+      z4 = z2h * z2h;
+      double c9 = fma (C0[7], z2h, C0[6]);
+      double c5 = fma (C0[5], z2h, C0[4]);
+      c5 = fma (c9, z4, c5);
+      /* compute C0[2] + C0[3] + z2h*c5 */
+      a_mul (&th, &tl, z2h, c5);
+      fast_two_sum (h, l, C0[2], th);
+      *l += tl + C0[3];
+      /* compute C0[0] + C0[1] + (z2h + z2l)*(h + l) */
+      double h_copy = *h;
+      a_mul (&th, &tl, z2h, *h);
+      tl += fma (z2h, *l, C0[1]);
+      fast_two_sum (h, l, C0[0], th);
+      *l += fma (z2l, h_copy, tl);
+      /* multiply (h,l) by z */
+      a_mul (h, &tl, *h, z);
+      *l = fma (*l, z, tl);
+      return 0x1.78p-69; /* err < 2.48658249618372e-21, cf Analyze0() */
+    }
+  double v = floor (16.0 * z);
+  uint32_t i = 16.0 * z;
+  /* i/16 <= z < (i+1)/16 */
+  /* For 0.0625 0 <= z <= 0x1.7afb48dc96626p+2, z - 0.03125 is exact:
+     (1) either z - 0.03125 is in the same binade as z, then 0.03125 is
+	 an integer multiple of ulp(z), so is z - 0.03125
+     (2) if z - 0.03125 is in a smaller binade, both z and 0.03125 are
+	 integer multiple of the ulp() of that smaller binade.
+     Also, subtracting 0.0625 * v is exact. */
+  z = (z - 0.03125) - 0.0625 * v;
+  /* now |z| <= 1/32 */
+  const double *c = C[i - 1];
+  double z2 = z * z, z4 = z2 * z2;
+  /* the degree-10 coefficient is c[12] */
+  double c9 = fma (c[12], z, c[11]);
+  double c7 = fma (c[10], z, c[9]);
+  double c5 = fma (c[8], z, c[7]);
+  double c3h, c3l;
+  /* c3h, c3l <- c[5] + z*c[6] */
+  fast_two_sum (&c3h, &c3l, c[5], z * c[6]);
+  c7 = fma (c9, z2, c7);
+  /* c3h, c3l <- c3h, c3l + c5*z2 */
+  fast_two_sum (&c3h, &tl, c3h, c5 * z2);
+  c3l += tl;
+  /* c3h, c3l <- c3h, c3l + c7*z4 */
+  fast_two_sum (&c3h, &tl, c3h, c7 * z4);
+  c3l += tl;
+  /* c2h, c2l <- c[4] + z*(c3h + c3l) */
+  double c2h, c2l;
+  a_mul (&th, &tl, z, c3h);
+  fast_two_sum (&c2h, &c2l, c[4], th);
+  c2l += fma (z, c3l, tl);
+  /* compute c[2] + c[3] + z*(c2h + c2l) */
+  a_mul (&th, &tl, z, c2h);
+  fast_two_sum (h, l, c[2], th);
+  *l += tl + fma (z, c2l, c[3]);
+  /* compute c[0] + c[1] + z*(h + l) */
+  a_mul (&th, &tl, z, *h);
+  tl = fma (z, *l, tl); /* tl += z*l */
+  fast_two_sum (h, l, c[0], th);
+  *l += tl + c[1];
+  return 0x1.11p-69; /* err < 1.80414390200020e-21, cf analyze_p(1)
+			(larger values of i yield smaller error bounds) */
+}
+
+void
+__cr_erf_accurate_tiny (double *h, double *l, double z, bool exceptions)
+{
+  if (exceptions)
+    {
+      uint64_t i, j, k;
+      /* use dichotomy */
+      for (i = 0, j = EXCEPTIONS_TINY_LEN; i + 1 < j;)
+	{
+	  k = (i + j) / 2;
+	  if (EXCEPTIONS_TINY[k][0] <= z)
+	    i = k;
+	  else
+	    j = k;
+	}
+      /* Either i=0 and z < exceptions[i+1][0],
+	 or i=n-1 and exceptions[i][0] <= z
+	 or 0 < i < n-1 and exceptions[i][0] <= z < exceptions[i+1][0].
+	 In all cases z can only equal exceptions[i][0]. */
+      if (z == EXCEPTIONS_TINY[i][0])
+	{
+	  *h = EXCEPTIONS_TINY[i][1];
+	  *l = EXCEPTIONS_TINY[i][2];
+	  return;
+	}
+    }
+
+  double z2 = z * z, th, tl;
+  *h = P[21 / 2 + 4]; /* degree 21 */
+  for (int a = 19; a > 11; a -= 2)
+    *h = fma (*h, z2, P[a / 2 + 4]); /* degree j */
+  *l = 0;
+  for (int a = 11; a > 7; a -= 2)
+    {
+      /* multiply h+l by z^2 */
+      a_mul (&th, &tl, *h, z);
+      tl = fma (*l, z, tl);
+      a_mul (h, l, th, z);
+      *l = fma (tl, z, *l);
+      /* add p[j] to h + l */
+      fast_two_sum (h, &tl, P[a / 2 + 4], *h);
+      *l += tl;
+    }
+  for (int a = 7; a >= 1; a -= 2)
+    {
+      /* multiply h+l by z^2 */
+      a_mul (&th, &tl, *h, z);
+      tl = fma (*l, z, tl);
+      a_mul (h, l, th, z);
+      *l = fma (tl, z, *l);
+      fast_two_sum (h, &tl, P[a - 1], *h);
+      *l += P[a] + tl;
+    }
+  /* multiply by z */
+  a_mul (h, &tl, *h, z);
+  *l = fma (*l, z, tl);
+  return;
+}
diff --git a/sysdeps/ieee754/dbl-64/s_erf_common.h b/sysdeps/ieee754/dbl-64/s_erf_common.h
new file mode 100644
index 0000000000..b85e8da5aa
--- /dev/null
+++ b/sysdeps/ieee754/dbl-64/s_erf_common.h
@@ -0,0 +1,80 @@
+/* Common definitions for erf/erc implementation.
+
+Copyright (c) 2023-2025 Paul Zimmermann
+
+This file is part of the CORE-MATH project
+(https://core-math.gitlabpages.inria.fr/).
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#ifndef _S_ERF_COMMON_H
+#define _S_ERF_COMMON_H
+
+#include "s_erf_data.h"
+
+/* Add a + b, such that *hi + *lo approximates a + b.
+   Assumes |a| >= |b|.
+   For rounding to nearest we have hi + lo = a + b exactly.
+   For directed rounding, we have
+   (a) hi + lo = a + b exactly when the exponent difference between a and b
+       is at most 53 (the binary64 precision)
+   (b) otherwise |(a+b)-(hi+lo)| <= 2^-105 min(|a+b|,|hi|)
+       (see https://hal.inria.fr/hal-03798376)
+   We also have |lo| < ulp(hi). */
+static inline void
+fast_two_sum (double *hi, double *lo, double a, double b)
+{
+  double e;
+
+  *hi = a + b;
+  e = *hi - a; /* exact */
+  *lo = b - e; /* exact */
+}
+
+/* Reference: https://hal.science/hal-01351529v3/document */
+static inline void
+two_sum (double *hi, double *lo, double a, double b)
+{
+  *hi = a + b;
+  double aa = *hi - b;
+  double bb = *hi - aa;
+  double da = a - aa;
+  double db = b - bb;
+  *lo = da + db;
+}
+
+// Multiply exactly a and b, such that *hi + *lo = a * b.
+static inline void
+a_mul (double *hi, double *lo, double a, double b)
+{
+  *hi = a * b;
+  *lo = fma (a, b, -*hi);
+}
+
+/* Assuming 0 <= z <= 0x1.7afb48dc96626p+2, put in h+l an approximation
+   of erf(z). Return err the maximal relative error:
+   |(h + l)/erf(z) - 1| < err*|h+l| */
+double __cr_erf_fast (double *h, double *l, double z) attribute_hidden;
+
+/* for |z| < 1/8, assuming z >= 2^-61, thus no underflow can occur */
+void __cr_erf_accurate_tiny (double *h, double *l, double z, bool exceptions)
+     attribute_hidden;
+
+#endif
diff --git a/sysdeps/ieee754/dbl-64/s_erfc.c b/sysdeps/ieee754/dbl-64/s_erfc.c
index 2688e944da..d240263cfe 100644
--- a/sysdeps/ieee754/dbl-64/s_erfc.c
+++ b/sysdeps/ieee754/dbl-64/s_erfc.c
@@ -48,129 +48,9 @@ SOFTWARE.
 #include <errno.h>
 #include <libm-alias-double.h>
 #include "math_config.h"
-#include "s_erf_data.h"
+#include "s_erf_common.h"
 #include "s_erfc_data.h"
 
-static inline void
-fast_two_sum (double *hi, double *lo, double a, double b)
-{
-  double e;
-
-  *hi = a + b;
-  e = *hi - a; /* exact */
-  *lo = b - e; /* exact */
-}
-
-static inline void
-two_sum (double *hi, double *lo, double a, double b)
-{
-  *hi = a + b;
-  double aa = *hi - b;
-  double bb = *hi - aa;
-  double da = a - aa;
-  double db = b - bb;
-  *lo = da + db;
-}
-
-static inline void
-a_mul (double *hi, double *lo, double a, double b)
-{
-  *hi = a * b;
-  *lo = fma (a, b, -*hi);
-}
-
-/* Assuming 0 <= z <= 0x1.7afb48dc96626p+2, put in h+l an approximation
-   of erf(z). Return err the maximal relative error:
-   |(h + l)/erf(z) - 1| < err*|h+l| */
-static double
-cr_erf_fast (double *h, double *l, double z)
-{
-  double th, tl;
-  if (z < 0.0625) /* z < 1/16 */
-    {
-      double z2h, z2l, z4;
-      a_mul (&z2h, &z2l, z, z);
-      z4 = z2h * z2h;
-      double c9 = fma (C0[7], z2h, C0[6]);
-      double c5 = fma (C0[5], z2h, C0[4]);
-      c5 = fma (c9, z4, c5);
-      a_mul (&th, &tl, z2h, c5);
-      fast_two_sum (h, l, C0[2], th);
-      *l += tl + C0[3];
-      double h_copy = *h;
-      a_mul (&th, &tl, z2h, *h);
-      tl += fma (z2h, *l, C0[1]);
-      fast_two_sum (h, l, C0[0], th);
-      *l += fma (z2l, h_copy, tl);
-      a_mul (h, &tl, *h, z);
-      *l = fma (*l, z, tl);
-      return 0x1.78p-69;
-    }
-  double v = floor (16.0 * z);
-  uint32_t i = 16.0 * z;
-  z = (z - 0.03125) - 0.0625 * v;
-  const double *c = C[i - 1];
-  double z2 = z * z, z4 = z2 * z2;
-  double c9 = fma (c[12], z, c[11]);
-  double c7 = fma (c[10], z, c[9]);
-  double c5 = fma (c[8], z, c[7]);
-  double c3h, c3l;
-  fast_two_sum (&c3h, &c3l, c[5], z * c[6]);
-  c7 = fma (c9, z2, c7);
-  fast_two_sum (&c3h, &tl, c3h, c5 * z2);
-  c3l += tl;
-  fast_two_sum (&c3h, &tl, c3h, c7 * z4);
-  c3l += tl;
-  double c2h, c2l;
-  a_mul (&th, &tl, z, c3h);
-  fast_two_sum (&c2h, &c2l, c[4], th);
-  c2l += fma (z, c3l, tl);
-  a_mul (&th, &tl, z, c2h);
-  fast_two_sum (h, l, c[2], th);
-  *l += tl + fma (z, c2l, c[3]);
-  a_mul (&th, &tl, z, *h);
-  tl = fma (z, *l, tl); /* tl += z*l */
-  fast_two_sum (h, l, c[0], th);
-  *l += tl + c[1];
-  return 0x1.11p-69;
-}
-
-/* for |z| < 1/8, assuming z >= 2^-61, thus no underflow can occur */
-static void
-cr_erf_accurate_tiny (double *h, double *l, double z)
-{
-  double z2 = z * z, th, tl;
-  *h = P[21 / 2 + 4]; /* degree 21 */
-  for (int j = 19; j > 11; j -= 2)
-    *h = fma (*h, z2, P[j / 2 + 4]); /* degree j */
-  *l = 0;
-  for (int j = 11; j > 7; j -= 2)
-    {
-      /* multiply h+l by z^2 */
-      a_mul (&th, &tl, *h, z);
-      tl = fma (*l, z, tl);
-      a_mul (h, l, th, z);
-      *l = fma (tl, z, *l);
-      /* add P[j] to h + l */
-      fast_two_sum (h, &tl, P[j / 2 + 4], *h);
-      *l += tl;
-    }
-  for (int j = 7; j >= 1; j -= 2)
-    {
-      /* multiply h+l by z^2 */
-      a_mul (&th, &tl, *h, z);
-      tl = fma (*l, z, tl);
-      a_mul (h, l, th, z);
-      *l = fma (tl, z, *l);
-      fast_two_sum (h, &tl, P[j - 1], *h);
-      *l += P[j] + tl;
-    }
-  /* multiply by z */
-  a_mul (h, &tl, *h, z);
-  *l = fma (*l, z, tl);
-  return;
-}
-
 /* Assuming 0 <= z <= 0x1.7afb48dc96626p+2, put in h+l an accurate
    approximation of erf(z).
    Assumes z >= 2^-61, thus no underflow can occur. */
@@ -179,7 +59,7 @@ cr_erf_accurate (double *h, double *l, double z)
 {
   double th, tl;
   if (z < 0.125) /* z < 1/8 */
-    return cr_erf_accurate_tiny (h, l, z);
+    return __cr_erf_accurate_tiny (h, l, z, false);
   double v = floor (8.0 * z);
   uint32_t i = 8.0 * z;
   z = (z - 0.0625) - 0.125 * v;
@@ -507,7 +387,7 @@ cr_erfc_fast (double *h, double *l, double x)
      throughput is about 44 cycles */
   if (x < 0) // erfc(x) = 1 - erf(x) = 1 + erf(-x)
     {
-      double err = cr_erf_fast (h, l, -x);
+      double err = __cr_erf_fast (h, l, -x);
       /* h+l approximates erf(-x), with relative error bounded by err,
 	 where err <= 0x1.78p-69 */
       err = err * *h; /* convert into absolute error */
@@ -533,7 +413,7 @@ cr_erfc_fast (double *h, double *l, double x)
      the average reciprocal throughput is about 59 cycles */
   else if (x <= THRESHOLD1)
     {
-      double err = cr_erf_fast (h, l, x);
+      double err = __cr_erf_fast (h, l, x);
       /* h+l approximates erf(x), with relative error bounded by err,
 	 where err <= 0x1.78p-69 */
       err = err * *h; /* convert into absolute error */