]> git.ipfire.org Git - thirdparty/glibc.git/commitdiff
powerpc: Add a POWER8-optimized version of cosf()
authorPaul Clarke <pc@us.ibm.com>
Wed, 17 May 2017 19:35:33 +0000 (16:35 -0300)
committerTulio Magno Quites Machado Filho <tuliom@linux.vnet.ibm.com>
Wed, 17 May 2017 21:37:48 +0000 (18:37 -0300)
This implementation is based on the one already used at
sysdeps/powerpc/powerpc64/fpu/multiarch/s_sinf-power8.S.

* sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile
[$(subdir) = math] (libm-sysdep_routines): Add s_cosf-power8 and
s_cosf-ppc64.
* sysdeps/powerpc/powerpc64/fpu/multiarch/s_cosf-power8.S: New file.
* sysdeps/powerpc/powerpc64/fpu/multiarch/s_cosf-ppc64.c: Likewise.
* sysdeps/powerpc/powerpc64/fpu/multiarch/s_cosf.c: Likewise.
* sysdeps/powerpc/powerpc64/power8/fpu/s_cosf.S: Likewise.

ChangeLog
sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile
sysdeps/powerpc/powerpc64/fpu/multiarch/s_cosf-power8.S [new file with mode: 0644]
sysdeps/powerpc/powerpc64/fpu/multiarch/s_cosf-ppc64.c [new file with mode: 0644]
sysdeps/powerpc/powerpc64/fpu/multiarch/s_cosf.c [new file with mode: 0644]
sysdeps/powerpc/powerpc64/power8/fpu/s_cosf.S [new file with mode: 0644]

index ef284095cf26676eae0626452f96bebe12d9796b..6c03b363a2611022fd5a2c6d72dd6179a1e513b7 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,13 @@
+2017-05-17  Paul A. Clarke  <pc@us.ibm.com>
+
+       * sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile
+       [$(subdir) = math] (libm-sysdep_routines): Add s_cosf-power8 and
+       s_cosf-ppc64.
+       * sysdeps/powerpc/powerpc64/fpu/multiarch/s_cosf-power8.S: New file.
+       * sysdeps/powerpc/powerpc64/fpu/multiarch/s_cosf-ppc64.c: Likewise.
+       * sysdeps/powerpc/powerpc64/fpu/multiarch/s_cosf.c: Likewise.
+       * sysdeps/powerpc/powerpc64/power8/fpu/s_cosf.S: Likewise.
+
 2017-05-17  Gabriel F. T. Gomes  <gftg@linux.vnet.ibm.com>
 
        * math/Makefile (libm-calls): Move e_exp2F to gen-libm-calls.
index ff4228825ec121780b3aec90df048bf899c9ae0c..317a988854ca60b2788d9706df3539eedf7fc1fc 100644 (file)
@@ -26,7 +26,8 @@ libm-sysdep_routines += s_isnan-power7 s_isnan-power6x s_isnan-power6 \
                        s_isnan-power8 s_isinf-power8 s_finite-power8 \
                        s_llrint-power8 s_llround-power8 \
                        e_expf-power8 e_expf-ppc64 \
-                       s_sinf-ppc64 s_sinf-power8
+                       s_sinf-ppc64 s_sinf-power8 \
+                       s_cosf-ppc64 s_cosf-power8
 
 CFLAGS-s_logbf-power7.c = -mcpu=power7
 CFLAGS-s_logbl-power7.c = -mcpu=power7
diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/s_cosf-power8.S b/sysdeps/powerpc/powerpc64/fpu/multiarch/s_cosf-power8.S
new file mode 100644 (file)
index 0000000..ee00a2c
--- /dev/null
@@ -0,0 +1,26 @@
+/* cosf function.  PowerPC64/power8 version.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#undef weak_alias
+#define weak_alias(a,b)
+
+#define __cosf __cosf_power8
+
+#include <sysdeps/powerpc/powerpc64/power8/fpu/s_cosf.S>
diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/s_cosf-ppc64.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/s_cosf-ppc64.c
new file mode 100644 (file)
index 0000000..635624c
--- /dev/null
@@ -0,0 +1,26 @@
+/* cosf function.  PowerPC64 default version.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#undef weak_alias
+#define weak_alias(a, b)
+
+#define __cosf __cosf_ppc64
+
+#include <sysdeps/powerpc/fpu/s_cosf.c>
diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/s_cosf.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/s_cosf.c
new file mode 100644 (file)
index 0000000..acf2a59
--- /dev/null
@@ -0,0 +1,31 @@
+/* Multiple versions of cosf.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <math.h>
+#include <shlib-compat.h>
+#include "init-arch.h"
+
+extern __typeof (__cosf) __cosf_ppc64 attribute_hidden;
+extern __typeof (__cosf) __cosf_power8 attribute_hidden;
+
+libc_ifunc (__cosf,
+           (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+           ? __cosf_power8
+           : __cosf_ppc64);
+
+weak_alias (__cosf, cosf)
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_cosf.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_cosf.S
new file mode 100644 (file)
index 0000000..8dfa007
--- /dev/null
@@ -0,0 +1,508 @@
+/* Optimized cosf().  PowerPC64/POWER8 version.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#define _ERRNO_H       1
+#include <bits/errno.h>
+
+#define FRAMESIZE (FRAME_MIN_SIZE+16)
+
+#define FLOAT_EXPONENT_SHIFT   23
+#define FLOAT_EXPONENT_BIAS    127
+#define INTEGER_BITS           3
+
+#define PI_4           0x3f490fdb      /* PI/4 */
+#define NINEPI_4       0x40e231d6      /* 9 * PI/4 */
+#define TWO_PN5                0x3d000000      /* 2^-5 */
+#define TWO_PN27       0x32000000      /* 2^-27 */
+#define INFINITY       0x7f800000
+#define TWO_P23                0x4b000000      /* 2^23 */
+#define FX_FRACTION_1_28 0x9249250     /* 0x100000000 / 28 + 1 */
+
+       /* Implements the function
+
+          float [fp1] cosf (float [fp1] x)  */
+
+       .machine power8
+EALIGN(__cosf, 4, 0)
+       addis   r9,r2,L(anchor)@toc@ha
+       addi    r9,r9,L(anchor)@toc@l
+
+       lis     r4,PI_4@h
+       ori     r4,r4,PI_4@l
+
+       xscvdpspn v0,v1
+       mfvsrd  r8,v0
+       rldicl  r3,r8,32,33             /* Remove sign bit.  */
+
+       cmpw    r3,r4
+       bge     L(greater_or_equal_pio4)
+
+       lis     r4,TWO_PN5@h
+       ori     r4,r4,TWO_PN5@l
+
+       cmpw    r3,r4
+       blt     L(less_2pn5)
+
+       /* Chebyshev polynomial of the form:
+        * 1.0+x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))).  */
+
+       lfd     fp9,(L(C0)-L(anchor))(r9)
+       lfd     fp10,(L(C1)-L(anchor))(r9)
+       lfd     fp11,(L(C2)-L(anchor))(r9)
+       lfd     fp12,(L(C3)-L(anchor))(r9)
+       lfd     fp13,(L(C4)-L(anchor))(r9)
+
+       fmul    fp2,fp1,fp1             /* x^2 */
+       lfd     fp3,(L(DPone)-L(anchor))(r9)
+
+       fmadd   fp4,fp2,fp13,fp12       /* C3+x^2*C4 */
+       fmadd   fp4,fp2,fp4,fp11        /* C2+x^2*(C3+x^2*C4) */
+       fmadd   fp4,fp2,fp4,fp10        /* C1+x^2*(C2+x^2*(C3+x^2*C4)) */
+       fmadd   fp4,fp2,fp4,fp9         /* C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4))) */
+       fmadd   fp1,fp2,fp4,fp3         /* 1.0+x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))) */
+       frsp    fp1,fp1                 /* Round to single precision.  */
+
+       blr
+
+       .balign 16
+L(greater_or_equal_pio4):
+       lis     r4,NINEPI_4@h
+       ori     r4,r4,NINEPI_4@l
+       cmpw    r3,r4
+       bge     L(greater_or_equal_9pio4)
+
+       /* Calculate quotient of |x|/(PI/4).  */
+       lfd     fp2,(L(invpio4)-L(anchor))(r9)
+       fabs    fp1,fp1                 /* |x| */
+       fmul    fp2,fp1,fp2             /* |x|/(PI/4) */
+       fctiduz fp2,fp2
+       mfvsrd  r3,v2                   /* n = |x| mod PI/4 */
+
+       /* Now use that quotient to find |x| mod (PI/2).  */
+       addi    r7,r3,1
+       rldicr  r5,r7,2,60              /* ((n+1) >> 1) << 3 */
+       addi    r6,r9,(L(pio2_table)-L(anchor))
+       lfdx    fp4,r5,r6
+       fsub    fp1,fp1,fp4
+
+       .balign 16
+L(reduced):
+       /* Now we are in the range -PI/4 to PI/4.  */
+
+       /* Work out if we are in a positive or negative primary interval.  */
+       addi    r7,r7,2
+       rldicl  r4,r7,62,63             /* ((n+3) >> 2) & 1 */
+
+       /* Load a 1.0 or -1.0.  */
+       addi    r5,r9,(L(ones)-L(anchor))
+       sldi    r4,r4,3
+       lfdx    fp0,r4,r5
+
+       /* Are we in the primary interval of sin or cos?  */
+       andi.   r4,r7,0x2
+       bne     L(cos)
+
+       /* Chebyshev polynomial of the form:
+          x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))).  */
+
+       lfd     fp9,(L(S0)-L(anchor))(r9)
+       lfd     fp10,(L(S1)-L(anchor))(r9)
+       lfd     fp11,(L(S2)-L(anchor))(r9)
+       lfd     fp12,(L(S3)-L(anchor))(r9)
+       lfd     fp13,(L(S4)-L(anchor))(r9)
+
+       fmul    fp2,fp1,fp1             /* x^2 */
+       fmul    fp3,fp2,fp1             /* x^3 */
+
+       fmadd   fp4,fp2,fp13,fp12       /* S3+x^2*S4 */
+       fmadd   fp4,fp2,fp4,fp11        /* S2+x^2*(S3+x^2*S4) */
+       fmadd   fp4,fp2,fp4,fp10        /* S1+x^2*(S2+x^2*(S3+x^2*S4)) */
+       fmadd   fp4,fp2,fp4,fp9         /* S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4))) */
+       fmadd   fp4,fp3,fp4,fp1         /* x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))) */
+       fmul    fp4,fp4,fp0             /* Add in the sign.  */
+       frsp    fp1,fp4                 /* Round to single precision.  */
+
+       blr
+
+       .balign 16
+L(cos):
+       /* Chebyshev polynomial of the form:
+          1.0+x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))).  */
+
+       lfd     fp9,(L(C0)-L(anchor))(r9)
+       lfd     fp10,(L(C1)-L(anchor))(r9)
+       lfd     fp11,(L(C2)-L(anchor))(r9)
+       lfd     fp12,(L(C3)-L(anchor))(r9)
+       lfd     fp13,(L(C4)-L(anchor))(r9)
+
+       fmul    fp2,fp1,fp1             /* x^2 */
+       lfd     fp3,(L(DPone)-L(anchor))(r9)
+
+       fmadd   fp4,fp2,fp13,fp12       /* C3+x^2*C4 */
+       fmadd   fp4,fp2,fp4,fp11        /* C2+x^2*(C3+x^2*C4) */
+       fmadd   fp4,fp2,fp4,fp10        /* C1+x^2*(C2+x^2*(C3+x^2*C4)) */
+       fmadd   fp4,fp2,fp4,fp9         /* C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4))) */
+       fmadd   fp4,fp2,fp4,fp3         /* 1.0 + x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))) */
+       fmul    fp4,fp4,fp0             /* Add in the sign.  */
+       frsp    fp1,fp4                 /* Round to single precision.  */
+
+       blr
+
+       .balign 16
+L(greater_or_equal_9pio4):
+       lis     r4,INFINITY@h
+       ori     r4,r4,INFINITY@l
+       cmpw    r3,r4
+       bge     L(inf_or_nan)
+
+       lis     r4,TWO_P23@h
+       ori     r4,r4,TWO_P23@l
+       cmpw    r3,r4
+       bge     L(greater_or_equal_2p23)
+
+       fabs    fp1,fp1                 /* |x| */
+
+       /* Calculate quotient of |x|/(PI/4).  */
+       lfd     fp2,(L(invpio4)-L(anchor))(r9)
+
+       lfd     fp3,(L(DPone)-L(anchor))(r9)
+       lfd     fp4,(L(DPhalf)-L(anchor))(r9)
+       fmul    fp2,fp1,fp2             /* |x|/(PI/4) */
+       friz    fp2,fp2                 /* n = floor(|x|/(PI/4)) */
+
+       /* Calculate (n + 1) / 2.  */
+       fadd    fp2,fp2,fp3             /* n + 1 */
+       fmul    fp3,fp2,fp4             /* (n + 1) / 2 */
+       friz    fp3,fp3
+
+       lfd     fp4,(L(pio2hi)-L(anchor))(r9)
+       lfd     fp5,(L(pio2lo)-L(anchor))(r9)
+
+       fmul    fp6,fp4,fp3
+       fadd    fp6,fp6,fp1
+       fmadd   fp1,fp5,fp3,fp6
+
+       fctiduz fp2,fp2
+       mfvsrd  r7,v2                   /* n + 1 */
+
+       b       L(reduced)
+
+       .balign 16
+L(inf_or_nan):
+       bne     L(skip_errno_setting)   /* Is a NAN?  */
+
+       /* We delayed the creation of the stack frame, as well as the saving of
+          the link register, because only at this point, we are sure that
+          doing so is actually needed.  */
+
+       stfd    fp1,-8(r1)
+
+       /* Save the link register.  */
+       mflr    r0
+       std     r0,16(r1)
+       cfi_offset(lr, 16)
+
+       /* Create the stack frame.  */
+       stdu    r1,-FRAMESIZE(r1)
+       cfi_adjust_cfa_offset(FRAMESIZE)
+
+       bl      JUMPTARGET(__errno_location)
+       nop
+
+       /* Restore the stack frame.  */
+       addi    r1,r1,FRAMESIZE
+       cfi_adjust_cfa_offset(-FRAMESIZE)
+       /* Restore the link register.  */
+       ld      r0,16(r1)
+       mtlr    r0
+
+       lfd     fp1,-8(r1)
+
+       /* errno = EDOM */
+       li      r4,EDOM
+       stw     r4,0(r3)
+
+L(skip_errno_setting):
+       fsub    fp1,fp1,fp1             /* x - x */
+       blr
+
+       .balign 16
+L(greater_or_equal_2p23):
+       fabs    fp1,fp1
+
+       srwi    r4,r3,FLOAT_EXPONENT_SHIFT
+       subi    r4,r4,FLOAT_EXPONENT_BIAS
+
+       /* We reduce the input modulo pi/4, so we need 3 bits of integer
+          to determine where in 2*pi we are. Index into our array
+          accordingly.  */
+       addi r4,r4,INTEGER_BITS
+
+       /* To avoid an expensive divide, for the range we care about (0 - 127)
+          we can transform x/28 into:
+
+          x/28 = (x * ((0x100000000 / 28) + 1)) >> 32
+
+          mulhwu returns the top 32 bits of the 64 bit result, doing the
+          shift for us in the same instruction. The top 32 bits are undefined,
+          so we have to mask them.  */
+
+       lis     r6,FX_FRACTION_1_28@h
+       ori     r6,r6,FX_FRACTION_1_28@l
+       mulhwu  r5,r4,r6
+       clrldi  r5,r5,32
+
+       /* Get our pointer into the invpio4_table array.  */
+       sldi    r4,r5,3
+       addi    r6,r9,(L(invpio4_table)-L(anchor))
+       add     r4,r4,r6
+
+       lfd     fp2,0(r4)
+       lfd     fp3,8(r4)
+       lfd     fp4,16(r4)
+       lfd     fp5,24(r4)
+
+       fmul    fp6,fp2,fp1
+       fmul    fp7,fp3,fp1
+       fmul    fp8,fp4,fp1
+       fmul    fp9,fp5,fp1
+
+       /* Mask off larger integer bits in highest double word that we don't
+          care about to avoid losing precision when combining with smaller
+          values.  */
+       fctiduz fp10,fp6
+       mfvsrd  r7,v10
+       rldicr  r7,r7,0,(63-INTEGER_BITS)
+       mtvsrd  v10,r7
+       fcfidu  fp10,fp10               /* Integer bits.  */
+
+       fsub    fp6,fp6,fp10            /* highest -= integer bits */
+
+       /* Work out the integer component, rounded down. Use the top two
+          limbs for this.  */
+       fadd    fp10,fp6,fp7            /* highest + higher */
+
+       fctiduz fp10,fp10
+       mfvsrd  r7,v10
+       andi.   r0,r7,1
+       fcfidu  fp10,fp10
+
+       /* Subtract integer component from highest limb.  */
+       fsub    fp12,fp6,fp10
+
+       beq     L(even_integer)
+
+       /* Our integer component is odd, so we are in the -PI/4 to 0 primary
+          region. We need to shift our result down by PI/4, and to do this
+          in the mod (4/PI) space we simply subtract 1.  */
+       lfd     fp11,(L(DPone)-L(anchor))(r9)
+       fsub    fp12,fp12,fp11
+
+       /* Now add up all the limbs in order.  */
+       fadd    fp12,fp12,fp7
+       fadd    fp12,fp12,fp8
+       fadd    fp12,fp12,fp9
+
+       /* And finally multiply by pi/4.  */
+       lfd     fp13,(L(pio4)-L(anchor))(r9)
+       fmul    fp1,fp12,fp13
+
+       addi    r7,r7,1
+       b       L(reduced)
+
+L(even_integer):
+       lfd     fp11,(L(DPone)-L(anchor))(r9)
+
+       /* Now add up all the limbs in order.  */
+       fadd    fp12,fp12,fp7
+       fadd    fp12,r12,fp8
+       fadd    fp12,r12,fp9
+
+       /* We need to check if the addition of all the limbs resulted in us
+          overflowing 1.0.  */
+       fcmpu   0,fp12,fp11
+       bgt     L(greater_than_one)
+
+       /* And finally multiply by pi/4.  */
+       lfd     fp13,(L(pio4)-L(anchor))(r9)
+       fmul    fp1,fp12,fp13
+
+       addi    r7,r7,1
+       b       L(reduced)
+
+L(greater_than_one):
+       /* We did overflow 1.0 when adding up all the limbs. Add 1.0 to our
+          integer, and subtract 1.0 from our result. Since that makes the
+          integer component odd, we need to subtract another 1.0 as
+          explained above.  */
+       addi    r7,r7,1
+
+       lfd     fp11,(L(DPtwo)-L(anchor))(r9)
+       fsub    fp12,fp12,fp11
+
+       /* And finally multiply by pi/4.  */
+       lfd     fp13,(L(pio4)-L(anchor))(r9)
+       fmul    fp1,fp12,fp13
+
+       addi    r7,r7,1
+       b       L(reduced)
+
+       .balign 16
+L(less_2pn5):
+       lis     r4,TWO_PN27@h
+       ori     r4,r4,TWO_PN27@l
+
+       cmpw    r3,r4
+       blt     L(less_2pn27)
+
+       /* A simpler Chebyshev approximation is close enough for this range:
+          1.0+x^2*(CC0+x^3*CC1).  */
+
+       lfd     fp10,(L(CC0)-L(anchor))(r9)
+       lfd     fp11,(L(CC1)-L(anchor))(r9)
+
+       fmul    fp2,fp1,fp1             /* x^2 */
+       fmul    fp3,fp2,fp1             /* x^3 */
+       lfd     fp1,(L(DPone)-L(anchor))(r9)
+
+       fmadd   fp4,fp3,fp11,fp10       /* CC0+x^3*CC1 */
+       fmadd   fp1,fp2,fp4,fp1         /* 1.0+x^2*(CC0+x^3*CC1) */
+
+       frsp    fp1,fp1                 /* Round to single precision.  */
+
+       blr
+
+       .balign 16
+L(less_2pn27):
+       /* Handle some special cases:
+
+          cosf(subnormal) raises inexact
+          cosf(min_normalized) raises inexact
+          cosf(normalized) raises inexact.  */
+
+       lfd     fp2,(L(DPone)-L(anchor))(r9)
+
+       fabs    fp1,fp1                 /* |x| */
+       fsub    fp1,fp2,fp1             /* 1.0-|x| */
+
+       frsp    fp1,fp1
+
+       blr
+
+END (__cosf)
+
+       .section .rodata, "a"
+
+       .balign 8
+
+L(anchor):
+
+       /* Chebyshev constants for sin, range -PI/4 - PI/4.  */
+L(S0): .8byte  0xbfc5555555551cd9
+L(S1): .8byte  0x3f81111110c2688b
+L(S2): .8byte  0xbf2a019f8b4bd1f9
+L(S3): .8byte  0x3ec71d7264e6b5b4
+L(S4): .8byte  0xbe5a947e1674b58a
+
+       /* Chebyshev constants for cos, range 2^-27 - 2^-5.  */
+L(CC0):        .8byte  0xbfdfffffff5cc6fd
+L(CC1):        .8byte  0x3fa55514b178dac5
+
+       /* Chebyshev constants for cos, range -PI/4 - PI/4.  */
+L(C0): .8byte  0xbfdffffffffe98ae
+L(C1): .8byte  0x3fa55555545c50c7
+L(C2): .8byte  0xbf56c16b348b6874
+L(C3): .8byte  0x3efa00eb9ac43cc0
+L(C4): .8byte  0xbe923c97dd8844d7
+
+L(invpio2):
+       .8byte  0x3fe45f306dc9c883      /* 2/PI */
+
+L(invpio4):
+       .8byte  0x3ff45f306dc9c883      /* 4/PI */
+
+L(invpio4_table):
+       .8byte  0x0000000000000000
+       .8byte  0x3ff45f306c000000
+       .8byte  0x3e3c9c882a000000
+       .8byte  0x3c54fe13a8000000
+       .8byte  0x3aaf47d4d0000000
+       .8byte  0x38fbb81b6c000000
+       .8byte  0x3714acc9e0000000
+       .8byte  0x3560e4107c000000
+       .8byte  0x33bca2c756000000
+       .8byte  0x31fbd778ac000000
+       .8byte  0x300b7246e0000000
+       .8byte  0x2e5d2126e8000000
+       .8byte  0x2c97003248000000
+       .8byte  0x2ad77504e8000000
+       .8byte  0x290921cfe0000000
+       .8byte  0x274deb1cb0000000
+       .8byte  0x25829a73e0000000
+       .8byte  0x23fd1046be000000
+       .8byte  0x2224baed10000000
+       .8byte  0x20709d338e000000
+       .8byte  0x1e535a2f80000000
+       .8byte  0x1cef904e64000000
+       .8byte  0x1b0d639830000000
+       .8byte  0x1964ce7d24000000
+       .8byte  0x17b908bf16000000
+
+L(pio4):
+       .8byte  0x3fe921fb54442d18      /* PI/4 */
+
+/* PI/2 as a sum of two doubles. We only use 32 bits of the upper limb
+   to avoid losing significant bits when multiplying with up to
+   (2^22)/(pi/2).  */
+L(pio2hi):
+       .8byte  0xbff921fb54400000
+
+L(pio2lo):
+       .8byte  0xbdd0b4611a626332
+
+L(pio2_table):
+       .8byte  0
+       .8byte  0x3ff921fb54442d18      /* 1 * PI/2 */
+       .8byte  0x400921fb54442d18      /* 2 * PI/2 */
+       .8byte  0x4012d97c7f3321d2      /* 3 * PI/2 */
+       .8byte  0x401921fb54442d18      /* 4 * PI/2 */
+       .8byte  0x401f6a7a2955385e      /* 5 * PI/2 */
+       .8byte  0x4022d97c7f3321d2      /* 6 * PI/2 */
+       .8byte  0x4025fdbbe9bba775      /* 7 * PI/2 */
+       .8byte  0x402921fb54442d18      /* 8 * PI/2 */
+       .8byte  0x402c463abeccb2bb      /* 9 * PI/2 */
+       .8byte  0x402f6a7a2955385e      /* 10 * PI/2 */
+
+L(small):
+       .8byte  0x3cd0000000000000      /* 2^-50 */
+
+L(ones):
+       .8byte  0x3ff0000000000000      /* +1.0 */
+       .8byte  0xbff0000000000000      /* -1.0 */
+
+L(DPhalf):
+       .8byte  0x3fe0000000000000      /* 0.5 */
+
+L(DPone):
+       .8byte  0x3ff0000000000000      /* 1.0 */
+
+L(DPtwo):
+       .8byte  0x4000000000000000      /* 2.0 */
+
+weak_alias(__cosf, cosf)