[thirdparty/glibc.git] / sysdeps / x86_64 / fpu / s_cosf.S

/* Optimized cosf function.
   Copyright (C) 2012-2014 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <http://www.gnu.org/licenses/>.  */

#include <sysdep.h>
#define __need_Emath
#include <bits/errno.h>

/* Short algorithm description:
 *
 *  1) if |x| == 0: return 1.0-|x|.
 *  2) if |x| <  2^-27: return 1.0-|x|.
 *  3) if |x| <  2^-5 : return 1.0+x^2*DP_COS2_0+x^5*DP_COS2_1.
 *  4) if |x| <   Pi/4: return 1.0+x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))).
 *  5) if |x| < 9*Pi/4:
 *      5.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0x0e, n=k+3,
 *           t=|x|-j*Pi/4.
 *      5.2) Reconstruction:
 *          s = (-1.0)^((n>>2)&1)
 *          if(n&2 != 0) {
 *              using cos(t) polynomial for |t|<Pi/4, result is
 *              s     * (1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4))))).
 *          } else {
 *              using sin(t) polynomial for |t|<Pi/4, result is
 *              s * t * (1.0+t^2*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4))))).
 *          }
 *  6) if |x| < 2^23, large args:
 *      6.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+3,
 *           t=|x|-j*Pi/4.
 *      6.2) Reconstruction same as (5.2).
 *  7) if |x| >= 2^23, very large args:
 *      7.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+3,
 *           t=|x|-j*Pi/4.
 *      7.2) Reconstruction same as (5.2).
 *  8) if x is Inf, return x-x, and set errno=EDOM.
 *  9) if x is NaN, return x-x.
 *
 * Special cases:
 *  cos(+-0) = 1 not raising inexact,
 *  cos(subnormal) raises inexact,
 *  cos(min_normalized) raises inexact,
 *  cos(normalized) raises inexact,
 *  cos(Inf) = NaN, raises invalid, sets errno to EDOM,
 *  cos(NaN) = NaN.
 */

	.text
ENTRY(__cosf)
	/* Input: single precision x in %xmm0 */

	movd	%xmm0, %eax		/* Bits of x */
	movaps	%xmm0, %xmm7		/* Copy of x */
	cvtss2sd %xmm0, %xmm0		/* DP x */
	movss	L(SP_ABS_MASK)(%rip), %xmm3
	andl	$0x7fffffff, %eax	/* |x| */

	cmpl	$0x3f490fdb, %eax	/* |x|<Pi/4?  */
	jb	L(arg_less_pio4)

	/* Here if |x|>=Pi/4 */
	andps	%xmm7, %xmm3		/* SP |x| */
	andpd	L(DP_ABS_MASK)(%rip), %xmm0	/* DP |x| */
	movss	L(SP_INVPIO4)(%rip), %xmm2	/* SP 1/(Pi/4) */

	cmpl	$0x40e231d6, %eax	/* |x|<9*Pi/4?  */
	jae	L(large_args)

	/* Here if Pi/4<=|x|<9*Pi/4 */
	mulss	%xmm3, %xmm2		/* SP |x|/(Pi/4) */
	cvttss2si %xmm2, %eax		/* k, number of Pi/4 in x */
	lea	L(PIO4J)(%rip), %rsi
	addl	$1, %eax		/* k+1 */
	movl	$0x0e, %edx
	andl	%eax, %edx		/* j = (k+1)&0x0e */
	addl	$2, %eax		/* n */
	subsd	(%rsi,%rdx,8), %xmm0	/* t = |x| - j * Pi/4 */

L(reconstruction):
	/* Input: %eax=n, %xmm0=t */
	testl	$2, %eax		/* n&2 != 0?  */
	jz	L(sin_poly)

/*L(cos_poly):*/
	/* Here if cos(x) calculated using cos(t) polynomial for |t|<Pi/4:
	 * y = t*t; z = y*y;
	 * s = sign(x) * (-1.0)^((n>>2)&1)
	 * result = s     * (1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4)))))
	 */
	shrl	$2, %eax		/* n>>2 */
	mulsd	%xmm0, %xmm0		/* y=t^2 */
	andl	$1, %eax		/* (n>>2)&1 */
	movaps	%xmm0, %xmm1		/* y */
	mulsd	%xmm0, %xmm0		/* z=t^4 */

	movsd	L(DP_C4)(%rip), %xmm4	/* C4 */
	mulsd	%xmm0, %xmm4		/* z*C4 */
	movsd	L(DP_C3)(%rip), %xmm3	/* C3 */
	mulsd	%xmm0, %xmm3		/* z*C3 */
	lea	L(DP_ONES)(%rip), %rsi
	addsd	L(DP_C2)(%rip), %xmm4	/* C2+z*C4 */
	mulsd	%xmm0, %xmm4		/* z*(C2+z*C4) */
	addsd	L(DP_C1)(%rip), %xmm3	/* C1+z*C3 */
	mulsd	%xmm0, %xmm3		/* z*(C1+z*C3) */
	addsd	L(DP_C0)(%rip), %xmm4	/* C0+z*(C2+z*C4) */
	mulsd	%xmm1, %xmm4		/* y*(C0+z*(C2+z*C4)) */

	addsd	%xmm4, %xmm3		/* y*(C0+y*(C1+y*(C2+y*(C3+y*C4)))) */
	/* 1.0+y*(C0+y*(C1+y*(C2+y*(C3+y*C4)))) */
	addsd	L(DP_ONES)(%rip), %xmm3

	mulsd	(%rsi,%rax,8), %xmm3	/* DP result */
	cvtsd2ss %xmm3, %xmm0		/* SP result */
	ret

	.p2align	4
L(sin_poly):
	/* Here if cos(x) calculated using sin(t) polynomial for |t|<Pi/4:
	 * y = t*t; z = y*y;
	 * s = sign(x) * (-1.0)^((n>>2)&1)
	 * result = s * t * (1.0+t^2*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4)))))
	 */

	movaps	%xmm0, %xmm4		/* t */
	shrl	$2, %eax		/* n>>2 */
	mulsd	%xmm0, %xmm0		/* y=t^2 */
	andl	$1, %eax		/* (n>>2)&1 */
	movaps	%xmm0, %xmm1		/* y */
	mulsd	%xmm0, %xmm0		/* z=t^4 */

	movsd	L(DP_S4)(%rip), %xmm2	/* S4 */
	mulsd	%xmm0, %xmm2		/* z*S4 */
	movsd	L(DP_S3)(%rip), %xmm3	/* S3 */
	mulsd	%xmm0, %xmm3		/* z*S3 */
	lea	L(DP_ONES)(%rip), %rsi
	addsd	L(DP_S2)(%rip), %xmm2	/* S2+z*S4 */
	mulsd	%xmm0, %xmm2		/* z*(S2+z*S4) */
	addsd	L(DP_S1)(%rip), %xmm3	/* S1+z*S3 */
	mulsd	%xmm0, %xmm3		/* z*(S1+z*S3) */
	addsd	L(DP_S0)(%rip), %xmm2	/* S0+z*(S2+z*S4) */
	mulsd	%xmm1, %xmm2		/* y*(S0+z*(S2+z*S4)) */
	/* t*s, where s = sign(x) * (-1.0)^((n>>2)&1) */
	mulsd	(%rsi,%rax,8), %xmm4
	/* y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */
	addsd	%xmm2, %xmm3
	/* t*s*y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */
	mulsd	%xmm4, %xmm3
	/* t*s*(1.0+y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */
	addsd	%xmm4, %xmm3
	cvtsd2ss %xmm3, %xmm0		/* SP result */
	ret

	.p2align	4
L(large_args):
	/* Here if |x|>=9*Pi/4 */
	cmpl	$0x7f800000, %eax	/* x is Inf or NaN?  */
	jae	L(arg_inf_or_nan)

	/* Here if finite |x|>=9*Pi/4 */
	cmpl	$0x4b000000, %eax	/* |x|<2^23?  */
	jae	L(very_large_args)

	/* Here if 9*Pi/4<=|x|<2^23 */
	movsd	L(DP_INVPIO4)(%rip), %xmm1 /* 1/(Pi/4) */
	mulsd	%xmm0, %xmm1		/* |x|/(Pi/4) */
	cvttsd2si %xmm1, %eax		/* k=trunc(|x|/(Pi/4)) */
	addl	$1, %eax		/* k+1 */
	movl	%eax, %edx
	andl	$0xfffffffe, %edx	/* j=(k+1)&0xfffffffe */
	cvtsi2sdl %edx, %xmm4		/* DP j */
	movsd	L(DP_PIO4HI)(%rip), %xmm2 /* -PIO4HI = high part of -Pi/4 */
	mulsd	%xmm4, %xmm2		/* -j*PIO4HI */
	movsd	L(DP_PIO4LO)(%rip), %xmm3 /* -PIO4LO = low part of -Pi/4 */
	addsd	%xmm2, %xmm0		/* |x| - j*PIO4HI */
	addl	$2, %eax		/* n */
	mulsd	%xmm3, %xmm4		/* j*PIO4LO */
	addsd	%xmm4, %xmm0		/* t = |x| - j*PIO4HI - j*PIO4LO */
	jmp	L(reconstruction)

	.p2align	4
L(very_large_args):
	/* Here if finite |x|>=2^23 */

	/* bitpos = (ix>>23) - BIAS_32 + 59; */
	shrl	$23, %eax		/* eb = biased exponent of x */
	/* bitpos = eb - 0x7f + 59, where 0x7f is exponent bias */
	subl	$68, %eax
	movl	$28, %ecx		/* %cl=28 */
	movl	%eax, %edx		/* bitpos copy */

	/* j = bitpos/28; */
	div	%cl			/* j in register %al=%ax/%cl */
	movapd	%xmm0, %xmm3		/* |x| */
	/* clear unneeded remainder from %ah */
	andl	$0xff, %eax

	imull	$28, %eax, %ecx		/* j*28 */
	lea	L(_FPI)(%rip), %rsi
	movsd	L(DP_HI_MASK)(%rip), %xmm4 /* DP_HI_MASK */
	movapd	%xmm0, %xmm5		/* |x| */
	mulsd	-16(%rsi,%rax,8), %xmm3	/* tmp3 = FPI[j-2]*|x| */
	movapd	%xmm0, %xmm1		/* |x| */
	mulsd	-8(%rsi,%rax,8), %xmm5	/* tmp2 = FPI[j-1]*|x| */
	mulsd	(%rsi,%rax,8), %xmm0	/* tmp0 = FPI[j]*|x| */
	addl	$19, %ecx		/* j*28+19 */
	mulsd	8(%rsi,%rax,8), %xmm1	/* tmp1 = FPI[j+1]*|x| */
	cmpl	%ecx, %edx		/* bitpos>=j*28+19?  */
	jl	L(very_large_skip1)

	/* Here if bitpos>=j*28+19 */
	andpd	%xmm3, %xmm4		/* HI(tmp3) */
	subsd	%xmm4, %xmm3		/* tmp3 = tmp3 - HI(tmp3) */
L(very_large_skip1):

	movsd	L(DP_2POW52)(%rip), %xmm6
	movapd	%xmm5, %xmm2		/* tmp2 copy */
	addsd	%xmm3, %xmm5		/* tmp5 = tmp3 + tmp2 */
	movl	$1, %edx
	addsd	%xmm5, %xmm6		/* tmp6 = tmp5 + 2^52 */
	movsd	8+L(DP_2POW52)(%rip), %xmm4
	movd	%xmm6, %eax		/* k = I64_LO(tmp6); */
	addsd	%xmm6, %xmm4		/* tmp4 = tmp6 - 2^52 */
	comisd	%xmm5, %xmm4		/* tmp4 > tmp5?  */
	jbe	L(very_large_skip2)

	/* Here if tmp4 > tmp5 */
	subl	$1, %eax		/* k-- */
	addsd	8+L(DP_ONES)(%rip), %xmm4 /* tmp4 -= 1.0 */
L(very_large_skip2):

	andl	%eax, %edx		/* k&1 */
	lea	L(DP_ZERONE)(%rip), %rsi
	subsd	%xmm4, %xmm3		/* tmp3 -= tmp4 */
	addsd	(%rsi,%rdx,8), %xmm3	/* t  = DP_ZERONE[k&1] + tmp3 */
	addsd	%xmm2, %xmm3		/* t += tmp2 */
	addsd	%xmm3, %xmm0		/* t += tmp0 */
	addl	$3, %eax		/* n=k+3 */
	addsd	%xmm1, %xmm0		/* t += tmp1 */
	mulsd	L(DP_PIO4)(%rip), %xmm0	/* t *= PI04 */

	jmp	L(reconstruction)	/* end of very_large_args peth */

	.p2align	4
L(arg_less_pio4):
	/* Here if |x|<Pi/4 */
	cmpl	$0x3d000000, %eax	/* |x|<2^-5?  */
	jl	L(arg_less_2pn5)

	/* Here if 2^-5<=|x|<Pi/4 */
	mulsd	%xmm0, %xmm0		/* y=x^2 */
	movaps	%xmm0, %xmm1		/* y */
	mulsd	%xmm0, %xmm0		/* z=x^4 */
	movsd	L(DP_C4)(%rip), %xmm3	/* C4 */
	mulsd	%xmm0, %xmm3		/* z*C4 */
	movsd	L(DP_C3)(%rip), %xmm5	/* C3 */
	mulsd	%xmm0, %xmm5		/* z*C3 */
	addsd	L(DP_C2)(%rip), %xmm3	/* C2+z*C4 */
	mulsd	%xmm0, %xmm3		/* z*(C2+z*C4) */
	addsd	L(DP_C1)(%rip), %xmm5	/* C1+z*C3 */
	mulsd	%xmm0, %xmm5		/* z*(C1+z*C3) */
	addsd	L(DP_C0)(%rip), %xmm3	/* C0+z*(C2+z*C4) */
	mulsd	%xmm1, %xmm3		/* y*(C0+z*(C2+z*C4)) */
	/* y*(C0+y*(C1+y*(C2+y*(C3+y*C4)))) */
	addsd	%xmm5, %xmm3
	/* 1.0 + y*(C0+y*(C1+y*(C2+y*(C3+y*C4)))) */
	addsd	L(DP_ONES)(%rip), %xmm3
	cvtsd2ss %xmm3, %xmm0		/* SP result */
	ret

	.p2align	4
L(arg_less_2pn5):
	/* Here if |x|<2^-5 */
	cmpl	$0x32000000, %eax	/* |x|<2^-27?  */
	jl	L(arg_less_2pn27)

	/* Here if 2^-27<=|x|<2^-5 */
	mulsd	%xmm0, %xmm0		/* DP x^2 */
	movsd	L(DP_COS2_1)(%rip), %xmm3 /* DP DP_COS2_1 */
	mulsd	%xmm0, %xmm3		/* DP x^2*DP_COS2_1 */
	addsd	L(DP_COS2_0)(%rip), %xmm3 /* DP DP_COS2_0+x^2*DP_COS2_1 */
	mulsd	%xmm0, %xmm3		/* DP x^2*DP_COS2_0+x^4*DP_COS2_1 */
	/* DP 1.0+x^2*DP_COS2_0+x^4*DP_COS2_1 */
	addsd	L(DP_ONES)(%rip), %xmm3
	cvtsd2ss %xmm3, %xmm0		/* SP result */
	ret

	.p2align	4
L(arg_less_2pn27):
	/* Here if |x|<2^-27 */
	andps	L(SP_ABS_MASK)(%rip),%xmm7 /* |x| */
	movss	L(SP_ONE)(%rip), %xmm0	/* 1.0 */
	subss	%xmm7, %xmm0		/* result is 1.0-|x| */
	ret

	.p2align	4
L(arg_inf_or_nan):
	/* Here if |x| is Inf or NAN */
	jne	L(skip_errno_setting)	/* in case of x is NaN */

	/* Here if x is Inf. Set errno to EDOM.  */
	call	JUMPTARGET(__errno_location)
	movl	$EDOM, (%rax)

	.p2align	4
L(skip_errno_setting):
	/* Here if |x| is Inf or NAN. Continued.  */
	movaps	%xmm7, %xmm0		/* load x */
	subss	%xmm0, %xmm0		/* Result is NaN */
	ret
END(__cosf)

	.section .rodata, "a"
	.p2align 3
L(PIO4J): /* Table of j*Pi/4, for j=0,1,..,10 */
	.long	0x00000000,0x00000000
	.long	0x54442d18,0x3fe921fb
	.long	0x54442d18,0x3ff921fb
	.long	0x7f3321d2,0x4002d97c
	.long	0x54442d18,0x400921fb
	.long	0x2955385e,0x400f6a7a
	.long	0x7f3321d2,0x4012d97c
	.long	0xe9bba775,0x4015fdbb
	.long	0x54442d18,0x401921fb
	.long	0xbeccb2bb,0x401c463a
	.long	0x2955385e,0x401f6a7a
	.type L(PIO4J), @object
	ASM_SIZE_DIRECTIVE(L(PIO4J))

	.p2align 3
L(_FPI): /* 4/Pi broken into sum of positive DP values */
	.long	0x00000000,0x00000000
	.long	0x6c000000,0x3ff45f30
	.long	0x2a000000,0x3e3c9c88
	.long	0xa8000000,0x3c54fe13
	.long	0xd0000000,0x3aaf47d4
	.long	0x6c000000,0x38fbb81b
	.long	0xe0000000,0x3714acc9
	.long	0x7c000000,0x3560e410
	.long	0x56000000,0x33bca2c7
	.long	0xac000000,0x31fbd778
	.long	0xe0000000,0x300b7246
	.long	0xe8000000,0x2e5d2126
	.long	0x48000000,0x2c970032
	.long	0xe8000000,0x2ad77504
	.long	0xe0000000,0x290921cf
	.long	0xb0000000,0x274deb1c
	.long	0xe0000000,0x25829a73
	.long	0xbe000000,0x23fd1046
	.long	0x10000000,0x2224baed
	.long	0x8e000000,0x20709d33
	.long	0x80000000,0x1e535a2f
	.long	0x64000000,0x1cef904e
	.long	0x30000000,0x1b0d6398
	.long	0x24000000,0x1964ce7d
	.long	0x16000000,0x17b908bf
	.type L(_FPI), @object
	ASM_SIZE_DIRECTIVE(L(_FPI))

/* Coefficients of polynomial
   for cos(x)~=1.0+x^2*DP_COS2_0+x^4*DP_COS2_1, |x|<2^-5.  */
	.p2align 3
L(DP_COS2_0):
	.long	0xff5cc6fd,0xbfdfffff
	.type L(DP_COS2_0), @object
	ASM_SIZE_DIRECTIVE(L(DP_COS2_0))

	.p2align 3
L(DP_COS2_1):
	.long	0xb178dac5,0x3fa55514
	.type L(DP_COS2_1), @object
	ASM_SIZE_DIRECTIVE(L(DP_COS2_1))

	.p2align 3
L(DP_ZERONE):
	.long	0x00000000,0x00000000	/* 0.0 */
	.long	0x00000000,0xbff00000	/* 1.0 */
	.type L(DP_ZERONE), @object
	ASM_SIZE_DIRECTIVE(L(DP_ZERONE))

	.p2align 3
L(DP_ONES):
	.long	0x00000000,0x3ff00000	/* +1.0 */
	.long	0x00000000,0xbff00000	/* -1.0 */
	.type L(DP_ONES), @object
	ASM_SIZE_DIRECTIVE(L(DP_ONES))

/* Coefficients of polynomial
   for sin(t)~=t+t^3*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4)))), |t|<Pi/4.  */
	.p2align 3
L(DP_S3):
	.long	0x64e6b5b4,0x3ec71d72
	.type L(DP_S3), @object
	ASM_SIZE_DIRECTIVE(L(DP_S3))

	.p2align 3
L(DP_S1):
	.long	0x10c2688b,0x3f811111
	.type L(DP_S1), @object
	ASM_SIZE_DIRECTIVE(L(DP_S1))

	.p2align 3
L(DP_S4):
	.long	0x1674b58a,0xbe5a947e
	.type L(DP_S4), @object
	ASM_SIZE_DIRECTIVE(L(DP_S4))

	.p2align 3
L(DP_S2):
	.long	0x8b4bd1f9,0xbf2a019f
	.type L(DP_S2),@object
	ASM_SIZE_DIRECTIVE(L(DP_S2))

	.p2align 3
L(DP_S0):
	.long	0x55551cd9,0xbfc55555
	.type L(DP_S0), @object
	ASM_SIZE_DIRECTIVE(L(DP_S0))

/* Coefficients of polynomial
   for cos(t)~=1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4)))), |t|<Pi/4.  */
	.p2align 3
L(DP_C3):
	.long	0x9ac43cc0,0x3efa00eb
	.type L(DP_C3), @object
	ASM_SIZE_DIRECTIVE(L(DP_C3))

	.p2align 3
L(DP_C1):
	.long	0x545c50c7,0x3fa55555
	.type L(DP_C1), @object
	ASM_SIZE_DIRECTIVE(L(DP_C1))

	.p2align 3
L(DP_C4):
	.long	0xdd8844d7,0xbe923c97
	.type L(DP_C4), @object
	ASM_SIZE_DIRECTIVE(L(DP_C4))

	.p2align 3
L(DP_C2):
	.long	0x348b6874,0xbf56c16b
	.type L(DP_C2), @object
	ASM_SIZE_DIRECTIVE(L(DP_C2))

	.p2align 3
L(DP_C0):
	.long	0xfffe98ae,0xbfdfffff
	.type L(DP_C0), @object
	ASM_SIZE_DIRECTIVE(L(DP_C0))

	.p2align 3
L(DP_PIO4):
	.long	0x54442d18,0x3fe921fb	/* Pi/4 */
	.type L(DP_PIO4), @object
	ASM_SIZE_DIRECTIVE(L(DP_PIO4))

	.p2align 3
L(DP_2POW52):
	.long	0x00000000,0x43300000	/* +2^52 */
	.long	0x00000000,0xc3300000	/* -2^52 */
	.type L(DP_2POW52), @object
	ASM_SIZE_DIRECTIVE(L(DP_2POW52))

	.p2align 3
L(DP_INVPIO4):
	.long	0x6dc9c883,0x3ff45f30	/* 4/Pi */
	.type L(DP_INVPIO4), @object
	ASM_SIZE_DIRECTIVE(L(DP_INVPIO4))

	.p2align 3
L(DP_PIO4HI):
	.long	0x54000000,0xbfe921fb	/* High part of Pi/4 */
	.type L(DP_PIO4HI), @object
	ASM_SIZE_DIRECTIVE(L(DP_PIO4HI))

	.p2align 3
L(DP_PIO4LO):
	.long	0x11A62633,0xbe010b46	/* Low part of Pi/4 */
	.type L(DP_PIO4LO), @object
	ASM_SIZE_DIRECTIVE(L(DP_PIO4LO))

	.p2align 2
L(SP_INVPIO4):
	.long	0x3fa2f983		/* 4/Pi */
	.type L(SP_INVPIO4), @object
	ASM_SIZE_DIRECTIVE(L(SP_INVPIO4))

	.p2align 4
L(DP_ABS_MASK): /* Mask for getting DP absolute value */
	.long	0xffffffff,0x7fffffff
	.long	0xffffffff,0x7fffffff
	.type L(DP_ABS_MASK), @object
	ASM_SIZE_DIRECTIVE(L(DP_ABS_MASK))

	.p2align 3
L(DP_HI_MASK): /* Mask for getting high 21 bits of DP value */
	.long	0x00000000,0xffffffff
	.type L(DP_HI_MASK), @object
	ASM_SIZE_DIRECTIVE(L(DP_HI_MASK))

	.p2align 4
L(SP_ABS_MASK): /* Mask for getting SP absolute value */
	.long	0x7fffffff,0x7fffffff
	.long	0x7fffffff,0x7fffffff
	.type L(SP_ABS_MASK), @object
	ASM_SIZE_DIRECTIVE(L(SP_ABS_MASK))

	.p2align 2
L(SP_ONE):
	.long	0x3f800000		/* 1.0 */
	.type L(SP_ONE), @object
	ASM_SIZE_DIRECTIVE(L(SP_ONE))

weak_alias(__cosf, cosf)
Commit	Line	Data
4ffffbd2	1	/* Optimized cosf function.
d4697bc9	2	Copyright (C) 2012-2014 Free Software Foundation, Inc.
4ffffbd2 LD	3	This file is part of the GNU C Library.
	4
	5	The GNU C Library is free software; you can redistribute it and/or
	6	modify it under the terms of the GNU Lesser General Public
	7	License as published by the Free Software Foundation; either
	8	version 2.1 of the License, or (at your option) any later version.
	9
	10	The GNU C Library is distributed in the hope that it will be useful,
	11	but WITHOUT ANY WARRANTY; without even the implied warranty of
	12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	13	Lesser General Public License for more details.
	14
	15	You should have received a copy of the GNU Lesser General Public
	16	License along with the GNU C Library; if not, see
	17	<http://www.gnu.org/licenses/>. */
	18
	19	#include <sysdep.h>
	20	#define __need_Emath
	21	#include <bits/errno.h>
	22
	23	/* Short algorithm description:
	24	*
	25	* 1) if \|x\| == 0: return 1.0-\|x\|.
	26	* 2) if \|x\| < 2^-27: return 1.0-\|x\|.
	27	* 3) if \|x\| < 2^-5 : return 1.0+x^2DP_COS2_0+x^5DP_COS2_1.
	28	* 4) if \|x\| < Pi/4: return 1.0+x^2(C0+x^2(C1+x^2(C2+x^2(C3+x^2*C4)))).
	29	* 5) if \|x\| < 9*Pi/4:
	30	* 5.1) Range reduction: k=trunc(\|x\|/(Pi/4)), j=(k+1)&0x0e, n=k+3,
	31	* t=\|x\|-j*Pi/4.
	32	* 5.2) Reconstruction:
	33	* s = (-1.0)^((n>>2)&1)
	34	* if(n&2 != 0) {
	35	* using cos(t) polynomial for \|t\|<Pi/4, result is
	36	* s * (1.0+t^2(C0+t^2(C1+t^2(C2+t^2(C3+t^2*C4))))).
	37	* } else {
	38	* using sin(t) polynomial for \|t\|<Pi/4, result is
	39	* s * t * (1.0+t^2(S0+t^2(S1+t^2(S2+t^2(S3+t^2*S4))))).
	40	* }
	41	* 6) if \|x\| < 2^23, large args:
	42	* 6.1) Range reduction: k=trunc(\|x\|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+3,
	43	* t=\|x\|-j*Pi/4.
	44	* 6.2) Reconstruction same as (5.2).
	45	* 7) if \|x\| >= 2^23, very large args:
	46	* 7.1) Range reduction: k=trunc(\|x\|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+3,
	47	* t=\|x\|-j*Pi/4.
	48	* 7.2) Reconstruction same as (5.2).
	49	* 8) if x is Inf, return x-x, and set errno=EDOM.
	50	* 9) if x is NaN, return x-x.
	51	*
	52	* Special cases:
80ccd52c LD	53	* cos(+-0) = 1 not raising inexact,
	54	* cos(subnormal) raises inexact,
	55	* cos(min_normalized) raises inexact,
	56	* cos(normalized) raises inexact,
	57	* cos(Inf) = NaN, raises invalid, sets errno to EDOM,
	58	* cos(NaN) = NaN.
4ffffbd2 LD	59	*/
	60
	61	.text
	62	ENTRY(__cosf)
	63	/* Input: single precision x in %xmm0 */
	64
	65	movd %xmm0, %eax /* Bits of x */
	66	movaps %xmm0, %xmm7 /* Copy of x */
	67	cvtss2sd %xmm0, %xmm0 /* DP x */
	68	movss L(SP_ABS_MASK)(%rip), %xmm3
	69	andl $0x7fffffff, %eax /* \|x\| */
	70
	71	cmpl $0x3f490fdb, %eax /* \|x\|<Pi/4? */
	72	jb L(arg_less_pio4)
	73
	74	/* Here if \|x\|>=Pi/4 */
	75	andps %xmm7, %xmm3 /* SP \|x\| */
	76	andpd L(DP_ABS_MASK)(%rip), %xmm0 /* DP \|x\| */
	77	movss L(SP_INVPIO4)(%rip), %xmm2 /* SP 1/(Pi/4) */
	78
	79	cmpl $0x40e231d6, %eax /* \|x\|<9Pi/4? /
	80	jae L(large_args)
	81
	82	/* Here if Pi/4<=\|x\|<9Pi/4 /
	83	mulss %xmm3, %xmm2 /* SP \|x\|/(Pi/4) */
	84	cvttss2si %xmm2, %eax /* k, number of Pi/4 in x */
	85	lea L(PIO4J)(%rip), %rsi
	86	addl $1, %eax /* k+1 */
	87	movl $0x0e, %edx
	88	andl %eax, %edx /* j = (k+1)&0x0e */
	89	addl $2, %eax /* n */
	90	subsd (%rsi,%rdx,8), %xmm0 /* t = \|x\| - j * Pi/4 */
	91
	92	L(reconstruction):
	93	/* Input: %eax=n, %xmm0=t */
	94	testl $2, %eax /* n&2 != 0? */
	95	jz L(sin_poly)
	96
	97	/L(cos_poly):/
	98	/* Here if cos(x) calculated using cos(t) polynomial for \|t\|<Pi/4:
	99	* y = tt; z = yy;
	100	* s = sign(x) * (-1.0)^((n>>2)&1)
	101	* result = s * (1.0+t^2(C0+t^2(C1+t^2(C2+t^2(C3+t^2*C4)))))
	102	*/
	103	shrl $2, %eax /* n>>2 */
	104	mulsd %xmm0, %xmm0 /* y=t^2 */
	105	andl $1, %eax /* (n>>2)&1 */
	106	movaps %xmm0, %xmm1 /* y */
	107	mulsd %xmm0, %xmm0 /* z=t^4 */
	108
	109	movsd L(DP_C4)(%rip), %xmm4 /* C4 */
	110	mulsd %xmm0, %xmm4 /* zC4 /
	111	movsd L(DP_C3)(%rip), %xmm3 /* C3 */
	112	mulsd %xmm0, %xmm3 /* zC3 /
	113	lea L(DP_ONES)(%rip), %rsi
	114	addsd L(DP_C2)(%rip), %xmm4 /* C2+zC4 /
	115	mulsd %xmm0, %xmm4 /* z(C2+zC4) */
	116	addsd L(DP_C1)(%rip), %xmm3 /* C1+zC3 /
	117	mulsd %xmm0, %xmm3 /* z(C1+zC3) */
	118	addsd L(DP_C0)(%rip), %xmm4 /* C0+z(C2+zC4) */
	119	mulsd %xmm1, %xmm4 /* y(C0+z(C2+zC4)) /
	120
	121	addsd %xmm4, %xmm3 /* y(C0+y(C1+y(C2+y(C3+yC4)))) /
	122	/* 1.0+y(C0+y(C1+y(C2+y(C3+yC4)))) /
123	addsd L(DP_ONES)(%rip), %xmm3
124
125	mulsd (%rsi,%rax,8), %xmm3 /* DP result */
126	cvtsd2ss %xmm3, %xmm0 /* SP result */
127	ret
128
129	.p2align 4
130	L(sin_poly):
131	/* Here if cos(x) calculated using sin(t) polynomial for \|t\|<Pi/4:
132	* y = tt; z = yy;
133	* s = sign(x) * (-1.0)^((n>>2)&1)
134	* result = s * t * (1.0+t^2(S0+t^2(S1+t^2(S2+t^2(S3+t^2*S4)))))
135	*/
136
137	movaps %xmm0, %xmm4 /* t */
138	shrl $2, %eax /* n>>2 */
139	mulsd %xmm0, %xmm0 /* y=t^2 */
140	andl $1, %eax /* (n>>2)&1 */
141	movaps %xmm0, %xmm1 /* y */
142	mulsd %xmm0, %xmm0 /* z=t^4 */
143
144	movsd L(DP_S4)(%rip), %xmm2 /* S4 */
145	mulsd %xmm0, %xmm2 /* zS4 /
146	movsd L(DP_S3)(%rip), %xmm3 /* S3 */
147	mulsd %xmm0, %xmm3 /* zS3 /
148	lea L(DP_ONES)(%rip), %rsi
149	addsd L(DP_S2)(%rip), %xmm2 /* S2+zS4 /
150	mulsd %xmm0, %xmm2 /* z(S2+zS4) */
151	addsd L(DP_S1)(%rip), %xmm3 /* S1+zS3 /
152	mulsd %xmm0, %xmm3 /* z(S1+zS3) */
153	addsd L(DP_S0)(%rip), %xmm2 /* S0+z(S2+zS4) */
154	mulsd %xmm1, %xmm2 /* y(S0+z(S2+zS4)) /
155	/* ts, where s = sign(x) (-1.0)^((n>>2)&1) */
156	mulsd (%rsi,%rax,8), %xmm4
157	/* y(S0+y(S1+y(S2+y(S3+yS4)))) /
158	addsd %xmm2, %xmm3
159	/* tsy(S0+y(S1+y(S2+y(S3+yS4)))) /
160	mulsd %xmm4, %xmm3
161	/* ts(1.0+y(S0+y(S1+y(S2+y(S3+yS4)))) /
162	addsd %xmm4, %xmm3
163	cvtsd2ss %xmm3, %xmm0 /* SP result */
164	ret
165
4ffffbd2 LD	166	.p2align 4
	167	L(large_args):
	168	/* Here if \|x\|>=9Pi/4 /
	169	cmpl $0x7f800000, %eax /* x is Inf or NaN? */
	170	jae L(arg_inf_or_nan)
	171
	172	/* Here if finite \|x\|>=9Pi/4 /
	173	cmpl $0x4b000000, %eax /* \|x\|<2^23? */
	174	jae L(very_large_args)
	175
	176	/* Here if 9Pi/4<=\|x\|<2^23 /
	177	movsd L(DP_INVPIO4)(%rip), %xmm1 /* 1/(Pi/4) */
	178	mulsd %xmm0, %xmm1 /* \|x\|/(Pi/4) */
	179	cvttsd2si %xmm1, %eax /* k=trunc(\|x\|/(Pi/4)) */
	180	addl $1, %eax /* k+1 */
	181	movl %eax, %edx
	182	andl $0xfffffffe, %edx /* j=(k+1)&0xfffffffe */
	183	cvtsi2sdl %edx, %xmm4 /* DP j */
	184	movsd L(DP_PIO4HI)(%rip), %xmm2 /* -PIO4HI = high part of -Pi/4 */
	185	mulsd %xmm4, %xmm2 /* -jPIO4HI /
	186	movsd L(DP_PIO4LO)(%rip), %xmm3 /* -PIO4LO = low part of -Pi/4 */
	187	addsd %xmm2, %xmm0 /* \|x\| - jPIO4HI /
	188	addl $2, %eax /* n */
	189	mulsd %xmm3, %xmm4 /* jPIO4LO /
	190	addsd %xmm4, %xmm0 /* t = \|x\| - jPIO4HI - jPIO4LO */
	191	jmp L(reconstruction)
	192
	193	.p2align 4
	194	L(very_large_args):
	195	/* Here if finite \|x\|>=2^23 */
	196
	197	/* bitpos = (ix>>23) - BIAS_32 + 59; */
	198	shrl $23, %eax /* eb = biased exponent of x */
	199	/* bitpos = eb - 0x7f + 59, where 0x7f is exponent bias */
	200	subl $68, %eax
	201	movl $28, %ecx /* %cl=28 */
	202	movl %eax, %edx /* bitpos copy */
	203
	204	/* j = bitpos/28; */
	205	div %cl /* j in register %al=%ax/%cl */
	206	movapd %xmm0, %xmm3 /* \|x\| */
	207	/* clear unneeded remainder from %ah */
	208	andl $0xff, %eax
	209
	210	imull $28, %eax, %ecx /* j28 /
	211	lea L(_FPI)(%rip), %rsi
	212	movsd L(DP_HI_MASK)(%rip), %xmm4 /* DP_HI_MASK */
	213	movapd %xmm0, %xmm5 /* \|x\| */
	214	mulsd -16(%rsi,%rax,8), %xmm3 /* tmp3 = FPI[j-2]\|x\| /
	215	movapd %xmm0, %xmm1 /* \|x\| */
	216	mulsd -8(%rsi,%rax,8), %xmm5 /* tmp2 = FPI[j-1]\|x\| /
	217	mulsd (%rsi,%rax,8), %xmm0 /* tmp0 = FPI[j]\|x\| /
	218	addl $19, %ecx /* j28+19 /
	219	mulsd 8(%rsi,%rax,8), %xmm1 /* tmp1 = FPI[j+1]\|x\| /
	220	cmpl %ecx, %edx /* bitpos>=j28+19? /
	221	jl L(very_large_skip1)
	222
	223	/* Here if bitpos>=j28+19 /
	224	andpd %xmm3, %xmm4 /* HI(tmp3) */
	225	subsd %xmm4, %xmm3 /* tmp3 = tmp3 - HI(tmp3) */
	226	L(very_large_skip1):
	227
	228	movsd L(DP_2POW52)(%rip), %xmm6
	229	movapd %xmm5, %xmm2 /* tmp2 copy */
230	addsd %xmm3, %xmm5 /* tmp5 = tmp3 + tmp2 */
231	movl $1, %edx
232	addsd %xmm5, %xmm6 /* tmp6 = tmp5 + 2^52 */
233	movsd 8+L(DP_2POW52)(%rip), %xmm4
234	movd %xmm6, %eax /* k = I64_LO(tmp6); */
235	addsd %xmm6, %xmm4 /* tmp4 = tmp6 - 2^52 */
236	comisd %xmm5, %xmm4 /* tmp4 > tmp5? */
237	jbe L(very_large_skip2)
238
239	/* Here if tmp4 > tmp5 */
240	subl $1, %eax /* k-- */
241	addsd 8+L(DP_ONES)(%rip), %xmm4 /* tmp4 -= 1.0 */
242	L(very_large_skip2):
243
244	andl %eax, %edx /* k&1 */
245	lea L(DP_ZERONE)(%rip), %rsi
246	subsd %xmm4, %xmm3 /* tmp3 -= tmp4 */
247	addsd (%rsi,%rdx,8), %xmm3 /* t = DP_ZERONE[k&1] + tmp3 */
248	addsd %xmm2, %xmm3 /* t += tmp2 */
249	addsd %xmm3, %xmm0 /* t += tmp0 */
250	addl $3, %eax /* n=k+3 */
251	addsd %xmm1, %xmm0 /* t += tmp1 */
252	mulsd L(DP_PIO4)(%rip), %xmm0 /* t = PI04 /
253
254	jmp L(reconstruction) /* end of very_large_args peth */
255
4ffffbd2 LD	256	.p2align 4
	257	L(arg_less_pio4):
	258	/* Here if \|x\|<Pi/4 */
	259	cmpl $0x3d000000, %eax /* \|x\|<2^-5? */
	260	jl L(arg_less_2pn5)
	261
	262	/* Here if 2^-5<=\|x\|<Pi/4 */
	263	mulsd %xmm0, %xmm0 /* y=x^2 */
	264	movaps %xmm0, %xmm1 /* y */
	265	mulsd %xmm0, %xmm0 /* z=x^4 */
	266	movsd L(DP_C4)(%rip), %xmm3 /* C4 */
	267	mulsd %xmm0, %xmm3 /* zC4 /
	268	movsd L(DP_C3)(%rip), %xmm5 /* C3 */
	269	mulsd %xmm0, %xmm5 /* zC3 /
	270	addsd L(DP_C2)(%rip), %xmm3 /* C2+zC4 /
	271	mulsd %xmm0, %xmm3 /* z(C2+zC4) */
	272	addsd L(DP_C1)(%rip), %xmm5 /* C1+zC3 /
	273	mulsd %xmm0, %xmm5 /* z(C1+zC3) */
	274	addsd L(DP_C0)(%rip), %xmm3 /* C0+z(C2+zC4) */
	275	mulsd %xmm1, %xmm3 /* y(C0+z(C2+zC4)) /
	276	/* y(C0+y(C1+y(C2+y(C3+yC4)))) /
	277	addsd %xmm5, %xmm3
	278	/* 1.0 + y(C0+y(C1+y(C2+y(C3+yC4)))) /
	279	addsd L(DP_ONES)(%rip), %xmm3
	280	cvtsd2ss %xmm3, %xmm0 /* SP result */
	281	ret
	282
	283	.p2align 4
	284	L(arg_less_2pn5):
	285	/* Here if \|x\|<2^-5 */
	286	cmpl $0x32000000, %eax /* \|x\|<2^-27? */
	287	jl L(arg_less_2pn27)
	288
	289	/* Here if 2^-27<=\|x\|<2^-5 */
	290	mulsd %xmm0, %xmm0 /* DP x^2 */
	291	movsd L(DP_COS2_1)(%rip), %xmm3 /* DP DP_COS2_1 */
	292	mulsd %xmm0, %xmm3 /* DP x^2DP_COS2_1 /
	293	addsd L(DP_COS2_0)(%rip), %xmm3 /* DP DP_COS2_0+x^2DP_COS2_1 /
	294	mulsd %xmm0, %xmm3 /* DP x^2DP_COS2_0+x^4DP_COS2_1 */
	295	/* DP 1.0+x^2DP_COS2_0+x^4DP_COS2_1 */
	296	addsd L(DP_ONES)(%rip), %xmm3
	297	cvtsd2ss %xmm3, %xmm0 /* SP result */
	298	ret
	299
	300	.p2align 4
	301	L(arg_less_2pn27):
	302	/* Here if \|x\|<2^-27 */
	303	andps L(SP_ABS_MASK)(%rip),%xmm7 /* \|x\| */
	304	movss L(SP_ONE)(%rip), %xmm0 /* 1.0 */
	305	subss %xmm7, %xmm0 /* result is 1.0-\|x\| */
	306	ret
	307
	308	.p2align 4
	309	L(arg_inf_or_nan):
	310	/* Here if \|x\| is Inf or NAN */
	311	jne L(skip_errno_setting) /* in case of x is NaN */
	312
	313	/* Here if x is Inf. Set errno to EDOM. */
	314	call JUMPTARGET(__errno_location)
4ffffbd2 LD	315	movl $EDOM, (%rax)
	316
	317	.p2align 4
	318	L(skip_errno_setting):
	319	/* Here if \|x\| is Inf or NAN. Continued. */
	320	movaps %xmm7, %xmm0 /* load x */
	321	subss %xmm0, %xmm0 /* Result is NaN */
	322	ret
	323	END(__cosf)
	324
4ffffbd2 LD	325	.section .rodata, "a"
	326	.p2align 3
	327	L(PIO4J): /* Table of jPi/4, for j=0,1,..,10 /
	328	.long 0x00000000,0x00000000
	329	.long 0x54442d18,0x3fe921fb
	330	.long 0x54442d18,0x3ff921fb
	331	.long 0x7f3321d2,0x4002d97c
	332	.long 0x54442d18,0x400921fb
	333	.long 0x2955385e,0x400f6a7a
	334	.long 0x7f3321d2,0x4012d97c
	335	.long 0xe9bba775,0x4015fdbb
	336	.long 0x54442d18,0x401921fb
	337	.long 0xbeccb2bb,0x401c463a
	338	.long 0x2955385e,0x401f6a7a
	339	.type L(PIO4J), @object
	340	ASM_SIZE_DIRECTIVE(L(PIO4J))
	341
	342	.p2align 3
	343	L(_FPI): /* 4/Pi broken into sum of positive DP values */
	344	.long 0x00000000,0x00000000
	345	.long 0x6c000000,0x3ff45f30
	346	.long 0x2a000000,0x3e3c9c88
	347	.long 0xa8000000,0x3c54fe13
	348	.long 0xd0000000,0x3aaf47d4
	349	.long 0x6c000000,0x38fbb81b
	350	.long 0xe0000000,0x3714acc9
	351	.long 0x7c000000,0x3560e410
	352	.long 0x56000000,0x33bca2c7
	353	.long 0xac000000,0x31fbd778
	354	.long 0xe0000000,0x300b7246
	355	.long 0xe8000000,0x2e5d2126
	356	.long 0x48000000,0x2c970032
	357	.long 0xe8000000,0x2ad77504
	358	.long 0xe0000000,0x290921cf
	359	.long 0xb0000000,0x274deb1c
	360	.long 0xe0000000,0x25829a73
	361	.long 0xbe000000,0x23fd1046
	362	.long 0x10000000,0x2224baed
	363	.long 0x8e000000,0x20709d33
	364	.long 0x80000000,0x1e535a2f
	365	.long 0x64000000,0x1cef904e
	366	.long 0x30000000,0x1b0d6398
	367	.long 0x24000000,0x1964ce7d
	368	.long 0x16000000,0x17b908bf
	369	.type L(_FPI), @object
	370	ASM_SIZE_DIRECTIVE(L(_FPI))
	371
	372	/* Coefficients of polynomial
	373	for cos(x)~=1.0+x^2DP_COS2_0+x^4DP_COS2_1, \|x\|<2^-5. */
	374	.p2align 3
	375	L(DP_COS2_0):
	376	.long 0xff5cc6fd,0xbfdfffff
	377	.type L(DP_COS2_0), @object
	378	ASM_SIZE_DIRECTIVE(L(DP_COS2_0))
	379
	380	.p2align 3
	381	L(DP_COS2_1):
	382	.long 0xb178dac5,0x3fa55514
	383	.type L(DP_COS2_1), @object
	384	ASM_SIZE_DIRECTIVE(L(DP_COS2_1))
	385
	386	.p2align 3
	387	L(DP_ZERONE):
	388	.long 0x00000000,0x00000000 /* 0.0 */
389	.long 0x00000000,0xbff00000 /* 1.0 */
390	.type L(DP_ZERONE), @object
391	ASM_SIZE_DIRECTIVE(L(DP_ZERONE))
392
393	.p2align 3
394	L(DP_ONES):
395	.long 0x00000000,0x3ff00000 /* +1.0 */
396	.long 0x00000000,0xbff00000 /* -1.0 */
397	.type L(DP_ONES), @object
398	ASM_SIZE_DIRECTIVE(L(DP_ONES))
399
400	/* Coefficients of polynomial
401	for sin(t)~=t+t^3(S0+t^2(S1+t^2(S2+t^2(S3+t^2S4)))), \|t\|<Pi/4. /
402	.p2align 3
403	L(DP_S3):
404	.long 0x64e6b5b4,0x3ec71d72
405	.type L(DP_S3), @object
406	ASM_SIZE_DIRECTIVE(L(DP_S3))
407
408	.p2align 3
409	L(DP_S1):
410	.long 0x10c2688b,0x3f811111
411	.type L(DP_S1), @object
412	ASM_SIZE_DIRECTIVE(L(DP_S1))
413
414	.p2align 3
415	L(DP_S4):
416	.long 0x1674b58a,0xbe5a947e
417	.type L(DP_S4), @object
418	ASM_SIZE_DIRECTIVE(L(DP_S4))
419
420	.p2align 3
421	L(DP_S2):
422	.long 0x8b4bd1f9,0xbf2a019f
423	.type L(DP_S2),@object
424	ASM_SIZE_DIRECTIVE(L(DP_S2))
425
426	.p2align 3
427	L(DP_S0):
428	.long 0x55551cd9,0xbfc55555
429	.type L(DP_S0), @object
430	ASM_SIZE_DIRECTIVE(L(DP_S0))
431
432	/* Coefficients of polynomial
433	for cos(t)~=1.0+t^2(C0+t^2(C1+t^2(C2+t^2(C3+t^2C4)))), \|t\|<Pi/4. /
434	.p2align 3
435	L(DP_C3):
436	.long 0x9ac43cc0,0x3efa00eb
437	.type L(DP_C3), @object
438	ASM_SIZE_DIRECTIVE(L(DP_C3))
439
440	.p2align 3
441	L(DP_C1):
442	.long 0x545c50c7,0x3fa55555
443	.type L(DP_C1), @object
444	ASM_SIZE_DIRECTIVE(L(DP_C1))
445
446	.p2align 3
447	L(DP_C4):
448	.long 0xdd8844d7,0xbe923c97
449	.type L(DP_C4), @object
450	ASM_SIZE_DIRECTIVE(L(DP_C4))
451
452	.p2align 3
453	L(DP_C2):
454	.long 0x348b6874,0xbf56c16b
455	.type L(DP_C2), @object
456	ASM_SIZE_DIRECTIVE(L(DP_C2))
457
458	.p2align 3
459	L(DP_C0):
460	.long 0xfffe98ae,0xbfdfffff
461	.type L(DP_C0), @object
462	ASM_SIZE_DIRECTIVE(L(DP_C0))
463
464	.p2align 3
465	L(DP_PIO4):
466	.long 0x54442d18,0x3fe921fb /* Pi/4 */
467	.type L(DP_PIO4), @object
468	ASM_SIZE_DIRECTIVE(L(DP_PIO4))
469
470	.p2align 3
471	L(DP_2POW52):
472	.long 0x00000000,0x43300000 /* +2^52 */
473	.long 0x00000000,0xc3300000 /* -2^52 */
474	.type L(DP_2POW52), @object
475	ASM_SIZE_DIRECTIVE(L(DP_2POW52))
476
477	.p2align 3
478	L(DP_INVPIO4):
479	.long 0x6dc9c883,0x3ff45f30 /* 4/Pi */
480	.type L(DP_INVPIO4), @object
481	ASM_SIZE_DIRECTIVE(L(DP_INVPIO4))
482
483	.p2align 3
484	L(DP_PIO4HI):
485	.long 0x54000000,0xbfe921fb /* High part of Pi/4 */
486	.type L(DP_PIO4HI), @object
487	ASM_SIZE_DIRECTIVE(L(DP_PIO4HI))
488
489	.p2align 3
490	L(DP_PIO4LO):
491	.long 0x11A62633,0xbe010b46 /* Low part of Pi/4 */
492	.type L(DP_PIO4LO), @object
493	ASM_SIZE_DIRECTIVE(L(DP_PIO4LO))
494
495	.p2align 2
496	L(SP_INVPIO4):
497	.long 0x3fa2f983 /* 4/Pi */
498	.type L(SP_INVPIO4), @object
499	ASM_SIZE_DIRECTIVE(L(SP_INVPIO4))
500
501	.p2align 4
502	L(DP_ABS_MASK): /* Mask for getting DP absolute value */
503	.long 0xffffffff,0x7fffffff
504	.long 0xffffffff,0x7fffffff
505	.type L(DP_ABS_MASK), @object
506	ASM_SIZE_DIRECTIVE(L(DP_ABS_MASK))
507
508	.p2align 3
509	L(DP_HI_MASK): /* Mask for getting high 21 bits of DP value */
510	.long 0x00000000,0xffffffff
80ccd52c LD	511	.type L(DP_HI_MASK), @object
80ccd52c LD	512	ASM_SIZE_DIRECTIVE(L(DP_HI_MASK))
4ffffbd2 LD	513
	514	.p2align 4
	515	L(SP_ABS_MASK): /* Mask for getting SP absolute value */
	516	.long 0x7fffffff,0x7fffffff
	517	.long 0x7fffffff,0x7fffffff
	518	.type L(SP_ABS_MASK), @object
	519	ASM_SIZE_DIRECTIVE(L(SP_ABS_MASK))
	520
	521	.p2align 2
	522	L(SP_ONE):
	523	.long 0x3f800000 /* 1.0 */
	524	.type L(SP_ONE), @object
	525	ASM_SIZE_DIRECTIVE(L(SP_ONE))
	526
	527	weak_alias(__cosf, cosf)