[thirdparty/glibc.git] / sysdeps / i386 / i686 / fpu / multiarch / s_cosf-sse2.S

/* Optimized with sse2 version of cosf
   Copyright (C) 2012-2015 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <http://www.gnu.org/licenses/>.  */

#include <sysdep.h>
#define __need_Emath
#include <bits/errno.h>

/* Short algorithm description:
 *
 *  1) if |x| == 0: return 1.0-|x|.
 *  2) if |x| <  2^-27: return 1.0-|x|.
 *  3) if |x| <  2^-5 : return 1.0+x^2*DP_COS2_0+x^5*DP_COS2_1.
 *  4) if |x| <   Pi/4: return 1.0+x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))).
 *  5) if |x| < 9*Pi/4:
 *      5.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0x0e, n=k+3,
 *           t=|x|-j*Pi/4.
 *      5.2) Reconstruction:
 *          s = (-1.0)^((n>>2)&1)
 *          if(n&2 != 0) {
 *              using cos(t) polynomial for |t|<Pi/4, result is
 *              s     * (1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4))))).
 *          } else {
 *              using sin(t) polynomial for |t|<Pi/4, result is
 *              s * t * (1.0+t^2*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4))))).
 *          }
 *  6) if |x| < 2^23, large args:
 *      6.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+3,
 *           t=|x|-j*Pi/4.
 *      6.2) Reconstruction same as (5.2).
 *  7) if |x| >= 2^23, very large args:
 *      7.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+3,
 *           t=|x|-j*Pi/4.
 *      7.2) Reconstruction same as (5.2).
 *  8) if x is Inf, return x-x, and set errno=EDOM.
 *  9) if x is NaN, return x-x.
 *
 * Special cases:
 *  cos(+-0) = 1 not raising inexact,
 *  cos(subnormal) raises inexact,
 *  cos(min_normalized) raises inexact,
 *  cos(normalized) raises inexact,
 *  cos(Inf) = NaN, raises invalid, sets errno to EDOM,
 *  cos(NaN) = NaN.
 */

#ifdef	PIC
# define MO1(symbol)			L(symbol)##@GOTOFF(%ebx)
# define MO2(symbol,reg2,_scale)	L(symbol)##@GOTOFF(%ebx,reg2,_scale)
# define CFI_PUSH(REG)	cfi_adjust_cfa_offset(4); cfi_rel_offset(REG,0)
# define CFI_POP(REG)	cfi_adjust_cfa_offset(-4); cfi_restore(REG)
# define PUSH(REG)			pushl REG; CFI_PUSH(REG)
# define POP(REG)			popl REG; CFI_POP(REG)
# define ENTRANCE			PUSH(%ebx); LOAD_PIC_REG(bx)
# define RETURN				POP(%ebx); ret; CFI_PUSH(%ebx)
# define ARG_X				8(%esp)
#else
# define MO1(symbol)			L(symbol)
# define MO2(symbol,reg2,_scale)	L(symbol)(,reg2,_scale)
# define ENTRANCE
# define RETURN				ret
# define ARG_X				4(%esp)
#endif

	.text
ENTRY(__cosf_sse2)
	/* Input: single precision x on stack at address ARG_X */

	ENTRANCE
	movl	ARG_X, %eax		/* Bits of x */
	cvtss2sd ARG_X, %xmm0		/* DP x */
	andl	$0x7fffffff, %eax	/* |x| */

	cmpl	$0x3f490fdb, %eax	/* |x|<Pi/4?  */
	jb	L(arg_less_pio4)

	/* Here if |x|>=Pi/4 */
	movd	%eax, %xmm3		/* SP |x| */
	andpd	MO1(DP_ABS_MASK),%xmm0	/* DP |x| */
	movss	MO1(SP_INVPIO4), %xmm2	/* SP 1/(Pi/4) */

	cmpl	$0x40e231d6, %eax	/* |x|<9*Pi/4?  */
	jae	L(large_args)

	/* Here if Pi/4<=|x|<9*Pi/4 */
	mulss	%xmm3, %xmm2		/* SP |x|/(Pi/4) */
	cvttss2si %xmm2, %eax		/* k, number of Pi/4 in x */
	addl	$1, %eax		/* k+1 */
	movl	$0x0e, %edx
	andl	%eax, %edx		/* j = (k+1)&0x0e */
	addl	$2, %eax		/* n */
	subsd	MO2(PIO4J,%edx,8), %xmm0 /* t = |x| - j * Pi/4 */

L(reconstruction):
	/* Input: %eax=n, %xmm0=t */
	testl	$2, %eax		/* n&2 != 0?  */
	jz	L(sin_poly)

/*L(cos_poly):*/
	/* Here if cos(x) calculated using cos(t) polynomial for |t|<Pi/4:
	 * y = t*t; z = y*y;
	 * s = sign(x) * (-1.0)^((n>>2)&1)
	 * result = s * (1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4)))))
	 */
	shrl	$2, %eax		/* n>>2 */
	mulsd	%xmm0, %xmm0		/* y=t^2 */
	andl	$1, %eax		/* (n>>2)&1 */
	movaps	%xmm0, %xmm1		/* y */
	mulsd	%xmm0, %xmm0		/* z=t^4 */

	movsd	MO1(DP_C4), %xmm4	/* C4 */
	mulsd	%xmm0, %xmm4		/* z*C4 */
	movsd	MO1(DP_C3), %xmm3	/* C3 */
	mulsd	%xmm0, %xmm3		/* z*C3 */
	addsd	MO1(DP_C2), %xmm4	/* C2+z*C4 */
	mulsd	%xmm0, %xmm4		/* z*(C2+z*C4) */
	lea	-8(%esp), %esp		/* Borrow 4 bytes of stack frame */
	addsd	MO1(DP_C1), %xmm3	/* C1+z*C3 */
	mulsd	%xmm0, %xmm3		/* z*(C1+z*C3) */
	addsd	MO1(DP_C0), %xmm4	/* C0+z*(C2+z*C4) */
	mulsd	%xmm1, %xmm4		/* y*(C0+z*(C2+z*C4)) */

	addsd	%xmm4, %xmm3		/* y*(C0+y*(C1+y*(C2+y*(C3+y*C4)))) */
	/* 1.0+y*(C0+y*(C1+y*(C2+y*(C3+y*C4)))) */
	addsd	MO1(DP_ONES), %xmm3

	mulsd	MO2(DP_ONES,%eax,8), %xmm3 /* DP result */
	movsd	%xmm3, 0(%esp)		/* Move result from sse...  */
	fldl	0(%esp)			/* ...to FPU.  */
	/* Return back 4 bytes of stack frame */
	lea	8(%esp), %esp
	RETURN

	.p2align	4
L(sin_poly):
	/* Here if cos(x) calculated using sin(t) polynomial for |t|<Pi/4:
	 * y = t*t; z = y*y;
	 * s = sign(x) * (-1.0)^((n>>2)&1)
	 * result = s * t * (1.0+t^2*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4)))))
	 */

	movaps	%xmm0, %xmm4		/* t */
	shrl	$2, %eax		/* n>>2 */
	mulsd	%xmm0, %xmm0		/* y=t^2 */
	andl	$1, %eax		/* (n>>2)&1 */
	movaps	%xmm0, %xmm1		/* y */
	mulsd	%xmm0, %xmm0		/* z=t^4 */

	movsd	MO1(DP_S4), %xmm2	/* S4 */
	mulsd	%xmm0, %xmm2		/* z*S4 */
	movsd	MO1(DP_S3), %xmm3	/* S3 */
	mulsd	%xmm0, %xmm3		/* z*S3 */
	lea	-8(%esp), %esp		/* Borrow 4 bytes of stack frame */
	addsd	MO1(DP_S2), %xmm2	/* S2+z*S4 */
	mulsd	%xmm0, %xmm2		/* z*(S2+z*S4) */
	addsd	MO1(DP_S1), %xmm3	/* S1+z*S3 */
	mulsd	%xmm0, %xmm3		/* z*(S1+z*S3) */
	addsd	MO1(DP_S0), %xmm2	/* S0+z*(S2+z*S4) */
	mulsd	%xmm1, %xmm2		/* y*(S0+z*(S2+z*S4)) */
	/* t*s, where s = sign(x) * (-1.0)^((n>>2)&1) */
	mulsd	MO2(DP_ONES,%eax,8), %xmm4
	addsd	%xmm2, %xmm3		/* y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */
	/* t*s*y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */
	mulsd	%xmm4, %xmm3
	/* t*s*(1.0+y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */
	addsd	%xmm4, %xmm3
	movsd	%xmm3, 0(%esp)		/* Move result from sse...   */
	fldl	0(%esp)			/* ...to FPU.  */
	/* Return back 4 bytes of stack frame */
	lea	8(%esp), %esp
	RETURN

	.p2align	4
L(large_args):
	/* Here if |x|>=9*Pi/4 */
	cmpl	$0x7f800000, %eax	/* x is Inf or NaN?  */
	jae	L(arg_inf_or_nan)

	/* Here if finite |x|>=9*Pi/4 */
	cmpl	$0x4b000000, %eax	/* |x|<2^23?  */
	jae	L(very_large_args)

	/* Here if 9*Pi/4<=|x|<2^23 */
	movsd	MO1(DP_INVPIO4), %xmm1	/* 1/(Pi/4) */
	mulsd	%xmm0, %xmm1		/* |x|/(Pi/4) */
	cvttsd2si %xmm1, %eax		/* k=trunc(|x|/(Pi/4)) */
	addl	$1, %eax		/* k+1 */
	movl	%eax, %edx
	andl	$0xfffffffe, %edx	/* j=(k+1)&0xfffffffe */
	cvtsi2sdl %edx, %xmm4		/* DP j */
	movsd	MO1(DP_PIO4HI), %xmm2	/* -PIO4HI = high part of -Pi/4 */
	mulsd	%xmm4, %xmm2		/* -j*PIO4HI */
	movsd	MO1(DP_PIO4LO), %xmm3	/* -PIO4LO = low part of -Pi/4 */
	addsd	%xmm2, %xmm0		/* |x| - j*PIO4HI */
	addl	$2, %eax		/* n */
	mulsd	%xmm3, %xmm4		/* j*PIO4LO */
	addsd	%xmm4, %xmm0		/* t = |x| - j*PIO4HI - j*PIO4LO */
	jmp	L(reconstruction)

	.p2align	4
L(very_large_args):
	/* Here if finite |x|>=2^23 */

	/* bitpos = (ix>>23) - BIAS_32 + 59; */
	shrl	$23, %eax		/* eb = biased exponent of x */
	/* bitpos = eb - 0x7f + 59, where 0x7f is exponent bias */
	subl	$68, %eax
	movl	$28, %ecx		/* %cl=28 */
	movl	%eax, %edx		/* bitpos copy */

	/* j = bitpos/28; */
	div	%cl			/* j in register %al=%ax/%cl */
	movapd	%xmm0, %xmm3		/* |x| */
	/* clear unneeded remainder from %ah */
	andl	$0xff, %eax

	imull	$28, %eax, %ecx		/* j*28 */
	movsd	MO1(DP_HI_MASK), %xmm4	/* DP_HI_MASK */
	movapd	%xmm0, %xmm5		/* |x| */
	mulsd	-2*8+MO2(_FPI,%eax,8), %xmm3	/* tmp3 = FPI[j-2]*|x| */
	movapd	%xmm0, %xmm1		/* |x| */
	mulsd	-1*8+MO2(_FPI,%eax,8), %xmm5	/* tmp2 = FPI[j-1]*|x| */
	mulsd	0*8+MO2(_FPI,%eax,8), %xmm0	/* tmp0 = FPI[j]*|x| */
	addl	$19, %ecx		/* j*28+19 */
	mulsd	1*8+MO2(_FPI,%eax,8), %xmm1	/* tmp1 = FPI[j+1]*|x| */
	cmpl	%ecx, %edx		/* bitpos>=j*28+19?  */
	jl	L(very_large_skip1)

	/* Here if bitpos>=j*28+19 */
	andpd	%xmm3, %xmm4		/* HI(tmp3) */
	subsd	%xmm4, %xmm3		/* tmp3 = tmp3 - HI(tmp3) */
L(very_large_skip1):

	movsd	MO1(DP_2POW52), %xmm6
	movapd	%xmm5, %xmm2		/* tmp2 copy */
	addsd	%xmm3, %xmm5		/* tmp5 = tmp3 + tmp2 */
	movl	$1, %edx
	addsd	%xmm5, %xmm6		/* tmp6 = tmp5 + 2^52 */
	movsd	8+MO1(DP_2POW52), %xmm4
	movd	%xmm6, %eax		/* k = I64_LO(tmp6); */
	addsd	%xmm6, %xmm4		/* tmp4 = tmp6 - 2^52 */
	comisd	%xmm5, %xmm4		/* tmp4 > tmp5?  */
	jbe	L(very_large_skip2)

	/* Here if tmp4 > tmp5 */
	subl	$1, %eax		/* k-- */
	addsd	8+MO1(DP_ONES), %xmm4	/* tmp4 -= 1.0 */
L(very_large_skip2):

	andl	%eax, %edx		/* k&1 */
	subsd	%xmm4, %xmm3		/* tmp3 -= tmp4 */
	addsd	MO2(DP_ZERONE,%edx,8), %xmm3 /* t  = DP_ZERONE[k&1] + tmp3 */
	addsd	%xmm2, %xmm3		/* t += tmp2 */
	addsd	%xmm3, %xmm0		/* t += tmp0 */
	addl	$3, %eax		/* n=k+3 */
	addsd	%xmm1, %xmm0		/* t += tmp1 */
	mulsd	MO1(DP_PIO4), %xmm0	/* t *= PI04 */

	jmp	L(reconstruction)	/* end of very_large_args peth */

	.p2align	4
L(arg_less_pio4):
	/* Here if |x|<Pi/4 */
	cmpl	$0x3d000000, %eax	/* |x|<2^-5?  */
	jl	L(arg_less_2pn5)

	/* Here if 2^-5<=|x|<Pi/4 */
	mulsd	%xmm0, %xmm0		/* y=x^2 */
	movaps	%xmm0, %xmm1		/* y */
	mulsd	%xmm0, %xmm0		/* z=x^4 */
	movsd	MO1(DP_C4), %xmm3	/* C4 */
	mulsd	%xmm0, %xmm3		/* z*C4 */
	movsd	MO1(DP_C3), %xmm5	/* C3 */
	mulsd	%xmm0, %xmm5		/* z*C3 */
	addsd	MO1(DP_C2), %xmm3	/* C2+z*C4 */
	mulsd	%xmm0, %xmm3		/* z*(C2+z*C4) */
	addsd	MO1(DP_C1), %xmm5	/* C1+z*C3 */
	mulsd	%xmm0, %xmm5		/* z*(C1+z*C3) */
	addsd	MO1(DP_C0), %xmm3	/* C0+z*(C2+z*C4) */
	mulsd	%xmm1, %xmm3		/* y*(C0+z*(C2+z*C4)) */
	addsd	%xmm5, %xmm3		/* y*(C0+y*(C1+y*(C2+y*(C3+y*C4)))) */
	/* 1.0 + y*(C0+y*(C1+y*(C2+y*(C3+y*C4)))) */
	addsd	MO1(DP_ONES), %xmm3
	cvtsd2ss %xmm3, %xmm3		/* SP result */

L(epilogue):
	lea	-4(%esp), %esp		/* Borrow 4 bytes of stack frame */
	movss	%xmm3, 0(%esp)		/* Move result from sse...  */
	flds	0(%esp)			/* ...to FPU.  */
	/* Return back 4 bytes of stack frame */
	lea	4(%esp), %esp
	RETURN

	.p2align	4
L(arg_less_2pn5):
	/* Here if |x|<2^-5 */
	cmpl	$0x32000000, %eax	/* |x|<2^-27?  */
	jl	L(arg_less_2pn27)

	/* Here if 2^-27<=|x|<2^-5 */
	mulsd	%xmm0, %xmm0		/* DP x^2 */
	movsd	MO1(DP_COS2_1), %xmm3	/* DP DP_COS2_1 */
	mulsd	%xmm0, %xmm3		/* DP x^2*DP_COS2_1 */
	addsd	MO1(DP_COS2_0), %xmm3	/* DP DP_COS2_0+x^2*DP_COS2_1 */
	mulsd	%xmm0, %xmm3		/* DP x^2*DP_COS2_0+x^4*DP_COS2_1 */
	/* DP 1.0+x^2*DP_COS2_0+x^4*DP_COS2_1 */
	addsd	MO1(DP_ONES), %xmm3
	cvtsd2ss %xmm3, %xmm3		/* SP result */
	jmp	L(epilogue)

	.p2align	4
L(arg_less_2pn27):
	/* Here if |x|<2^-27 */
	movss	ARG_X, %xmm0		/* x */
	andps	MO1(SP_ABS_MASK),%xmm0	/* |x| */
	movss	MO1(SP_ONE), %xmm3	/* 1.0 */
	subss	%xmm0, %xmm3		/* result is 1.0-|x| */
	jmp	L(epilogue)

	.p2align	4
L(arg_inf_or_nan):
	/* Here if |x| is Inf or NAN */
	jne	L(skip_errno_setting)	/* in case of x is NaN */

	/* Here if x is Inf. Set errno to EDOM.  */
	call	JUMPTARGET(__errno_location)
	movl	$EDOM, (%eax)

	.p2align	4
L(skip_errno_setting):
	/* Here if |x| is Inf or NAN. Continued.  */
	movss	ARG_X, %xmm3		/* load x */
	subss	%xmm3, %xmm3		/* Result is NaN */
	jmp	L(epilogue)
END(__cosf_sse2)

	.section .rodata, "a"
	.p2align 3
L(PIO4J): /* Table of j*Pi/4, for j=0,1,..,10 */
	.long	0x00000000,0x00000000
	.long	0x54442d18,0x3fe921fb
	.long	0x54442d18,0x3ff921fb
	.long	0x7f3321d2,0x4002d97c
	.long	0x54442d18,0x400921fb
	.long	0x2955385e,0x400f6a7a
	.long	0x7f3321d2,0x4012d97c
	.long	0xe9bba775,0x4015fdbb
	.long	0x54442d18,0x401921fb
	.long	0xbeccb2bb,0x401c463a
	.long	0x2955385e,0x401f6a7a
	.type L(PIO4J), @object
	ASM_SIZE_DIRECTIVE(L(PIO4J))

	.p2align 3
L(_FPI): /* 4/Pi broken into sum of positive DP values */
	.long	0x00000000,0x00000000
	.long	0x6c000000,0x3ff45f30
	.long	0x2a000000,0x3e3c9c88
	.long	0xa8000000,0x3c54fe13
	.long	0xd0000000,0x3aaf47d4
	.long	0x6c000000,0x38fbb81b
	.long	0xe0000000,0x3714acc9
	.long	0x7c000000,0x3560e410
	.long	0x56000000,0x33bca2c7
	.long	0xac000000,0x31fbd778
	.long	0xe0000000,0x300b7246
	.long	0xe8000000,0x2e5d2126
	.long	0x48000000,0x2c970032
	.long	0xe8000000,0x2ad77504
	.long	0xe0000000,0x290921cf
	.long	0xb0000000,0x274deb1c
	.long	0xe0000000,0x25829a73
	.long	0xbe000000,0x23fd1046
	.long	0x10000000,0x2224baed
	.long	0x8e000000,0x20709d33
	.long	0x80000000,0x1e535a2f
	.long	0x64000000,0x1cef904e
	.long	0x30000000,0x1b0d6398
	.long	0x24000000,0x1964ce7d
	.long	0x16000000,0x17b908bf
	.type L(_FPI), @object
	ASM_SIZE_DIRECTIVE(L(_FPI))

/* Coefficients of polynomial
 for cos(x)~=1.0+x^2*DP_COS2_0+x^4*DP_COS2_1, |x|<2^-5.  */
	.p2align 3
L(DP_COS2_0):
	.long	0xff5cc6fd,0xbfdfffff
	.type L(DP_COS2_0), @object
	ASM_SIZE_DIRECTIVE(L(DP_COS2_0))

	.p2align 3
L(DP_COS2_1):
	.long	0xb178dac5,0x3fa55514
	.type L(DP_COS2_1), @object
	ASM_SIZE_DIRECTIVE(L(DP_COS2_1))

	.p2align 3
L(DP_ZERONE):
	.long	0x00000000,0x00000000	/* 0.0 */
	.long	0x00000000,0xbff00000	/* 1.0 */
	.type L(DP_ZERONE),@object
	ASM_SIZE_DIRECTIVE(L(DP_ZERONE))

	.p2align 3
L(DP_ONES):
	.long	0x00000000,0x3ff00000	/* +1.0 */
	.long	0x00000000,0xbff00000	/* -1.0 */
	.type L(DP_ONES), @object
	ASM_SIZE_DIRECTIVE(L(DP_ONES))

/* Coefficients of polynomial
 for sin(t)~=t+t^3*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4)))), |t|<Pi/4.  */
	.p2align 3
L(DP_S3):
	.long	0x64e6b5b4,0x3ec71d72
	.type L(DP_S3), @object
	ASM_SIZE_DIRECTIVE(L(DP_S3))

	.p2align 3
L(DP_S1):
	.long	0x10c2688b,0x3f811111
	.type L(DP_S1), @object
	ASM_SIZE_DIRECTIVE(L(DP_S1))

	.p2align 3
L(DP_S4):
	.long	0x1674b58a,0xbe5a947e
	.type L(DP_S4), @object
	ASM_SIZE_DIRECTIVE(L(DP_S4))

	.p2align 3
L(DP_S2):
	.long	0x8b4bd1f9,0xbf2a019f
	.type L(DP_S2), @object
	ASM_SIZE_DIRECTIVE(L(DP_S2))

	.p2align 3
L(DP_S0):
	.long	0x55551cd9,0xbfc55555
	.type L(DP_S0), @object
	ASM_SIZE_DIRECTIVE(L(DP_S0))

/* Coefficients of polynomial
 for cos(t)~=1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4)))), |t|<Pi/4.  */
	.p2align 3
L(DP_C3):
	.long	0x9ac43cc0,0x3efa00eb
	.type L(DP_C3), @object
	ASM_SIZE_DIRECTIVE(L(DP_C3))

	.p2align 3
L(DP_C1):
	.long	0x545c50c7,0x3fa55555
	.type L(DP_C1), @object
	ASM_SIZE_DIRECTIVE(L(DP_C1))

	.p2align 3
L(DP_C4):
	.long	0xdd8844d7,0xbe923c97
	.type L(DP_C4), @object
	ASM_SIZE_DIRECTIVE(L(DP_C4))

	.p2align 3
L(DP_C2):
	.long	0x348b6874,0xbf56c16b
	.type L(DP_C2), @object
	ASM_SIZE_DIRECTIVE(L(DP_C2))

	.p2align 3
L(DP_C0):
	.long	0xfffe98ae,0xbfdfffff
	.type L(DP_C0), @object
	ASM_SIZE_DIRECTIVE(L(DP_C0))

	.p2align 3
L(DP_PIO4):
	.long	0x54442d18,0x3fe921fb	/* Pi/4 */
	.type L(DP_PIO4), @object
	ASM_SIZE_DIRECTIVE(L(DP_PIO4))

	.p2align 3
L(DP_2POW52):
	.long	0x00000000,0x43300000	/* +2^52 */
	.long	0x00000000,0xc3300000	/* -2^52 */
	.type L(DP_2POW52), @object
	ASM_SIZE_DIRECTIVE(L(DP_2POW52))

	.p2align 3
L(DP_INVPIO4):
	.long	0x6dc9c883,0x3ff45f30	/* 4/Pi */
	.type L(DP_INVPIO4), @object
	ASM_SIZE_DIRECTIVE(L(DP_INVPIO4))

	.p2align 3
L(DP_PIO4HI):
	.long	0x54000000,0xbfe921fb	/* High part of Pi/4 */
	.type L(DP_PIO4HI), @object
	ASM_SIZE_DIRECTIVE(L(DP_PIO4HI))

	.p2align 3
L(DP_PIO4LO):
	.long	0x11A62633,0xbe010b46	/* Low part of Pi/4 */
	.type L(DP_PIO4LO), @object
	ASM_SIZE_DIRECTIVE(L(DP_PIO4LO))

	.p2align 2
L(SP_INVPIO4):
	.long	0x3fa2f983		/* 4/Pi */
	.type L(SP_INVPIO4), @object
	ASM_SIZE_DIRECTIVE(L(SP_INVPIO4))

	.p2align 4
L(DP_ABS_MASK): /* Mask for getting DP absolute value */
	.long	0xffffffff,0x7fffffff
	.long	0xffffffff,0x7fffffff
	.type L(DP_ABS_MASK), @object
	ASM_SIZE_DIRECTIVE(L(DP_ABS_MASK))

	.p2align 3
L(DP_HI_MASK): /* Mask for getting high 21 bits of DP value */
	.long	0x00000000,0xffffffff
	.type L(DP_HI_MASK), @object
	ASM_SIZE_DIRECTIVE(L(DP_HI_MASK))

	.p2align 4
L(SP_ABS_MASK): /* Mask for getting SP absolute value */
	.long	0x7fffffff,0x7fffffff
	.long	0x7fffffff,0x7fffffff
	.type L(SP_ABS_MASK), @object
	ASM_SIZE_DIRECTIVE(L(SP_ABS_MASK))

	.p2align 2
L(SP_ONE):
	.long	0x3f800000		/* 1.0 */
	.type L(SP_ONE), @object
	ASM_SIZE_DIRECTIVE(L(SP_ONE))

weak_alias (__cosf, cosf)
Commit	Line	Data
4ffffbd2	1	/* Optimized with sse2 version of cosf
b168057a	2	Copyright (C) 2012-2015 Free Software Foundation, Inc.
4ffffbd2 LD	3	This file is part of the GNU C Library.
	4
	5	The GNU C Library is free software; you can redistribute it and/or
	6	modify it under the terms of the GNU Lesser General Public
	7	License as published by the Free Software Foundation; either
	8	version 2.1 of the License, or (at your option) any later version.
	9
	10	The GNU C Library is distributed in the hope that it will be useful,
	11	but WITHOUT ANY WARRANTY; without even the implied warranty of
	12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	13	Lesser General Public License for more details.
	14
	15	You should have received a copy of the GNU Lesser General Public
	16	License along with the GNU C Library; if not, see
	17	<http://www.gnu.org/licenses/>. */
	18
	19	#include <sysdep.h>
	20	#define __need_Emath
	21	#include <bits/errno.h>
	22
	23	/* Short algorithm description:
	24	*
	25	* 1) if \|x\| == 0: return 1.0-\|x\|.
	26	* 2) if \|x\| < 2^-27: return 1.0-\|x\|.
	27	* 3) if \|x\| < 2^-5 : return 1.0+x^2DP_COS2_0+x^5DP_COS2_1.
	28	* 4) if \|x\| < Pi/4: return 1.0+x^2(C0+x^2(C1+x^2(C2+x^2(C3+x^2*C4)))).
	29	* 5) if \|x\| < 9*Pi/4:
	30	* 5.1) Range reduction: k=trunc(\|x\|/(Pi/4)), j=(k+1)&0x0e, n=k+3,
	31	* t=\|x\|-j*Pi/4.
	32	* 5.2) Reconstruction:
	33	* s = (-1.0)^((n>>2)&1)
	34	* if(n&2 != 0) {
	35	* using cos(t) polynomial for \|t\|<Pi/4, result is
	36	* s * (1.0+t^2(C0+t^2(C1+t^2(C2+t^2(C3+t^2*C4))))).
	37	* } else {
	38	* using sin(t) polynomial for \|t\|<Pi/4, result is
	39	* s * t * (1.0+t^2(S0+t^2(S1+t^2(S2+t^2(S3+t^2*S4))))).
	40	* }
	41	* 6) if \|x\| < 2^23, large args:
	42	* 6.1) Range reduction: k=trunc(\|x\|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+3,
	43	* t=\|x\|-j*Pi/4.
	44	* 6.2) Reconstruction same as (5.2).
	45	* 7) if \|x\| >= 2^23, very large args:
	46	* 7.1) Range reduction: k=trunc(\|x\|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+3,
	47	* t=\|x\|-j*Pi/4.
	48	* 7.2) Reconstruction same as (5.2).
	49	* 8) if x is Inf, return x-x, and set errno=EDOM.
	50	* 9) if x is NaN, return x-x.
	51	*
	52	* Special cases:
80ccd52c LD	53	* cos(+-0) = 1 not raising inexact,
	54	* cos(subnormal) raises inexact,
	55	* cos(min_normalized) raises inexact,
	56	* cos(normalized) raises inexact,
	57	* cos(Inf) = NaN, raises invalid, sets errno to EDOM,
	58	* cos(NaN) = NaN.
4ffffbd2 LD	59	*/
	60
	61	#ifdef PIC
	62	# define MO1(symbol) L(symbol)##@GOTOFF(%ebx)
	63	# define MO2(symbol,reg2,_scale) L(symbol)##@GOTOFF(%ebx,reg2,_scale)
80ccd52c LD	64	# define CFI_PUSH(REG) cfi_adjust_cfa_offset(4); cfi_rel_offset(REG,0)
	65	# define CFI_POP(REG) cfi_adjust_cfa_offset(-4); cfi_restore(REG)
	66	# define PUSH(REG) pushl REG; CFI_PUSH(REG)
	67	# define POP(REG) popl REG; CFI_POP(REG)
	68	# define ENTRANCE PUSH(%ebx); LOAD_PIC_REG(bx)
	69	# define RETURN POP(%ebx); ret; CFI_PUSH(%ebx)
4ffffbd2 LD	70	# define ARG_X 8(%esp)
	71	#else
	72	# define MO1(symbol) L(symbol)
	73	# define MO2(symbol,reg2,_scale) L(symbol)(,reg2,_scale)
80ccd52c LD	74	# define ENTRANCE
80ccd52c LD	75	# define RETURN ret
4ffffbd2 LD	76	# define ARG_X 4(%esp)
	77	#endif
	78
	79	.text
	80	ENTRY(__cosf_sse2)
	81	/* Input: single precision x on stack at address ARG_X */
	82
80ccd52c	83	ENTRANCE
4ffffbd2 LD	84	movl ARG_X, %eax /* Bits of x */
	85	cvtss2sd ARG_X, %xmm0 /* DP x */
	86	andl $0x7fffffff, %eax /* \|x\| */
	87
	88	cmpl $0x3f490fdb, %eax /* \|x\|<Pi/4? */
	89	jb L(arg_less_pio4)
	90
	91	/* Here if \|x\|>=Pi/4 */
	92	movd %eax, %xmm3 /* SP \|x\| */
	93	andpd MO1(DP_ABS_MASK),%xmm0 /* DP \|x\| */
	94	movss MO1(SP_INVPIO4), %xmm2 /* SP 1/(Pi/4) */
	95
	96	cmpl $0x40e231d6, %eax /* \|x\|<9Pi/4? /
	97	jae L(large_args)
	98
	99	/* Here if Pi/4<=\|x\|<9Pi/4 /
	100	mulss %xmm3, %xmm2 /* SP \|x\|/(Pi/4) */
	101	cvttss2si %xmm2, %eax /* k, number of Pi/4 in x */
	102	addl $1, %eax /* k+1 */
	103	movl $0x0e, %edx
	104	andl %eax, %edx /* j = (k+1)&0x0e */
	105	addl $2, %eax /* n */
	106	subsd MO2(PIO4J,%edx,8), %xmm0 /* t = \|x\| - j * Pi/4 */
	107
	108	L(reconstruction):
	109	/* Input: %eax=n, %xmm0=t */
	110	testl $2, %eax /* n&2 != 0? */
	111	jz L(sin_poly)
	112
	113	/L(cos_poly):/
	114	/* Here if cos(x) calculated using cos(t) polynomial for \|t\|<Pi/4:
	115	* y = tt; z = yy;
	116	* s = sign(x) * (-1.0)^((n>>2)&1)
	117	* result = s * (1.0+t^2(C0+t^2(C1+t^2(C2+t^2(C3+t^2*C4)))))
	118	*/
	119	shrl $2, %eax /* n>>2 */
	120	mulsd %xmm0, %xmm0 /* y=t^2 */
	121	andl $1, %eax /* (n>>2)&1 */
	122	movaps %xmm0, %xmm1 /* y */
	123	mulsd %xmm0, %xmm0 /* z=t^4 */
	124
	125	movsd MO1(DP_C4), %xmm4 /* C4 */
	126	mulsd %xmm0, %xmm4 /* zC4 /
	127	movsd MO1(DP_C3), %xmm3 /* C3 */
	128	mulsd %xmm0, %xmm3 /* zC3 /
	129	addsd MO1(DP_C2), %xmm4 /* C2+zC4 /
	130	mulsd %xmm0, %xmm4 /* z(C2+zC4) */
	131	lea -8(%esp), %esp /* Borrow 4 bytes of stack frame */
	132	addsd MO1(DP_C1), %xmm3 /* C1+zC3 /
	133	mulsd %xmm0, %xmm3 /* z(C1+zC3) */
	134	addsd MO1(DP_C0), %xmm4 /* C0+z(C2+zC4) */
	135	mulsd %xmm1, %xmm4 /* y(C0+z(C2+zC4)) /
	136
	137	addsd %xmm4, %xmm3 /* y(C0+y(C1+y(C2+y(C3+yC4)))) /
	138	/* 1.0+y(C0+y(C1+y(C2+y(C3+yC4)))) /
	139	addsd MO1(DP_ONES), %xmm3
	140
	141	mulsd MO2(DP_ONES,%eax,8), %xmm3 /* DP result */
	142	movsd %xmm3, 0(%esp) /* Move result from sse... */
	143	fldl 0(%esp) /* ...to FPU. */
	144	/* Return back 4 bytes of stack frame */
	145	lea 8(%esp), %esp
80ccd52c	146	RETURN
4ffffbd2 LD	147
	148	.p2align 4
	149	L(sin_poly):
	150	/* Here if cos(x) calculated using sin(t) polynomial for \|t\|<Pi/4:
	151	* y = tt; z = yy;
	152	* s = sign(x) * (-1.0)^((n>>2)&1)
	153	* result = s * t * (1.0+t^2(S0+t^2(S1+t^2(S2+t^2(S3+t^2*S4)))))
	154	*/
	155
	156	movaps %xmm0, %xmm4 /* t */
	157	shrl $2, %eax /* n>>2 */
	158	mulsd %xmm0, %xmm0 /* y=t^2 */
	159	andl $1, %eax /* (n>>2)&1 */
	160	movaps %xmm0, %xmm1 /* y */
	161	mulsd %xmm0, %xmm0 /* z=t^4 */
	162
	163	movsd MO1(DP_S4), %xmm2 /* S4 */
	164	mulsd %xmm0, %xmm2 /* zS4 /
	165	movsd MO1(DP_S3), %xmm3 /* S3 */
	166	mulsd %xmm0, %xmm3 /* zS3 /
	167	lea -8(%esp), %esp /* Borrow 4 bytes of stack frame */
	168	addsd MO1(DP_S2), %xmm2 /* S2+zS4 /
	169	mulsd %xmm0, %xmm2 /* z(S2+zS4) */
	170	addsd MO1(DP_S1), %xmm3 /* S1+zS3 /
	171	mulsd %xmm0, %xmm3 /* z(S1+zS3) */
	172	addsd MO1(DP_S0), %xmm2 /* S0+z(S2+zS4) */
	173	mulsd %xmm1, %xmm2 /* y(S0+z(S2+zS4)) /
	174	/* ts, where s = sign(x) (-1.0)^((n>>2)&1) */
	175	mulsd MO2(DP_ONES,%eax,8), %xmm4
	176	addsd %xmm2, %xmm3 /* y(S0+y(S1+y(S2+y(S3+yS4)))) /
	177	/* tsy(S0+y(S1+y(S2+y(S3+yS4)))) /
	178	mulsd %xmm4, %xmm3
	179	/* ts(1.0+y(S0+y(S1+y(S2+y(S3+yS4)))) /
	180	addsd %xmm4, %xmm3
	181	movsd %xmm3, 0(%esp) /* Move result from sse... */
	182	fldl 0(%esp) /* ...to FPU. */
	183	/* Return back 4 bytes of stack frame */
	184	lea 8(%esp), %esp
80ccd52c	185	RETURN
4ffffbd2 LD	186
	187	.p2align 4
	188	L(large_args):
	189	/* Here if \|x\|>=9Pi/4 /
	190	cmpl $0x7f800000, %eax /* x is Inf or NaN? */
	191	jae L(arg_inf_or_nan)
	192
	193	/* Here if finite \|x\|>=9Pi/4 /
	194	cmpl $0x4b000000, %eax /* \|x\|<2^23? */
	195	jae L(very_large_args)
	196
	197	/* Here if 9Pi/4<=\|x\|<2^23 /
	198	movsd MO1(DP_INVPIO4), %xmm1 /* 1/(Pi/4) */
	199	mulsd %xmm0, %xmm1 /* \|x\|/(Pi/4) */
	200	cvttsd2si %xmm1, %eax /* k=trunc(\|x\|/(Pi/4)) */
	201	addl $1, %eax /* k+1 */
	202	movl %eax, %edx
	203	andl $0xfffffffe, %edx /* j=(k+1)&0xfffffffe */
	204	cvtsi2sdl %edx, %xmm4 /* DP j */
	205	movsd MO1(DP_PIO4HI), %xmm2 /* -PIO4HI = high part of -Pi/4 */
	206	mulsd %xmm4, %xmm2 /* -jPIO4HI /
	207	movsd MO1(DP_PIO4LO), %xmm3 /* -PIO4LO = low part of -Pi/4 */
	208	addsd %xmm2, %xmm0 /* \|x\| - jPIO4HI /
	209	addl $2, %eax /* n */
	210	mulsd %xmm3, %xmm4 /* jPIO4LO /
	211	addsd %xmm4, %xmm0 /* t = \|x\| - jPIO4HI - jPIO4LO */
	212	jmp L(reconstruction)
	213
	214	.p2align 4
	215	L(very_large_args):
	216	/* Here if finite \|x\|>=2^23 */
	217
	218	/* bitpos = (ix>>23) - BIAS_32 + 59; */
	219	shrl $23, %eax /* eb = biased exponent of x */
	220	/* bitpos = eb - 0x7f + 59, where 0x7f is exponent bias */
	221	subl $68, %eax
	222	movl $28, %ecx /* %cl=28 */
	223	movl %eax, %edx /* bitpos copy */
	224
	225	/* j = bitpos/28; */
	226	div %cl /* j in register %al=%ax/%cl */
	227	movapd %xmm0, %xmm3 /* \|x\| */
	228	/* clear unneeded remainder from %ah */
	229	andl $0xff, %eax
	230
	231	imull $28, %eax, %ecx /* j28 /
	232	movsd MO1(DP_HI_MASK), %xmm4 /* DP_HI_MASK */
	233	movapd %xmm0, %xmm5 /* \|x\| */
	234	mulsd -28+MO2(_FPI,%eax,8), %xmm3 / tmp3 = FPI[j-2]\|x\| /
	235	movapd %xmm0, %xmm1 /* \|x\| */
	236	mulsd -18+MO2(_FPI,%eax,8), %xmm5 / tmp2 = FPI[j-1]\|x\| /
	237	mulsd 08+MO2(_FPI,%eax,8), %xmm0 / tmp0 = FPI[j]\|x\| /
	238	addl $19, %ecx /* j28+19 /
	239	mulsd 18+MO2(_FPI,%eax,8), %xmm1 / tmp1 = FPI[j+1]\|x\| /
	240	cmpl %ecx, %edx /* bitpos>=j28+19? /
	241	jl L(very_large_skip1)
	242
	243	/* Here if bitpos>=j28+19 /
	244	andpd %xmm3, %xmm4 /* HI(tmp3) */
	245	subsd %xmm4, %xmm3 /* tmp3 = tmp3 - HI(tmp3) */
	246	L(very_large_skip1):
	247
	248	movsd MO1(DP_2POW52), %xmm6
	249	movapd %xmm5, %xmm2 /* tmp2 copy */
250	addsd %xmm3, %xmm5 /* tmp5 = tmp3 + tmp2 */
251	movl $1, %edx
252	addsd %xmm5, %xmm6 /* tmp6 = tmp5 + 2^52 */
253	movsd 8+MO1(DP_2POW52), %xmm4
254	movd %xmm6, %eax /* k = I64_LO(tmp6); */
255	addsd %xmm6, %xmm4 /* tmp4 = tmp6 - 2^52 */
256	comisd %xmm5, %xmm4 /* tmp4 > tmp5? */
257	jbe L(very_large_skip2)
258
259	/* Here if tmp4 > tmp5 */
260	subl $1, %eax /* k-- */
261	addsd 8+MO1(DP_ONES), %xmm4 /* tmp4 -= 1.0 */
262	L(very_large_skip2):
263
264	andl %eax, %edx /* k&1 */
265	subsd %xmm4, %xmm3 /* tmp3 -= tmp4 */
266	addsd MO2(DP_ZERONE,%edx,8), %xmm3 /* t = DP_ZERONE[k&1] + tmp3 */
267	addsd %xmm2, %xmm3 /* t += tmp2 */
268	addsd %xmm3, %xmm0 /* t += tmp0 */
269	addl $3, %eax /* n=k+3 */
270	addsd %xmm1, %xmm0 /* t += tmp1 */
271	mulsd MO1(DP_PIO4), %xmm0 /* t = PI04 /
272
273	jmp L(reconstruction) /* end of very_large_args peth */
274
4ffffbd2 LD	275	.p2align 4
	276	L(arg_less_pio4):
	277	/* Here if \|x\|<Pi/4 */
	278	cmpl $0x3d000000, %eax /* \|x\|<2^-5? */
	279	jl L(arg_less_2pn5)
	280
	281	/* Here if 2^-5<=\|x\|<Pi/4 */
	282	mulsd %xmm0, %xmm0 /* y=x^2 */
	283	movaps %xmm0, %xmm1 /* y */
	284	mulsd %xmm0, %xmm0 /* z=x^4 */
	285	movsd MO1(DP_C4), %xmm3 /* C4 */
	286	mulsd %xmm0, %xmm3 /* zC4 /
	287	movsd MO1(DP_C3), %xmm5 /* C3 */
	288	mulsd %xmm0, %xmm5 /* zC3 /
	289	addsd MO1(DP_C2), %xmm3 /* C2+zC4 /
	290	mulsd %xmm0, %xmm3 /* z(C2+zC4) */
	291	addsd MO1(DP_C1), %xmm5 /* C1+zC3 /
	292	mulsd %xmm0, %xmm5 /* z(C1+zC3) */
	293	addsd MO1(DP_C0), %xmm3 /* C0+z(C2+zC4) */
	294	mulsd %xmm1, %xmm3 /* y(C0+z(C2+zC4)) /
	295	addsd %xmm5, %xmm3 /* y(C0+y(C1+y(C2+y(C3+yC4)))) /
	296	/* 1.0 + y(C0+y(C1+y(C2+y(C3+yC4)))) /
	297	addsd MO1(DP_ONES), %xmm3
	298	cvtsd2ss %xmm3, %xmm3 /* SP result */
	299
	300	L(epilogue):
	301	lea -4(%esp), %esp /* Borrow 4 bytes of stack frame */
	302	movss %xmm3, 0(%esp) /* Move result from sse... */
	303	flds 0(%esp) /* ...to FPU. */
	304	/* Return back 4 bytes of stack frame */
	305	lea 4(%esp), %esp
80ccd52c	306	RETURN
4ffffbd2 LD	307
	308	.p2align 4
	309	L(arg_less_2pn5):
	310	/* Here if \|x\|<2^-5 */
	311	cmpl $0x32000000, %eax /* \|x\|<2^-27? */
	312	jl L(arg_less_2pn27)
	313
	314	/* Here if 2^-27<=\|x\|<2^-5 */
	315	mulsd %xmm0, %xmm0 /* DP x^2 */
	316	movsd MO1(DP_COS2_1), %xmm3 /* DP DP_COS2_1 */
	317	mulsd %xmm0, %xmm3 /* DP x^2DP_COS2_1 /
	318	addsd MO1(DP_COS2_0), %xmm3 /* DP DP_COS2_0+x^2DP_COS2_1 /
	319	mulsd %xmm0, %xmm3 /* DP x^2DP_COS2_0+x^4DP_COS2_1 */
	320	/* DP 1.0+x^2DP_COS2_0+x^4DP_COS2_1 */
	321	addsd MO1(DP_ONES), %xmm3
	322	cvtsd2ss %xmm3, %xmm3 /* SP result */
	323	jmp L(epilogue)
	324
	325	.p2align 4
	326	L(arg_less_2pn27):
	327	/* Here if \|x\|<2^-27 */
	328	movss ARG_X, %xmm0 /* x */
	329	andps MO1(SP_ABS_MASK),%xmm0 /* \|x\| */
	330	movss MO1(SP_ONE), %xmm3 /* 1.0 */
	331	subss %xmm0, %xmm3 /* result is 1.0-\|x\| */
	332	jmp L(epilogue)
	333
	334	.p2align 4
	335	L(arg_inf_or_nan):
	336	/* Here if \|x\| is Inf or NAN */
	337	jne L(skip_errno_setting) /* in case of x is NaN */
	338
	339	/* Here if x is Inf. Set errno to EDOM. */
	340	call JUMPTARGET(__errno_location)
	341	movl $EDOM, (%eax)
	342
	343	.p2align 4
	344	L(skip_errno_setting):
	345	/* Here if \|x\| is Inf or NAN. Continued. */
	346	movss ARG_X, %xmm3 /* load x */
	347	subss %xmm3, %xmm3 /* Result is NaN */
	348	jmp L(epilogue)
	349	END(__cosf_sse2)
	350
4ffffbd2 LD	351	.section .rodata, "a"
	352	.p2align 3
	353	L(PIO4J): /* Table of jPi/4, for j=0,1,..,10 /
	354	.long 0x00000000,0x00000000
	355	.long 0x54442d18,0x3fe921fb
	356	.long 0x54442d18,0x3ff921fb
	357	.long 0x7f3321d2,0x4002d97c
	358	.long 0x54442d18,0x400921fb
	359	.long 0x2955385e,0x400f6a7a
	360	.long 0x7f3321d2,0x4012d97c
	361	.long 0xe9bba775,0x4015fdbb
	362	.long 0x54442d18,0x401921fb
	363	.long 0xbeccb2bb,0x401c463a
	364	.long 0x2955385e,0x401f6a7a
	365	.type L(PIO4J), @object
	366	ASM_SIZE_DIRECTIVE(L(PIO4J))
	367
	368	.p2align 3
	369	L(_FPI): /* 4/Pi broken into sum of positive DP values */
	370	.long 0x00000000,0x00000000
	371	.long 0x6c000000,0x3ff45f30
	372	.long 0x2a000000,0x3e3c9c88
	373	.long 0xa8000000,0x3c54fe13
	374	.long 0xd0000000,0x3aaf47d4
	375	.long 0x6c000000,0x38fbb81b
	376	.long 0xe0000000,0x3714acc9
	377	.long 0x7c000000,0x3560e410
	378	.long 0x56000000,0x33bca2c7
	379	.long 0xac000000,0x31fbd778
	380	.long 0xe0000000,0x300b7246
	381	.long 0xe8000000,0x2e5d2126
	382	.long 0x48000000,0x2c970032
	383	.long 0xe8000000,0x2ad77504
	384	.long 0xe0000000,0x290921cf
	385	.long 0xb0000000,0x274deb1c
	386	.long 0xe0000000,0x25829a73
	387	.long 0xbe000000,0x23fd1046
	388	.long 0x10000000,0x2224baed
	389	.long 0x8e000000,0x20709d33
	390	.long 0x80000000,0x1e535a2f
	391	.long 0x64000000,0x1cef904e
	392	.long 0x30000000,0x1b0d6398
	393	.long 0x24000000,0x1964ce7d
	394	.long 0x16000000,0x17b908bf
	395	.type L(_FPI), @object
	396	ASM_SIZE_DIRECTIVE(L(_FPI))
	397
	398	/* Coefficients of polynomial
	399	for cos(x)~=1.0+x^2DP_COS2_0+x^4DP_COS2_1, \|x\|<2^-5. */
	400	.p2align 3
	401	L(DP_COS2_0):
	402	.long 0xff5cc6fd,0xbfdfffff
	403	.type L(DP_COS2_0), @object
	404	ASM_SIZE_DIRECTIVE(L(DP_COS2_0))
	405
	406	.p2align 3
	407	L(DP_COS2_1):
	408	.long 0xb178dac5,0x3fa55514
	409	.type L(DP_COS2_1), @object
	410	ASM_SIZE_DIRECTIVE(L(DP_COS2_1))
	411
	412	.p2align 3
	413	L(DP_ZERONE):
	414	.long 0x00000000,0x00000000 /* 0.0 */
415	.long 0x00000000,0xbff00000 /* 1.0 */
416	.type L(DP_ZERONE),@object
417	ASM_SIZE_DIRECTIVE(L(DP_ZERONE))
418
419	.p2align 3
420	L(DP_ONES):
421	.long 0x00000000,0x3ff00000 /* +1.0 */
422	.long 0x00000000,0xbff00000 /* -1.0 */
423	.type L(DP_ONES), @object
424	ASM_SIZE_DIRECTIVE(L(DP_ONES))
425
426	/* Coefficients of polynomial
427	for sin(t)~=t+t^3(S0+t^2(S1+t^2(S2+t^2(S3+t^2S4)))), \|t\|<Pi/4. /
428	.p2align 3
429	L(DP_S3):
430	.long 0x64e6b5b4,0x3ec71d72
431	.type L(DP_S3), @object
432	ASM_SIZE_DIRECTIVE(L(DP_S3))
433
434	.p2align 3
435	L(DP_S1):
436	.long 0x10c2688b,0x3f811111
437	.type L(DP_S1), @object
438	ASM_SIZE_DIRECTIVE(L(DP_S1))
439
440	.p2align 3
441	L(DP_S4):
442	.long 0x1674b58a,0xbe5a947e
443	.type L(DP_S4), @object
444	ASM_SIZE_DIRECTIVE(L(DP_S4))
445
446	.p2align 3
447	L(DP_S2):
448	.long 0x8b4bd1f9,0xbf2a019f
449	.type L(DP_S2), @object
450	ASM_SIZE_DIRECTIVE(L(DP_S2))
451
452	.p2align 3
453	L(DP_S0):
454	.long 0x55551cd9,0xbfc55555
455	.type L(DP_S0), @object
456	ASM_SIZE_DIRECTIVE(L(DP_S0))
457
458	/* Coefficients of polynomial
459	for cos(t)~=1.0+t^2(C0+t^2(C1+t^2(C2+t^2(C3+t^2C4)))), \|t\|<Pi/4. /
460	.p2align 3
461	L(DP_C3):
462	.long 0x9ac43cc0,0x3efa00eb
463	.type L(DP_C3), @object
464	ASM_SIZE_DIRECTIVE(L(DP_C3))
465
466	.p2align 3
467	L(DP_C1):
468	.long 0x545c50c7,0x3fa55555
469	.type L(DP_C1), @object
470	ASM_SIZE_DIRECTIVE(L(DP_C1))
471
472	.p2align 3
473	L(DP_C4):
474	.long 0xdd8844d7,0xbe923c97
475	.type L(DP_C4), @object
476	ASM_SIZE_DIRECTIVE(L(DP_C4))
477
478	.p2align 3
479	L(DP_C2):
480	.long 0x348b6874,0xbf56c16b
481	.type L(DP_C2), @object
482	ASM_SIZE_DIRECTIVE(L(DP_C2))
483
484	.p2align 3
485	L(DP_C0):
486	.long 0xfffe98ae,0xbfdfffff
487	.type L(DP_C0), @object
488	ASM_SIZE_DIRECTIVE(L(DP_C0))
489
490	.p2align 3
491	L(DP_PIO4):
492	.long 0x54442d18,0x3fe921fb /* Pi/4 */
493	.type L(DP_PIO4), @object
494	ASM_SIZE_DIRECTIVE(L(DP_PIO4))
495
496	.p2align 3
497	L(DP_2POW52):
498	.long 0x00000000,0x43300000 /* +2^52 */
499	.long 0x00000000,0xc3300000 /* -2^52 */
500	.type L(DP_2POW52), @object
501	ASM_SIZE_DIRECTIVE(L(DP_2POW52))
502
503	.p2align 3
504	L(DP_INVPIO4):
505	.long 0x6dc9c883,0x3ff45f30 /* 4/Pi */
506	.type L(DP_INVPIO4), @object
507	ASM_SIZE_DIRECTIVE(L(DP_INVPIO4))
508
509	.p2align 3
510	L(DP_PIO4HI):
511	.long 0x54000000,0xbfe921fb /* High part of Pi/4 */
512	.type L(DP_PIO4HI), @object
513	ASM_SIZE_DIRECTIVE(L(DP_PIO4HI))
514
515	.p2align 3
516	L(DP_PIO4LO):
517	.long 0x11A62633,0xbe010b46 /* Low part of Pi/4 */
518	.type L(DP_PIO4LO), @object
519	ASM_SIZE_DIRECTIVE(L(DP_PIO4LO))
520
521	.p2align 2
522	L(SP_INVPIO4):
523	.long 0x3fa2f983 /* 4/Pi */
524	.type L(SP_INVPIO4), @object
525	ASM_SIZE_DIRECTIVE(L(SP_INVPIO4))
526
527	.p2align 4
528	L(DP_ABS_MASK): /* Mask for getting DP absolute value */
529	.long 0xffffffff,0x7fffffff
530	.long 0xffffffff,0x7fffffff
531	.type L(DP_ABS_MASK), @object
532	ASM_SIZE_DIRECTIVE(L(DP_ABS_MASK))
533
534	.p2align 3
535	L(DP_HI_MASK): /* Mask for getting high 21 bits of DP value */
536	.long 0x00000000,0xffffffff
80ccd52c LD	537	.type L(DP_HI_MASK), @object
80ccd52c LD	538	ASM_SIZE_DIRECTIVE(L(DP_HI_MASK))
4ffffbd2 LD	539
	540	.p2align 4
	541	L(SP_ABS_MASK): /* Mask for getting SP absolute value */
	542	.long 0x7fffffff,0x7fffffff
	543	.long 0x7fffffff,0x7fffffff
	544	.type L(SP_ABS_MASK), @object
	545	ASM_SIZE_DIRECTIVE(L(SP_ABS_MASK))
	546
	547	.p2align 2
	548	L(SP_ONE):
	549	.long 0x3f800000 /* 1.0 */
	550	.type L(SP_ONE), @object
	551	ASM_SIZE_DIRECTIVE(L(SP_ONE))
	552
	553	weak_alias (__cosf, cosf)