[thirdparty/glibc.git] / sysdeps / i386 / fpu / s_cbrtl.S

/* Compute cubic root of long double value.
   Copyright (C) 1997-2015 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   Contributed by Dirk Alboth <dirka@uni-paderborn.de> and
   Ulrich Drepper <drepper@cygnus.com>, 1997.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <http://www.gnu.org/licenses/>.  */

#include <machine/asm.h>

        .section .rodata

        .align ALIGNARG(4)
        .type f8,@object
f8:	.tfloat 0.161617097923756032
	ASM_SIZE_DIRECTIVE(f8)
        .align ALIGNARG(4)
        .type f7,@object
f7:	.tfloat -0.988553671195413709
	ASM_SIZE_DIRECTIVE(f7)
        .align ALIGNARG(4)
        .type f6,@object
f6:	.tfloat 2.65298938441952296
	ASM_SIZE_DIRECTIVE(f6)
        .align ALIGNARG(4)
        .type f5,@object
f5:	.tfloat -4.11151425200350531
	ASM_SIZE_DIRECTIVE(f5)
        .align ALIGNARG(4)
        .type f4,@object
f4:	.tfloat 4.09559907378707839
	ASM_SIZE_DIRECTIVE(f4)
        .align ALIGNARG(4)
        .type f3,@object
f3:	.tfloat -2.82414939754975962
	ASM_SIZE_DIRECTIVE(f3)
        .align ALIGNARG(4)
        .type f2,@object
f2:	.tfloat 1.67595307700780102
	ASM_SIZE_DIRECTIVE(f2)
        .align ALIGNARG(4)
        .type f1,@object
f1:	.tfloat 0.338058687610520237
	ASM_SIZE_DIRECTIVE(f1)

#define CBRT2		1.2599210498948731648
#define ONE_CBRT2	0.793700525984099737355196796584
#define SQR_CBRT2	1.5874010519681994748
#define ONE_SQR_CBRT2	0.629960524947436582364439673883

	/* We make the entries in the following table all 16 bytes
	   wide to avoid having to implement a multiplication by 10.  */
	.type factor,@object
        .align ALIGNARG(4)
factor:	.tfloat ONE_SQR_CBRT2
	.byte 0, 0, 0, 0, 0, 0
	.tfloat ONE_CBRT2
	.byte 0, 0, 0, 0, 0, 0
	.tfloat 1.0
	.byte 0, 0, 0, 0, 0, 0
	.tfloat CBRT2
	.byte 0, 0, 0, 0, 0, 0
	.tfloat SQR_CBRT2
	ASM_SIZE_DIRECTIVE(factor)

        .type two64,@object
        .align ALIGNARG(4)
two64:  .byte 0, 0, 0, 0, 0, 0, 0xf0, 0x43
        ASM_SIZE_DIRECTIVE(two64)

#ifdef PIC
#define MO(op) op##@GOTOFF(%ebx)
#define MOX(op,x) op##@GOTOFF(%ebx,x,1)
#else
#define MO(op) op
#define MOX(op,x) op(x)
#endif

	.text
ENTRY(__cbrtl)
	movl	4(%esp), %ecx
	movl	12(%esp), %eax
	orl	8(%esp), %ecx
	movl	%eax, %edx
	andl	$0x7fff, %eax
	orl	%eax, %ecx
	jz	1f
	xorl	%ecx, %ecx
	cmpl	$0x7fff, %eax
	je	1f

#ifdef PIC
	pushl	%ebx
	cfi_adjust_cfa_offset (4)
	cfi_rel_offset (ebx, 0)
	LOAD_PIC_REG (bx)
#endif

	cmpl	$0, %eax
	jne	2f

#ifdef PIC
	fldt	8(%esp)
#else
	fldt	4(%esp)
#endif
	fmull	MO(two64)
	movl	$-64, %ecx
#ifdef PIC
	fstpt	8(%esp)
	movl	16(%esp), %eax
#else
	fstpt	4(%esp)
	movl	12(%esp), %eax
#endif
	movl	%eax, %edx
	andl	$0x7fff, %eax

2:	andl	$0x8000, %edx
	subl	$16382, %eax
	orl	$0x3ffe, %edx
	addl	%eax, %ecx
#ifdef PIC
	movl	%edx, 16(%esp)

	fldt	8(%esp)			/* xm */
#else
	movl	%edx, 12(%esp)

	fldt	4(%esp)			/* xm */
#endif
	fabs

	/* The following code has two tracks:
	    a) compute the normalized cbrt value
	    b) compute xe/3 and xe%3
	   The right track computes the value for b) and this is done
	   in an optimized way by avoiding division.

	   But why two tracks at all?  Very easy: efficiency.  Some FP
	   instruction can overlap with a certain amount of integer (and
	   FP) instructions.  So we get (except for the imull) all
	   instructions for free.  */

	fldt	MO(f8)			/* f8 : xm */
	fmul	%st(1)			/* f8*xm : xm */

	fldt	MO(f7)
	faddp				/* f7+f8*xm : xm */
	fmul	%st(1)			/* (f7+f8*xm)*xm : xm */
			movl	$1431655766, %eax
	fldt	MO(f6)
	faddp				/* f6+(f7+f8*xm)*xm : xm */
			imull	%ecx
	fmul	%st(1)			/* (f6+(f7+f8*xm)*xm)*xm : xm */
			movl	%ecx, %eax
	fldt	MO(f5)
	faddp				/* f5+(f6+(f7+f8*xm)*xm)*xm : xm */
			sarl	$31, %eax
	fmul	%st(1)			/* (f5+(f6+(f7+f8*xm)*xm)*xm)*xm : xm */
			subl	%eax, %edx
	fldt	MO(f4)
	faddp				/* f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm : xm */
	fmul	%st(1)			/* (f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm : xm */
	fldt	MO(f3)
	faddp				/* f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm : xm */
	fmul	%st(1)			/* (f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm)*xm : xm */
	fldt	MO(f2)
	faddp				/* f2+(f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm)*xm : xm */
	fmul	%st(1)			/* (f2+(f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm)*xm)*xm : xm */
	fldt	MO(f1)
	faddp				/* u:=f1+(f2+(f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm)*xm)*xm : xm */

	fld	%st			/* u : u : xm */
	fmul	%st(1)			/* u*u : u : xm */
	fld	%st(2)			/* xm : u*u : u : xm */
	fadd	%st			/* 2*xm : u*u : u : xm */
	fxch	%st(1)			/* u*u : 2*xm : u : xm */
	fmul	%st(2)			/* t2:=u*u*u : 2*xm : u : xm */
			movl	%edx, %eax
	fadd	%st, %st(1)		/* t2 : t2+2*xm : u : xm */
			leal	(%edx,%edx,2),%edx
	fadd	%st(0)			/* 2*t2 : t2+2*xm : u : xm */
			subl	%edx, %ecx
	faddp	%st, %st(3)		/* t2+2*xm : u : 2*t2+xm */
			shll	$4, %ecx
	fmulp				/* u*(t2+2*xm) : 2*t2+xm */
	fdivp	%st, %st(1)		/* u*(t2+2*xm)/(2*t2+xm) */
	fldt	MOX(32+factor,%ecx)
	fmulp				/* u*(t2+2*xm)/(2*t2+xm)*FACT */
	pushl	%eax
	cfi_adjust_cfa_offset (4)
	fildl	(%esp)			/* xe/3 : u*(t2+2*xm)/(2*t2+xm)*FACT */
	fxch				/* u*(t2+2*xm)/(2*t2+xm)*FACT : xe/3 */
	fscale				/* u*(t2+2*xm)/(2*t2+xm)*FACT*2^xe/3 */
	popl	%edx
	cfi_adjust_cfa_offset (-4)
#ifdef PIC
	movl	16(%esp), %eax
	popl	%ebx
	cfi_adjust_cfa_offset (-4)
	cfi_restore (ebx)
#else
	movl	12(%esp), %eax
#endif
	testl	$0x8000, %eax
	fstp	%st(1)
	jz	4f
	fchs
4:	ret

	/* Return the argument.  */
1:	fldt	4(%esp)
	ret
END(__cbrtl)
weak_alias (__cbrtl, cbrtl)
Commit	Line	Data
26dee9c4	1	/* Compute cubic root of long double value.
b168057a	2	Copyright (C) 1997-2015 Free Software Foundation, Inc.
26dee9c4 UD	3	This file is part of the GNU C Library.
	4	Contributed by Dirk Alboth <dirka@uni-paderborn.de> and
	5	Ulrich Drepper <drepper@cygnus.com>, 1997.
	6
	7	The GNU C Library is free software; you can redistribute it and/or
41bdb6e2 AJ	8	modify it under the terms of the GNU Lesser General Public
	9	License as published by the Free Software Foundation; either
	10	version 2.1 of the License, or (at your option) any later version.
26dee9c4 UD	11
	12	The GNU C Library is distributed in the hope that it will be useful,
	13	but WITHOUT ANY WARRANTY; without even the implied warranty of
	14	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
41bdb6e2	15	Lesser General Public License for more details.
26dee9c4	16
41bdb6e2	17	You should have received a copy of the GNU Lesser General Public
59ba27a6 PE	18	License along with the GNU C Library; if not, see
59ba27a6 PE	19	<http://www.gnu.org/licenses/>. */
26dee9c4 UD	20
	21	#include <machine/asm.h>
	22
26dee9c4	23	.section .rodata
26dee9c4 UD	24
26dee9c4 UD	25	.align ALIGNARG(4)
b67e9372	26	.type f8,@object
0413b54c	27	f8: .tfloat 0.161617097923756032
26dee9c4	28	ASM_SIZE_DIRECTIVE(f8)
0413b54c	29	.align ALIGNARG(4)
b67e9372	30	.type f7,@object
0413b54c UD	31	f7: .tfloat -0.988553671195413709
	32	ASM_SIZE_DIRECTIVE(f7)
	33	.align ALIGNARG(4)
b67e9372	34	.type f6,@object
0413b54c UD	35	f6: .tfloat 2.65298938441952296
	36	ASM_SIZE_DIRECTIVE(f6)
	37	.align ALIGNARG(4)
b67e9372	38	.type f5,@object
0413b54c UD	39	f5: .tfloat -4.11151425200350531
	40	ASM_SIZE_DIRECTIVE(f5)
	41	.align ALIGNARG(4)
b67e9372	42	.type f4,@object
0413b54c UD	43	f4: .tfloat 4.09559907378707839
	44	ASM_SIZE_DIRECTIVE(f4)
	45	.align ALIGNARG(4)
b67e9372	46	.type f3,@object
0413b54c UD	47	f3: .tfloat -2.82414939754975962
	48	ASM_SIZE_DIRECTIVE(f3)
	49	.align ALIGNARG(4)
b67e9372	50	.type f2,@object
0413b54c UD	51	f2: .tfloat 1.67595307700780102
	52	ASM_SIZE_DIRECTIVE(f2)
	53	.align ALIGNARG(4)
b67e9372	54	.type f1,@object
0413b54c UD	55	f1: .tfloat 0.338058687610520237
0413b54c UD	56	ASM_SIZE_DIRECTIVE(f1)
26dee9c4	57
0413b54c UD	58	#define CBRT2 1.2599210498948731648
	59	#define ONE_CBRT2 0.793700525984099737355196796584
	60	#define SQR_CBRT2 1.5874010519681994748
	61	#define ONE_SQR_CBRT2 0.629960524947436582364439673883
26dee9c4	62
0413b54c UD	63	/* We make the entries in the following table all 16 bytes
0413b54c UD	64	wide to avoid having to implement a multiplication by 10. */
b67e9372	65	.type factor,@object
0413b54c UD	66	.align ALIGNARG(4)
	67	factor: .tfloat ONE_SQR_CBRT2
	68	.byte 0, 0, 0, 0, 0, 0
	69	.tfloat ONE_CBRT2
	70	.byte 0, 0, 0, 0, 0, 0
	71	.tfloat 1.0
	72	.byte 0, 0, 0, 0, 0, 0
	73	.tfloat CBRT2
	74	.byte 0, 0, 0, 0, 0, 0
	75	.tfloat SQR_CBRT2
26dee9c4 UD	76	ASM_SIZE_DIRECTIVE(factor)
26dee9c4 UD	77
b67e9372	78	.type two64,@object
0413b54c	79	.align ALIGNARG(4)
26dee9c4 UD	80	two64: .byte 0, 0, 0, 0, 0, 0, 0xf0, 0x43
	81	ASM_SIZE_DIRECTIVE(two64)
	82
	83	#ifdef PIC
	84	#define MO(op) op##@GOTOFF(%ebx)
0413b54c	85	#define MOX(op,x) op##@GOTOFF(%ebx,x,1)
26dee9c4 UD	86	#else
26dee9c4 UD	87	#define MO(op) op
0413b54c	88	#define MOX(op,x) op(x)
26dee9c4 UD	89	#endif
	90
	91	.text
	92	ENTRY(__cbrtl)
	93	movl 4(%esp), %ecx
	94	movl 12(%esp), %eax
	95	orl 8(%esp), %ecx
	96	movl %eax, %edx
	97	andl $0x7fff, %eax
	98	orl %eax, %ecx
	99	jz 1f
	100	xorl %ecx, %ecx
	101	cmpl $0x7fff, %eax
	102	je 1f
	103
	104	#ifdef PIC
	105	pushl %ebx
fee732e5 UD	106	cfi_adjust_cfa_offset (4)
	107	cfi_rel_offset (ebx, 0)
	108	LOAD_PIC_REG (bx)
26dee9c4 UD	109	#endif
	110
	111	cmpl $0, %eax
0413b54c	112	jne 2f
26dee9c4 UD	113
	114	#ifdef PIC
	115	fldt 8(%esp)
	116	#else
	117	fldt 4(%esp)
	118	#endif
	119	fmull MO(two64)
	120	movl $-64, %ecx
0413b54c UD	121	#ifdef PIC
	122	fstpt 8(%esp)
	123	movl 16(%esp), %eax
	124	#else
26dee9c4 UD	125	fstpt 4(%esp)
26dee9c4 UD	126	movl 12(%esp), %eax
0413b54c	127	#endif
26dee9c4 UD	128	movl %eax, %edx
	129	andl $0x7fff, %eax
	130
	131	2: andl $0x8000, %edx
	132	subl $16382, %eax
	133	orl $0x3ffe, %edx
	134	addl %eax, %ecx
	135	#ifdef PIC
	136	movl %edx, 16(%esp)
	137
	138	fldt 8(%esp) /* xm */
	139	#else
	140	movl %edx, 12(%esp)
	141
	142	fldt 4(%esp) /* xm */
	143	#endif
	144	fabs
	145
0413b54c	146	/* The following code has two tracks:
26dee9c4 UD	147	a) compute the normalized cbrt value
	148	b) compute xe/3 and xe%3
	149	The right track computes the value for b) and this is done
0413b54c	150	in an optimized way by avoiding division.
26dee9c4	151
0413b54c UD	152	But why two tracks at all? Very easy: efficiency. Some FP
	153	instruction can overlap with a certain amount of integer (and
	154	FP) instructions. So we get (except for the imull) all
	155	instructions for free. */
26dee9c4	156
0413b54c UD	157	fldt MO(f8) /* f8 : xm */
	158	fmul %st(1) /* f8xm : xm /
	159
	160	fldt MO(f7)
	161	faddp /* f7+f8xm : xm /
	162	fmul %st(1) /* (f7+f8xm)xm : xm */
26dee9c4	163	movl $1431655766, %eax
0413b54c UD	164	fldt MO(f6)
0413b54c UD	165	faddp /* f6+(f7+f8xm)xm : xm */
26dee9c4	166	imull %ecx
0413b54c	167	fmul %st(1) /* (f6+(f7+f8xm)xm)xm : xm /
26dee9c4	168	movl %ecx, %eax
0413b54c UD	169	fldt MO(f5)
0413b54c UD	170	faddp /* f5+(f6+(f7+f8xm)xm)xm : xm /
26dee9c4	171	sarl $31, %eax
0413b54c	172	fmul %st(1) /* (f5+(f6+(f7+f8xm)xm)xm)xm : xm */
26dee9c4	173	subl %eax, %edx
0413b54c UD	174	fldt MO(f4)
	175	faddp /* f4+(f5+(f6+(f7+f8xm)xm)xm)xm : xm */
	176	fmul %st(1) /* (f4+(f5+(f6+(f7+f8xm)xm)xm)xm)xm : xm /
	177	fldt MO(f3)
	178	faddp /* f3+(f4+(f5+(f6+(f7+f8xm)xm)xm)xm)xm : xm /
	179	fmul %st(1) /* (f3+(f4+(f5+(f6+(f7+f8xm)xm)xm)xm)xm)xm : xm */
	180	fldt MO(f2)
	181	faddp /* f2+(f3+(f4+(f5+(f6+(f7+f8xm)xm)xm)xm)xm)xm : xm */
	182	fmul %st(1) /* (f2+(f3+(f4+(f5+(f6+(f7+f8xm)xm)xm)xm)xm)xm)xm : xm /
	183	fldt MO(f1)
	184	faddp /* u:=f1+(f2+(f3+(f4+(f5+(f6+(f7+f8xm)xm)xm)xm)xm)xm)xm : xm /
26dee9c4 UD	185
	186	fld %st /* u : u : xm */
	187	fmul %st(1) /* uu : u : xm /
	188	fld %st(2) /* xm : uu : u : xm /
	189	fadd %st /* 2xm : uu : u : xm */
	190	fxch %st(1) /* uu : 2xm : u : xm */
	191	fmul %st(2) /* t2:=uuu : 2xm : u : xm /
	192	movl %edx, %eax
	193	fadd %st, %st(1) /* t2 : t2+2xm : u : xm /
	194	leal (%edx,%edx,2),%edx
	195	fadd %st(0) /* 2t2 : t2+2xm : u : xm */
	196	subl %edx, %ecx
	197	faddp %st, %st(3) /* t2+2xm : u : 2t2+xm */
0413b54c	198	shll $4, %ecx
26dee9c4 UD	199	fmulp /* u(t2+2xm) : 2t2+xm /
26dee9c4 UD	200	fdivp %st, %st(1) /* u(t2+2xm)/(2t2+xm) /
0413b54c UD	201	fldt MOX(32+factor,%ecx)
0413b54c UD	202	fmulp /* u(t2+2xm)/(2t2+xm)FACT */
26dee9c4	203	pushl %eax
fee732e5	204	cfi_adjust_cfa_offset (4)
26dee9c4 UD	205	fildl (%esp) /* xe/3 : u(t2+2xm)/(2t2+xm)FACT */
26dee9c4 UD	206	fxch /* u(t2+2xm)/(2t2+xm)FACT : xe/3 */
26dee9c4	207	fscale /* u(t2+2xm)/(2t2+xm)FACT2^xe/3 /
0413b54c	208	popl %edx
fee732e5	209	cfi_adjust_cfa_offset (-4)
26dee9c4	210	#ifdef PIC
0413b54c	211	movl 16(%esp), %eax
26dee9c4	212	popl %ebx
fee732e5 UD	213	cfi_adjust_cfa_offset (-4)
fee732e5 UD	214	cfi_restore (ebx)
0413b54c UD	215	#else
0413b54c UD	216	movl 12(%esp), %eax
26dee9c4	217	#endif
0413b54c UD	218	testl $0x8000, %eax
0413b54c UD	219	fstp %st(1)
26dee9c4 UD	220	jz 4f
	221	fchs
	222	4: ret
	223
	224	/* Return the argument. */
	225	1: fldt 4(%esp)
	226	ret
	227	END(__cbrtl)
	228	weak_alias (__cbrtl, cbrtl)