[thirdparty/glibc.git] / sysdeps / i386 / fpu / s_cbrtf.S

/* Compute cubic root of float value.
   Copyright (C) 1997-2019 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   Contributed by Dirk Alboth <dirka@uni-paderborn.de> and
   Ulrich Drepper <drepper@cygnus.com>, 1997.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */

#include <machine/asm.h>
#include <libm-alias-float.h>

        .section .rodata

        .align ALIGNARG(4)
        .type f3,@object
f3:	.double 0.191502161678719066
        ASM_SIZE_DIRECTIVE(f3)
        .type f2,@object
f2:	.double 0.697570460207922770
        ASM_SIZE_DIRECTIVE(f2)
        .type f1,@object
f1:	.double 0.492659620528969547
        ASM_SIZE_DIRECTIVE(f1)

#define CBRT2		1.2599210498948731648
#define ONE_CBRT2	0.793700525984099737355196796584
#define SQR_CBRT2	1.5874010519681994748
#define ONE_SQR_CBRT2	0.629960524947436582364439673883

	.type factor,@object
        .align ALIGNARG(4)
factor:	.double ONE_SQR_CBRT2
	.double ONE_CBRT2
	.double 1.0
	.double CBRT2
	.double SQR_CBRT2
	ASM_SIZE_DIRECTIVE(factor)

        .type two25,@object
two25:	.byte 0, 0, 0, 0x4c
        ASM_SIZE_DIRECTIVE(two25)

#ifdef PIC
#define MO(op) op##@GOTOFF(%ebx)
#define MOX(op,x) op##@GOTOFF(%ebx,x,1)
#else
#define MO(op) op
#define MOX(op,x) op(x)
#endif

	.text
ENTRY(__cbrtf)
	movl	4(%esp), %eax
	xorl	%ecx, %ecx
	movl	%eax, %edx
	andl	$0x7fffffff, %eax
	jz	1f
	cmpl	$0x7f800000, %eax
	jae	1f

#ifdef PIC
	pushl	%ebx
	cfi_adjust_cfa_offset (4)
	cfi_rel_offset (ebx, 0)
	LOAD_PIC_REG (bx)
#endif

	cmpl	$0x00800000, %eax
	jae	2f

#ifdef PIC
	flds	8(%esp)
#else
	flds	4(%esp)
#endif
	fmuls	MO(two25)
	movl	$-25, %ecx
#ifdef PIC
	fstps	8(%esp)
	movl	8(%esp), %eax
#else
	fstps	4(%esp)
	movl	4(%esp), %eax
#endif
	movl	%eax, %edx
	andl	$0x7fffffff, %eax

2:	shrl	$23, %eax
	andl	$0x807fffff, %edx
	subl	$126, %eax
	orl	$0x3f000000, %edx
	addl	%eax, %ecx
#ifdef PIC
	movl	%edx, 8(%esp)

	flds	8(%esp)			/* xm */
#else
	movl	%edx, 4(%esp)

	flds	4(%esp)			/* xm */
#endif
	fabs

	/* The following code has two tracks:
	    a) compute the normalized cbrt value
	    b) compute xe/3 and xe%3
	   The right track computes the value for b) and this is done
	   in an optimized way by avoiding division.

	   But why two tracks at all?  Very easy: efficiency.  Some FP
	   instruction can overlap with a certain amount of integer (and
	   FP) instructions.  So we get (except for the imull) all
	   instructions for free.  */

	fld	%st(0)			/* xm : xm */
	fmull	MO(f3)			/* f3*xm : xm */
			movl	$1431655766, %eax
	fsubrl	MO(f2)			/* f2-f3*xm : xm */
			imull	%ecx
	fmul	%st(1)			/* (f2-f3*xm)*xm : xm */
			movl	%ecx, %eax
	faddl	MO(f1)			/* u:=f1+(f2-f3*xm)*xm : xm */
			sarl	$31, %eax
	fld	%st			/* u : u : xm */
			subl	%eax, %edx
	fmul	%st(1)			/* u*u : u : xm */
	fld	%st(2)			/* xm : u*u : u : xm */
	fadd	%st			/* 2*xm : u*u : u : xm */
	fxch	%st(1)			/* u*u : 2*xm : u : xm */
	fmul	%st(2)			/* t2:=u*u*u : 2*xm : u : xm */
			movl	%edx, %eax
	fadd	%st, %st(1)		/* t2 : t2+2*xm : u : xm */
			leal	(%edx,%edx,2),%edx
	fadd	%st(0)			/* 2*t2 : t2+2*xm : u : xm */
			subl	%edx, %ecx
	faddp	%st, %st(3)		/* t2+2*xm : u : 2*t2+xm */
			shll	$3, %ecx
	fmulp				/* u*(t2+2*xm) : 2*t2+xm */
	fdivp	%st, %st(1)		/* u*(t2+2*xm)/(2*t2+xm) */
	fmull	MOX(16+factor,%ecx)	/* u*(t2+2*xm)/(2*t2+xm)*FACT */
	pushl	%eax
	cfi_adjust_cfa_offset (4)
	fildl	(%esp)			/* xe/3 : u*(t2+2*xm)/(2*t2+xm)*FACT */
	fxch				/* u*(t2+2*xm)/(2*t2+xm)*FACT : xe/3 */
	fscale				/* u*(t2+2*xm)/(2*t2+xm)*FACT*2^xe/3 */
	popl	%edx
	cfi_adjust_cfa_offset (-4)
#ifdef PIC
	movl	8(%esp), %eax
	popl	%ebx
	cfi_adjust_cfa_offset (-4)
	cfi_restore (ebx)
#else
	movl	4(%esp), %eax
#endif
	testl	%eax, %eax
	fstp	%st(1)
	jns	4f
	fchs
4:	ret

	/* Return the argument.  */
1:	flds	4(%esp)
	ret
END(__cbrtf)
libm_alias_float (__cbrt, cbrt)
Commit	Line	Data
26dee9c4	1	/* Compute cubic root of float value.
04277e02	2	Copyright (C) 1997-2019 Free Software Foundation, Inc.
26dee9c4 UD	3	This file is part of the GNU C Library.
	4	Contributed by Dirk Alboth <dirka@uni-paderborn.de> and
	5	Ulrich Drepper <drepper@cygnus.com>, 1997.
	6
	7	The GNU C Library is free software; you can redistribute it and/or
41bdb6e2 AJ	8	modify it under the terms of the GNU Lesser General Public
	9	License as published by the Free Software Foundation; either
	10	version 2.1 of the License, or (at your option) any later version.
26dee9c4 UD	11
	12	The GNU C Library is distributed in the hope that it will be useful,
	13	but WITHOUT ANY WARRANTY; without even the implied warranty of
	14	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
41bdb6e2	15	Lesser General Public License for more details.
26dee9c4	16
41bdb6e2	17	You should have received a copy of the GNU Lesser General Public
59ba27a6	18	License along with the GNU C Library; if not, see
5a82c748	19	<https://www.gnu.org/licenses/>. */
26dee9c4 UD	20
26dee9c4 UD	21	#include <machine/asm.h>
e4602cba	22	#include <libm-alias-float.h>
26dee9c4	23
26dee9c4	24	.section .rodata
26dee9c4 UD	25
26dee9c4 UD	26	.align ALIGNARG(4)
b67e9372	27	.type f3,@object
26dee9c4 UD	28	f3: .double 0.191502161678719066
26dee9c4 UD	29	ASM_SIZE_DIRECTIVE(f3)
b67e9372	30	.type f2,@object
0413b54c UD	31	f2: .double 0.697570460207922770
0413b54c UD	32	ASM_SIZE_DIRECTIVE(f2)
b67e9372	33	.type f1,@object
0413b54c UD	34	f1: .double 0.492659620528969547
0413b54c UD	35	ASM_SIZE_DIRECTIVE(f1)
26dee9c4	36
0413b54c UD	37	#define CBRT2 1.2599210498948731648
	38	#define ONE_CBRT2 0.793700525984099737355196796584
	39	#define SQR_CBRT2 1.5874010519681994748
	40	#define ONE_SQR_CBRT2 0.629960524947436582364439673883
26dee9c4	41
b67e9372	42	.type factor,@object
0413b54c UD	43	.align ALIGNARG(4)
	44	factor: .double ONE_SQR_CBRT2
	45	.double ONE_CBRT2
26dee9c4 UD	46	.double 1.0
	47	.double CBRT2
	48	.double SQR_CBRT2
	49	ASM_SIZE_DIRECTIVE(factor)
	50
b67e9372	51	.type two25,@object
26dee9c4 UD	52	two25: .byte 0, 0, 0, 0x4c
	53	ASM_SIZE_DIRECTIVE(two25)
	54
	55	#ifdef PIC
	56	#define MO(op) op##@GOTOFF(%ebx)
0413b54c	57	#define MOX(op,x) op##@GOTOFF(%ebx,x,1)
26dee9c4 UD	58	#else
26dee9c4 UD	59	#define MO(op) op
0413b54c	60	#define MOX(op,x) op(x)
26dee9c4 UD	61	#endif
	62
	63	.text
	64	ENTRY(__cbrtf)
	65	movl 4(%esp), %eax
	66	xorl %ecx, %ecx
	67	movl %eax, %edx
	68	andl $0x7fffffff, %eax
	69	jz 1f
	70	cmpl $0x7f800000, %eax
	71	jae 1f
	72
	73	#ifdef PIC
	74	pushl %ebx
fee732e5 UD	75	cfi_adjust_cfa_offset (4)
	76	cfi_rel_offset (ebx, 0)
	77	LOAD_PIC_REG (bx)
26dee9c4 UD	78	#endif
	79
	80	cmpl $0x00800000, %eax
	81	jae 2f
	82
	83	#ifdef PIC
	84	flds 8(%esp)
	85	#else
	86	flds 4(%esp)
	87	#endif
	88	fmuls MO(two25)
	89	movl $-25, %ecx
	90	#ifdef PIC
	91	fstps 8(%esp)
	92	movl 8(%esp), %eax
	93	#else
	94	fstps 4(%esp)
	95	movl 4(%esp), %eax
	96	#endif
	97	movl %eax, %edx
	98	andl $0x7fffffff, %eax
	99
	100	2: shrl $23, %eax
	101	andl $0x807fffff, %edx
	102	subl $126, %eax
	103	orl $0x3f000000, %edx
	104	addl %eax, %ecx
	105	#ifdef PIC
	106	movl %edx, 8(%esp)
	107
	108	flds 8(%esp) /* xm */
	109	#else
	110	movl %edx, 4(%esp)
	111
	112	flds 4(%esp) /* xm */
	113	#endif
	114	fabs
	115
0413b54c	116	/* The following code has two tracks:
26dee9c4 UD	117	a) compute the normalized cbrt value
	118	b) compute xe/3 and xe%3
	119	The right track computes the value for b) and this is done
0413b54c UD	120	in an optimized way by avoiding division.
	121
	122	But why two tracks at all? Very easy: efficiency. Some FP
	123	instruction can overlap with a certain amount of integer (and
	124	FP) instructions. So we get (except for the imull) all
	125	instructions for free. */
26dee9c4 UD	126
	127	fld %st(0) /* xm : xm */
	128	fmull MO(f3) /* f3xm : xm /
	129	movl $1431655766, %eax
	130	fsubrl MO(f2) /* f2-f3xm : xm /
	131	imull %ecx
	132	fmul %st(1) /* (f2-f3xm)xm : xm */
	133	movl %ecx, %eax
	134	faddl MO(f1) /* u:=f1+(f2-f3xm)xm : xm */
	135	sarl $31, %eax
	136	fld %st /* u : u : xm */
	137	subl %eax, %edx
	138	fmul %st(1) /* uu : u : xm /
	139	fld %st(2) /* xm : uu : u : xm /
	140	fadd %st /* 2xm : uu : u : xm */
	141	fxch %st(1) /* uu : 2xm : u : xm */
	142	fmul %st(2) /* t2:=uuu : 2xm : u : xm /
	143	movl %edx, %eax
	144	fadd %st, %st(1) /* t2 : t2+2xm : u : xm /
	145	leal (%edx,%edx,2),%edx
	146	fadd %st(0) /* 2t2 : t2+2xm : u : xm */
	147	subl %edx, %ecx
	148	faddp %st, %st(3) /* t2+2xm : u : 2t2+xm */
0413b54c	149	shll $3, %ecx
26dee9c4 UD	150	fmulp /* u(t2+2xm) : 2t2+xm /
26dee9c4 UD	151	fdivp %st, %st(1) /* u(t2+2xm)/(2t2+xm) /
0413b54c	152	fmull MOX(16+factor,%ecx) /* u(t2+2xm)/(2t2+xm)FACT */
26dee9c4	153	pushl %eax
fee732e5	154	cfi_adjust_cfa_offset (4)
26dee9c4 UD	155	fildl (%esp) /* xe/3 : u(t2+2xm)/(2t2+xm)FACT */
26dee9c4 UD	156	fxch /* u(t2+2xm)/(2t2+xm)FACT : xe/3 */
26dee9c4	157	fscale /* u(t2+2xm)/(2t2+xm)FACT2^xe/3 /
0413b54c	158	popl %edx
fee732e5	159	cfi_adjust_cfa_offset (-4)
26dee9c4	160	#ifdef PIC
0413b54c	161	movl 8(%esp), %eax
26dee9c4	162	popl %ebx
fee732e5 UD	163	cfi_adjust_cfa_offset (-4)
fee732e5 UD	164	cfi_restore (ebx)
0413b54c UD	165	#else
0413b54c UD	166	movl 4(%esp), %eax
26dee9c4	167	#endif
0413b54c UD	168	testl %eax, %eax
	169	fstp %st(1)
	170	jns 4f
26dee9c4 UD	171	fchs
	172	4: ret
	173
	174	/* Return the argument. */
	175	1: flds 4(%esp)
	176	ret
	177	END(__cbrtf)
e4602cba	178	libm_alias_float (__cbrt, cbrt)