[thirdparty/glibc.git] / sysdeps / i386 / fpu / s_cbrtf.S

/* Compute cubic root of float value.
   Copyright (C) 1997-2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */

#include <machine/asm.h>
#include <libm-alias-float.h>

        .section .rodata

        .align ALIGNARG(4)
        .type f3,@object
f3:	.double 0.191502161678719066
        ASM_SIZE_DIRECTIVE(f3)
        .type f2,@object
f2:	.double 0.697570460207922770
        ASM_SIZE_DIRECTIVE(f2)
        .type f1,@object
f1:	.double 0.492659620528969547
        ASM_SIZE_DIRECTIVE(f1)

#define CBRT2		1.2599210498948731648
#define ONE_CBRT2	0.793700525984099737355196796584
#define SQR_CBRT2	1.5874010519681994748
#define ONE_SQR_CBRT2	0.629960524947436582364439673883

	.type factor,@object
        .align ALIGNARG(4)
factor:	.double ONE_SQR_CBRT2
	.double ONE_CBRT2
	.double 1.0
	.double CBRT2
	.double SQR_CBRT2
	ASM_SIZE_DIRECTIVE(factor)

        .type two25,@object
two25:	.byte 0, 0, 0, 0x4c
        ASM_SIZE_DIRECTIVE(two25)

#ifdef PIC
#define MO(op) op##@GOTOFF(%ebx)
#define MOX(op,x) op##@GOTOFF(%ebx,x,1)
#else
#define MO(op) op
#define MOX(op,x) op(x)
#endif

	.text
ENTRY(__cbrtf)
	movl	4(%esp), %eax
	xorl	%ecx, %ecx
	movl	%eax, %edx
	andl	$0x7fffffff, %eax
	jz	1f
	cmpl	$0x7f800000, %eax
	jae	1f

#ifdef PIC
	pushl	%ebx
	cfi_adjust_cfa_offset (4)
	cfi_rel_offset (ebx, 0)
	LOAD_PIC_REG (bx)
#endif

	cmpl	$0x00800000, %eax
	jae	2f

#ifdef PIC
	flds	8(%esp)
#else
	flds	4(%esp)
#endif
	fmuls	MO(two25)
	movl	$-25, %ecx
#ifdef PIC
	fstps	8(%esp)
	movl	8(%esp), %eax
#else
	fstps	4(%esp)
	movl	4(%esp), %eax
#endif
	movl	%eax, %edx
	andl	$0x7fffffff, %eax

2:	shrl	$23, %eax
	andl	$0x807fffff, %edx
	subl	$126, %eax
	orl	$0x3f000000, %edx
	addl	%eax, %ecx
#ifdef PIC
	movl	%edx, 8(%esp)

	flds	8(%esp)			/* xm */
#else
	movl	%edx, 4(%esp)

	flds	4(%esp)			/* xm */
#endif
	fabs

	/* The following code has two tracks:
	    a) compute the normalized cbrt value
	    b) compute xe/3 and xe%3
	   The right track computes the value for b) and this is done
	   in an optimized way by avoiding division.

	   But why two tracks at all?  Very easy: efficiency.  Some FP
	   instruction can overlap with a certain amount of integer (and
	   FP) instructions.  So we get (except for the imull) all
	   instructions for free.  */

	fld	%st(0)			/* xm : xm */
	fmull	MO(f3)			/* f3*xm : xm */
			movl	$1431655766, %eax
	fsubrl	MO(f2)			/* f2-f3*xm : xm */
			imull	%ecx
	fmul	%st(1)			/* (f2-f3*xm)*xm : xm */
			movl	%ecx, %eax
	faddl	MO(f1)			/* u:=f1+(f2-f3*xm)*xm : xm */
			sarl	$31, %eax
	fld	%st			/* u : u : xm */
			subl	%eax, %edx
	fmul	%st(1)			/* u*u : u : xm */
	fld	%st(2)			/* xm : u*u : u : xm */
	fadd	%st			/* 2*xm : u*u : u : xm */
	fxch	%st(1)			/* u*u : 2*xm : u : xm */
	fmul	%st(2)			/* t2:=u*u*u : 2*xm : u : xm */
			movl	%edx, %eax
	fadd	%st, %st(1)		/* t2 : t2+2*xm : u : xm */
			leal	(%edx,%edx,2),%edx
	fadd	%st(0)			/* 2*t2 : t2+2*xm : u : xm */
			subl	%edx, %ecx
	faddp	%st, %st(3)		/* t2+2*xm : u : 2*t2+xm */
			shll	$3, %ecx
	fmulp				/* u*(t2+2*xm) : 2*t2+xm */
	fdivp	%st, %st(1)		/* u*(t2+2*xm)/(2*t2+xm) */
	fmull	MOX(16+factor,%ecx)	/* u*(t2+2*xm)/(2*t2+xm)*FACT */
	pushl	%eax
	cfi_adjust_cfa_offset (4)
	fildl	(%esp)			/* xe/3 : u*(t2+2*xm)/(2*t2+xm)*FACT */
	fxch				/* u*(t2+2*xm)/(2*t2+xm)*FACT : xe/3 */
	fscale				/* u*(t2+2*xm)/(2*t2+xm)*FACT*2^xe/3 */
	popl	%edx
	cfi_adjust_cfa_offset (-4)
#ifdef PIC
	movl	8(%esp), %eax
	popl	%ebx
	cfi_adjust_cfa_offset (-4)
	cfi_restore (ebx)
#else
	movl	4(%esp), %eax
#endif
	testl	%eax, %eax
	fstp	%st(1)
	jns	4f
	fchs
4:	ret

	/* Return the argument.  */
1:	flds	4(%esp)
	ret
END(__cbrtf)
libm_alias_float (__cbrt, cbrt)
Commit	Line	Data
26dee9c4	1	/* Compute cubic root of float value.
2b778ceb	2	Copyright (C) 1997-2021 Free Software Foundation, Inc.
26dee9c4	3	This file is part of the GNU C Library.
26dee9c4 UD	4
26dee9c4 UD	5	The GNU C Library is free software; you can redistribute it and/or
41bdb6e2 AJ	6	modify it under the terms of the GNU Lesser General Public
	7	License as published by the Free Software Foundation; either
	8	version 2.1 of the License, or (at your option) any later version.
26dee9c4 UD	9
	10	The GNU C Library is distributed in the hope that it will be useful,
	11	but WITHOUT ANY WARRANTY; without even the implied warranty of
	12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
41bdb6e2	13	Lesser General Public License for more details.
26dee9c4	14
41bdb6e2	15	You should have received a copy of the GNU Lesser General Public
59ba27a6	16	License along with the GNU C Library; if not, see
5a82c748	17	<https://www.gnu.org/licenses/>. */
26dee9c4 UD	18
26dee9c4 UD	19	#include <machine/asm.h>
e4602cba	20	#include <libm-alias-float.h>
26dee9c4	21
26dee9c4	22	.section .rodata
26dee9c4 UD	23
26dee9c4 UD	24	.align ALIGNARG(4)
b67e9372	25	.type f3,@object
26dee9c4 UD	26	f3: .double 0.191502161678719066
26dee9c4 UD	27	ASM_SIZE_DIRECTIVE(f3)
b67e9372	28	.type f2,@object
0413b54c UD	29	f2: .double 0.697570460207922770
0413b54c UD	30	ASM_SIZE_DIRECTIVE(f2)
b67e9372	31	.type f1,@object
0413b54c UD	32	f1: .double 0.492659620528969547
0413b54c UD	33	ASM_SIZE_DIRECTIVE(f1)
26dee9c4	34
0413b54c UD	35	#define CBRT2 1.2599210498948731648
	36	#define ONE_CBRT2 0.793700525984099737355196796584
	37	#define SQR_CBRT2 1.5874010519681994748
	38	#define ONE_SQR_CBRT2 0.629960524947436582364439673883
26dee9c4	39
b67e9372	40	.type factor,@object
0413b54c UD	41	.align ALIGNARG(4)
	42	factor: .double ONE_SQR_CBRT2
	43	.double ONE_CBRT2
26dee9c4 UD	44	.double 1.0
	45	.double CBRT2
	46	.double SQR_CBRT2
	47	ASM_SIZE_DIRECTIVE(factor)
	48
b67e9372	49	.type two25,@object
26dee9c4 UD	50	two25: .byte 0, 0, 0, 0x4c
	51	ASM_SIZE_DIRECTIVE(two25)
	52
	53	#ifdef PIC
	54	#define MO(op) op##@GOTOFF(%ebx)
0413b54c	55	#define MOX(op,x) op##@GOTOFF(%ebx,x,1)
26dee9c4 UD	56	#else
26dee9c4 UD	57	#define MO(op) op
0413b54c	58	#define MOX(op,x) op(x)
26dee9c4 UD	59	#endif
	60
	61	.text
	62	ENTRY(__cbrtf)
	63	movl 4(%esp), %eax
	64	xorl %ecx, %ecx
	65	movl %eax, %edx
	66	andl $0x7fffffff, %eax
	67	jz 1f
	68	cmpl $0x7f800000, %eax
	69	jae 1f
	70
	71	#ifdef PIC
	72	pushl %ebx
fee732e5 UD	73	cfi_adjust_cfa_offset (4)
	74	cfi_rel_offset (ebx, 0)
	75	LOAD_PIC_REG (bx)
26dee9c4 UD	76	#endif
	77
	78	cmpl $0x00800000, %eax
	79	jae 2f
	80
	81	#ifdef PIC
	82	flds 8(%esp)
	83	#else
	84	flds 4(%esp)
	85	#endif
	86	fmuls MO(two25)
	87	movl $-25, %ecx
	88	#ifdef PIC
	89	fstps 8(%esp)
	90	movl 8(%esp), %eax
	91	#else
	92	fstps 4(%esp)
	93	movl 4(%esp), %eax
	94	#endif
	95	movl %eax, %edx
	96	andl $0x7fffffff, %eax
	97
	98	2: shrl $23, %eax
	99	andl $0x807fffff, %edx
	100	subl $126, %eax
	101	orl $0x3f000000, %edx
	102	addl %eax, %ecx
	103	#ifdef PIC
	104	movl %edx, 8(%esp)
	105
	106	flds 8(%esp) /* xm */
	107	#else
	108	movl %edx, 4(%esp)
	109
	110	flds 4(%esp) /* xm */
	111	#endif
	112	fabs
	113
0413b54c	114	/* The following code has two tracks:
26dee9c4 UD	115	a) compute the normalized cbrt value
	116	b) compute xe/3 and xe%3
	117	The right track computes the value for b) and this is done
0413b54c UD	118	in an optimized way by avoiding division.
	119
	120	But why two tracks at all? Very easy: efficiency. Some FP
	121	instruction can overlap with a certain amount of integer (and
	122	FP) instructions. So we get (except for the imull) all
	123	instructions for free. */
26dee9c4 UD	124
	125	fld %st(0) /* xm : xm */
	126	fmull MO(f3) /* f3xm : xm /
	127	movl $1431655766, %eax
	128	fsubrl MO(f2) /* f2-f3xm : xm /
	129	imull %ecx
	130	fmul %st(1) /* (f2-f3xm)xm : xm */
	131	movl %ecx, %eax
	132	faddl MO(f1) /* u:=f1+(f2-f3xm)xm : xm */
	133	sarl $31, %eax
	134	fld %st /* u : u : xm */
	135	subl %eax, %edx
	136	fmul %st(1) /* uu : u : xm /
	137	fld %st(2) /* xm : uu : u : xm /
	138	fadd %st /* 2xm : uu : u : xm */
	139	fxch %st(1) /* uu : 2xm : u : xm */
	140	fmul %st(2) /* t2:=uuu : 2xm : u : xm /
	141	movl %edx, %eax
	142	fadd %st, %st(1) /* t2 : t2+2xm : u : xm /
	143	leal (%edx,%edx,2),%edx
	144	fadd %st(0) /* 2t2 : t2+2xm : u : xm */
	145	subl %edx, %ecx
	146	faddp %st, %st(3) /* t2+2xm : u : 2t2+xm */
0413b54c	147	shll $3, %ecx
26dee9c4 UD	148	fmulp /* u(t2+2xm) : 2t2+xm /
26dee9c4 UD	149	fdivp %st, %st(1) /* u(t2+2xm)/(2t2+xm) /
0413b54c	150	fmull MOX(16+factor,%ecx) /* u(t2+2xm)/(2t2+xm)FACT */
26dee9c4	151	pushl %eax
fee732e5	152	cfi_adjust_cfa_offset (4)
26dee9c4 UD	153	fildl (%esp) /* xe/3 : u(t2+2xm)/(2t2+xm)FACT */
26dee9c4 UD	154	fxch /* u(t2+2xm)/(2t2+xm)FACT : xe/3 */
26dee9c4	155	fscale /* u(t2+2xm)/(2t2+xm)FACT2^xe/3 /
0413b54c	156	popl %edx
fee732e5	157	cfi_adjust_cfa_offset (-4)
26dee9c4	158	#ifdef PIC
0413b54c	159	movl 8(%esp), %eax
26dee9c4	160	popl %ebx
fee732e5 UD	161	cfi_adjust_cfa_offset (-4)
fee732e5 UD	162	cfi_restore (ebx)
0413b54c UD	163	#else
0413b54c UD	164	movl 4(%esp), %eax
26dee9c4	165	#endif
0413b54c UD	166	testl %eax, %eax
	167	fstp %st(1)
	168	jns 4f
26dee9c4 UD	169	fchs
	170	4: ret
	171
	172	/* Return the argument. */
	173	1: flds 4(%esp)
	174	ret
	175	END(__cbrtf)
e4602cba	176	libm_alias_float (__cbrt, cbrt)