[thirdparty/glibc.git] / sysdeps / i386 / fpu / s_cbrt.S

/* Compute cubic root of double value.
   Copyright (C) 1997-2024 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */

#include <machine/asm.h>
#include <libm-alias-double.h>

        .section .rodata

        .align ALIGNARG(4)
        .type f7,@object
f7:	.double -0.145263899385486377
	ASM_SIZE_DIRECTIVE(f7)
        .type f6,@object
f6:	.double 0.784932344976639262
	ASM_SIZE_DIRECTIVE(f6)
        .type f5,@object
f5:	.double -1.83469277483613086
	ASM_SIZE_DIRECTIVE(f5)
        .type f4,@object
f4:	.double 2.44693122563534430
	ASM_SIZE_DIRECTIVE(f4)
        .type f3,@object
f3:	.double -2.11499494167371287
	ASM_SIZE_DIRECTIVE(f3)
        .type f2,@object
f2:	.double 1.50819193781584896
	ASM_SIZE_DIRECTIVE(f2)
        .type f1,@object
f1:	.double 0.354895765043919860
	ASM_SIZE_DIRECTIVE(f1)

#define CBRT2		1.2599210498948731648
#define ONE_CBRT2	0.793700525984099737355196796584
#define SQR_CBRT2	1.5874010519681994748
#define ONE_SQR_CBRT2	0.629960524947436582364439673883

	.type factor,@object
factor:	.double ONE_SQR_CBRT2
	.double ONE_CBRT2
	.double 1.0
	.double CBRT2
	.double SQR_CBRT2
	ASM_SIZE_DIRECTIVE(factor)

        .type two54,@object
two54:  .byte 0, 0, 0, 0, 0, 0, 0x50, 0x43
        ASM_SIZE_DIRECTIVE(two54)

#ifdef PIC
#define MO(op) op##@GOTOFF(%ebx)
#define MOX(op,x) op##@GOTOFF(%ebx,x,1)
#else
#define MO(op) op
#define MOX(op,x) op(x)
#endif

	.text
ENTRY(__cbrt)
	movl	4(%esp), %ecx
	movl	8(%esp), %eax
	movl	%eax, %edx
	andl	$0x7fffffff, %eax
	orl	%eax, %ecx
	jz	1f
	xorl	%ecx, %ecx
	cmpl	$0x7ff00000, %eax
	jae	1f

#ifdef PIC
	pushl	%ebx
	cfi_adjust_cfa_offset (4)
	cfi_rel_offset (ebx, 0)
	LOAD_PIC_REG (bx)
#endif

	cmpl	$0x00100000, %eax
	jae	2f

#ifdef PIC
	fldl	8(%esp)
#else
	fldl	4(%esp)
#endif
	fmull	MO(two54)
	movl	$-54, %ecx
#ifdef PIC
	fstpl	8(%esp)
	movl	12(%esp), %eax
#else
	fstpl	4(%esp)
	movl	8(%esp), %eax
#endif
	movl	%eax, %edx
	andl	$0x7fffffff, %eax

2:	shrl	$20, %eax
	andl	$0x800fffff, %edx
	subl	$1022, %eax
	orl	$0x3fe00000, %edx
	addl	%eax, %ecx
#ifdef PIC
	movl	%edx, 12(%esp)

	fldl	8(%esp)			/* xm */
#else
	movl	%edx, 8(%esp)

	fldl	4(%esp)			/* xm */
#endif
	fabs

	/* The following code has two tracks:
	    a) compute the normalized cbrt value
	    b) compute xe/3 and xe%3
	   The right track computes the value for b) and this is done
	   in an optimized way by avoiding division.

	   But why two tracks at all?  Very easy: efficiency.  Some FP
	   instruction can overlap with a certain amount of integer (and
	   FP) instructions.  So we get (except for the imull) all
	   instructions for free.  */

	fld	%st(0)			/* xm : xm */

	fmull	MO(f7)			/* f7*xm : xm */
			movl	$1431655766, %eax
	faddl	MO(f6)			/* f6+f7*xm : xm */
			imull	%ecx
	fmul	%st(1)			/* (f6+f7*xm)*xm : xm */
			movl	%ecx, %eax
	faddl	MO(f5)			/* f5+(f6+f7*xm)*xm : xm */
			sarl	$31, %eax
	fmul	%st(1)			/* (f5+(f6+f7*xm)*xm)*xm : xm */
			subl	%eax, %edx
	faddl	MO(f4)			/* f4+(f5+(f6+f7*xm)*xm)*xm : xm */
	fmul	%st(1)			/* (f4+(f5+(f6+f7*xm)*xm)*xm)*xm : xm */
	faddl	MO(f3)			/* f3+(f4+(f5+(f6+f7*xm)*xm)*xm)*xm : xm */
	fmul	%st(1)			/* (f3+(f4+(f5+(f6+f7*xm)*xm)*xm)*xm)*xm : xm */
	faddl	MO(f2)			/* f2+(f3+(f4+(f5+(f6+f7*xm)*xm)*xm)*xm)*xm : xm */
	fmul	%st(1)			/* (f2+(f3+(f4+(f5+(f6+f7*xm)*xm)*xm)*xm)*xm)*xm : xm */
	faddl	MO(f1)			/* u:=f1+(f2+(f3+(f4+(f5+(f6+f7*xm)*xm)*xm)*xm)*xm)*xm : xm */

	fld	%st			/* u : u : xm */
	fmul	%st(1)			/* u*u : u : xm */
	fld	%st(2)			/* xm : u*u : u : xm */
	fadd	%st			/* 2*xm : u*u : u : xm */
	fxch	%st(1)			/* u*u : 2*xm : u : xm */
	fmul	%st(2)			/* t2:=u*u*u : 2*xm : u : xm */
			movl	%edx, %eax
	fadd	%st, %st(1)		/* t2 : t2+2*xm : u : xm */
			leal	(%edx,%edx,2),%edx
	fadd	%st(0)			/* 2*t2 : t2+2*xm : u : xm */
			subl	%edx, %ecx
	faddp	%st, %st(3)		/* t2+2*xm : u : 2*t2+xm */
			shll	$3, %ecx
	fmulp				/* u*(t2+2*xm) : 2*t2+xm */
	fdivp	%st, %st(1)		/* u*(t2+2*xm)/(2*t2+xm) */
	fmull	MOX(16+factor,%ecx)	/* u*(t2+2*xm)/(2*t2+xm)*FACT */
	pushl	%eax
	cfi_adjust_cfa_offset (4)
	fildl	(%esp)			/* xe/3 : u*(t2+2*xm)/(2*t2+xm)*FACT */
	fxch				/* u*(t2+2*xm)/(2*t2+xm)*FACT : xe/3 */
	fscale				/* u*(t2+2*xm)/(2*t2+xm)*FACT*2^xe/3 */
	popl	%edx
	cfi_adjust_cfa_offset (-4)
#ifdef PIC
	movl	12(%esp), %eax
	popl	%ebx
	cfi_adjust_cfa_offset (-4)
	cfi_restore (ebx)
#else
	movl	8(%esp), %eax
#endif
	testl	%eax, %eax
	fstp	%st(1)
	jns	4f
	fchs
4:	ret

	/* Return the argument.  */
1:	fldl	4(%esp)
	ret
END(__cbrt)
libm_alias_double (__cbrt, cbrt)
Commit	Line	Data
26dee9c4	1	/* Compute cubic root of double value.
dff8da6b	2	Copyright (C) 1997-2024 Free Software Foundation, Inc.
26dee9c4	3	This file is part of the GNU C Library.
26dee9c4 UD	4
26dee9c4 UD	5	The GNU C Library is free software; you can redistribute it and/or
41bdb6e2 AJ	6	modify it under the terms of the GNU Lesser General Public
	7	License as published by the Free Software Foundation; either
	8	version 2.1 of the License, or (at your option) any later version.
26dee9c4 UD	9
	10	The GNU C Library is distributed in the hope that it will be useful,
	11	but WITHOUT ANY WARRANTY; without even the implied warranty of
	12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
41bdb6e2	13	Lesser General Public License for more details.
26dee9c4	14
41bdb6e2	15	You should have received a copy of the GNU Lesser General Public
59ba27a6	16	License along with the GNU C Library; if not, see
5a82c748	17	<https://www.gnu.org/licenses/>. */
26dee9c4 UD	18
26dee9c4 UD	19	#include <machine/asm.h>
bc4e8f9b	20	#include <libm-alias-double.h>
26dee9c4	21
26dee9c4	22	.section .rodata
26dee9c4 UD	23
26dee9c4 UD	24	.align ALIGNARG(4)
b67e9372	25	.type f7,@object
26dee9c4 UD	26	f7: .double -0.145263899385486377
26dee9c4 UD	27	ASM_SIZE_DIRECTIVE(f7)
b67e9372	28	.type f6,@object
0413b54c UD	29	f6: .double 0.784932344976639262
0413b54c UD	30	ASM_SIZE_DIRECTIVE(f6)
b67e9372	31	.type f5,@object
0413b54c UD	32	f5: .double -1.83469277483613086
0413b54c UD	33	ASM_SIZE_DIRECTIVE(f5)
b67e9372	34	.type f4,@object
0413b54c UD	35	f4: .double 2.44693122563534430
0413b54c UD	36	ASM_SIZE_DIRECTIVE(f4)
b67e9372	37	.type f3,@object
0413b54c UD	38	f3: .double -2.11499494167371287
0413b54c UD	39	ASM_SIZE_DIRECTIVE(f3)
b67e9372	40	.type f2,@object
0413b54c UD	41	f2: .double 1.50819193781584896
0413b54c UD	42	ASM_SIZE_DIRECTIVE(f2)
b67e9372	43	.type f1,@object
0413b54c UD	44	f1: .double 0.354895765043919860
0413b54c UD	45	ASM_SIZE_DIRECTIVE(f1)
26dee9c4	46
0413b54c UD	47	#define CBRT2 1.2599210498948731648
	48	#define ONE_CBRT2 0.793700525984099737355196796584
	49	#define SQR_CBRT2 1.5874010519681994748
	50	#define ONE_SQR_CBRT2 0.629960524947436582364439673883
26dee9c4	51
b67e9372	52	.type factor,@object
0413b54c UD	53	factor: .double ONE_SQR_CBRT2
0413b54c UD	54	.double ONE_CBRT2
26dee9c4 UD	55	.double 1.0
	56	.double CBRT2
	57	.double SQR_CBRT2
	58	ASM_SIZE_DIRECTIVE(factor)
	59
b67e9372	60	.type two54,@object
26dee9c4 UD	61	two54: .byte 0, 0, 0, 0, 0, 0, 0x50, 0x43
	62	ASM_SIZE_DIRECTIVE(two54)
	63
	64	#ifdef PIC
	65	#define MO(op) op##@GOTOFF(%ebx)
0413b54c	66	#define MOX(op,x) op##@GOTOFF(%ebx,x,1)
26dee9c4 UD	67	#else
26dee9c4 UD	68	#define MO(op) op
0413b54c	69	#define MOX(op,x) op(x)
26dee9c4 UD	70	#endif
	71
	72	.text
	73	ENTRY(__cbrt)
	74	movl 4(%esp), %ecx
	75	movl 8(%esp), %eax
	76	movl %eax, %edx
	77	andl $0x7fffffff, %eax
	78	orl %eax, %ecx
	79	jz 1f
	80	xorl %ecx, %ecx
	81	cmpl $0x7ff00000, %eax
	82	jae 1f
	83
	84	#ifdef PIC
	85	pushl %ebx
fee732e5 UD	86	cfi_adjust_cfa_offset (4)
	87	cfi_rel_offset (ebx, 0)
	88	LOAD_PIC_REG (bx)
26dee9c4 UD	89	#endif
	90
	91	cmpl $0x00100000, %eax
	92	jae 2f
	93
	94	#ifdef PIC
	95	fldl 8(%esp)
	96	#else
	97	fldl 4(%esp)
	98	#endif
	99	fmull MO(two54)
	100	movl $-54, %ecx
0413b54c UD	101	#ifdef PIC
	102	fstpl 8(%esp)
	103	movl 12(%esp), %eax
	104	#else
26dee9c4 UD	105	fstpl 4(%esp)
26dee9c4 UD	106	movl 8(%esp), %eax
0413b54c	107	#endif
26dee9c4 UD	108	movl %eax, %edx
	109	andl $0x7fffffff, %eax
	110
	111	2: shrl $20, %eax
	112	andl $0x800fffff, %edx
	113	subl $1022, %eax
	114	orl $0x3fe00000, %edx
	115	addl %eax, %ecx
	116	#ifdef PIC
	117	movl %edx, 12(%esp)
	118
	119	fldl 8(%esp) /* xm */
	120	#else
	121	movl %edx, 8(%esp)
	122
	123	fldl 4(%esp) /* xm */
	124	#endif
	125	fabs
	126
0413b54c	127	/* The following code has two tracks:
26dee9c4 UD	128	a) compute the normalized cbrt value
	129	b) compute xe/3 and xe%3
	130	The right track computes the value for b) and this is done
0413b54c UD	131	in an optimized way by avoiding division.
	132
	133	But why two tracks at all? Very easy: efficiency. Some FP
	134	instruction can overlap with a certain amount of integer (and
	135	FP) instructions. So we get (except for the imull) all
	136	instructions for free. */
26dee9c4 UD	137
	138	fld %st(0) /* xm : xm */
	139
	140	fmull MO(f7) /* f7xm : xm /
	141	movl $1431655766, %eax
	142	faddl MO(f6) /* f6+f7xm : xm /
	143	imull %ecx
	144	fmul %st(1) /* (f6+f7xm)xm : xm */
	145	movl %ecx, %eax
	146	faddl MO(f5) /* f5+(f6+f7xm)xm : xm */
	147	sarl $31, %eax
	148	fmul %st(1) /* (f5+(f6+f7xm)xm)xm : xm /
	149	subl %eax, %edx
	150	faddl MO(f4) /* f4+(f5+(f6+f7xm)xm)xm : xm /
	151	fmul %st(1) /* (f4+(f5+(f6+f7xm)xm)xm)xm : xm */
	152	faddl MO(f3) /* f3+(f4+(f5+(f6+f7xm)xm)xm)xm : xm */
	153	fmul %st(1) /* (f3+(f4+(f5+(f6+f7xm)xm)xm)xm)xm : xm /
	154	faddl MO(f2) /* f2+(f3+(f4+(f5+(f6+f7xm)xm)xm)xm)xm : xm /
	155	fmul %st(1) /* (f2+(f3+(f4+(f5+(f6+f7xm)xm)xm)xm)xm)xm : xm */
	156	faddl MO(f1) /* u:=f1+(f2+(f3+(f4+(f5+(f6+f7xm)xm)xm)xm)xm)xm : xm */
	157
	158	fld %st /* u : u : xm */
	159	fmul %st(1) /* uu : u : xm /
	160	fld %st(2) /* xm : uu : u : xm /
	161	fadd %st /* 2xm : uu : u : xm */
	162	fxch %st(1) /* uu : 2xm : u : xm */
	163	fmul %st(2) /* t2:=uuu : 2xm : u : xm /
	164	movl %edx, %eax
	165	fadd %st, %st(1) /* t2 : t2+2xm : u : xm /
	166	leal (%edx,%edx,2),%edx
	167	fadd %st(0) /* 2t2 : t2+2xm : u : xm */
	168	subl %edx, %ecx
	169	faddp %st, %st(3) /* t2+2xm : u : 2t2+xm */
0413b54c	170	shll $3, %ecx
26dee9c4 UD	171	fmulp /* u(t2+2xm) : 2t2+xm /
26dee9c4 UD	172	fdivp %st, %st(1) /* u(t2+2xm)/(2t2+xm) /
0413b54c	173	fmull MOX(16+factor,%ecx) /* u(t2+2xm)/(2t2+xm)FACT */
26dee9c4	174	pushl %eax
fee732e5	175	cfi_adjust_cfa_offset (4)
26dee9c4 UD	176	fildl (%esp) /* xe/3 : u(t2+2xm)/(2t2+xm)FACT */
26dee9c4 UD	177	fxch /* u(t2+2xm)/(2t2+xm)FACT : xe/3 */
26dee9c4	178	fscale /* u(t2+2xm)/(2t2+xm)FACT2^xe/3 /
0413b54c	179	popl %edx
fee732e5	180	cfi_adjust_cfa_offset (-4)
26dee9c4	181	#ifdef PIC
0413b54c	182	movl 12(%esp), %eax
26dee9c4	183	popl %ebx
fee732e5 UD	184	cfi_adjust_cfa_offset (-4)
fee732e5 UD	185	cfi_restore (ebx)
0413b54c UD	186	#else
0413b54c UD	187	movl 8(%esp), %eax
26dee9c4	188	#endif
0413b54c UD	189	testl %eax, %eax
	190	fstp %st(1)
	191	jns 4f
26dee9c4 UD	192	fchs
	193	4: ret
	194
	195	/* Return the argument. */
	196	1: fldl 4(%esp)
	197	ret
	198	END(__cbrt)
bc4e8f9b	199	libm_alias_double (__cbrt, cbrt)