]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/i386/fpu/s_cbrtl.S
9c65bc50f393df3468dcc5b32f64e1525427c414
[thirdparty/glibc.git] / sysdeps / i386 / fpu / s_cbrtl.S
1 /* Compute cubic root of long double value.
2 Copyright (C) 1997-2019 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Dirk Alboth <dirka@uni-paderborn.de> and
5 Ulrich Drepper <drepper@cygnus.com>, 1997.
6
7 The GNU C Library is free software; you can redistribute it and/or
8 modify it under the terms of the GNU Lesser General Public
9 License as published by the Free Software Foundation; either
10 version 2.1 of the License, or (at your option) any later version.
11
12 The GNU C Library is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
16
17 You should have received a copy of the GNU Lesser General Public
18 License along with the GNU C Library; if not, see
19 <http://www.gnu.org/licenses/>. */
20
21 #include <libm-alias-ldouble.h>
22 #include <machine/asm.h>
23
24 .section .rodata
25
26 .align ALIGNARG(4)
27 .type f8,@object
28 f8: .tfloat 0.161617097923756032
29 ASM_SIZE_DIRECTIVE(f8)
30 .align ALIGNARG(4)
31 .type f7,@object
32 f7: .tfloat -0.988553671195413709
33 ASM_SIZE_DIRECTIVE(f7)
34 .align ALIGNARG(4)
35 .type f6,@object
36 f6: .tfloat 2.65298938441952296
37 ASM_SIZE_DIRECTIVE(f6)
38 .align ALIGNARG(4)
39 .type f5,@object
40 f5: .tfloat -4.11151425200350531
41 ASM_SIZE_DIRECTIVE(f5)
42 .align ALIGNARG(4)
43 .type f4,@object
44 f4: .tfloat 4.09559907378707839
45 ASM_SIZE_DIRECTIVE(f4)
46 .align ALIGNARG(4)
47 .type f3,@object
48 f3: .tfloat -2.82414939754975962
49 ASM_SIZE_DIRECTIVE(f3)
50 .align ALIGNARG(4)
51 .type f2,@object
52 f2: .tfloat 1.67595307700780102
53 ASM_SIZE_DIRECTIVE(f2)
54 .align ALIGNARG(4)
55 .type f1,@object
56 f1: .tfloat 0.338058687610520237
57 ASM_SIZE_DIRECTIVE(f1)
58
59 #define CBRT2 1.2599210498948731648
60 #define ONE_CBRT2 0.793700525984099737355196796584
61 #define SQR_CBRT2 1.5874010519681994748
62 #define ONE_SQR_CBRT2 0.629960524947436582364439673883
63
64 /* We make the entries in the following table all 16 bytes
65 wide to avoid having to implement a multiplication by 10. */
66 .type factor,@object
67 .align ALIGNARG(4)
68 factor: .tfloat ONE_SQR_CBRT2
69 .byte 0, 0, 0, 0, 0, 0
70 .tfloat ONE_CBRT2
71 .byte 0, 0, 0, 0, 0, 0
72 .tfloat 1.0
73 .byte 0, 0, 0, 0, 0, 0
74 .tfloat CBRT2
75 .byte 0, 0, 0, 0, 0, 0
76 .tfloat SQR_CBRT2
77 ASM_SIZE_DIRECTIVE(factor)
78
79 .type two64,@object
80 .align ALIGNARG(4)
81 two64: .byte 0, 0, 0, 0, 0, 0, 0xf0, 0x43
82 ASM_SIZE_DIRECTIVE(two64)
83
84 #ifdef PIC
85 #define MO(op) op##@GOTOFF(%ebx)
86 #define MOX(op,x) op##@GOTOFF(%ebx,x,1)
87 #else
88 #define MO(op) op
89 #define MOX(op,x) op(x)
90 #endif
91
92 .text
93 ENTRY(__cbrtl)
94 movl 4(%esp), %ecx
95 movl 12(%esp), %eax
96 orl 8(%esp), %ecx
97 movl %eax, %edx
98 andl $0x7fff, %eax
99 orl %eax, %ecx
100 jz 1f
101 xorl %ecx, %ecx
102 cmpl $0x7fff, %eax
103 je 1f
104
105 #ifdef PIC
106 pushl %ebx
107 cfi_adjust_cfa_offset (4)
108 cfi_rel_offset (ebx, 0)
109 LOAD_PIC_REG (bx)
110 #endif
111
112 cmpl $0, %eax
113 jne 2f
114
115 #ifdef PIC
116 fldt 8(%esp)
117 #else
118 fldt 4(%esp)
119 #endif
120 fmull MO(two64)
121 movl $-64, %ecx
122 #ifdef PIC
123 fstpt 8(%esp)
124 movl 16(%esp), %eax
125 #else
126 fstpt 4(%esp)
127 movl 12(%esp), %eax
128 #endif
129 movl %eax, %edx
130 andl $0x7fff, %eax
131
132 2: andl $0x8000, %edx
133 subl $16382, %eax
134 orl $0x3ffe, %edx
135 addl %eax, %ecx
136 #ifdef PIC
137 movl %edx, 16(%esp)
138
139 fldt 8(%esp) /* xm */
140 #else
141 movl %edx, 12(%esp)
142
143 fldt 4(%esp) /* xm */
144 #endif
145 fabs
146
147 /* The following code has two tracks:
148 a) compute the normalized cbrt value
149 b) compute xe/3 and xe%3
150 The right track computes the value for b) and this is done
151 in an optimized way by avoiding division.
152
153 But why two tracks at all? Very easy: efficiency. Some FP
154 instruction can overlap with a certain amount of integer (and
155 FP) instructions. So we get (except for the imull) all
156 instructions for free. */
157
158 fldt MO(f8) /* f8 : xm */
159 fmul %st(1) /* f8*xm : xm */
160
161 fldt MO(f7)
162 faddp /* f7+f8*xm : xm */
163 fmul %st(1) /* (f7+f8*xm)*xm : xm */
164 movl $1431655766, %eax
165 fldt MO(f6)
166 faddp /* f6+(f7+f8*xm)*xm : xm */
167 imull %ecx
168 fmul %st(1) /* (f6+(f7+f8*xm)*xm)*xm : xm */
169 movl %ecx, %eax
170 fldt MO(f5)
171 faddp /* f5+(f6+(f7+f8*xm)*xm)*xm : xm */
172 sarl $31, %eax
173 fmul %st(1) /* (f5+(f6+(f7+f8*xm)*xm)*xm)*xm : xm */
174 subl %eax, %edx
175 fldt MO(f4)
176 faddp /* f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm : xm */
177 fmul %st(1) /* (f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm : xm */
178 fldt MO(f3)
179 faddp /* f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm : xm */
180 fmul %st(1) /* (f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm)*xm : xm */
181 fldt MO(f2)
182 faddp /* f2+(f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm)*xm : xm */
183 fmul %st(1) /* (f2+(f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm)*xm)*xm : xm */
184 fldt MO(f1)
185 faddp /* u:=f1+(f2+(f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm)*xm)*xm : xm */
186
187 fld %st /* u : u : xm */
188 fmul %st(1) /* u*u : u : xm */
189 fld %st(2) /* xm : u*u : u : xm */
190 fadd %st /* 2*xm : u*u : u : xm */
191 fxch %st(1) /* u*u : 2*xm : u : xm */
192 fmul %st(2) /* t2:=u*u*u : 2*xm : u : xm */
193 movl %edx, %eax
194 fadd %st, %st(1) /* t2 : t2+2*xm : u : xm */
195 leal (%edx,%edx,2),%edx
196 fadd %st(0) /* 2*t2 : t2+2*xm : u : xm */
197 subl %edx, %ecx
198 faddp %st, %st(3) /* t2+2*xm : u : 2*t2+xm */
199 shll $4, %ecx
200 fmulp /* u*(t2+2*xm) : 2*t2+xm */
201 fdivp %st, %st(1) /* u*(t2+2*xm)/(2*t2+xm) */
202 fldt MOX(32+factor,%ecx)
203 fmulp /* u*(t2+2*xm)/(2*t2+xm)*FACT */
204 pushl %eax
205 cfi_adjust_cfa_offset (4)
206 fildl (%esp) /* xe/3 : u*(t2+2*xm)/(2*t2+xm)*FACT */
207 fxch /* u*(t2+2*xm)/(2*t2+xm)*FACT : xe/3 */
208 fscale /* u*(t2+2*xm)/(2*t2+xm)*FACT*2^xe/3 */
209 popl %edx
210 cfi_adjust_cfa_offset (-4)
211 #ifdef PIC
212 movl 16(%esp), %eax
213 popl %ebx
214 cfi_adjust_cfa_offset (-4)
215 cfi_restore (ebx)
216 #else
217 movl 12(%esp), %eax
218 #endif
219 testl $0x8000, %eax
220 fstp %st(1)
221 jz 4f
222 fchs
223 4: ret
224
225 /* Return the argument. */
226 1: fldt 4(%esp)
227 fadd %st
228 ret
229 END(__cbrtl)
230 libm_alias_ldouble (__cbrt, cbrt)