]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/i386/i686/fpu/multiarch/e_expf-sse2.S
Update copyright dates with scripts/update-copyrights.
[thirdparty/glibc.git] / sysdeps / i386 / i686 / fpu / multiarch / e_expf-sse2.S
1 /* SSE2 version of __ieee754_expf and __expf_finite
2 Copyright (C) 2012-2015 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
18
19
20 #include <sysdep.h>
21
22 /* Short algorithm description:
23 *
24 * Let K = 64 (table size).
25 * e^x = 2^(x/log(2)) = 2^n * T[j] * (1 + P(y))
26 * where
27 * x = m*log(2)/K + y, y in [0.0..log(2)/K]
28 * m = n*K + j, m,n,j - signed integer, j in [0..K-1]
29 * values of 2^(j/K) are tabulated as T[j].
30 *
31 * P(y) is a minimax polynomial approximation of expf(x)-1
32 * on small interval [0.0..log(2)/K].
33 *
34 * P(y) = P3*y*y*y*y + P2*y*y*y + P1*y*y + P0*y, calculated as
35 * z = y*y; P(y) = (P3*z + P1)*z + (P2*z + P0)*y
36 *
37 * Special cases:
38 * __ieee754_expf_sse2(NaN) = NaN
39 * __ieee754_expf_sse2(+INF) = +INF
40 * __ieee754_expf_sse2(-INF) = 0
41 * __ieee754_expf_sse2(x) = 1 for subnormals
42 * for finite argument, only __ieee754_expf_sse2(0)=1 is exact
43 * __ieee754_expf_sse2(x) overflows if x>700
44 * __ieee754_expf_sse2(x) underflows if x<-700
45 *
46 * Note:
47 * For |x|<700, __ieee754_expf_sse2 computes result in double precision,
48 * with accuracy a bit more than needed for expf, and does not round it
49 * to single precision.
50 */
51
52
53 #ifdef PIC
54 # define MO1(symbol) L(symbol)##@GOTOFF(%edx)
55 # define MO2(symbol,reg2,_scale) L(symbol)##@GOTOFF(%edx,reg2,_scale)
56 #else
57 # define MO1(symbol) L(symbol)
58 # define MO2(symbol,reg2,_scale) L(symbol)(,reg2,_scale)
59 #endif
60
61 .text
62 ENTRY(__ieee754_expf_sse2)
63 /* Input: single precision x on stack at address 4(%esp) */
64
65 #ifdef PIC
66 LOAD_PIC_REG(dx)
67 #endif
68
69 cvtss2sd 4(%esp), %xmm1 /* Convert x to double precision */
70 mov 4(%esp), %ecx /* Copy x */
71 movsd MO1(DP_KLN2), %xmm2 /* DP K/log(2) */
72 movsd MO1(DP_P2), %xmm3 /* DP P2 */
73 movl %ecx, %eax /* x */
74 mulsd %xmm1, %xmm2 /* DP x*K/log(2) */
75 andl $0x7fffffff, %ecx /* |x| */
76 cmpl $0x442f0000, %ecx /* |x|<700 ? */
77 movsd MO1(DP_P3), %xmm4 /* DP P3 */
78 addsd MO1(DP_RS), %xmm2 /* DP x*K/log(2)+RS */
79 jae L(special_paths)
80
81 /* Here if |x|<700 */
82 cmpl $0x31800000, %ecx /* |x|<2^(-28) ? */
83 jb L(small_arg)
84
85 /* Main path: here if 2^(-28)<=|x|<700 */
86 cvtsd2ss %xmm2, %xmm2 /* SP x*K/log(2)+RS */
87 movd %xmm2, %eax /* bits of n*K+j with trash */
88 subss MO1(SP_RS), %xmm2 /* SP t=round(x*K/log(2)) */
89 movl %eax, %ecx /* n*K+j with trash */
90 cvtss2sd %xmm2, %xmm2 /* DP t */
91 andl $0x3f, %eax /* bits of j */
92 mulsd MO1(DP_NLN2K), %xmm2 /* DP -t*log(2)/K */
93 andl $0xffffffc0, %ecx /* bits of n */
94 #ifdef __AVX__
95 vaddsd %xmm1, %xmm2, %xmm0 /* DP y=x-t*log(2)/K */
96 vmulsd %xmm0, %xmm0, %xmm2 /* DP z=y*y */
97 #else
98 addsd %xmm1, %xmm2 /* DP y=x-t*log(2)/K */
99 movaps %xmm2, %xmm0 /* DP y */
100 mulsd %xmm2, %xmm2 /* DP z=y*y */
101 #endif
102 mulsd %xmm2, %xmm4 /* DP P3*z */
103 addl $0xffc0, %ecx /* bits of n + DP exponent bias */
104 mulsd %xmm2, %xmm3 /* DP P2*z */
105 shrl $2, %ecx /* High 2 bytes of DP 2^n */
106 pxor %xmm1, %xmm1 /* clear %xmm1 */
107 addsd MO1(DP_P1), %xmm4 /* DP P3*z+P1 */
108 addsd MO1(DP_P0), %xmm3 /* DP P2*z+P0 */
109 pinsrw $3, %ecx, %xmm1 /* DP 2^n */
110 mulsd %xmm2, %xmm4 /* DP (P3*z+P1)*z */
111 mulsd %xmm3, %xmm0 /* DP (P2*z+P0)*y */
112 addsd %xmm4, %xmm0 /* DP P(y) */
113 mulsd MO2(DP_T,%eax,8), %xmm0 /* DP P(y)*T[j] */
114 addsd MO2(DP_T,%eax,8), %xmm0 /* DP T[j]*(P(y)+1) */
115 mulsd %xmm1, %xmm0 /* DP result=2^n*(T[j]*(P(y)+1)) */
116
117 lea -8(%esp), %esp /* Borrow 8 bytes of stack frame */
118 movsd %xmm0, 0(%esp) /* Move result from sse... */
119 fldl 0(%esp) /* ...to FPU. */
120 lea 8(%esp), %esp /* Return back 8 bytes of stack frame */
121 ret
122
123 .p2align 4
124 L(small_arg):
125 /* Here if 0<=|x|<2^(-28) */
126 movss 4(%esp), %xmm0 /* load x */
127 addss MO1(SP_ONE), %xmm0 /* 1.0 + x */
128 /* Return 1.0 with inexact raised, except for x==0 */
129 jmp L(epilogue)
130
131 .p2align 4
132 L(special_paths):
133 /* Here if x is NaN, or Inf, or finite |x|>=700 */
134 movss 4(%esp), %xmm0 /* load x */
135
136 cmpl $0x7f800000, %ecx /* |x| is finite ? */
137 jae L(arg_inf_or_nan)
138
139 /* Here if finite |x|>=700 */
140 testl $0x80000000, %eax /* sign of x nonzero ? */
141 je L(res_overflow)
142
143 /* Here if finite x<=-700 */
144 movss MO1(SP_SMALL), %xmm0 /* load small value 2^(-100) */
145 mulss %xmm0, %xmm0 /* Return underflowed result (zero or subnormal) */
146 jmp L(epilogue)
147
148 .p2align 4
149 L(res_overflow):
150 /* Here if finite x>=700 */
151 movss MO1(SP_LARGE), %xmm0 /* load large value 2^100 */
152 mulss %xmm0, %xmm0 /* Return overflowed result (Inf or max normal) */
153 jmp L(epilogue)
154
155 .p2align 4
156 L(arg_inf_or_nan):
157 /* Here if |x| is Inf or NAN */
158 jne L(arg_nan) /* |x| is Inf ? */
159
160 /* Here if |x| is Inf */
161 shrl $31, %eax /* Get sign bit of x */
162 movss MO2(SP_INF_0,%eax,4), %xmm0/* return zero or Inf, depending on sign of x */
163 jmp L(epilogue)
164
165 .p2align 4
166 L(arg_nan):
167 /* Here if |x| is NaN */
168 addss %xmm0, %xmm0 /* Return x+x (raise invalid) */
169
170 .p2align 4
171 L(epilogue):
172 lea -4(%esp), %esp /* Borrow 4 bytes of stack frame */
173 movss %xmm0, 0(%esp) /* Move result from sse... */
174 flds 0(%esp) /* ...to FPU. */
175 lea 4(%esp), %esp /* Return back 4 bytes of stack frame */
176 ret
177 END(__ieee754_expf_sse2)
178
179 .section .rodata, "a"
180 .p2align 3
181 L(DP_T): /* table of double precision values 2^(j/K) for j=[0..K-1] */
182 .long 0x00000000, 0x3ff00000
183 .long 0x3e778061, 0x3ff02c9a
184 .long 0xd3158574, 0x3ff059b0
185 .long 0x18759bc8, 0x3ff08745
186 .long 0x6cf9890f, 0x3ff0b558
187 .long 0x32d3d1a2, 0x3ff0e3ec
188 .long 0xd0125b51, 0x3ff11301
189 .long 0xaea92de0, 0x3ff1429a
190 .long 0x3c7d517b, 0x3ff172b8
191 .long 0xeb6fcb75, 0x3ff1a35b
192 .long 0x3168b9aa, 0x3ff1d487
193 .long 0x88628cd6, 0x3ff2063b
194 .long 0x6e756238, 0x3ff2387a
195 .long 0x65e27cdd, 0x3ff26b45
196 .long 0xf51fdee1, 0x3ff29e9d
197 .long 0xa6e4030b, 0x3ff2d285
198 .long 0x0a31b715, 0x3ff306fe
199 .long 0xb26416ff, 0x3ff33c08
200 .long 0x373aa9cb, 0x3ff371a7
201 .long 0x34e59ff7, 0x3ff3a7db
202 .long 0x4c123422, 0x3ff3dea6
203 .long 0x21f72e2a, 0x3ff4160a
204 .long 0x6061892d, 0x3ff44e08
205 .long 0xb5c13cd0, 0x3ff486a2
206 .long 0xd5362a27, 0x3ff4bfda
207 .long 0x769d2ca7, 0x3ff4f9b2
208 .long 0x569d4f82, 0x3ff5342b
209 .long 0x36b527da, 0x3ff56f47
210 .long 0xdd485429, 0x3ff5ab07
211 .long 0x15ad2148, 0x3ff5e76f
212 .long 0xb03a5585, 0x3ff6247e
213 .long 0x82552225, 0x3ff66238
214 .long 0x667f3bcd, 0x3ff6a09e
215 .long 0x3c651a2f, 0x3ff6dfb2
216 .long 0xe8ec5f74, 0x3ff71f75
217 .long 0x564267c9, 0x3ff75feb
218 .long 0x73eb0187, 0x3ff7a114
219 .long 0x36cf4e62, 0x3ff7e2f3
220 .long 0x994cce13, 0x3ff82589
221 .long 0x9b4492ed, 0x3ff868d9
222 .long 0x422aa0db, 0x3ff8ace5
223 .long 0x99157736, 0x3ff8f1ae
224 .long 0xb0cdc5e5, 0x3ff93737
225 .long 0x9fde4e50, 0x3ff97d82
226 .long 0x82a3f090, 0x3ff9c491
227 .long 0x7b5de565, 0x3ffa0c66
228 .long 0xb23e255d, 0x3ffa5503
229 .long 0x5579fdbf, 0x3ffa9e6b
230 .long 0x995ad3ad, 0x3ffae89f
231 .long 0xb84f15fb, 0x3ffb33a2
232 .long 0xf2fb5e47, 0x3ffb7f76
233 .long 0x904bc1d2, 0x3ffbcc1e
234 .long 0xdd85529c, 0x3ffc199b
235 .long 0x2e57d14b, 0x3ffc67f1
236 .long 0xdcef9069, 0x3ffcb720
237 .long 0x4a07897c, 0x3ffd072d
238 .long 0xdcfba487, 0x3ffd5818
239 .long 0x03db3285, 0x3ffda9e6
240 .long 0x337b9b5f, 0x3ffdfc97
241 .long 0xe78b3ff6, 0x3ffe502e
242 .long 0xa2a490da, 0x3ffea4af
243 .long 0xee615a27, 0x3ffefa1b
244 .long 0x5b6e4540, 0x3fff5076
245 .long 0x819e90d8, 0x3fffa7c1
246 .type L(DP_T), @object
247 ASM_SIZE_DIRECTIVE(L(DP_T))
248
249 .section .rodata.cst8,"aM",@progbits,8
250 .p2align 3
251 L(DP_KLN2): /* double precision K/log(2) */
252 .long 0x652b82fe, 0x40571547
253 .type L(DP_KLN2), @object
254 ASM_SIZE_DIRECTIVE(L(DP_KLN2))
255
256 .p2align 3
257 L(DP_NLN2K): /* double precision -log(2)/K */
258 .long 0xfefa39ef, 0xbf862e42
259 .type L(DP_NLN2K), @object
260 ASM_SIZE_DIRECTIVE(L(DP_NLN2K))
261
262 .p2align 3
263 L(DP_RS): /* double precision 2^23+2^22 */
264 .long 0x00000000, 0x41680000
265 .type L(DP_RS), @object
266 ASM_SIZE_DIRECTIVE(L(DP_RS))
267
268 .p2align 3
269 L(DP_P3): /* double precision polynomial coefficient P3 */
270 .long 0xeb78fa85, 0x3fa56420
271 .type L(DP_P3), @object
272 ASM_SIZE_DIRECTIVE(L(DP_P3))
273
274 .p2align 3
275 L(DP_P1): /* double precision polynomial coefficient P1 */
276 .long 0x008d6118, 0x3fe00000
277 .type L(DP_P1), @object
278 ASM_SIZE_DIRECTIVE(L(DP_P1))
279
280 .p2align 3
281 L(DP_P2): /* double precision polynomial coefficient P2 */
282 .long 0xda752d4f, 0x3fc55550
283 .type L(DP_P2), @object
284 ASM_SIZE_DIRECTIVE(L(DP_P2))
285
286 .p2align 3
287 L(DP_P0): /* double precision polynomial coefficient P0 */
288 .long 0xffffe7c6, 0x3fefffff
289 .type L(DP_P0), @object
290 ASM_SIZE_DIRECTIVE(L(DP_P0))
291
292 .p2align 2
293 L(SP_INF_0):
294 .long 0x7f800000 /* single precision Inf */
295 .long 0 /* single precision zero */
296 .type L(SP_INF_0), @object
297 ASM_SIZE_DIRECTIVE(L(SP_INF_0))
298
299 .section .rodata.cst4,"aM",@progbits,4
300 .p2align 2
301 L(SP_RS): /* single precision 2^23+2^22 */
302 .long 0x4b400000
303 .type L(SP_RS), @object
304 ASM_SIZE_DIRECTIVE(L(SP_RS))
305
306 .p2align 2
307 L(SP_SMALL): /* single precision small value 2^(-100) */
308 .long 0x0d800000
309 .type L(SP_SMALL), @object
310 ASM_SIZE_DIRECTIVE(L(SP_SMALL))
311
312 .p2align 2
313 L(SP_LARGE): /* single precision large value 2^100 */
314 .long 0x71800000
315 .type L(SP_LARGE), @object
316 ASM_SIZE_DIRECTIVE(L(SP_LARGE))
317
318 .p2align 2
319 L(SP_ONE): /* single precision 1.0 */
320 .long 0x3f800000
321 .type L(SP_ONE), @object
322 ASM_SIZE_DIRECTIVE(L(SP_ONE))
323
324 strong_alias (__ieee754_expf_sse2, __expf_finite_sse2)