]>
Commit | Line | Data |
---|---|---|
30a4e827 MT |
1 | diff -Nrup a/sysdeps/x86_64/fpu/e_expf.S b/sysdeps/x86_64/fpu/e_expf.S |
2 | --- a/sysdeps/x86_64/fpu/e_expf.S 1969-12-31 17:00:00.000000000 -0700 | |
3 | +++ b/sysdeps/x86_64/fpu/e_expf.S 2012-08-20 09:47:15.551971545 -0600 | |
4 | @@ -0,0 +1,339 @@ | |
5 | +/* Optimized __ieee754_expf function. | |
6 | + Copyright (C) 2012 Free Software Foundation, Inc. | |
7 | + Contributed by Intel Corporation. | |
8 | + This file is part of the GNU C Library. | |
9 | + | |
10 | + The GNU C Library is free software; you can redistribute it and/or | |
11 | + modify it under the terms of the GNU Lesser General Public | |
12 | + License as published by the Free Software Foundation; either | |
13 | + version 2.1 of the License, or (at your option) any later version. | |
14 | + | |
15 | + The GNU C Library is distributed in the hope that it will be useful, | |
16 | + but WITHOUT ANY WARRANTY; without even the implied warranty of | |
17 | + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
18 | + Lesser General Public License for more details. | |
19 | + | |
20 | + You should have received a copy of the GNU Lesser General Public | |
21 | + License along with the GNU C Library; if not, see | |
22 | + <http://www.gnu.org/licenses/>. */ | |
23 | + | |
24 | +#include <sysdep.h> | |
25 | + | |
26 | +/* Short algorithm description: | |
27 | + * | |
28 | + * Let K = 64 (table size). | |
29 | + * e^x = 2^(x/log(2)) = 2^n * T[j] * (1 + P(y)) | |
30 | + * where | |
31 | + * x = m*log(2)/K + y, y in [0.0..log(2)/K] | |
32 | + * m = n*K + j, m,n,j - signed integer, j in [0..K-1] | |
33 | + * values of 2^(j/K) are tabulated as T[j]. | |
34 | + * | |
35 | + * P(y) is a minimax polynomial approximation of expf(x)-1 | |
36 | + * on small interval [0.0..log(2)/K]. | |
37 | + * | |
38 | + * P(y) = P3*y*y*y*y + P2*y*y*y + P1*y*y + P0*y, calculated as | |
39 | + * z = y*y; P(y) = (P3*z + P1)*z + (P2*z + P0)*y | |
40 | + * | |
41 | + * Special cases: | |
42 | + * expf(NaN) = NaN | |
43 | + * expf(+INF) = +INF | |
44 | + * expf(-INF) = 0 | |
45 | + * expf(x) = 1 for subnormals | |
46 | + * for finite argument, only expf(0)=1 is exact | |
47 | + * expf(x) overflows if x>88.7228317260742190 | |
48 | + * expf(x) underflows if x<-103.972076416015620 | |
49 | + */ | |
50 | + | |
51 | + .text | |
52 | +ENTRY(__ieee754_expf) | |
53 | + /* Input: single precision x in %xmm0 */ | |
54 | + cvtss2sd %xmm0, %xmm1 /* Convert x to double precision */ | |
55 | + movd %xmm0, %ecx /* Copy x */ | |
56 | + movsd L(DP_KLN2)(%rip), %xmm2 /* DP K/log(2) */ | |
57 | + movsd L(DP_P2)(%rip), %xmm3 /* DP P2 */ | |
58 | + movl %ecx, %eax /* x */ | |
59 | + mulsd %xmm1, %xmm2 /* DP x*K/log(2) */ | |
60 | + andl $0x7fffffff, %ecx /* |x| */ | |
61 | + lea L(DP_T)(%rip), %rsi /* address of table T[j] */ | |
62 | + cmpl $0x42ad496b, %ecx /* |x|<125*log(2) ? */ | |
63 | + movsd L(DP_P3)(%rip), %xmm4 /* DP P3 */ | |
64 | + addsd L(DP_RS)(%rip), %xmm2 /* DP x*K/log(2)+RS */ | |
65 | + jae L(special_paths) | |
66 | + | |
67 | + /* Here if |x|<125*log(2) */ | |
68 | + cmpl $0x31800000, %ecx /* |x|<2^(-28) ? */ | |
69 | + jb L(small_arg) | |
70 | + | |
71 | + /* Main path: here if 2^(-28)<=|x|<125*log(2) */ | |
72 | + cvtsd2ss %xmm2, %xmm2 /* SP x*K/log(2)+RS */ | |
73 | + movd %xmm2, %eax /* bits of n*K+j with trash */ | |
74 | + subss L(SP_RS)(%rip), %xmm2 /* SP t=round(x*K/log(2)) */ | |
75 | + movl %eax, %edx /* n*K+j with trash */ | |
76 | + cvtss2sd %xmm2, %xmm2 /* DP t */ | |
77 | + andl $0x3f, %eax /* bits of j */ | |
78 | + mulsd L(DP_NLN2K)(%rip), %xmm2/* DP -t*log(2)/K */ | |
79 | + andl $0xffffffc0, %edx /* bits of n */ | |
80 | +#ifdef __AVX__ | |
81 | + vaddsd %xmm1, %xmm2, %xmm0 /* DP y=x-t*log(2)/K */ | |
82 | + vmulsd %xmm0, %xmm0, %xmm2 /* DP z=y*y */ | |
83 | +#else | |
84 | + addsd %xmm1, %xmm2 /* DP y=x-t*log(2)/K */ | |
85 | + movaps %xmm2, %xmm0 /* DP y */ | |
86 | + mulsd %xmm2, %xmm2 /* DP z=y*y */ | |
87 | +#endif | |
88 | + mulsd %xmm2, %xmm4 /* DP P3*z */ | |
89 | + addl $0x1fc0, %edx /* bits of n + SP exponent bias */ | |
90 | + mulsd %xmm2, %xmm3 /* DP P2*z */ | |
91 | + shll $17, %edx /* SP 2^n */ | |
92 | + addsd L(DP_P1)(%rip), %xmm4 /* DP P3*z+P1 */ | |
93 | + addsd L(DP_P0)(%rip), %xmm3 /* DP P2*z+P0 */ | |
94 | + movd %edx, %xmm1 /* SP 2^n */ | |
95 | + mulsd %xmm2, %xmm4 /* DP (P3*z+P1)*z */ | |
96 | + mulsd %xmm3, %xmm0 /* DP (P2*z+P0)*y */ | |
97 | + addsd %xmm4, %xmm0 /* DP P(y) */ | |
98 | + mulsd (%rsi,%rax,8), %xmm0 /* DP P(y)*T[j] */ | |
99 | + addsd (%rsi,%rax,8), %xmm0 /* DP T[j]*(P(y)+1) */ | |
100 | + cvtsd2ss %xmm0, %xmm0 /* SP T[j]*(P(y)+1) */ | |
101 | + mulss %xmm1, %xmm0 /* SP result=2^n*(T[j]*(P(y)+1)) */ | |
102 | + ret | |
103 | + | |
104 | + .p2align 4 | |
105 | +L(small_arg): | |
106 | + /* Here if 0<=|x|<2^(-28) */ | |
107 | + addss L(SP_ONE)(%rip), %xmm0 /* 1.0 + x */ | |
108 | + /* Return 1.0 with inexact raised, except for x==0 */ | |
109 | + ret | |
110 | + | |
111 | + .p2align 4 | |
112 | +L(special_paths): | |
113 | + /* Here if 125*log(2)<=|x| */ | |
114 | + shrl $31, %eax /* Get sign bit of x, and depending on it: */ | |
115 | + lea L(SP_RANGE)(%rip), %rdx /* load over/underflow bound */ | |
116 | + cmpl (%rdx,%rax,4), %ecx /* |x|<under/overflow bound ? */ | |
117 | + jbe L(near_under_or_overflow) | |
118 | + | |
119 | + /* Here if |x|>under/overflow bound */ | |
120 | + cmpl $0x7f800000, %ecx /* |x| is finite ? */ | |
121 | + jae L(arg_inf_or_nan) | |
122 | + | |
123 | + /* Here if |x|>under/overflow bound, and x is finite */ | |
124 | + testq %rax, %rax /* sign of x nonzero ? */ | |
125 | + je L(res_overflow) | |
126 | + | |
127 | + /* Here if -inf<x<underflow bound (x<0) */ | |
128 | + movss L(SP_SMALL)(%rip), %xmm0/* load small value 2^(-100) */ | |
129 | + mulss %xmm0, %xmm0 /* Return underflowed result (zero or subnormal) */ | |
130 | + ret | |
131 | + | |
132 | + .p2align 4 | |
133 | +L(res_overflow): | |
134 | + /* Here if overflow bound<x<inf (x>0) */ | |
135 | + movss L(SP_LARGE)(%rip), %xmm0/* load large value 2^100 */ | |
136 | + mulss %xmm0, %xmm0 /* Return overflowed result (Inf or max normal) */ | |
137 | + ret | |
138 | + | |
139 | + .p2align 4 | |
140 | +L(arg_inf_or_nan): | |
141 | + /* Here if |x| is Inf or NAN */ | |
142 | + jne L(arg_nan) /* |x| is Inf ? */ | |
143 | + | |
144 | + /* Here if |x| is Inf */ | |
145 | + lea L(SP_INF_0)(%rip), %rdx /* depending on sign of x: */ | |
146 | + movss (%rdx,%rax,4), %xmm0 /* return zero or Inf */ | |
147 | + ret | |
148 | + | |
149 | + .p2align 4 | |
150 | +L(arg_nan): | |
151 | + /* Here if |x| is NaN */ | |
152 | + addss %xmm0, %xmm0 /* Return x+x (raise invalid) */ | |
153 | + ret | |
154 | + | |
155 | + .p2align 4 | |
156 | +L(near_under_or_overflow): | |
157 | + /* Here if 125*log(2)<=|x|<under/overflow bound */ | |
158 | + cvtsd2ss %xmm2, %xmm2 /* SP x*K/log(2)+RS */ | |
159 | + movd %xmm2, %eax /* bits of n*K+j with trash */ | |
160 | + subss L(SP_RS)(%rip), %xmm2 /* SP t=round(x*K/log(2)) */ | |
161 | + movl %eax, %edx /* n*K+j with trash */ | |
162 | + cvtss2sd %xmm2, %xmm2 /* DP t */ | |
163 | + andl $0x3f, %eax /* bits of j */ | |
164 | + mulsd L(DP_NLN2K)(%rip), %xmm2/* DP -t*log(2)/K */ | |
165 | + andl $0xffffffc0, %edx /* bits of n */ | |
166 | +#ifdef __AVX__ | |
167 | + vaddsd %xmm1, %xmm2, %xmm0 /* DP y=x-t*log(2)/K */ | |
168 | + vmulsd %xmm0, %xmm0, %xmm2 /* DP z=y*y */ | |
169 | +#else | |
170 | + addsd %xmm1, %xmm2 /* DP y=x-t*log(2)/K */ | |
171 | + movaps %xmm2, %xmm0 /* DP y */ | |
172 | + mulsd %xmm2, %xmm2 /* DP z=y*y */ | |
173 | +#endif | |
174 | + mulsd %xmm2, %xmm4 /* DP P3*z */ | |
175 | + addl $0xffc0, %edx /* bits of n + DP exponent bias */ | |
176 | + mulsd %xmm2, %xmm3 /* DP P2*z */ | |
177 | + shlq $46, %rdx /* DP 2^n */ | |
178 | + addsd L(DP_P1)(%rip), %xmm4 /* DP P3*z+P1 */ | |
179 | + addsd L(DP_P0)(%rip), %xmm3 /* DP P2*z+P0 */ | |
180 | + movd %rdx, %xmm1 /* DP 2^n */ | |
181 | + mulsd %xmm2, %xmm4 /* DP (P3*z+P1)*z */ | |
182 | + mulsd %xmm3, %xmm0 /* DP (P2*z+P0)*y */ | |
183 | + addsd %xmm4, %xmm0 /* DP P(y) */ | |
184 | + mulsd (%rsi,%rax,8), %xmm0 /* DP P(y)*T[j] */ | |
185 | + addsd (%rsi,%rax,8), %xmm0 /* DP T[j]*(P(y)+1) */ | |
186 | + mulsd %xmm1, %xmm0 /* DP result=2^n*(T[j]*(P(y)+1)) */ | |
187 | + cvtsd2ss %xmm0, %xmm0 /* convert result to single precision */ | |
188 | + ret | |
189 | +END(__ieee754_expf) | |
190 | + | |
191 | + .section .rodata, "a" | |
192 | + .p2align 3 | |
193 | +L(DP_T): /* table of double precision values 2^(j/K) for j=[0..K-1] */ | |
194 | + .long 0x00000000, 0x3ff00000 | |
195 | + .long 0x3e778061, 0x3ff02c9a | |
196 | + .long 0xd3158574, 0x3ff059b0 | |
197 | + .long 0x18759bc8, 0x3ff08745 | |
198 | + .long 0x6cf9890f, 0x3ff0b558 | |
199 | + .long 0x32d3d1a2, 0x3ff0e3ec | |
200 | + .long 0xd0125b51, 0x3ff11301 | |
201 | + .long 0xaea92de0, 0x3ff1429a | |
202 | + .long 0x3c7d517b, 0x3ff172b8 | |
203 | + .long 0xeb6fcb75, 0x3ff1a35b | |
204 | + .long 0x3168b9aa, 0x3ff1d487 | |
205 | + .long 0x88628cd6, 0x3ff2063b | |
206 | + .long 0x6e756238, 0x3ff2387a | |
207 | + .long 0x65e27cdd, 0x3ff26b45 | |
208 | + .long 0xf51fdee1, 0x3ff29e9d | |
209 | + .long 0xa6e4030b, 0x3ff2d285 | |
210 | + .long 0x0a31b715, 0x3ff306fe | |
211 | + .long 0xb26416ff, 0x3ff33c08 | |
212 | + .long 0x373aa9cb, 0x3ff371a7 | |
213 | + .long 0x34e59ff7, 0x3ff3a7db | |
214 | + .long 0x4c123422, 0x3ff3dea6 | |
215 | + .long 0x21f72e2a, 0x3ff4160a | |
216 | + .long 0x6061892d, 0x3ff44e08 | |
217 | + .long 0xb5c13cd0, 0x3ff486a2 | |
218 | + .long 0xd5362a27, 0x3ff4bfda | |
219 | + .long 0x769d2ca7, 0x3ff4f9b2 | |
220 | + .long 0x569d4f82, 0x3ff5342b | |
221 | + .long 0x36b527da, 0x3ff56f47 | |
222 | + .long 0xdd485429, 0x3ff5ab07 | |
223 | + .long 0x15ad2148, 0x3ff5e76f | |
224 | + .long 0xb03a5585, 0x3ff6247e | |
225 | + .long 0x82552225, 0x3ff66238 | |
226 | + .long 0x667f3bcd, 0x3ff6a09e | |
227 | + .long 0x3c651a2f, 0x3ff6dfb2 | |
228 | + .long 0xe8ec5f74, 0x3ff71f75 | |
229 | + .long 0x564267c9, 0x3ff75feb | |
230 | + .long 0x73eb0187, 0x3ff7a114 | |
231 | + .long 0x36cf4e62, 0x3ff7e2f3 | |
232 | + .long 0x994cce13, 0x3ff82589 | |
233 | + .long 0x9b4492ed, 0x3ff868d9 | |
234 | + .long 0x422aa0db, 0x3ff8ace5 | |
235 | + .long 0x99157736, 0x3ff8f1ae | |
236 | + .long 0xb0cdc5e5, 0x3ff93737 | |
237 | + .long 0x9fde4e50, 0x3ff97d82 | |
238 | + .long 0x82a3f090, 0x3ff9c491 | |
239 | + .long 0x7b5de565, 0x3ffa0c66 | |
240 | + .long 0xb23e255d, 0x3ffa5503 | |
241 | + .long 0x5579fdbf, 0x3ffa9e6b | |
242 | + .long 0x995ad3ad, 0x3ffae89f | |
243 | + .long 0xb84f15fb, 0x3ffb33a2 | |
244 | + .long 0xf2fb5e47, 0x3ffb7f76 | |
245 | + .long 0x904bc1d2, 0x3ffbcc1e | |
246 | + .long 0xdd85529c, 0x3ffc199b | |
247 | + .long 0x2e57d14b, 0x3ffc67f1 | |
248 | + .long 0xdcef9069, 0x3ffcb720 | |
249 | + .long 0x4a07897c, 0x3ffd072d | |
250 | + .long 0xdcfba487, 0x3ffd5818 | |
251 | + .long 0x03db3285, 0x3ffda9e6 | |
252 | + .long 0x337b9b5f, 0x3ffdfc97 | |
253 | + .long 0xe78b3ff6, 0x3ffe502e | |
254 | + .long 0xa2a490da, 0x3ffea4af | |
255 | + .long 0xee615a27, 0x3ffefa1b | |
256 | + .long 0x5b6e4540, 0x3fff5076 | |
257 | + .long 0x819e90d8, 0x3fffa7c1 | |
258 | + .type L(DP_T), @object | |
259 | + ASM_SIZE_DIRECTIVE(L(DP_T)) | |
260 | + | |
261 | + .section .rodata.cst8,"aM",@progbits,8 | |
262 | + .p2align 3 | |
263 | +L(DP_KLN2): /* double precision K/log(2) */ | |
264 | + .long 0x652b82fe, 0x40571547 | |
265 | + .type L(DP_KLN2), @object | |
266 | + ASM_SIZE_DIRECTIVE(L(DP_KLN2)) | |
267 | + | |
268 | + .p2align 3 | |
269 | +L(DP_NLN2K): /* double precision -log(2)/K */ | |
270 | + .long 0xfefa39ef, 0xbf862e42 | |
271 | + .type L(DP_NLN2K), @object | |
272 | + ASM_SIZE_DIRECTIVE(L(DP_NLN2K)) | |
273 | + | |
274 | + .p2align 3 | |
275 | +L(DP_RS): /* double precision 2^23+2^22 */ | |
276 | + .long 0x00000000, 0x41680000 | |
277 | + .type L(DP_RS), @object | |
278 | + ASM_SIZE_DIRECTIVE(L(DP_RS)) | |
279 | + | |
280 | + .p2align 3 | |
281 | +L(DP_P3): /* double precision polynomial coefficient P3 */ | |
282 | + .long 0xeb78fa85, 0x3fa56420 | |
283 | + .type L(DP_P3), @object | |
284 | + ASM_SIZE_DIRECTIVE(L(DP_P3)) | |
285 | + | |
286 | + .p2align 3 | |
287 | +L(DP_P1): /* double precision polynomial coefficient P1 */ | |
288 | + .long 0x008d6118, 0x3fe00000 | |
289 | + .type L(DP_P1), @object | |
290 | + ASM_SIZE_DIRECTIVE(L(DP_P1)) | |
291 | + | |
292 | + .p2align 3 | |
293 | +L(DP_P2): /* double precision polynomial coefficient P2 */ | |
294 | + .long 0xda752d4f, 0x3fc55550 | |
295 | + .type L(DP_P2), @object | |
296 | + ASM_SIZE_DIRECTIVE(L(DP_P2)) | |
297 | + | |
298 | + .p2align 3 | |
299 | +L(DP_P0): /* double precision polynomial coefficient P0 */ | |
300 | + .long 0xffffe7c6, 0x3fefffff | |
301 | + .type L(DP_P0), @object | |
302 | + ASM_SIZE_DIRECTIVE(L(DP_P0)) | |
303 | + | |
304 | + .p2align 2 | |
305 | +L(SP_RANGE): /* single precision overflow/underflow bounds */ | |
306 | + .long 0x42b17217 /* if x>this bound, then result overflows */ | |
307 | + .long 0x42cff1b4 /* if x<this bound, then result underflows */ | |
308 | + .type L(SP_RANGE), @object | |
309 | + ASM_SIZE_DIRECTIVE(L(SP_RANGE)) | |
310 | + | |
311 | + .p2align 2 | |
312 | +L(SP_INF_0): | |
313 | + .long 0x7f800000 /* single precision Inf */ | |
314 | + .long 0 /* single precision zero */ | |
315 | + .type L(SP_INF_0), @object | |
316 | + ASM_SIZE_DIRECTIVE(L(SP_INF_0)) | |
317 | + | |
318 | + .section .rodata.cst4,"aM",@progbits,4 | |
319 | + .p2align 2 | |
320 | +L(SP_RS): /* single precision 2^23+2^22 */ | |
321 | + .long 0x4b400000 | |
322 | + .type L(SP_RS), @object | |
323 | + ASM_SIZE_DIRECTIVE(L(SP_RS)) | |
324 | + | |
325 | + .p2align 2 | |
326 | +L(SP_SMALL): /* single precision small value 2^(-100) */ | |
327 | + .long 0x0d800000 | |
328 | + .type L(SP_SMALL), @object | |
329 | + ASM_SIZE_DIRECTIVE(L(SP_SMALL)) | |
330 | + | |
331 | + .p2align 2 | |
332 | +L(SP_LARGE): /* single precision large value 2^100 */ | |
333 | + .long 0x71800000 | |
334 | + .type L(SP_LARGE), @object | |
335 | + ASM_SIZE_DIRECTIVE(L(SP_LARGE)) | |
336 | + | |
337 | + .p2align 2 | |
338 | +L(SP_ONE): /* single precision 1.0 */ | |
339 | + .long 0x3f800000 | |
340 | + .type L(SP_ONE), @object | |
341 | + ASM_SIZE_DIRECTIVE(L(SP_ONE)) | |
342 | + | |
343 | +strong_alias (__ieee754_expf, __expf_finite) |