]>
Commit | Line | Data |
---|---|---|
d7bb4c42 | 1 | /* SSE2 version of __ieee754_expf and __expf_finite |
b168057a | 2 | Copyright (C) 2012-2015 Free Software Foundation, Inc. |
d7bb4c42 LD |
3 | This file is part of the GNU C Library. |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
16 | License along with the GNU C Library; if not, see | |
17 | <http://www.gnu.org/licenses/>. */ | |
18 | ||
19 | ||
20 | #include <sysdep.h> | |
21 | ||
22 | /* Short algorithm description: | |
23 | * | |
24 | * Let K = 64 (table size). | |
25 | * e^x = 2^(x/log(2)) = 2^n * T[j] * (1 + P(y)) | |
26 | * where | |
27 | * x = m*log(2)/K + y, y in [0.0..log(2)/K] | |
28 | * m = n*K + j, m,n,j - signed integer, j in [0..K-1] | |
29 | * values of 2^(j/K) are tabulated as T[j]. | |
30 | * | |
31 | * P(y) is a minimax polynomial approximation of expf(x)-1 | |
32 | * on small interval [0.0..log(2)/K]. | |
33 | * | |
34 | * P(y) = P3*y*y*y*y + P2*y*y*y + P1*y*y + P0*y, calculated as | |
35 | * z = y*y; P(y) = (P3*z + P1)*z + (P2*z + P0)*y | |
36 | * | |
37 | * Special cases: | |
38 | * __ieee754_expf_sse2(NaN) = NaN | |
39 | * __ieee754_expf_sse2(+INF) = +INF | |
40 | * __ieee754_expf_sse2(-INF) = 0 | |
41 | * __ieee754_expf_sse2(x) = 1 for subnormals | |
42 | * for finite argument, only __ieee754_expf_sse2(0)=1 is exact | |
43 | * __ieee754_expf_sse2(x) overflows if x>700 | |
44 | * __ieee754_expf_sse2(x) underflows if x<-700 | |
45 | * | |
46 | * Note: | |
47 | * For |x|<700, __ieee754_expf_sse2 computes result in double precision, | |
48 | * with accuracy a bit more than needed for expf, and does not round it | |
49 | * to single precision. | |
50 | */ | |
51 | ||
52 | ||
53 | #ifdef PIC | |
54 | # define MO1(symbol) L(symbol)##@GOTOFF(%edx) | |
55 | # define MO2(symbol,reg2,_scale) L(symbol)##@GOTOFF(%edx,reg2,_scale) | |
56 | #else | |
57 | # define MO1(symbol) L(symbol) | |
58 | # define MO2(symbol,reg2,_scale) L(symbol)(,reg2,_scale) | |
59 | #endif | |
60 | ||
61 | .text | |
62 | ENTRY(__ieee754_expf_sse2) | |
63 | /* Input: single precision x on stack at address 4(%esp) */ | |
64 | ||
65 | #ifdef PIC | |
66 | LOAD_PIC_REG(dx) | |
67 | #endif | |
68 | ||
69 | cvtss2sd 4(%esp), %xmm1 /* Convert x to double precision */ | |
70 | mov 4(%esp), %ecx /* Copy x */ | |
71 | movsd MO1(DP_KLN2), %xmm2 /* DP K/log(2) */ | |
72 | movsd MO1(DP_P2), %xmm3 /* DP P2 */ | |
73 | movl %ecx, %eax /* x */ | |
74 | mulsd %xmm1, %xmm2 /* DP x*K/log(2) */ | |
75 | andl $0x7fffffff, %ecx /* |x| */ | |
76 | cmpl $0x442f0000, %ecx /* |x|<700 ? */ | |
77 | movsd MO1(DP_P3), %xmm4 /* DP P3 */ | |
78 | addsd MO1(DP_RS), %xmm2 /* DP x*K/log(2)+RS */ | |
79 | jae L(special_paths) | |
80 | ||
81 | /* Here if |x|<700 */ | |
82 | cmpl $0x31800000, %ecx /* |x|<2^(-28) ? */ | |
83 | jb L(small_arg) | |
84 | ||
85 | /* Main path: here if 2^(-28)<=|x|<700 */ | |
86 | cvtsd2ss %xmm2, %xmm2 /* SP x*K/log(2)+RS */ | |
87 | movd %xmm2, %eax /* bits of n*K+j with trash */ | |
88 | subss MO1(SP_RS), %xmm2 /* SP t=round(x*K/log(2)) */ | |
89 | movl %eax, %ecx /* n*K+j with trash */ | |
90 | cvtss2sd %xmm2, %xmm2 /* DP t */ | |
91 | andl $0x3f, %eax /* bits of j */ | |
92 | mulsd MO1(DP_NLN2K), %xmm2 /* DP -t*log(2)/K */ | |
93 | andl $0xffffffc0, %ecx /* bits of n */ | |
94 | #ifdef __AVX__ | |
95 | vaddsd %xmm1, %xmm2, %xmm0 /* DP y=x-t*log(2)/K */ | |
96 | vmulsd %xmm0, %xmm0, %xmm2 /* DP z=y*y */ | |
97 | #else | |
98 | addsd %xmm1, %xmm2 /* DP y=x-t*log(2)/K */ | |
99 | movaps %xmm2, %xmm0 /* DP y */ | |
100 | mulsd %xmm2, %xmm2 /* DP z=y*y */ | |
101 | #endif | |
102 | mulsd %xmm2, %xmm4 /* DP P3*z */ | |
103 | addl $0xffc0, %ecx /* bits of n + DP exponent bias */ | |
104 | mulsd %xmm2, %xmm3 /* DP P2*z */ | |
105 | shrl $2, %ecx /* High 2 bytes of DP 2^n */ | |
106 | pxor %xmm1, %xmm1 /* clear %xmm1 */ | |
107 | addsd MO1(DP_P1), %xmm4 /* DP P3*z+P1 */ | |
108 | addsd MO1(DP_P0), %xmm3 /* DP P2*z+P0 */ | |
109 | pinsrw $3, %ecx, %xmm1 /* DP 2^n */ | |
110 | mulsd %xmm2, %xmm4 /* DP (P3*z+P1)*z */ | |
111 | mulsd %xmm3, %xmm0 /* DP (P2*z+P0)*y */ | |
112 | addsd %xmm4, %xmm0 /* DP P(y) */ | |
113 | mulsd MO2(DP_T,%eax,8), %xmm0 /* DP P(y)*T[j] */ | |
114 | addsd MO2(DP_T,%eax,8), %xmm0 /* DP T[j]*(P(y)+1) */ | |
115 | mulsd %xmm1, %xmm0 /* DP result=2^n*(T[j]*(P(y)+1)) */ | |
116 | ||
117 | lea -8(%esp), %esp /* Borrow 8 bytes of stack frame */ | |
118 | movsd %xmm0, 0(%esp) /* Move result from sse... */ | |
119 | fldl 0(%esp) /* ...to FPU. */ | |
120 | lea 8(%esp), %esp /* Return back 8 bytes of stack frame */ | |
121 | ret | |
122 | ||
123 | .p2align 4 | |
124 | L(small_arg): | |
125 | /* Here if 0<=|x|<2^(-28) */ | |
126 | movss 4(%esp), %xmm0 /* load x */ | |
127 | addss MO1(SP_ONE), %xmm0 /* 1.0 + x */ | |
128 | /* Return 1.0 with inexact raised, except for x==0 */ | |
129 | jmp L(epilogue) | |
130 | ||
131 | .p2align 4 | |
132 | L(special_paths): | |
133 | /* Here if x is NaN, or Inf, or finite |x|>=700 */ | |
134 | movss 4(%esp), %xmm0 /* load x */ | |
135 | ||
136 | cmpl $0x7f800000, %ecx /* |x| is finite ? */ | |
137 | jae L(arg_inf_or_nan) | |
138 | ||
139 | /* Here if finite |x|>=700 */ | |
140 | testl $0x80000000, %eax /* sign of x nonzero ? */ | |
141 | je L(res_overflow) | |
142 | ||
143 | /* Here if finite x<=-700 */ | |
144 | movss MO1(SP_SMALL), %xmm0 /* load small value 2^(-100) */ | |
145 | mulss %xmm0, %xmm0 /* Return underflowed result (zero or subnormal) */ | |
146 | jmp L(epilogue) | |
147 | ||
148 | .p2align 4 | |
149 | L(res_overflow): | |
150 | /* Here if finite x>=700 */ | |
151 | movss MO1(SP_LARGE), %xmm0 /* load large value 2^100 */ | |
152 | mulss %xmm0, %xmm0 /* Return overflowed result (Inf or max normal) */ | |
153 | jmp L(epilogue) | |
154 | ||
155 | .p2align 4 | |
156 | L(arg_inf_or_nan): | |
157 | /* Here if |x| is Inf or NAN */ | |
158 | jne L(arg_nan) /* |x| is Inf ? */ | |
159 | ||
160 | /* Here if |x| is Inf */ | |
161 | shrl $31, %eax /* Get sign bit of x */ | |
162 | movss MO2(SP_INF_0,%eax,4), %xmm0/* return zero or Inf, depending on sign of x */ | |
163 | jmp L(epilogue) | |
164 | ||
165 | .p2align 4 | |
166 | L(arg_nan): | |
167 | /* Here if |x| is NaN */ | |
168 | addss %xmm0, %xmm0 /* Return x+x (raise invalid) */ | |
169 | ||
170 | .p2align 4 | |
171 | L(epilogue): | |
172 | lea -4(%esp), %esp /* Borrow 4 bytes of stack frame */ | |
173 | movss %xmm0, 0(%esp) /* Move result from sse... */ | |
174 | flds 0(%esp) /* ...to FPU. */ | |
175 | lea 4(%esp), %esp /* Return back 4 bytes of stack frame */ | |
176 | ret | |
177 | END(__ieee754_expf_sse2) | |
178 | ||
179 | .section .rodata, "a" | |
180 | .p2align 3 | |
181 | L(DP_T): /* table of double precision values 2^(j/K) for j=[0..K-1] */ | |
182 | .long 0x00000000, 0x3ff00000 | |
183 | .long 0x3e778061, 0x3ff02c9a | |
184 | .long 0xd3158574, 0x3ff059b0 | |
185 | .long 0x18759bc8, 0x3ff08745 | |
186 | .long 0x6cf9890f, 0x3ff0b558 | |
187 | .long 0x32d3d1a2, 0x3ff0e3ec | |
188 | .long 0xd0125b51, 0x3ff11301 | |
189 | .long 0xaea92de0, 0x3ff1429a | |
190 | .long 0x3c7d517b, 0x3ff172b8 | |
191 | .long 0xeb6fcb75, 0x3ff1a35b | |
192 | .long 0x3168b9aa, 0x3ff1d487 | |
193 | .long 0x88628cd6, 0x3ff2063b | |
194 | .long 0x6e756238, 0x3ff2387a | |
195 | .long 0x65e27cdd, 0x3ff26b45 | |
196 | .long 0xf51fdee1, 0x3ff29e9d | |
197 | .long 0xa6e4030b, 0x3ff2d285 | |
198 | .long 0x0a31b715, 0x3ff306fe | |
199 | .long 0xb26416ff, 0x3ff33c08 | |
200 | .long 0x373aa9cb, 0x3ff371a7 | |
201 | .long 0x34e59ff7, 0x3ff3a7db | |
202 | .long 0x4c123422, 0x3ff3dea6 | |
203 | .long 0x21f72e2a, 0x3ff4160a | |
204 | .long 0x6061892d, 0x3ff44e08 | |
205 | .long 0xb5c13cd0, 0x3ff486a2 | |
206 | .long 0xd5362a27, 0x3ff4bfda | |
207 | .long 0x769d2ca7, 0x3ff4f9b2 | |
208 | .long 0x569d4f82, 0x3ff5342b | |
209 | .long 0x36b527da, 0x3ff56f47 | |
210 | .long 0xdd485429, 0x3ff5ab07 | |
211 | .long 0x15ad2148, 0x3ff5e76f | |
212 | .long 0xb03a5585, 0x3ff6247e | |
213 | .long 0x82552225, 0x3ff66238 | |
214 | .long 0x667f3bcd, 0x3ff6a09e | |
215 | .long 0x3c651a2f, 0x3ff6dfb2 | |
216 | .long 0xe8ec5f74, 0x3ff71f75 | |
217 | .long 0x564267c9, 0x3ff75feb | |
218 | .long 0x73eb0187, 0x3ff7a114 | |
219 | .long 0x36cf4e62, 0x3ff7e2f3 | |
220 | .long 0x994cce13, 0x3ff82589 | |
221 | .long 0x9b4492ed, 0x3ff868d9 | |
222 | .long 0x422aa0db, 0x3ff8ace5 | |
223 | .long 0x99157736, 0x3ff8f1ae | |
224 | .long 0xb0cdc5e5, 0x3ff93737 | |
225 | .long 0x9fde4e50, 0x3ff97d82 | |
226 | .long 0x82a3f090, 0x3ff9c491 | |
227 | .long 0x7b5de565, 0x3ffa0c66 | |
228 | .long 0xb23e255d, 0x3ffa5503 | |
229 | .long 0x5579fdbf, 0x3ffa9e6b | |
230 | .long 0x995ad3ad, 0x3ffae89f | |
231 | .long 0xb84f15fb, 0x3ffb33a2 | |
232 | .long 0xf2fb5e47, 0x3ffb7f76 | |
233 | .long 0x904bc1d2, 0x3ffbcc1e | |
234 | .long 0xdd85529c, 0x3ffc199b | |
235 | .long 0x2e57d14b, 0x3ffc67f1 | |
236 | .long 0xdcef9069, 0x3ffcb720 | |
237 | .long 0x4a07897c, 0x3ffd072d | |
238 | .long 0xdcfba487, 0x3ffd5818 | |
239 | .long 0x03db3285, 0x3ffda9e6 | |
240 | .long 0x337b9b5f, 0x3ffdfc97 | |
241 | .long 0xe78b3ff6, 0x3ffe502e | |
242 | .long 0xa2a490da, 0x3ffea4af | |
243 | .long 0xee615a27, 0x3ffefa1b | |
244 | .long 0x5b6e4540, 0x3fff5076 | |
245 | .long 0x819e90d8, 0x3fffa7c1 | |
b67e9372 | 246 | .type L(DP_T), @object |
d7bb4c42 LD |
247 | ASM_SIZE_DIRECTIVE(L(DP_T)) |
248 | ||
249 | .section .rodata.cst8,"aM",@progbits,8 | |
250 | .p2align 3 | |
251 | L(DP_KLN2): /* double precision K/log(2) */ | |
252 | .long 0x652b82fe, 0x40571547 | |
b67e9372 | 253 | .type L(DP_KLN2), @object |
d7bb4c42 LD |
254 | ASM_SIZE_DIRECTIVE(L(DP_KLN2)) |
255 | ||
256 | .p2align 3 | |
257 | L(DP_NLN2K): /* double precision -log(2)/K */ | |
258 | .long 0xfefa39ef, 0xbf862e42 | |
b67e9372 | 259 | .type L(DP_NLN2K), @object |
d7bb4c42 LD |
260 | ASM_SIZE_DIRECTIVE(L(DP_NLN2K)) |
261 | ||
262 | .p2align 3 | |
263 | L(DP_RS): /* double precision 2^23+2^22 */ | |
264 | .long 0x00000000, 0x41680000 | |
b67e9372 | 265 | .type L(DP_RS), @object |
d7bb4c42 LD |
266 | ASM_SIZE_DIRECTIVE(L(DP_RS)) |
267 | ||
268 | .p2align 3 | |
269 | L(DP_P3): /* double precision polynomial coefficient P3 */ | |
270 | .long 0xeb78fa85, 0x3fa56420 | |
b67e9372 | 271 | .type L(DP_P3), @object |
d7bb4c42 LD |
272 | ASM_SIZE_DIRECTIVE(L(DP_P3)) |
273 | ||
274 | .p2align 3 | |
275 | L(DP_P1): /* double precision polynomial coefficient P1 */ | |
276 | .long 0x008d6118, 0x3fe00000 | |
b67e9372 | 277 | .type L(DP_P1), @object |
d7bb4c42 LD |
278 | ASM_SIZE_DIRECTIVE(L(DP_P1)) |
279 | ||
280 | .p2align 3 | |
281 | L(DP_P2): /* double precision polynomial coefficient P2 */ | |
282 | .long 0xda752d4f, 0x3fc55550 | |
b67e9372 | 283 | .type L(DP_P2), @object |
d7bb4c42 LD |
284 | ASM_SIZE_DIRECTIVE(L(DP_P2)) |
285 | ||
286 | .p2align 3 | |
287 | L(DP_P0): /* double precision polynomial coefficient P0 */ | |
288 | .long 0xffffe7c6, 0x3fefffff | |
b67e9372 | 289 | .type L(DP_P0), @object |
d7bb4c42 LD |
290 | ASM_SIZE_DIRECTIVE(L(DP_P0)) |
291 | ||
292 | .p2align 2 | |
293 | L(SP_INF_0): | |
294 | .long 0x7f800000 /* single precision Inf */ | |
295 | .long 0 /* single precision zero */ | |
b67e9372 | 296 | .type L(SP_INF_0), @object |
d7bb4c42 LD |
297 | ASM_SIZE_DIRECTIVE(L(SP_INF_0)) |
298 | ||
299 | .section .rodata.cst4,"aM",@progbits,4 | |
300 | .p2align 2 | |
301 | L(SP_RS): /* single precision 2^23+2^22 */ | |
302 | .long 0x4b400000 | |
b67e9372 | 303 | .type L(SP_RS), @object |
d7bb4c42 LD |
304 | ASM_SIZE_DIRECTIVE(L(SP_RS)) |
305 | ||
306 | .p2align 2 | |
307 | L(SP_SMALL): /* single precision small value 2^(-100) */ | |
308 | .long 0x0d800000 | |
b67e9372 | 309 | .type L(SP_SMALL), @object |
d7bb4c42 LD |
310 | ASM_SIZE_DIRECTIVE(L(SP_SMALL)) |
311 | ||
312 | .p2align 2 | |
313 | L(SP_LARGE): /* single precision large value 2^100 */ | |
314 | .long 0x71800000 | |
b67e9372 | 315 | .type L(SP_LARGE), @object |
d7bb4c42 LD |
316 | ASM_SIZE_DIRECTIVE(L(SP_LARGE)) |
317 | ||
318 | .p2align 2 | |
319 | L(SP_ONE): /* single precision 1.0 */ | |
320 | .long 0x3f800000 | |
b67e9372 | 321 | .type L(SP_ONE), @object |
d7bb4c42 LD |
322 | ASM_SIZE_DIRECTIVE(L(SP_ONE)) |
323 | ||
324 | strong_alias (__ieee754_expf_sse2, __expf_finite_sse2) |