]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/ia64/fpu/s_expm1f.S
(CFLAGS-tst-align.c): Add -mpreferred-stack-boundary=4.
[thirdparty/glibc.git] / sysdeps / ia64 / fpu / s_expm1f.S
1 .file "exp_m1f.s"
2
3 // Copyright (C) 2000, 2001, Intel Corporation
4 // All rights reserved.
5 //
6 // Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
7 // and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
8 //
9 // Redistribution and use in source and binary forms, with or without
10 // modification, are permitted provided that the following conditions are
11 // met:
12 //
13 // * Redistributions of source code must retain the above copyright
14 // notice, this list of conditions and the following disclaimer.
15 //
16 // * Redistributions in binary form must reproduce the above copyright
17 // notice, this list of conditions and the following disclaimer in the
18 // documentation and/or other materials provided with the distribution.
19 //
20 // * The name of Intel Corporation may not be used to endorse or promote
21 // products derived from this software without specific prior written
22 // permission.
23 //
24 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
25 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
26 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
27 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
28 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
29 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
30 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
31 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
32 // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
33 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
34 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35 //
36 // Intel Corporation is the author of this code, and requests that all
37 // problem reports or change requests be submitted to it directly at
38 // http://developer.intel.com/opensource.
39 //
40 // HISTORY
41 // 2/02/00 Initial Version
42 // 4/04/00 Unwind support added
43 // 8/15/00 Bundle added after call to __libm_error_support to properly
44 // set [the previously overwritten] GR_Parameter_RESULT.
45 //
46 // *********************************************************************
47 //
48 // Function: Combined expf(x) and expm1f(x), where
49 // x
50 // expf(x) = e , for single precision x values
51 // x
52 // expm1f(x) = e - 1 for single precision x values
53 //
54 // *********************************************************************
55 //
56 // Accuracy: Within .7 ulps for 80-bit floating point values
57 // Very accurate for single precision values
58 //
59 // *********************************************************************
60 //
61 // Resources Used:
62 //
63 // Floating-Point Registers: f8 (Input and Return Value)
64 // f9,f32-f61, f99-f102
65 //
66 // General Purpose Registers:
67 // r32-r61
68 // r62-r65 (Used to pass arguments to error handling routine)
69 //
70 // Predicate Registers: p6-p15
71 //
72 // *********************************************************************
73 //
74 // IEEE Special Conditions:
75 //
76 // Denormal fault raised on denormal inputs
77 // Overflow exceptions raised when appropriate for exp and expm1
78 // Underflow exceptions raised when appropriate for exp and expm1
79 // (Error Handling Routine called for overflow and Underflow)
80 // Inexact raised when appropriate by algorithm
81 //
82 // expf(inf) = inf
83 // expf(-inf) = +0
84 // expf(SNaN) = QNaN
85 // expf(QNaN) = QNaN
86 // expf(0) = 1
87 // expf(EM_special Values) = QNaN
88 // expf(inf) = inf
89 // expm1f(-inf) = -1
90 // expm1f(SNaN) = QNaN
91 // expm1f(QNaN) = QNaN
92 // expm1f(0) = 0
93 // expm1f(EM_special Values) = QNaN
94 //
95 // *********************************************************************
96 //
97 // Implementation and Algorithm Notes:
98 //
99 // ker_exp_64( in_FR : X,
100 // in_GR : Flag,
101 // in_GR : Expo_Range
102 // out_FR : Y_hi,
103 // out_FR : Y_lo,
104 // out_FR : scale,
105 // out_PR : Safe )
106 //
107 // On input, X is in register format and
108 // Flag = 0 for exp,
109 // Flag = 1 for expm1,
110 //
111 // On output, provided X and X_cor are real numbers, then
112 //
113 // scale*(Y_hi + Y_lo) approximates expf(X) if Flag is 0
114 // scale*(Y_hi + Y_lo) approximates expf(X)-1 if Flag is 1
115 //
116 // The accuracy is sufficient for a highly accurate 64 sig.
117 // bit implementation. Safe is set if there is no danger of
118 // overflow/underflow when the result is composed from scale,
119 // Y_hi and Y_lo. Thus, we can have a fast return if Safe is set.
120 // Otherwise, one must prepare to handle the possible exception
121 // appropriately. Note that SAFE not set (false) does not mean
122 // that overflow/underflow will occur; only the setting of SAFE
123 // guarantees the opposite.
124 //
125 // **** High Level Overview ****
126 //
127 // The method consists of three cases.
128 //
129 // If |X| < Tiny use case exp_tiny;
130 // else if |X| < 2^(-6) use case exp_small;
131 // else use case exp_regular;
132 //
133 // Case exp_tiny:
134 //
135 // 1 + X can be used to approximate expf(X) or expf(X+X_cor);
136 // X + X^2/2 can be used to approximate expf(X) - 1
137 //
138 // Case exp_small:
139 //
140 // Here, expf(X), expf(X+X_cor), and expf(X) - 1 can all be
141 // appproximated by a relatively simple polynomial.
142 //
143 // This polynomial resembles the truncated Taylor series
144 //
145 // expf(w) = 1 + w + w^2/2! + w^3/3! + ... + w^n/n!
146 //
147 // Case exp_regular:
148 //
149 // Here we use a table lookup method. The basic idea is that in
150 // order to compute expf(X), we accurately decompose X into
151 //
152 // X = N * log(2)/(2^12) + r, |r| <= log(2)/2^13.
153 //
154 // Hence
155 //
156 // expf(X) = 2^( N / 2^12 ) * expf(r).
157 //
158 // The value 2^( N / 2^12 ) is obtained by simple combinations
159 // of values calculated beforehand and stored in table; expf(r)
160 // is approximated by a short polynomial because |r| is small.
161 //
162 // We elaborate this method in 4 steps.
163 //
164 // Step 1: Reduction
165 //
166 // The value 2^12/log(2) is stored as a double-extended number
167 // L_Inv.
168 //
169 // N := round_to_nearest_integer( X * L_Inv )
170 //
171 // The value log(2)/2^12 is stored as two numbers L_hi and L_lo so
172 // that r can be computed accurately via
173 //
174 // r := (X - N*L_hi) - N*L_lo
175 //
176 // We pick L_hi such that N*L_hi is representable in 64 sig. bits
177 // and thus the FMA X - N*L_hi is error free. So r is the
178 // 1 rounding error from an exact reduction with respect to
179 //
180 // L_hi + L_lo.
181 //
182 // In particular, L_hi has 30 significant bit and can be stored
183 // as a double-precision number; L_lo has 64 significant bits and
184 // stored as a double-extended number.
185 //
186 // In the case Flag = 2, we further modify r by
187 //
188 // r := r + X_cor.
189 //
190 // Step 2: Approximation
191 //
192 // expf(r) - 1 is approximated by a short polynomial of the form
193 //
194 // r + A_1 r^2 + A_2 r^3 + A_3 r^4 .
195 //
196 // Step 3: Composition from Table Values
197 //
198 // The value 2^( N / 2^12 ) can be composed from a couple of tables
199 // of precalculated values. First, express N as three integers
200 // K, M_1, and M_2 as
201 //
202 // N = K * 2^12 + M_1 * 2^6 + M_2
203 //
204 // Where 0 <= M_1, M_2 < 2^6; and K can be positive or negative.
205 // When N is represented in 2's complement, M_2 is simply the 6
206 // lsb's, M_1 is the next 6, and K is simply N shifted right
207 // arithmetically (sign extended) by 12 bits.
208 //
209 // Now, 2^( N / 2^12 ) is simply
210 //
211 // 2^K * 2^( M_1 / 2^6 ) * 2^( M_2 / 2^12 )
212 //
213 // Clearly, 2^K needs no tabulation. The other two values are less
214 // trivial because if we store each accurately to more than working
215 // precision, than its product is too expensive to calculate. We
216 // use the following method.
217 //
218 // Define two mathematical values, delta_1 and delta_2, implicitly
219 // such that
220 //
221 // T_1 = expf( [M_1 log(2)/2^6] - delta_1 )
222 // T_2 = expf( [M_2 log(2)/2^12] - delta_2 )
223 //
224 // are representable as 24 significant bits. To illustrate the idea,
225 // we show how we define delta_1:
226 //
227 // T_1 := round_to_24_bits( expf( M_1 log(2)/2^6 ) )
228 // delta_1 = (M_1 log(2)/2^6) - log( T_1 )
229 //
230 // The last equality means mathematical equality. We then tabulate
231 //
232 // W_1 := expf(delta_1) - 1
233 // W_2 := expf(delta_2) - 1
234 //
235 // Both in double precision.
236 //
237 // From the tabulated values T_1, T_2, W_1, W_2, we compose the values
238 // T and W via
239 //
240 // T := T_1 * T_2 ...exactly
241 // W := W_1 + (1 + W_1)*W_2
242 //
243 // W approximates expf( delta ) - 1 where delta = delta_1 + delta_2.
244 // The mathematical product of T and (W+1) is an accurate representation
245 // of 2^(M_1/2^6) * 2^(M_2/2^12).
246 //
247 // Step 4. Reconstruction
248 //
249 // Finally, we can reconstruct expf(X), expf(X) - 1.
250 // Because
251 //
252 // X = K * log(2) + (M_1*log(2)/2^6 - delta_1)
253 // + (M_2*log(2)/2^12 - delta_2)
254 // + delta_1 + delta_2 + r ...accurately
255 // We have
256 //
257 // expf(X) ~=~ 2^K * ( T + T*[expf(delta_1+delta_2+r) - 1] )
258 // ~=~ 2^K * ( T + T*[expf(delta + r) - 1] )
259 // ~=~ 2^K * ( T + T*[(expf(delta)-1)
260 // + expf(delta)*(expf(r)-1)] )
261 // ~=~ 2^K * ( T + T*( W + (1+W)*poly(r) ) )
262 // ~=~ 2^K * ( Y_hi + Y_lo )
263 //
264 // where Y_hi = T and Y_lo = T*(W + (1+W)*poly(r))
265 //
266 // For expf(X)-1, we have
267 //
268 // expf(X)-1 ~=~ 2^K * ( Y_hi + Y_lo ) - 1
269 // ~=~ 2^K * ( Y_hi + Y_lo - 2^(-K) )
270 //
271 // and we combine Y_hi + Y_lo - 2^(-N) into the form of two
272 // numbers Y_hi + Y_lo carefully.
273 //
274 // **** Algorithm Details ****
275 //
276 // A careful algorithm must be used to realize the mathematical ideas
277 // accurately. We describe each of the three cases. We assume SAFE
278 // is preset to be TRUE.
279 //
280 // Case exp_tiny:
281 //
282 // The important points are to ensure an accurate result under
283 // different rounding directions and a correct setting of the SAFE
284 // flag.
285 //
286 // If Flag is 1, then
287 // SAFE := False ...possibility of underflow
288 // Scale := 1.0
289 // Y_hi := X
290 // Y_lo := 2^(-17000)
291 // Else
292 // Scale := 1.0
293 // Y_hi := 1.0
294 // Y_lo := X ...for different rounding modes
295 // Endif
296 //
297 // Case exp_small:
298 //
299 // Here we compute a simple polynomial. To exploit parallelism, we split
300 // the polynomial into several portions.
301 //
302 // Let r = X
303 //
304 // If Flag is not 1 ...i.e. expf( argument )
305 //
306 // rsq := r * r;
307 // r4 := rsq*rsq
308 // poly_lo := P_3 + r*(P_4 + r*(P_5 + r*P_6))
309 // poly_hi := r + rsq*(P_1 + r*P_2)
310 // Y_lo := poly_hi + r4 * poly_lo
311 // set lsb(Y_lo) to 1
312 // Y_hi := 1.0
313 // Scale := 1.0
314 //
315 // Else ...i.e. expf( argument ) - 1
316 //
317 // rsq := r * r
318 // r4 := rsq * rsq
319 // r6 := rsq * r4
320 // poly_lo := r6*(Q_5 + r*(Q_6 + r*Q_7))
321 // poly_hi := Q_1 + r*(Q_2 + r*(Q_3 + r*Q_4))
322 // Y_lo := rsq*poly_hi + poly_lo
323 // set lsb(Y_lo) to 1
324 // Y_hi := X
325 // Scale := 1.0
326 //
327 // Endif
328 //
329 // Case exp_regular:
330 //
331 // The previous description contain enough information except the
332 // computation of poly and the final Y_hi and Y_lo in the case for
333 // expf(X)-1.
334 //
335 // The computation of poly for Step 2:
336 //
337 // rsq := r*r
338 // poly := r + rsq*(A_1 + r*(A_2 + r*A_3))
339 //
340 // For the case expf(X) - 1, we need to incorporate 2^(-K) into
341 // Y_hi and Y_lo at the end of Step 4.
342 //
343 // If K > 10 then
344 // Y_lo := Y_lo - 2^(-K)
345 // Else
346 // If K < -10 then
347 // Y_lo := Y_hi + Y_lo
348 // Y_hi := -2^(-K)
349 // Else
350 // Y_hi := Y_hi - 2^(-K)
351 // End If
352 // End If
353 //
354
355 #include "libm_support.h"
356
357
358 GR_SAVE_B0 = r60
359 GR_SAVE_PFS = r59
360 GR_SAVE_GP = r61
361
362 GR_Parameter_X = r62
363 GR_Parameter_Y = r63
364 GR_Parameter_RESULT = r64
365 GR_Parameter_TAG = r65
366
367 FR_X = f9
368 FR_Y = f1
369 FR_RESULT = f99
370
371
372 #ifdef _LIBC
373 .rodata
374 #else
375 .data
376 #endif
377
378 .align 64
379 Constants_exp_64_Arg:
380 ASM_TYPE_DIRECTIVE(Constants_exp_64_Arg,@object)
381 data4 0x5C17F0BC,0xB8AA3B29,0x0000400B,0x00000000
382 data4 0x00000000,0xB17217F4,0x00003FF2,0x00000000
383 data4 0xF278ECE6,0xF473DE6A,0x00003FD4,0x00000000
384 // /* Inv_L, L_hi, L_lo */
385 ASM_SIZE_DIRECTIVE(Constants_exp_64_Arg)
386
387 .align 64
388 Constants_exp_64_Exponents:
389 ASM_TYPE_DIRECTIVE(Constants_exp_64_Exponents,@object)
390 data4 0x0000007E,0x00000000,0xFFFFFF83,0xFFFFFFFF
391 data4 0x000003FE,0x00000000,0xFFFFFC03,0xFFFFFFFF
392 data4 0x00003FFE,0x00000000,0xFFFFC003,0xFFFFFFFF
393 data4 0x00003FFE,0x00000000,0xFFFFC003,0xFFFFFFFF
394 data4 0xFFFFFFE2,0xFFFFFFFF,0xFFFFFFC4,0xFFFFFFFF
395 data4 0xFFFFFFBA,0xFFFFFFFF,0xFFFFFFBA,0xFFFFFFFF
396 ASM_SIZE_DIRECTIVE(Constants_exp_64_Exponents)
397
398 .align 64
399 Constants_exp_64_A:
400 ASM_TYPE_DIRECTIVE(Constants_exp_64_A,@object)
401 data4 0xB1B736A0,0xAAAAAAAB,0x00003FFA,0x00000000
402 data4 0x90CD6327,0xAAAAAAAB,0x00003FFC,0x00000000
403 data4 0xFFFFFFFF,0xFFFFFFFF,0x00003FFD,0x00000000
404 // /* Reversed */
405 ASM_SIZE_DIRECTIVE(Constants_exp_64_A)
406
407 .align 64
408 Constants_exp_64_P:
409 ASM_TYPE_DIRECTIVE(Constants_exp_64_P,@object)
410 data4 0x43914A8A,0xD00D6C81,0x00003FF2,0x00000000
411 data4 0x30304B30,0xB60BC4AC,0x00003FF5,0x00000000
412 data4 0x7474C518,0x88888888,0x00003FF8,0x00000000
413 data4 0x8DAE729D,0xAAAAAAAA,0x00003FFA,0x00000000
414 data4 0xAAAAAF61,0xAAAAAAAA,0x00003FFC,0x00000000
415 data4 0x000004C7,0x80000000,0x00003FFE,0x00000000
416 // /* Reversed */
417 ASM_SIZE_DIRECTIVE(Constants_exp_64_P)
418
419 .align 64
420 Constants_exp_64_Q:
421 ASM_TYPE_DIRECTIVE(Constants_exp_64_Q,@object)
422 data4 0xA49EF6CA,0xD00D56F7,0x00003FEF,0x00000000
423 data4 0x1C63493D,0xD00D59AB,0x00003FF2,0x00000000
424 data4 0xFB50CDD2,0xB60B60B5,0x00003FF5,0x00000000
425 data4 0x7BA68DC8,0x88888888,0x00003FF8,0x00000000
426 data4 0xAAAAAC8D,0xAAAAAAAA,0x00003FFA,0x00000000
427 data4 0xAAAAACCA,0xAAAAAAAA,0x00003FFC,0x00000000
428 data4 0x00000000,0x80000000,0x00003FFE,0x00000000
429 // /* Reversed */
430 ASM_SIZE_DIRECTIVE(Constants_exp_64_Q)
431
432 .align 64
433 Constants_exp_64_T1:
434 ASM_TYPE_DIRECTIVE(Constants_exp_64_T1,@object)
435 data4 0x3F800000,0x3F8164D2,0x3F82CD87,0x3F843A29
436 data4 0x3F85AAC3,0x3F871F62,0x3F88980F,0x3F8A14D5
437 data4 0x3F8B95C2,0x3F8D1ADF,0x3F8EA43A,0x3F9031DC
438 data4 0x3F91C3D3,0x3F935A2B,0x3F94F4F0,0x3F96942D
439 data4 0x3F9837F0,0x3F99E046,0x3F9B8D3A,0x3F9D3EDA
440 data4 0x3F9EF532,0x3FA0B051,0x3FA27043,0x3FA43516
441 data4 0x3FA5FED7,0x3FA7CD94,0x3FA9A15B,0x3FAB7A3A
442 data4 0x3FAD583F,0x3FAF3B79,0x3FB123F6,0x3FB311C4
443 data4 0x3FB504F3,0x3FB6FD92,0x3FB8FBAF,0x3FBAFF5B
444 data4 0x3FBD08A4,0x3FBF179A,0x3FC12C4D,0x3FC346CD
445 data4 0x3FC5672A,0x3FC78D75,0x3FC9B9BE,0x3FCBEC15
446 data4 0x3FCE248C,0x3FD06334,0x3FD2A81E,0x3FD4F35B
447 data4 0x3FD744FD,0x3FD99D16,0x3FDBFBB8,0x3FDE60F5
448 data4 0x3FE0CCDF,0x3FE33F89,0x3FE5B907,0x3FE8396A
449 data4 0x3FEAC0C7,0x3FED4F30,0x3FEFE4BA,0x3FF28177
450 data4 0x3FF5257D,0x3FF7D0DF,0x3FFA83B3,0x3FFD3E0C
451 ASM_SIZE_DIRECTIVE(Constants_exp_64_T1)
452
453 .align 64
454 Constants_exp_64_T2:
455 ASM_TYPE_DIRECTIVE(Constants_exp_64_T2,@object)
456 data4 0x3F800000,0x3F80058C,0x3F800B18,0x3F8010A4
457 data4 0x3F801630,0x3F801BBD,0x3F80214A,0x3F8026D7
458 data4 0x3F802C64,0x3F8031F2,0x3F803780,0x3F803D0E
459 data4 0x3F80429C,0x3F80482B,0x3F804DB9,0x3F805349
460 data4 0x3F8058D8,0x3F805E67,0x3F8063F7,0x3F806987
461 data4 0x3F806F17,0x3F8074A8,0x3F807A39,0x3F807FCA
462 data4 0x3F80855B,0x3F808AEC,0x3F80907E,0x3F809610
463 data4 0x3F809BA2,0x3F80A135,0x3F80A6C7,0x3F80AC5A
464 data4 0x3F80B1ED,0x3F80B781,0x3F80BD14,0x3F80C2A8
465 data4 0x3F80C83C,0x3F80CDD1,0x3F80D365,0x3F80D8FA
466 data4 0x3F80DE8F,0x3F80E425,0x3F80E9BA,0x3F80EF50
467 data4 0x3F80F4E6,0x3F80FA7C,0x3F810013,0x3F8105AA
468 data4 0x3F810B41,0x3F8110D8,0x3F81166F,0x3F811C07
469 data4 0x3F81219F,0x3F812737,0x3F812CD0,0x3F813269
470 data4 0x3F813802,0x3F813D9B,0x3F814334,0x3F8148CE
471 data4 0x3F814E68,0x3F815402,0x3F81599C,0x3F815F37
472 ASM_SIZE_DIRECTIVE(Constants_exp_64_T2)
473
474 .align 64
475 Constants_exp_64_W1:
476 ASM_TYPE_DIRECTIVE(Constants_exp_64_W1,@object)
477 data4 0x00000000,0x00000000,0x171EC4B4,0xBE384454
478 data4 0x4AA72766,0xBE694741,0xD42518F8,0xBE5D32B6
479 data4 0x3A319149,0x3E68D96D,0x62415F36,0xBE68F4DA
480 data4 0xC9C86A3B,0xBE6DDA2F,0xF49228FE,0x3E6B2E50
481 data4 0x1188B886,0xBE49C0C2,0x1A4C2F1F,0x3E64BFC2
482 data4 0x2CB98B54,0xBE6A2FBB,0x9A55D329,0x3E5DC5DE
483 data4 0x39A7AACE,0x3E696490,0x5C66DBA5,0x3E54728B
484 data4 0xBA1C7D7D,0xBE62B0DB,0x09F1AF5F,0x3E576E04
485 data4 0x1A0DD6A1,0x3E612500,0x795FBDEF,0xBE66A419
486 data4 0xE1BD41FC,0xBE5CDE8C,0xEA54964F,0xBE621376
487 data4 0x476E76EE,0x3E6370BE,0x3427EB92,0x3E390D1A
488 data4 0x2BF82BF8,0x3E1336DE,0xD0F7BD9E,0xBE5FF1CB
489 data4 0x0CEB09DD,0xBE60A355,0x0980F30D,0xBE5CA37E
490 data4 0x4C082D25,0xBE5C541B,0x3B467D29,0xBE5BBECA
491 data4 0xB9D946C5,0xBE400D8A,0x07ED374A,0xBE5E2A08
492 data4 0x365C8B0A,0xBE66CB28,0xD3403BCA,0x3E3AAD5B
493 data4 0xC7EA21E0,0x3E526055,0xE72880D6,0xBE442C75
494 data4 0x85222A43,0x3E58B2BB,0x522C42BF,0xBE5AAB79
495 data4 0x469DC2BC,0xBE605CB4,0xA48C40DC,0xBE589FA7
496 data4 0x1AA42614,0xBE51C214,0xC37293F4,0xBE48D087
497 data4 0xA2D673E0,0x3E367A1C,0x114F7A38,0xBE51BEBB
498 data4 0x661A4B48,0xBE6348E5,0x1D3B9962,0xBDF52643
499 data4 0x35A78A53,0x3E3A3B5E,0x1CECD788,0xBE46C46C
500 data4 0x7857D689,0xBE60B7EC,0xD14F1AD7,0xBE594D3D
501 data4 0x4C9A8F60,0xBE4F9C30,0x02DFF9D2,0xBE521873
502 data4 0x55E6D68F,0xBE5E4C88,0x667F3DC4,0xBE62140F
503 data4 0x3BF88747,0xBE36961B,0xC96EC6AA,0x3E602861
504 data4 0xD57FD718,0xBE3B5151,0xFC4A627B,0x3E561CD0
505 data4 0xCA913FEA,0xBE3A5217,0x9A5D193A,0x3E40A3CC
506 data4 0x10A9C312,0xBE5AB713,0xC5F57719,0x3E4FDADB
507 data4 0xDBDF59D5,0x3E361428,0x61B4180D,0x3E5DB5DB
508 data4 0x7408D856,0xBE42AD5F,0x31B2B707,0x3E2A3148
509 ASM_SIZE_DIRECTIVE(Constants_exp_64_W1)
510
511 .align 64
512 Constants_exp_64_W2:
513 ASM_TYPE_DIRECTIVE(Constants_exp_64_W2,@object)
514 data4 0x00000000,0x00000000,0x37A3D7A2,0xBE641F25
515 data4 0xAD028C40,0xBE68DD57,0xF212B1B6,0xBE5C77D8
516 data4 0x1BA5B070,0x3E57878F,0x2ECAE6FE,0xBE55A36A
517 data4 0x569DFA3B,0xBE620608,0xA6D300A3,0xBE53B50E
518 data4 0x223F8F2C,0x3E5B5EF2,0xD6DE0DF4,0xBE56A0D9
519 data4 0xEAE28F51,0xBE64EEF3,0x367EA80B,0xBE5E5AE2
520 data4 0x5FCBC02D,0x3E47CB1A,0x9BDAFEB7,0xBE656BA0
521 data4 0x805AFEE7,0x3E6E70C6,0xA3415EBA,0xBE6E0509
522 data4 0x49BFF529,0xBE56856B,0x00508651,0x3E66DD33
523 data4 0xC114BC13,0x3E51165F,0xC453290F,0x3E53333D
524 data4 0x05539FDA,0x3E6A072B,0x7C0A7696,0xBE47CD87
525 data4 0xEB05C6D9,0xBE668BF4,0x6AE86C93,0xBE67C3E3
526 data4 0xD0B3E84B,0xBE533904,0x556B53CE,0x3E63E8D9
527 data4 0x63A98DC8,0x3E212C89,0x032A7A22,0xBE33138F
528 data4 0xBC584008,0x3E530FA9,0xCCB93C97,0xBE6ADF82
529 data4 0x8370EA39,0x3E5F9113,0xFB6A05D8,0x3E5443A4
530 data4 0x181FEE7A,0x3E63DACD,0xF0F67DEC,0xBE62B29D
531 data4 0x3DDE6307,0x3E65C483,0xD40A24C1,0x3E5BF030
532 data4 0x14E437BE,0x3E658B8F,0xED98B6C7,0xBE631C29
533 data4 0x04CF7C71,0x3E6335D2,0xE954A79D,0x3E529EED
534 data4 0xF64A2FB8,0x3E5D9257,0x854ED06C,0xBE6BED1B
535 data4 0xD71405CB,0x3E5096F6,0xACB9FDF5,0xBE3D4893
536 data4 0x01B68349,0xBDFEB158,0xC6A463B9,0x3E628D35
537 data4 0xADE45917,0xBE559725,0x042FC476,0xBE68C29C
538 data4 0x01E511FA,0xBE67593B,0x398801ED,0xBE4A4313
539 data4 0xDA7C3300,0x3E699571,0x08062A9E,0x3E5349BE
540 data4 0x755BB28E,0x3E5229C4,0x77A1F80D,0x3E67E426
541 data4 0x6B69C352,0xBE52B33F,0x084DA57F,0xBE6B3550
542 data4 0xD1D09A20,0xBE6DB03F,0x2161B2C1,0xBE60CBC4
543 data4 0x78A2B771,0x3E56ED9C,0x9D0FA795,0xBE508E31
544 data4 0xFD1A54E9,0xBE59482A,0xB07FD23E,0xBE2A17CE
545 data4 0x17365712,0x3E68BF5C,0xB3785569,0x3E3956F9
546 ASM_SIZE_DIRECTIVE(Constants_exp_64_W2)
547
548 .section .text
549 .proc expm1f#
550 .global expm1f#
551 .align 64
552
553 expm1f:
554 #ifdef _LIBC
555 .global __expm1f#
556 __expm1f:
557 #endif
558
559
560 { .mii
561 alloc r32 = ar.pfs,0,30,4,0
562 (p0) add r33 = 1, r0
563 (p0) cmp.eq.unc p7, p0 = r0, r0
564 }
565 ;;
566
567 //
568 // Set p7 true for expm1
569 // Set Flag = r33 = 1 for expm1
570 // These are really no longer necesary, but are a remnant
571 // when this file had multiple entry points.
572 // They should be carefully removed
573
574
575 { .mfi
576 (p0) add r32 = 0,r0
577 (p0) fnorm.s1 f9 = f8
578 nop.i 0
579 }
580
581 { .mfi
582 nop.m 0
583 //
584 // Set p7 false for exp
585 // Set Flag = r33 = 0 for exp
586 //
587 (p0) fclass.m.unc p6, p8 = f8, 0x1E7
588 nop.i 0 ;;
589 }
590
591 { .mfi
592 nop.m 999
593 (p0) fclass.nm.unc p9, p0 = f8, 0x1FF
594 nop.i 0
595 }
596
597 { .mfi
598 nop.m 999
599 (p0) mov f36 = f1
600 nop.i 999 ;;
601 }
602
603 //
604 // Identify NatVals, NaNs, Infs, and Zeros.
605 // Identify EM unsupporteds.
606 // Save special input registers
607 //
608 // Create FR_X_cor = 0.0
609 // GR_Flag = 0
610 // GR_Expo_Range = 0 (r32) for single precision
611 // FR_Scale = 1.0
612 //
613
614 { .mfb
615 nop.m 999
616 (p0) mov f32 = f0
617 (p6) br.cond.spnt EXPF_64_SPECIAL ;;
618 }
619
620 { .mib
621 nop.m 999
622 nop.i 999
623 (p9) br.cond.spnt EXPF_64_UNSUPPORTED ;;
624 }
625
626 //
627 // Branch out for special input values
628 //
629
630 { .mfi
631 (p0) cmp.ne.unc p12, p13 = 0x01, r33
632 (p0) fcmp.lt.unc.s0 p9,p0 = f8, f0
633 (p0) cmp.eq.unc p15, p0 = r0, r0
634 }
635
636 //
637 // Raise possible denormal operand exception
638 // Normalize x
639 //
640 // This function computes expf( x + x_cor)
641 // Input FR 1: FR_X
642 // Input FR 2: FR_X_cor
643 // Input GR 1: GR_Flag
644 // Input GR 2: GR_Expo_Range
645 // Output FR 3: FR_Y_hi
646 // Output FR 4: FR_Y_lo
647 // Output FR 5: FR_Scale
648 // Output PR 1: PR_Safe
649
650 //
651 // Prepare to load constants
652 // Set Safe = True
653 //
654
655 { .mmi
656 (p0) addl r34 = @ltoff(Constants_exp_64_Arg#),gp
657 (p0) addl r40 = @ltoff(Constants_exp_64_W1#),gp
658 (p0) addl r41 = @ltoff(Constants_exp_64_W2#),gp
659 };;
660
661 { .mmi
662 ld8 r34 = [r34]
663 ld8 r40 = [r40]
664 (p0) addl r50 = @ltoff(Constants_exp_64_T1#), gp
665 }
666 ;;
667 { .mmi
668 ld8 r41 = [r41]
669 (p0) ldfe f37 = [r34],16
670 (p0) addl r51 = @ltoff(Constants_exp_64_T2#), gp
671 }
672 ;;
673 //
674 // N = fcvt.fx(float_N)
675 // Set p14 if -6 > expo_X
676 //
677 //
678 // Bias = 0x0FFFF
679 // expo_X = expo_X and Mask
680 //
681
682 { .mmi
683 ld8 r50 = [r50]
684 (p0) ldfe f40 = [r34],16
685 nop.i 999
686 }
687 ;;
688
689 { .mlx
690 nop.m 999
691 (p0) movl r58 = 0x0FFFF
692 };;
693
694 //
695 // Load W2_ptr
696 // Branch to SMALL is expo_X < -6
697 //
698 //
699 // float_N = X * L_Inv
700 // expo_X = exponent of X
701 // Mask = 0x1FFFF
702 //
703
704 { .mmi
705 ld8 r51 = [r51]
706 (p0) ldfe f41 = [r34],16
707 //
708 // float_N = X * L_Inv
709 // expo_X = exponent of X
710 // Mask = 0x1FFFF
711 //
712 nop.i 0
713 };;
714
715 { .mlx
716 (p0) addl r34 = @ltoff(Constants_exp_64_Exponents#), gp
717 (p0) movl r39 = 0x1FFFF
718 }
719 ;;
720
721 { .mmi
722 ld8 r34 = [r34]
723 (p0) getf.exp r37 = f9
724 nop.i 999
725 }
726 ;;
727
728 { .mii
729 nop.m 999
730 nop.i 999
731 (p0) and r37 = r37, r39 ;;
732 }
733
734 { .mmi
735 (p0) sub r37 = r37, r58 ;;
736 (p0) cmp.gt.unc p14, p0 = -6, r37
737 (p0) cmp.lt.unc p10, p0 = 14, r37 ;;
738 }
739
740 { .mfi
741 nop.m 999
742 //
743 // Load L_inv
744 // Set p12 true for Flag = 0 (exp)
745 // Set p13 true for Flag = 1 (expm1)
746 //
747 (p0) fmpy.s1 f38 = f9, f37
748 nop.i 999 ;;
749 }
750
751 { .mfb
752 nop.m 999
753 //
754 // Load L_hi
755 // expo_X = expo_X - Bias
756 // get W1_ptr
757 //
758 (p0) fcvt.fx.s1 f39 = f38
759 (p14) br.cond.spnt EXPF_SMALL ;;
760 }
761
762 { .mib
763 nop.m 999
764 nop.i 999
765 (p10) br.cond.spnt EXPF_HUGE ;;
766 }
767
768 { .mmi
769 (p0) shladd r34 = r32,4,r34
770 (p0) addl r35 = @ltoff(Constants_exp_64_A#),gp
771 nop.i 999
772 }
773 ;;
774
775 { .mmi
776 ld8 r35 = [r35]
777 nop.m 999
778 nop.i 999
779 }
780 ;;
781
782 //
783 // Load T_1,T_2
784 //
785
786 { .mmb
787 (p0) ldfe f51 = [r35],16
788 (p0) ld8 r45 = [r34],8
789 nop.b 999 ;;
790 }
791 //
792 // Set Safe = True if k >= big_expo_neg
793 // Set Safe = False if k < big_expo_neg
794 //
795
796 { .mmb
797 (p0) ldfe f49 = [r35],16
798 (p0) ld8 r48 = [r34],0
799 nop.b 999 ;;
800 }
801
802 { .mfi
803 nop.m 999
804 //
805 // Branch to HUGE is expo_X > 14
806 //
807 (p0) fcvt.xf f38 = f39
808 nop.i 999 ;;
809 }
810
811 { .mfi
812 (p0) getf.sig r52 = f39
813 nop.f 999
814 nop.i 999 ;;
815 }
816
817 { .mii
818 nop.m 999
819 (p0) extr.u r43 = r52, 6, 6 ;;
820 //
821 // r = r - float_N * L_lo
822 // K = extr(N_fix,12,52)
823 //
824 (p0) shladd r40 = r43,3,r40 ;;
825 }
826
827 { .mfi
828 (p0) shladd r50 = r43,2,r50
829 (p0) fnma.s1 f42 = f40, f38, f9
830 //
831 // float_N = float(N)
832 // N_fix = signficand N
833 //
834 (p0) extr.u r42 = r52, 0, 6
835 }
836
837 { .mmi
838 (p0) ldfd f43 = [r40],0 ;;
839 (p0) shladd r41 = r42,3,r41
840 (p0) shladd r51 = r42,2,r51
841 }
842 //
843 // W_1_p1 = 1 + W_1
844 //
845
846 { .mmi
847 (p0) ldfs f44 = [r50],0 ;;
848 (p0) ldfd f45 = [r41],0
849 //
850 // M_2 = extr(N_fix,0,6)
851 // M_1 = extr(N_fix,6,6)
852 // r = X - float_N * L_hi
853 //
854 (p0) extr r44 = r52, 12, 52
855 }
856
857 { .mmi
858 (p0) ldfs f46 = [r51],0 ;;
859 (p0) sub r46 = r58, r44
860 (p0) cmp.gt.unc p8, p15 = r44, r45
861 }
862 //
863 // W = W_1 + W_1_p1*W_2
864 // Load A_2
865 // Bias_m_K = Bias - K
866 //
867
868 { .mii
869 (p0) ldfe f40 = [r35],16
870 //
871 // load A_1
872 // poly = A_2 + r*A_3
873 // rsq = r * r
874 // neg_2_mK = exponent of Bias_m_k
875 //
876 (p0) add r47 = r58, r44 ;;
877 //
878 // Set Safe = True if k <= big_expo_pos
879 // Set Safe = False if k > big_expo_pos
880 // Load A_3
881 //
882 (p15) cmp.lt p8,p15 = r44,r48 ;;
883 }
884
885 { .mmf
886 (p0) setf.exp f61 = r46
887 //
888 // Bias_p + K = Bias + K
889 // T = T_1 * T_2
890 //
891 (p0) setf.exp f36 = r47
892 (p0) fnma.s1 f42 = f41, f38, f42 ;;
893 }
894
895 { .mfi
896 nop.m 999
897 //
898 // Load W_1,W_2
899 // Load big_exp_pos, load big_exp_neg
900 //
901 (p0) fadd.s1 f47 = f43, f1
902 nop.i 999 ;;
903 }
904
905 { .mfi
906 nop.m 999
907 (p0) fma.s1 f52 = f42, f51, f49
908 nop.i 999
909 }
910
911 { .mfi
912 nop.m 999
913 (p0) fmpy.s1 f48 = f42, f42
914 nop.i 999 ;;
915 }
916
917 { .mfi
918 nop.m 999
919 (p0) fmpy.s1 f53 = f44, f46
920 nop.i 999 ;;
921 }
922
923 { .mfi
924 nop.m 999
925 (p0) fma.s1 f54 = f45, f47, f43
926 nop.i 999
927 }
928
929 { .mfi
930 nop.m 999
931 (p0) fneg f61 = f61
932 nop.i 999 ;;
933 }
934
935 { .mfi
936 nop.m 999
937 (p0) fma.s1 f52 = f42, f52, f40
938 nop.i 999 ;;
939 }
940
941 { .mfi
942 nop.m 999
943 (p0) fadd.s1 f55 = f54, f1
944 nop.i 999
945 }
946
947 { .mfi
948 nop.m 999
949 //
950 // W + Wp1 * poly
951 //
952 (p0) mov f34 = f53
953 nop.i 999 ;;
954 }
955
956 { .mfi
957 nop.m 999
958 //
959 // A_1 + r * poly
960 // Scale = setf_expf(Bias_p_k)
961 //
962 (p0) fma.s1 f52 = f48, f52, f42
963 nop.i 999 ;;
964 }
965
966 { .mfi
967 nop.m 999
968 //
969 // poly = r + rsq(A_1 + r*poly)
970 // Wp1 = 1 + W
971 // neg_2_mK = -neg_2_mK
972 //
973 (p0) fma.s1 f35 = f55, f52, f54
974 nop.i 999 ;;
975 }
976
977 { .mfb
978 nop.m 999
979 (p0) fmpy.s1 f35 = f35, f53
980 //
981 // Y_hi = T
982 // Y_lo = T * (W + Wp1*poly)
983 //
984 (p12) br.cond.sptk EXPF_MAIN ;;
985 }
986 //
987 // Branch if expf(x)
988 // Continue for expf(x-1)
989 //
990
991 { .mii
992 (p0) cmp.lt.unc p12, p13 = 10, r44
993 nop.i 999 ;;
994 //
995 // Set p12 if 10 < K, Else p13
996 //
997 (p13) cmp.gt.unc p13, p14 = -10, r44 ;;
998 }
999 //
1000 // K > 10: Y_lo = Y_lo + neg_2_mK
1001 // K <=10: Set p13 if -10 > K, Else set p14
1002 //
1003
1004 { .mfi
1005 (p13) cmp.eq p15, p0 = r0, r0
1006 (p14) fadd.s1 f34 = f61, f34
1007 nop.i 999 ;;
1008 }
1009
1010 { .mfi
1011 nop.m 999
1012 (p12) fadd.s1 f35 = f35, f61
1013 nop.i 999 ;;
1014 }
1015
1016 { .mfi
1017 nop.m 999
1018 (p13) fadd.s1 f35 = f35, f34
1019 nop.i 999
1020 }
1021
1022 { .mfb
1023 nop.m 999
1024 //
1025 // K <= 10 and K < -10, Set Safe = True
1026 // K <= 10 and K < 10, Y_lo = Y_hi + Y_lo
1027 // K <= 10 and K > =-10, Y_hi = Y_hi + neg_2_mk
1028 //
1029 (p13) mov f34 = f61
1030 (p0) br.cond.sptk EXPF_MAIN ;;
1031 }
1032 EXPF_SMALL:
1033 { .mmi
1034 (p12) addl r35 = @ltoff(Constants_exp_64_P#), gp
1035 (p0) addl r34 = @ltoff(Constants_exp_64_Exponents#), gp
1036 nop.i 999
1037 }
1038 ;;
1039
1040 { .mmi
1041 (p12) ld8 r35 = [r35]
1042 ld8 r34 = [r34]
1043 nop.i 999
1044 }
1045 ;;
1046
1047
1048 { .mmi
1049 (p13) addl r35 = @ltoff(Constants_exp_64_Q#), gp
1050 nop.m 999
1051 nop.i 999
1052 }
1053 ;;
1054
1055
1056 //
1057 // Return
1058 // K <= 10 and K < 10, Y_hi = neg_2_mk
1059 //
1060 // /*******************************************************/
1061 // /*********** Branch EXP_SMALL *************************/
1062 // /*******************************************************/
1063
1064 { .mfi
1065 (p13) ld8 r35 = [r35]
1066 (p0) mov f42 = f9
1067 (p0) add r34 = 0x48,r34
1068 }
1069 ;;
1070
1071 //
1072 // Flag = 0
1073 // r4 = rsq * rsq
1074 //
1075
1076 { .mfi
1077 (p0) ld8 r49 =[r34],0
1078 nop.f 999
1079 nop.i 999 ;;
1080 }
1081
1082 { .mii
1083 nop.m 999
1084 nop.i 999 ;;
1085 //
1086 // Flag = 1
1087 //
1088 (p0) cmp.lt.unc p14, p0 = r37, r49 ;;
1089 }
1090
1091 { .mfi
1092 nop.m 999
1093 //
1094 // r = X
1095 //
1096 (p0) fmpy.s1 f48 = f42, f42
1097 nop.i 999 ;;
1098 }
1099
1100 { .mfb
1101 nop.m 999
1102 //
1103 // rsq = r * r
1104 //
1105 (p0) fmpy.s1 f50 = f48, f48
1106 //
1107 // Is input very small?
1108 //
1109 (p14) br.cond.spnt EXPF_VERY_SMALL ;;
1110 }
1111 //
1112 // Flag_not1: Y_hi = 1.0
1113 // Flag is 1: r6 = rsq * r4
1114 //
1115
1116 { .mfi
1117 (p12) ldfe f52 = [r35],16
1118 (p12) mov f34 = f1
1119 (p0) add r53 = 0x1,r0 ;;
1120 }
1121
1122 { .mfi
1123 (p13) ldfe f51 = [r35],16
1124 //
1125 // Flag_not_1: Y_lo = poly_hi + r4 * poly_lo
1126 //
1127 (p13) mov f34 = f9
1128 nop.i 999 ;;
1129 }
1130
1131 { .mmf
1132 (p12) ldfe f53 = [r35],16
1133 //
1134 // For Flag_not_1, Y_hi = X
1135 // Scale = 1
1136 // Create 0x000...01
1137 //
1138 (p0) setf.sig f37 = r53
1139 (p0) mov f36 = f1 ;;
1140 }
1141
1142 { .mmi
1143 (p13) ldfe f52 = [r35],16 ;;
1144 (p12) ldfe f54 = [r35],16
1145 nop.i 999 ;;
1146 }
1147
1148 { .mfi
1149 (p13) ldfe f53 = [r35],16
1150 (p13) fmpy.s1 f58 = f48, f50
1151 nop.i 999 ;;
1152 }
1153 //
1154 // Flag_not1: poly_lo = P_5 + r*P_6
1155 // Flag_1: poly_lo = Q_6 + r*Q_7
1156 //
1157
1158 { .mmi
1159 (p13) ldfe f54 = [r35],16 ;;
1160 (p12) ldfe f55 = [r35],16
1161 nop.i 999 ;;
1162 }
1163
1164 { .mmi
1165 (p12) ldfe f56 = [r35],16 ;;
1166 (p13) ldfe f55 = [r35],16
1167 nop.i 999 ;;
1168 }
1169
1170 { .mmi
1171 (p12) ldfe f57 = [r35],0 ;;
1172 (p13) ldfe f56 = [r35],16
1173 nop.i 999 ;;
1174 }
1175
1176 { .mfi
1177 (p13) ldfe f57 = [r35],0
1178 nop.f 999
1179 nop.i 999 ;;
1180 }
1181
1182 { .mfi
1183 nop.m 999
1184 //
1185 // For Flag_not_1, load p5,p6,p1,p2
1186 // Else load p5,p6,p1,p2
1187 //
1188 (p12) fma.s1 f60 = f52, f42, f53
1189 nop.i 999 ;;
1190 }
1191
1192 { .mfi
1193 nop.m 999
1194 (p13) fma.s1 f60 = f51, f42, f52
1195 nop.i 999 ;;
1196 }
1197
1198 { .mfi
1199 nop.m 999
1200 (p12) fma.s1 f60 = f60, f42, f54
1201 nop.i 999 ;;
1202 }
1203
1204 { .mfi
1205 nop.m 999
1206 (p12) fma.s1 f59 = f56, f42, f57
1207 nop.i 999 ;;
1208 }
1209
1210 { .mfi
1211 nop.m 999
1212 (p13) fma.s1 f60 = f42, f60, f53
1213 nop.i 999 ;;
1214 }
1215
1216 { .mfi
1217 nop.m 999
1218 (p12) fma.s1 f59 = f59, f48, f42
1219 nop.i 999 ;;
1220 }
1221
1222 { .mfi
1223 nop.m 999
1224 //
1225 // Flag_1: poly_lo = Q_5 + r*(Q_6 + r*Q_7)
1226 // Flag_not1: poly_lo = P_4 + r*(P_5 + r*P_6)
1227 // Flag_not1: poly_hi = (P_1 + r*P_2)
1228 //
1229 (p13) fmpy.s1 f60 = f60, f58
1230 nop.i 999 ;;
1231 }
1232
1233 { .mfi
1234 nop.m 999
1235 (p12) fma.s1 f60 = f60, f42, f55
1236 nop.i 999 ;;
1237 }
1238
1239 { .mfi
1240 nop.m 999
1241 //
1242 // Flag_1: poly_lo = r6 *(Q_5 + ....)
1243 // Flag_not1: poly_hi = r + rsq *(P_1 + r*P_2)
1244 //
1245 (p12) fma.s1 f35 = f60, f50, f59
1246 nop.i 999
1247 }
1248
1249 { .mfi
1250 nop.m 999
1251 (p13) fma.s1 f59 = f54, f42, f55
1252 nop.i 999 ;;
1253 }
1254
1255 { .mfi
1256 nop.m 999
1257 //
1258 // Flag_not1: Y_lo = rsq* poly_hi + poly_lo
1259 // Flag_1: poly_lo = rsq* poly_hi + poly_lo
1260 //
1261 (p13) fma.s1 f59 = f59, f42, f56
1262 nop.i 999 ;;
1263 }
1264
1265 { .mfi
1266 nop.m 999
1267 //
1268 // Flag_not_1: (P_1 + r*P_2)
1269 //
1270 (p13) fma.s1 f59 = f59, f42, f57
1271 nop.i 999 ;;
1272 }
1273
1274 { .mfi
1275 nop.m 999
1276 //
1277 // Flag_not_1: poly_hi = r + rsq * (P_1 + r*P_2)
1278 //
1279 (p13) fma.s1 f35 = f59, f48, f60
1280 nop.i 999 ;;
1281 }
1282
1283 { .mfi
1284 nop.m 999
1285 //
1286 // Create 0.000...01
1287 //
1288 (p0) for f37 = f35, f37
1289 nop.i 999 ;;
1290 }
1291
1292 { .mfb
1293 nop.m 999
1294 //
1295 // Set lsb of Y_lo to 1
1296 //
1297 (p0) fmerge.se f35 = f35,f37
1298 (p0) br.cond.sptk EXPF_MAIN ;;
1299 }
1300 EXPF_VERY_SMALL:
1301
1302 { .mmi
1303 nop.m 999
1304 (p13) addl r34 = @ltoff(Constants_exp_64_Exponents#),gp
1305 nop.i 999;;
1306 }
1307
1308 { .mfi
1309 (p13) ld8 r34 = [r34];
1310 (p12) mov f35 = f9
1311 nop.i 999 ;;
1312 }
1313
1314 { .mfb
1315 nop.m 999
1316 (p12) mov f34 = f1
1317 (p12) br.cond.sptk EXPF_MAIN ;;
1318 }
1319
1320 { .mlx
1321 (p13) add r34 = 8,r34
1322 (p13) movl r39 = 0x0FFFE ;;
1323 }
1324 //
1325 // Load big_exp_neg
1326 // Create 1/2's exponent
1327 //
1328
1329 { .mii
1330 (p13) setf.exp f56 = r39
1331 (p13) shladd r34 = r32,4,r34 ;;
1332 nop.i 999
1333 }
1334 //
1335 // Negative exponents are stored after positive
1336 //
1337
1338 { .mfi
1339 (p13) ld8 r45 = [r34],0
1340 //
1341 // Y_hi = x
1342 // Scale = 1
1343 //
1344 (p13) fmpy.s1 f35 = f9, f9
1345 nop.i 999 ;;
1346 }
1347
1348 { .mfi
1349 nop.m 999
1350 //
1351 // Reset Safe if necessary
1352 // Create 1/2
1353 //
1354 (p13) mov f34 = f9
1355 nop.i 999 ;;
1356 }
1357
1358 { .mfi
1359 (p13) cmp.lt.unc p0, p15 = r37, r45
1360 (p13) mov f36 = f1
1361 nop.i 999 ;;
1362 }
1363
1364 { .mfb
1365 nop.m 999
1366 //
1367 // Y_lo = x * x
1368 //
1369 (p13) fmpy.s1 f35 = f35, f56
1370 //
1371 // Y_lo = x*x/2
1372 //
1373 (p13) br.cond.sptk EXPF_MAIN ;;
1374 }
1375 EXPF_HUGE:
1376
1377 { .mfi
1378 nop.m 999
1379 (p0) fcmp.gt.unc.s1 p14, p0 = f9, f0
1380 nop.i 999
1381 }
1382
1383 { .mlx
1384 nop.m 999
1385 (p0) movl r39 = 0x15DC0 ;;
1386 }
1387
1388 { .mfi
1389 (p14) setf.exp f34 = r39
1390 (p14) mov f35 = f1
1391 (p14) cmp.eq p0, p15 = r0, r0 ;;
1392 }
1393
1394 { .mfb
1395 nop.m 999
1396 (p14) mov f36 = f34
1397 //
1398 // If x > 0, Set Safe = False
1399 // If x > 0, Y_hi = 2**(24,000)
1400 // If x > 0, Y_lo = 1.0
1401 // If x > 0, Scale = 2**(24,000)
1402 //
1403 (p14) br.cond.sptk EXPF_MAIN ;;
1404 }
1405
1406 { .mlx
1407 nop.m 999
1408 (p12) movl r39 = 0xA240
1409 }
1410
1411 { .mlx
1412 nop.m 999
1413 (p12) movl r38 = 0xA1DC ;;
1414 }
1415
1416 { .mmb
1417 (p13) cmp.eq p15, p14 = r0, r0
1418 (p12) setf.exp f34 = r39
1419 nop.b 999 ;;
1420 }
1421
1422 { .mlx
1423 (p12) setf.exp f35 = r38
1424 (p13) movl r39 = 0xFF9C
1425 }
1426
1427 { .mfi
1428 nop.m 999
1429 (p13) fsub.s1 f34 = f0, f1
1430 nop.i 999 ;;
1431 }
1432
1433 { .mfi
1434 nop.m 999
1435 (p12) mov f36 = f34
1436 (p12) cmp.eq p0, p15 = r0, r0 ;;
1437 }
1438
1439 { .mfi
1440 (p13) setf.exp f35 = r39
1441 (p13) mov f36 = f1
1442 nop.i 999 ;;
1443 }
1444 EXPF_MAIN:
1445
1446 { .mfi
1447 (p0) cmp.ne.unc p12, p0 = 0x01, r33
1448 (p0) fmpy.s1 f101 = f36, f35
1449 nop.i 999 ;;
1450 }
1451
1452 { .mfb
1453 nop.m 999
1454 (p0) fma.s.s0 f99 = f34, f36, f101
1455 (p15) br.cond.sptk EXPF_64_RETURN ;;
1456 }
1457
1458 { .mfi
1459 nop.m 999
1460 (p0) fsetc.s3 0x7F,0x01
1461 nop.i 999
1462 }
1463
1464 { .mlx
1465 nop.m 999
1466 (p0) movl r50 = 0x0000000001007F ;;
1467 }
1468 //
1469 // S0 user supplied status
1470 // S2 user supplied status + WRE + TD (Overflows)
1471 // S3 user supplied status + RZ + TD (Underflows)
1472 //
1473 //
1474 // If (Safe) is true, then
1475 // Compute result using user supplied status field.
1476 // No overflow or underflow here, but perhaps inexact.
1477 // Return
1478 // Else
1479 // Determine if overflow or underflow was raised.
1480 // Fetch +/- overflow threshold for IEEE single, double,
1481 // double extended
1482 //
1483
1484 { .mfi
1485 (p0) setf.exp f60 = r50
1486 (p0) fma.s.s3 f102 = f34, f36, f101
1487 nop.i 999
1488 }
1489
1490 { .mfi
1491 nop.m 999
1492 (p0) fsetc.s3 0x7F,0x40
1493 nop.i 999 ;;
1494 }
1495
1496 { .mfi
1497 nop.m 999
1498 //
1499 // For Safe, no need to check for over/under.
1500 // For expm1, handle errors like exp.
1501 //
1502 (p0) fsetc.s2 0x7F,0x42
1503 nop.i 999;;
1504 }
1505
1506 { .mfi
1507 nop.m 999
1508 (p0) fma.s.s2 f100 = f34, f36, f101
1509 nop.i 999 ;;
1510 }
1511
1512 { .mfi
1513 nop.m 999
1514 (p0) fsetc.s2 0x7F,0x40
1515 nop.i 999 ;;
1516 }
1517
1518 { .mfi
1519 nop.m 999
1520 (p7) fclass.m.unc p12, p0 = f102, 0x00F
1521 nop.i 999
1522 }
1523
1524 { .mfi
1525 nop.m 999
1526 (p0) fclass.m.unc p11, p0 = f102, 0x00F
1527 nop.i 999 ;;
1528 }
1529
1530 { .mfi
1531 nop.m 999
1532 (p7) fcmp.ge.unc.s1 p10, p0 = f100, f60
1533 nop.i 999
1534 }
1535
1536 { .mfi
1537 nop.m 999
1538 //
1539 // Create largest double exponent + 1.
1540 // Create smallest double exponent - 1.
1541 //
1542 (p0) fcmp.ge.unc.s1 p8, p0 = f100, f60
1543 nop.i 999 ;;
1544 }
1545 //
1546 // fcmp: resultS2 >= + overflow threshold -> set (a) if true
1547 // fcmp: resultS2 <= - overflow threshold -> set (b) if true
1548 // fclass: resultS3 is denorm/unorm/0 -> set (d) if true
1549 //
1550
1551 { .mib
1552 (p10) mov GR_Parameter_TAG = 43
1553 nop.i 999
1554 (p10) br.cond.sptk __libm_error_region ;;
1555 }
1556
1557 { .mib
1558 (p8) mov GR_Parameter_TAG = 16
1559 nop.i 999
1560 (p8) br.cond.sptk __libm_error_region ;;
1561 }
1562 //
1563 // Report that exp overflowed
1564 //
1565
1566 { .mib
1567 (p12) mov GR_Parameter_TAG = 44
1568 nop.i 999
1569 (p12) br.cond.sptk __libm_error_region ;;
1570 }
1571
1572 { .mib
1573 (p11) mov GR_Parameter_TAG = 17
1574 nop.i 999
1575 (p11) br.cond.sptk __libm_error_region ;;
1576 }
1577
1578 { .mib
1579 nop.m 999
1580 nop.i 999
1581 //
1582 // Report that exp underflowed
1583 //
1584 (p0) br.cond.sptk EXPF_64_RETURN ;;
1585 }
1586 EXPF_64_SPECIAL:
1587
1588 { .mfi
1589 nop.m 999
1590 (p0) fclass.m.unc p6, p0 = f8, 0x0c3
1591 nop.i 999
1592 }
1593
1594 { .mfi
1595 nop.m 999
1596 (p0) fclass.m.unc p13, p8 = f8, 0x007
1597 nop.i 999 ;;
1598 }
1599
1600 { .mfi
1601 nop.m 999
1602 (p7) fclass.m.unc p14, p0 = f8, 0x007
1603 nop.i 999
1604 }
1605
1606 { .mfi
1607 nop.m 999
1608 (p0) fclass.m.unc p12, p9 = f8, 0x021
1609 nop.i 999 ;;
1610 }
1611
1612 { .mfi
1613 nop.m 999
1614 (p0) fclass.m.unc p11, p0 = f8, 0x022
1615 nop.i 999
1616 }
1617
1618 { .mfi
1619 nop.m 999
1620 (p7) fclass.m.unc p10, p0 = f8, 0x022
1621 nop.i 999 ;;
1622 }
1623
1624 { .mfi
1625 nop.m 999
1626 //
1627 // Identify +/- 0, Inf, or -Inf
1628 // Generate the right kind of NaN.
1629 //
1630 (p13) fadd.s.s0 f99 = f0, f1
1631 nop.i 999 ;;
1632 }
1633
1634 { .mfi
1635 nop.m 999
1636 (p14) mov f99 = f8
1637 nop.i 999 ;;
1638 }
1639
1640 { .mfb
1641 nop.m 999
1642 (p6) fadd.s.s0 f99 = f8, f1
1643 //
1644 // expf(+/-0) = 1
1645 // expm1f(+/-0) = +/-0
1646 // No exceptions raised
1647 //
1648 (p6) br.cond.sptk EXPF_64_RETURN ;;
1649 }
1650
1651 { .mib
1652 nop.m 999
1653 nop.i 999
1654 (p14) br.cond.sptk EXPF_64_RETURN ;;
1655 }
1656
1657 { .mfi
1658 nop.m 999
1659 (p11) mov f99 = f0
1660 nop.i 999 ;;
1661 }
1662
1663 { .mfb
1664 nop.m 999
1665 (p10) fsub.s.s1 f99 = f0, f1
1666 //
1667 // expf(-Inf) = 0
1668 // expm1f(-Inf) = -1
1669 // No exceptions raised.
1670 //
1671 (p10) br.cond.sptk EXPF_64_RETURN ;;
1672 }
1673
1674 { .mfb
1675 nop.m 999
1676 (p12) fmpy.s.s1 f99 = f8, f1
1677 //
1678 // expf(+Inf) = Inf
1679 // No exceptions raised.
1680 //
1681 (p0) br.cond.sptk EXPF_64_RETURN ;;
1682 }
1683 EXPF_64_UNSUPPORTED:
1684
1685 { .mfb
1686 nop.m 999
1687 (p0) fmpy.s.s0 f99 = f8, f0
1688 nop.b 0;;
1689 }
1690
1691 EXPF_64_RETURN:
1692 { .mfb
1693 nop.m 999
1694 (p0) mov f8 = f99
1695 (p0) br.ret.sptk b0
1696 }
1697 .endp expm1f
1698 ASM_SIZE_DIRECTIVE(expm1f)
1699
1700
1701 .proc __libm_error_region
1702 __libm_error_region:
1703 .prologue
1704 { .mfi
1705 add GR_Parameter_Y=-32,sp // Parameter 2 value
1706 nop.f 0
1707 .save ar.pfs,GR_SAVE_PFS
1708 mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
1709 }
1710 { .mfi
1711 .fframe 64
1712 add sp=-64,sp // Create new stack
1713 nop.f 0
1714 mov GR_SAVE_GP=gp // Save gp
1715 };;
1716 { .mmi
1717 stfs [GR_Parameter_Y] = FR_Y,16 // Store Parameter 2 on stack
1718 add GR_Parameter_X = 16,sp // Parameter 1 address
1719 .save b0, GR_SAVE_B0
1720 mov GR_SAVE_B0=b0 // Save b0
1721 };;
1722 .body
1723 { .mib
1724 stfs [GR_Parameter_X] = FR_X // Store Parameter 1 on stack
1725 add GR_Parameter_RESULT = 0,GR_Parameter_Y
1726 nop.b 0 // Parameter 3 address
1727 }
1728 { .mib
1729 stfs [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack
1730 add GR_Parameter_Y = -16,GR_Parameter_Y
1731 br.call.sptk b0=__libm_error_support# // Call error handling function
1732 };;
1733 { .mmi
1734 nop.m 0
1735 nop.m 0
1736 add GR_Parameter_RESULT = 48,sp
1737 };;
1738 { .mmi
1739 ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack
1740 .restore sp
1741 add sp = 64,sp // Restore stack pointer
1742 mov b0 = GR_SAVE_B0 // Restore return address
1743 };;
1744 { .mib
1745 mov gp = GR_SAVE_GP // Restore gp
1746 mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
1747 br.ret.sptk b0 // Return
1748 };;
1749
1750 .endp __libm_error_region
1751 ASM_SIZE_DIRECTIVE(__libm_error_region)
1752
1753
1754 .type __libm_error_support#,@function
1755 .global __libm_error_support#