]>
Commit | Line | Data |
---|---|---|
d5efd131 MF |
1 | .file "sinh.s" |
2 | ||
3 | ||
4 | // Copyright (c) 2000 - 2005, Intel Corporation | |
5 | // All rights reserved. | |
6 | // | |
d5efd131 MF |
7 | // |
8 | // Redistribution and use in source and binary forms, with or without | |
9 | // modification, are permitted provided that the following conditions are | |
10 | // met: | |
11 | // | |
12 | // * Redistributions of source code must retain the above copyright | |
13 | // notice, this list of conditions and the following disclaimer. | |
14 | // | |
15 | // * Redistributions in binary form must reproduce the above copyright | |
16 | // notice, this list of conditions and the following disclaimer in the | |
17 | // documentation and/or other materials provided with the distribution. | |
18 | // | |
19 | // * The name of Intel Corporation may not be used to endorse or promote | |
20 | // products derived from this software without specific prior written | |
21 | // permission. | |
22 | ||
23 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
24 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
25 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
26 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS | |
27 | // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | |
28 | // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | |
29 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | |
30 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY | |
31 | // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING | |
32 | // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | |
33 | // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
34 | // | |
35 | // Intel Corporation is the author of this code, and requests that all | |
36 | // problem reports or change requests be submitted to it directly at | |
37 | // http://www.intel.com/software/products/opensource/libraries/num.htm. | |
38 | // | |
39 | // History | |
40 | //============================================================== | |
41 | // 02/02/00 Initial version | |
42 | // 04/04/00 Unwind support added | |
43 | // 08/15/00 Bundle added after call to __libm_error_support to properly | |
44 | // set [the previously overwritten] GR_Parameter_RESULT. | |
45 | // 10/12/00 Update to set denormal operand and underflow flags | |
46 | // 01/22/01 Fixed to set inexact flag for small args. | |
47 | // 05/02/01 Reworked to improve speed of all paths | |
48 | // 05/20/02 Cleaned up namespace and sf0 syntax | |
49 | // 11/20/02 Improved speed with new algorithm | |
50 | // 03/31/05 Reformatted delimiters between data tables | |
51 | ||
52 | // API | |
53 | //============================================================== | |
54 | // double sinh(double) | |
55 | ||
56 | // Overview of operation | |
57 | //============================================================== | |
58 | // Case 1: 0 < |x| < 2^-60 | |
59 | // Result = x, computed by x+sgn(x)*x^2) to handle flags and rounding | |
60 | // | |
61 | // Case 2: 2^-60 < |x| < 0.25 | |
62 | // Evaluate sinh(x) by a 13th order polynomial | |
63 | // Care is take for the order of multiplication; and A1 is not exactly 1/3!, | |
64 | // A2 is not exactly 1/5!, etc. | |
65 | // sinh(x) = x + (A1*x^3 + A2*x^5 + A3*x^7 + A4*x^9 + A5*x^11 + A6*x^13) | |
66 | // | |
67 | // Case 3: 0.25 < |x| < 710.47586 | |
68 | // Algorithm is based on the identity sinh(x) = ( exp(x) - exp(-x) ) / 2. | |
69 | // The algorithm for exp is described as below. There are a number of | |
70 | // economies from evaluating both exp(x) and exp(-x). Although we | |
71 | // are evaluating both quantities, only where the quantities diverge do we | |
72 | // duplicate the computations. The basic algorithm for exp(x) is described | |
73 | // below. | |
74 | // | |
75 | // Take the input x. w is "how many log2/128 in x?" | |
76 | // w = x * 128/log2 | |
77 | // n = int(w) | |
78 | // x = n log2/128 + r + delta | |
79 | ||
80 | // n = 128M + index_1 + 2^4 index_2 | |
81 | // x = M log2 + (log2/128) index_1 + (log2/8) index_2 + r + delta | |
82 | ||
83 | // exp(x) = 2^M 2^(index_1/128) 2^(index_2/8) exp(r) exp(delta) | |
84 | // Construct 2^M | |
85 | // Get 2^(index_1/128) from table_1; | |
86 | // Get 2^(index_2/8) from table_2; | |
87 | // Calculate exp(r) by 5th order polynomial | |
88 | // r = x - n (log2/128)_high | |
89 | // delta = - n (log2/128)_low | |
90 | // Calculate exp(delta) as 1 + delta | |
91 | ||
92 | ||
93 | // Special values | |
94 | //============================================================== | |
95 | // sinh(+0) = +0 | |
96 | // sinh(-0) = -0 | |
97 | ||
98 | // sinh(+qnan) = +qnan | |
99 | // sinh(-qnan) = -qnan | |
100 | // sinh(+snan) = +qnan | |
101 | // sinh(-snan) = -qnan | |
102 | ||
103 | // sinh(-inf) = -inf | |
104 | // sinh(+inf) = +inf | |
105 | ||
106 | // Overflow and Underflow | |
107 | //======================= | |
108 | // sinh(x) = largest double normal when | |
109 | // |x| = 710.47586 = 0x408633ce8fb9f87d | |
110 | // | |
111 | // Underflow is handled as described in case 1 above | |
112 | ||
113 | // Registers used | |
114 | //============================================================== | |
115 | // Floating Point registers used: | |
116 | // f8, input, output | |
117 | // f6 -> f15, f32 -> f61 | |
118 | ||
119 | // General registers used: | |
120 | // r14 -> r40 | |
121 | ||
122 | // Predicate registers used: | |
123 | // p6 -> p15 | |
124 | ||
125 | // Assembly macros | |
126 | //============================================================== | |
127 | ||
128 | rRshf = r14 | |
129 | rN_neg = r14 | |
130 | rAD_TB1 = r15 | |
131 | rAD_TB2 = r16 | |
132 | rAD_P = r17 | |
133 | rN = r18 | |
134 | rIndex_1 = r19 | |
135 | rIndex_2_16 = r20 | |
136 | rM = r21 | |
137 | rBiased_M = r21 | |
138 | rSig_inv_ln2 = r22 | |
139 | rIndex_1_neg = r22 | |
140 | rExp_bias = r23 | |
141 | rExp_bias_minus_1 = r23 | |
142 | rExp_mask = r24 | |
143 | rTmp = r24 | |
144 | rGt_ln = r24 | |
145 | rIndex_2_16_neg = r24 | |
146 | rM_neg = r25 | |
147 | rBiased_M_neg = r25 | |
148 | rRshf_2to56 = r26 | |
149 | rAD_T1_neg = r26 | |
150 | rExp_2tom56 = r28 | |
151 | rAD_T2_neg = r28 | |
152 | rAD_T1 = r29 | |
153 | rAD_T2 = r30 | |
154 | rSignexp_x = r31 | |
155 | rExp_x = r31 | |
156 | ||
157 | GR_SAVE_B0 = r33 | |
158 | GR_SAVE_PFS = r34 | |
159 | GR_SAVE_GP = r35 | |
160 | ||
161 | GR_Parameter_X = r37 | |
162 | GR_Parameter_Y = r38 | |
163 | GR_Parameter_RESULT = r39 | |
164 | GR_Parameter_TAG = r40 | |
165 | ||
166 | ||
167 | FR_X = f10 | |
168 | FR_Y = f1 | |
169 | FR_RESULT = f8 | |
170 | ||
171 | fRSHF_2TO56 = f6 | |
172 | fINV_LN2_2TO63 = f7 | |
173 | fW_2TO56_RSH = f9 | |
174 | f2TOM56 = f11 | |
175 | fP5 = f12 | |
176 | fP4 = f13 | |
177 | fP3 = f14 | |
178 | fP2 = f15 | |
179 | ||
180 | fLn2_by_128_hi = f33 | |
181 | fLn2_by_128_lo = f34 | |
182 | ||
183 | fRSHF = f35 | |
184 | fNfloat = f36 | |
185 | fNormX = f37 | |
186 | fR = f38 | |
187 | fF = f39 | |
188 | ||
189 | fRsq = f40 | |
190 | f2M = f41 | |
191 | fS1 = f42 | |
192 | fT1 = f42 | |
193 | fS2 = f43 | |
194 | fT2 = f43 | |
195 | fS = f43 | |
196 | fWre_urm_f8 = f44 | |
197 | fAbsX = f44 | |
198 | ||
199 | fMIN_DBL_OFLOW_ARG = f45 | |
200 | fMAX_DBL_NORM_ARG = f46 | |
201 | fXsq = f47 | |
202 | fX4 = f48 | |
203 | fGt_pln = f49 | |
204 | fTmp = f49 | |
205 | ||
206 | fP54 = f50 | |
207 | fP5432 = f50 | |
208 | fP32 = f51 | |
209 | fP = f52 | |
210 | fP54_neg = f53 | |
211 | fP5432_neg = f53 | |
212 | fP32_neg = f54 | |
213 | fP_neg = f55 | |
214 | fF_neg = f56 | |
215 | ||
216 | f2M_neg = f57 | |
217 | fS1_neg = f58 | |
218 | fT1_neg = f58 | |
219 | fS2_neg = f59 | |
220 | fT2_neg = f59 | |
221 | fS_neg = f59 | |
222 | fExp = f60 | |
223 | fExp_neg = f61 | |
224 | ||
225 | fA6 = f50 | |
226 | fA65 = f50 | |
227 | fA6543 = f50 | |
228 | fA654321 = f50 | |
229 | fA5 = f51 | |
230 | fA4 = f52 | |
231 | fA43 = f52 | |
232 | fA3 = f53 | |
233 | fA2 = f54 | |
234 | fA21 = f54 | |
235 | fA1 = f55 | |
236 | fX3 = f56 | |
237 | ||
238 | // Data tables | |
239 | //============================================================== | |
240 | ||
241 | RODATA | |
242 | .align 16 | |
243 | ||
244 | // ************* DO NOT CHANGE ORDER OF THESE TABLES ******************** | |
245 | ||
246 | // double-extended 1/ln(2) | |
247 | // 3fff b8aa 3b29 5c17 f0bb be87fed0691d3e88 | |
248 | // 3fff b8aa 3b29 5c17 f0bc | |
249 | // For speed the significand will be loaded directly with a movl and setf.sig | |
250 | // and the exponent will be bias+63 instead of bias+0. Thus subsequent | |
251 | // computations need to scale appropriately. | |
252 | // The constant 128/ln(2) is needed for the computation of w. This is also | |
253 | // obtained by scaling the computations. | |
254 | // | |
255 | // Two shifting constants are loaded directly with movl and setf.d. | |
256 | // 1. fRSHF_2TO56 = 1.1000..00 * 2^(63-7) | |
257 | // This constant is added to x*1/ln2 to shift the integer part of | |
258 | // x*128/ln2 into the rightmost bits of the significand. | |
259 | // The result of this fma is fW_2TO56_RSH. | |
260 | // 2. fRSHF = 1.1000..00 * 2^(63) | |
261 | // This constant is subtracted from fW_2TO56_RSH * 2^(-56) to give | |
262 | // the integer part of w, n, as a floating-point number. | |
263 | // The result of this fms is fNfloat. | |
264 | ||
265 | ||
266 | LOCAL_OBJECT_START(exp_table_1) | |
267 | data8 0x408633ce8fb9f87e // smallest dbl overflow arg | |
268 | data8 0x408633ce8fb9f87d // largest dbl arg to give normal dbl result | |
269 | data8 0xb17217f7d1cf79ab , 0x00003ff7 // ln2/128 hi | |
270 | data8 0xc9e3b39803f2f6af , 0x00003fb7 // ln2/128 lo | |
271 | // | |
272 | // Table 1 is 2^(index_1/128) where | |
273 | // index_1 goes from 0 to 15 | |
274 | // | |
275 | data8 0x8000000000000000 , 0x00003FFF | |
276 | data8 0x80B1ED4FD999AB6C , 0x00003FFF | |
277 | data8 0x8164D1F3BC030773 , 0x00003FFF | |
278 | data8 0x8218AF4373FC25EC , 0x00003FFF | |
279 | data8 0x82CD8698AC2BA1D7 , 0x00003FFF | |
280 | data8 0x8383594EEFB6EE37 , 0x00003FFF | |
281 | data8 0x843A28C3ACDE4046 , 0x00003FFF | |
282 | data8 0x84F1F656379C1A29 , 0x00003FFF | |
283 | data8 0x85AAC367CC487B15 , 0x00003FFF | |
284 | data8 0x8664915B923FBA04 , 0x00003FFF | |
285 | data8 0x871F61969E8D1010 , 0x00003FFF | |
286 | data8 0x87DB357FF698D792 , 0x00003FFF | |
287 | data8 0x88980E8092DA8527 , 0x00003FFF | |
288 | data8 0x8955EE03618E5FDD , 0x00003FFF | |
289 | data8 0x8A14D575496EFD9A , 0x00003FFF | |
290 | data8 0x8AD4C6452C728924 , 0x00003FFF | |
291 | LOCAL_OBJECT_END(exp_table_1) | |
292 | ||
293 | // Table 2 is 2^(index_1/8) where | |
294 | // index_2 goes from 0 to 7 | |
295 | LOCAL_OBJECT_START(exp_table_2) | |
296 | data8 0x8000000000000000 , 0x00003FFF | |
297 | data8 0x8B95C1E3EA8BD6E7 , 0x00003FFF | |
298 | data8 0x9837F0518DB8A96F , 0x00003FFF | |
299 | data8 0xA5FED6A9B15138EA , 0x00003FFF | |
300 | data8 0xB504F333F9DE6484 , 0x00003FFF | |
301 | data8 0xC5672A115506DADD , 0x00003FFF | |
302 | data8 0xD744FCCAD69D6AF4 , 0x00003FFF | |
303 | data8 0xEAC0C6E7DD24392F , 0x00003FFF | |
304 | LOCAL_OBJECT_END(exp_table_2) | |
305 | ||
306 | ||
307 | LOCAL_OBJECT_START(exp_p_table) | |
308 | data8 0x3f8111116da21757 //P5 | |
309 | data8 0x3fa55555d787761c //P4 | |
310 | data8 0x3fc5555555555414 //P3 | |
311 | data8 0x3fdffffffffffd6a //P2 | |
312 | LOCAL_OBJECT_END(exp_p_table) | |
313 | ||
314 | LOCAL_OBJECT_START(sinh_p_table) | |
315 | data8 0xB08AF9AE78C1239F, 0x00003FDE // A6 | |
316 | data8 0xB8EF1D28926D8891, 0x00003FEC // A4 | |
317 | data8 0x8888888888888412, 0x00003FF8 // A2 | |
318 | data8 0xD732377688025BE9, 0x00003FE5 // A5 | |
319 | data8 0xD00D00D00D4D39F2, 0x00003FF2 // A3 | |
320 | data8 0xAAAAAAAAAAAAAAAB, 0x00003FFC // A1 | |
321 | LOCAL_OBJECT_END(sinh_p_table) | |
322 | ||
323 | ||
324 | .section .text | |
325 | GLOBAL_IEEE754_ENTRY(sinh) | |
326 | ||
327 | { .mlx | |
328 | getf.exp rSignexp_x = f8 // Must recompute if x unorm | |
329 | movl rSig_inv_ln2 = 0xb8aa3b295c17f0bc // significand of 1/ln2 | |
330 | } | |
331 | { .mlx | |
332 | addl rAD_TB1 = @ltoff(exp_table_1), gp | |
333 | movl rRshf_2to56 = 0x4768000000000000 // 1.10000 2^(63+56) | |
334 | } | |
335 | ;; | |
336 | ||
337 | { .mfi | |
338 | ld8 rAD_TB1 = [rAD_TB1] | |
339 | fclass.m p6,p0 = f8,0x0b // Test for x=unorm | |
340 | mov rExp_mask = 0x1ffff | |
341 | } | |
342 | { .mfi | |
343 | mov rExp_bias = 0xffff | |
344 | fnorm.s1 fNormX = f8 | |
345 | mov rExp_2tom56 = 0xffff-56 | |
346 | } | |
347 | ;; | |
348 | ||
349 | // Form two constants we need | |
350 | // 1/ln2 * 2^63 to compute w = x * 1/ln2 * 128 | |
351 | // 1.1000..000 * 2^(63+63-7) to right shift int(w) into the significand | |
352 | ||
353 | { .mfi | |
354 | setf.sig fINV_LN2_2TO63 = rSig_inv_ln2 // form 1/ln2 * 2^63 | |
355 | fclass.m p8,p0 = f8,0x07 // Test for x=0 | |
356 | nop.i 999 | |
357 | } | |
358 | { .mlx | |
359 | setf.d fRSHF_2TO56 = rRshf_2to56 // Form const 1.100 * 2^(63+56) | |
360 | movl rRshf = 0x43e8000000000000 // 1.10000 2^63 for right shift | |
361 | } | |
362 | ;; | |
363 | ||
364 | { .mfi | |
365 | ldfpd fMIN_DBL_OFLOW_ARG, fMAX_DBL_NORM_ARG = [rAD_TB1],16 | |
366 | fclass.m p10,p0 = f8,0x1e3 // Test for x=inf, nan, NaT | |
367 | nop.i 0 | |
368 | } | |
369 | { .mfb | |
370 | setf.exp f2TOM56 = rExp_2tom56 // form 2^-56 for scaling Nfloat | |
371 | nop.f 0 | |
372 | (p6) br.cond.spnt SINH_UNORM // Branch if x=unorm | |
373 | } | |
374 | ;; | |
375 | ||
376 | SINH_COMMON: | |
377 | { .mfi | |
378 | ldfe fLn2_by_128_hi = [rAD_TB1],16 | |
379 | nop.f 0 | |
380 | nop.i 0 | |
381 | } | |
382 | { .mfb | |
383 | setf.d fRSHF = rRshf // Form right shift const 1.100 * 2^63 | |
384 | nop.f 0 | |
385 | (p8) br.ret.spnt b0 // Exit for x=0, result=x | |
386 | } | |
387 | ;; | |
388 | ||
389 | { .mfi | |
390 | ldfe fLn2_by_128_lo = [rAD_TB1],16 | |
391 | nop.f 0 | |
392 | nop.i 0 | |
393 | } | |
394 | { .mfb | |
395 | and rExp_x = rExp_mask, rSignexp_x // Biased exponent of x | |
396 | (p10) fma.d.s0 f8 = f8,f1,f0 // Result if x=inf, nan, NaT | |
397 | (p10) br.ret.spnt b0 // quick exit for x=inf, nan, NaT | |
398 | } | |
399 | ;; | |
400 | ||
401 | // After that last load rAD_TB1 points to the beginning of table 1 | |
402 | { .mfi | |
403 | nop.m 0 | |
404 | fcmp.eq.s0 p6,p0 = f8, f0 // Dummy to set D | |
405 | sub rExp_x = rExp_x, rExp_bias // True exponent of x | |
406 | } | |
407 | ;; | |
408 | ||
409 | { .mfi | |
410 | nop.m 0 | |
411 | fmerge.s fAbsX = f0, fNormX // Form |x| | |
412 | nop.i 0 | |
413 | } | |
414 | { .mfb | |
415 | cmp.gt p7, p0 = -2, rExp_x // Test |x| < 2^(-2) | |
416 | fma.s1 fXsq = fNormX, fNormX, f0 // x*x for small path | |
417 | (p7) br.cond.spnt SINH_SMALL // Branch if 0 < |x| < 2^-2 | |
418 | } | |
419 | ;; | |
420 | ||
421 | // W = X * Inv_log2_by_128 | |
422 | // By adding 1.10...0*2^63 we shift and get round_int(W) in significand. | |
423 | // We actually add 1.10...0*2^56 to X * Inv_log2 to do the same thing. | |
424 | ||
425 | { .mfi | |
426 | add rAD_P = 0x180, rAD_TB1 | |
427 | fma.s1 fW_2TO56_RSH = fNormX, fINV_LN2_2TO63, fRSHF_2TO56 | |
428 | add rAD_TB2 = 0x100, rAD_TB1 | |
429 | } | |
430 | ;; | |
431 | ||
432 | // Divide arguments into the following categories: | |
433 | // Certain Safe - 0.25 <= |x| <= MAX_DBL_NORM_ARG | |
434 | // Possible Overflow p14 - MAX_DBL_NORM_ARG < |x| < MIN_DBL_OFLOW_ARG | |
435 | // Certain Overflow p15 - MIN_DBL_OFLOW_ARG <= |x| < +inf | |
436 | // | |
437 | // If the input is really a double arg, then there will never be | |
438 | // "Possible Overflow" arguments. | |
439 | // | |
440 | ||
441 | { .mfi | |
442 | ldfpd fP5, fP4 = [rAD_P] ,16 | |
443 | fcmp.ge.s1 p15,p14 = fAbsX,fMIN_DBL_OFLOW_ARG | |
444 | nop.i 0 | |
445 | } | |
446 | ;; | |
447 | ||
448 | // Nfloat = round_int(W) | |
449 | // The signficand of fW_2TO56_RSH contains the rounded integer part of W, | |
450 | // as a twos complement number in the lower bits (that is, it may be negative). | |
451 | // That twos complement number (called N) is put into rN. | |
452 | ||
453 | // Since fW_2TO56_RSH is scaled by 2^56, it must be multiplied by 2^-56 | |
454 | // before the shift constant 1.10000 * 2^63 is subtracted to yield fNfloat. | |
455 | // Thus, fNfloat contains the floating point version of N | |
456 | ||
457 | { .mfi | |
458 | ldfpd fP3, fP2 = [rAD_P] | |
459 | (p14) fcmp.gt.unc.s1 p14,p0 = fAbsX,fMAX_DBL_NORM_ARG | |
460 | nop.i 0 | |
461 | } | |
462 | { .mfb | |
463 | nop.m 0 | |
464 | fms.s1 fNfloat = fW_2TO56_RSH, f2TOM56, fRSHF | |
465 | (p15) br.cond.spnt SINH_CERTAIN_OVERFLOW | |
466 | } | |
467 | ;; | |
468 | ||
469 | { .mfi | |
470 | getf.sig rN = fW_2TO56_RSH | |
471 | nop.f 0 | |
472 | mov rExp_bias_minus_1 = 0xfffe | |
473 | } | |
474 | ;; | |
475 | ||
476 | // rIndex_1 has index_1 | |
477 | // rIndex_2_16 has index_2 * 16 | |
478 | // rBiased_M has M | |
479 | ||
480 | // rM has true M | |
481 | // r = x - Nfloat * ln2_by_128_hi | |
482 | // f = 1 - Nfloat * ln2_by_128_lo | |
483 | { .mfi | |
484 | and rIndex_1 = 0x0f, rN | |
485 | fnma.s1 fR = fNfloat, fLn2_by_128_hi, fNormX | |
486 | shr rM = rN, 0x7 | |
487 | } | |
488 | { .mfi | |
489 | and rIndex_2_16 = 0x70, rN | |
490 | fnma.s1 fF = fNfloat, fLn2_by_128_lo, f1 | |
491 | sub rN_neg = r0, rN | |
492 | } | |
493 | ;; | |
494 | ||
495 | { .mmi | |
496 | and rIndex_1_neg = 0x0f, rN_neg | |
497 | add rBiased_M = rExp_bias_minus_1, rM | |
498 | shr rM_neg = rN_neg, 0x7 | |
499 | } | |
500 | { .mmi | |
501 | and rIndex_2_16_neg = 0x70, rN_neg | |
502 | add rAD_T2 = rAD_TB2, rIndex_2_16 | |
503 | shladd rAD_T1 = rIndex_1, 4, rAD_TB1 | |
504 | } | |
505 | ;; | |
506 | ||
507 | // rAD_T1 has address of T1 | |
508 | // rAD_T2 has address if T2 | |
509 | ||
510 | { .mmi | |
511 | setf.exp f2M = rBiased_M | |
512 | ldfe fT2 = [rAD_T2] | |
513 | nop.i 0 | |
514 | } | |
515 | { .mmi | |
516 | add rBiased_M_neg = rExp_bias_minus_1, rM_neg | |
517 | add rAD_T2_neg = rAD_TB2, rIndex_2_16_neg | |
518 | shladd rAD_T1_neg = rIndex_1_neg, 4, rAD_TB1 | |
519 | } | |
520 | ;; | |
521 | ||
522 | // Create Scale = 2^M | |
523 | // Load T1 and T2 | |
524 | { .mmi | |
525 | ldfe fT1 = [rAD_T1] | |
526 | nop.m 0 | |
527 | nop.i 0 | |
528 | } | |
529 | { .mmf | |
530 | setf.exp f2M_neg = rBiased_M_neg | |
531 | ldfe fT2_neg = [rAD_T2_neg] | |
532 | fma.s1 fF_neg = fNfloat, fLn2_by_128_lo, f1 | |
533 | } | |
534 | ;; | |
535 | ||
536 | { .mfi | |
537 | nop.m 0 | |
538 | fma.s1 fRsq = fR, fR, f0 | |
539 | nop.i 0 | |
540 | } | |
541 | { .mfi | |
542 | ldfe fT1_neg = [rAD_T1_neg] | |
543 | fma.s1 fP54 = fR, fP5, fP4 | |
544 | nop.i 0 | |
545 | } | |
546 | ;; | |
547 | ||
548 | { .mfi | |
549 | nop.m 0 | |
550 | fma.s1 fP32 = fR, fP3, fP2 | |
551 | nop.i 0 | |
552 | } | |
553 | { .mfi | |
554 | nop.m 0 | |
555 | fnma.s1 fP54_neg = fR, fP5, fP4 | |
556 | nop.i 0 | |
557 | } | |
558 | ;; | |
559 | ||
560 | { .mfi | |
561 | nop.m 0 | |
562 | fnma.s1 fP32_neg = fR, fP3, fP2 | |
563 | nop.i 0 | |
564 | } | |
565 | ;; | |
566 | ||
567 | { .mfi | |
568 | nop.m 0 | |
569 | fma.s1 fP5432 = fRsq, fP54, fP32 | |
570 | nop.i 0 | |
571 | } | |
572 | { .mfi | |
573 | nop.m 0 | |
574 | fma.s1 fS2 = fF,fT2,f0 | |
575 | nop.i 0 | |
576 | } | |
577 | ;; | |
578 | ||
579 | { .mfi | |
580 | nop.m 0 | |
581 | fma.s1 fS1 = f2M,fT1,f0 | |
582 | nop.i 0 | |
583 | } | |
584 | { .mfi | |
585 | nop.m 0 | |
586 | fma.s1 fP5432_neg = fRsq, fP54_neg, fP32_neg | |
587 | nop.i 0 | |
588 | } | |
589 | ;; | |
590 | ||
591 | { .mfi | |
592 | nop.m 0 | |
593 | fma.s1 fS1_neg = f2M_neg,fT1_neg,f0 | |
594 | nop.i 0 | |
595 | } | |
596 | { .mfi | |
597 | nop.m 0 | |
598 | fma.s1 fS2_neg = fF_neg,fT2_neg,f0 | |
599 | nop.i 0 | |
600 | } | |
601 | ;; | |
602 | ||
603 | { .mfi | |
604 | nop.m 0 | |
605 | fma.s1 fP = fRsq, fP5432, fR | |
606 | nop.i 0 | |
607 | } | |
608 | { .mfi | |
609 | nop.m 0 | |
610 | fma.s1 fS = fS1,fS2,f0 | |
611 | nop.i 0 | |
612 | } | |
613 | ;; | |
614 | ||
615 | { .mfi | |
616 | nop.m 0 | |
617 | fms.s1 fP_neg = fRsq, fP5432_neg, fR | |
618 | nop.i 0 | |
619 | } | |
620 | { .mfi | |
621 | nop.m 0 | |
622 | fma.s1 fS_neg = fS1_neg,fS2_neg,f0 | |
623 | nop.i 0 | |
624 | } | |
625 | ;; | |
626 | ||
627 | { .mfb | |
628 | nop.m 0 | |
629 | fmpy.s0 fTmp = fLn2_by_128_lo, fLn2_by_128_lo // Force inexact | |
630 | (p14) br.cond.spnt SINH_POSSIBLE_OVERFLOW | |
631 | } | |
632 | ;; | |
633 | ||
634 | { .mfi | |
635 | nop.m 0 | |
636 | fma.s1 fExp = fS, fP, fS | |
637 | nop.i 0 | |
638 | } | |
639 | { .mfi | |
640 | nop.m 0 | |
641 | fma.s1 fExp_neg = fS_neg, fP_neg, fS_neg | |
642 | nop.i 0 | |
643 | } | |
644 | ;; | |
645 | ||
646 | { .mfb | |
647 | nop.m 0 | |
648 | fms.d.s0 f8 = fExp, f1, fExp_neg | |
649 | br.ret.sptk b0 // Normal path exit | |
650 | } | |
651 | ;; | |
652 | ||
653 | // Here if 0 < |x| < 0.25 | |
654 | SINH_SMALL: | |
655 | { .mfi | |
656 | add rAD_T1 = 0x1a0, rAD_TB1 | |
657 | fcmp.lt.s1 p7, p8 = fNormX, f0 // Test sign of x | |
658 | cmp.gt p6, p0 = -60, rExp_x // Test |x| < 2^(-60) | |
659 | } | |
660 | { .mfi | |
661 | add rAD_T2 = 0x1d0, rAD_TB1 | |
662 | nop.f 0 | |
663 | nop.i 0 | |
664 | } | |
665 | ;; | |
666 | ||
667 | { .mmb | |
668 | ldfe fA6 = [rAD_T1],16 | |
669 | ldfe fA5 = [rAD_T2],16 | |
670 | (p6) br.cond.spnt SINH_VERY_SMALL // Branch if |x| < 2^(-60) | |
671 | } | |
672 | ;; | |
673 | ||
674 | { .mmi | |
675 | ldfe fA4 = [rAD_T1],16 | |
676 | ldfe fA3 = [rAD_T2],16 | |
677 | nop.i 0 | |
678 | } | |
679 | ;; | |
680 | ||
681 | { .mmi | |
682 | ldfe fA2 = [rAD_T1] | |
683 | ldfe fA1 = [rAD_T2] | |
684 | nop.i 0 | |
685 | } | |
686 | ;; | |
687 | ||
688 | { .mfi | |
689 | nop.m 0 | |
690 | fma.s1 fX3 = fNormX, fXsq, f0 | |
691 | nop.i 0 | |
692 | } | |
693 | { .mfi | |
694 | nop.m 0 | |
695 | fma.s1 fX4 = fXsq, fXsq, f0 | |
696 | nop.i 0 | |
697 | } | |
698 | ;; | |
699 | ||
700 | { .mfi | |
701 | nop.m 0 | |
702 | fma.s1 fA65 = fXsq, fA6, fA5 | |
703 | nop.i 0 | |
704 | } | |
705 | { .mfi | |
706 | nop.m 0 | |
707 | fma.s1 fA43 = fXsq, fA4, fA3 | |
708 | nop.i 0 | |
709 | } | |
710 | ;; | |
711 | ||
712 | { .mfi | |
713 | nop.m 0 | |
714 | fma.s1 fA21 = fXsq, fA2, fA1 | |
715 | nop.i 0 | |
716 | } | |
717 | ;; | |
718 | ||
719 | { .mfi | |
720 | nop.m 0 | |
721 | fma.s1 fA6543 = fX4, fA65, fA43 | |
722 | nop.i 0 | |
723 | } | |
724 | ;; | |
725 | ||
726 | { .mfi | |
727 | nop.m 0 | |
728 | fma.s1 fA654321 = fX4, fA6543, fA21 | |
729 | nop.i 0 | |
730 | } | |
731 | ;; | |
732 | ||
733 | // Dummy multiply to generate inexact | |
734 | { .mfi | |
735 | nop.m 0 | |
736 | fmpy.s0 fTmp = fA6, fA6 | |
737 | nop.i 0 | |
738 | } | |
739 | { .mfb | |
740 | nop.m 0 | |
741 | fma.d.s0 f8 = fA654321, fX3, fNormX | |
742 | br.ret.sptk b0 // Exit if 2^-60 < |x| < 0.25 | |
743 | } | |
744 | ;; | |
745 | ||
746 | SINH_VERY_SMALL: | |
747 | // Here if 0 < |x| < 2^-60 | |
748 | // Compute result by x + sgn(x)*x^2 to get properly rounded result | |
749 | .pred.rel "mutex",p7,p8 | |
750 | { .mfi | |
751 | nop.m 0 | |
752 | (p7) fnma.d.s0 f8 = fNormX, fNormX, fNormX // If x<0 result ~ x-x^2 | |
753 | nop.i 0 | |
754 | } | |
755 | { .mfb | |
756 | nop.m 0 | |
757 | (p8) fma.d.s0 f8 = fNormX, fNormX, fNormX // If x>0 result ~ x+x^2 | |
758 | br.ret.sptk b0 // Exit if |x| < 2^-60 | |
759 | } | |
760 | ;; | |
761 | ||
762 | ||
763 | SINH_POSSIBLE_OVERFLOW: | |
764 | ||
765 | // Here if fMAX_DBL_NORM_ARG < |x| < fMIN_DBL_OFLOW_ARG | |
766 | // This cannot happen if input is a double, only if input higher precision. | |
767 | // Overflow is a possibility, not a certainty. | |
768 | ||
769 | // Recompute result using status field 2 with user's rounding mode, | |
770 | // and wre set. If result is larger than largest double, then we have | |
771 | // overflow | |
772 | ||
773 | { .mfi | |
774 | mov rGt_ln = 0x103ff // Exponent for largest dbl + 1 ulp | |
775 | fsetc.s2 0x7F,0x42 // Get user's round mode, set wre | |
776 | nop.i 0 | |
777 | } | |
778 | ;; | |
779 | ||
780 | { .mfi | |
781 | setf.exp fGt_pln = rGt_ln // Create largest double + 1 ulp | |
782 | fma.d.s2 fWre_urm_f8 = fS, fP, fS // Result with wre set | |
783 | nop.i 0 | |
784 | } | |
785 | ;; | |
786 | ||
787 | { .mfi | |
788 | nop.m 0 | |
789 | fsetc.s2 0x7F,0x40 // Turn off wre in sf2 | |
790 | nop.i 0 | |
791 | } | |
792 | ;; | |
793 | ||
794 | { .mfi | |
795 | nop.m 0 | |
796 | fcmp.ge.s1 p6, p0 = fWre_urm_f8, fGt_pln // Test for overflow | |
797 | nop.i 0 | |
798 | } | |
799 | ;; | |
800 | ||
801 | { .mfb | |
802 | nop.m 0 | |
803 | nop.f 0 | |
804 | (p6) br.cond.spnt SINH_CERTAIN_OVERFLOW // Branch if overflow | |
805 | } | |
806 | ;; | |
807 | ||
808 | { .mfb | |
809 | nop.m 0 | |
810 | fma.d.s0 f8 = fS, fP, fS | |
811 | br.ret.sptk b0 // Exit if really no overflow | |
812 | } | |
813 | ;; | |
814 | ||
815 | SINH_CERTAIN_OVERFLOW: | |
816 | { .mfi | |
817 | sub rTmp = rExp_mask, r0, 1 | |
818 | fcmp.lt.s1 p6, p7 = fNormX, f0 // Test for x < 0 | |
819 | nop.i 0 | |
820 | } | |
821 | ;; | |
822 | ||
823 | { .mmf | |
824 | alloc r32=ar.pfs,1,4,4,0 | |
825 | setf.exp fTmp = rTmp | |
826 | fmerge.s FR_X = f8,f8 | |
827 | } | |
828 | ;; | |
829 | ||
830 | { .mfi | |
831 | mov GR_Parameter_TAG = 127 | |
832 | (p6) fnma.d.s0 FR_RESULT = fTmp, fTmp, f0 // Set I,O and -INF result | |
833 | nop.i 0 | |
834 | } | |
835 | { .mfb | |
836 | nop.m 0 | |
837 | (p7) fma.d.s0 FR_RESULT = fTmp, fTmp, f0 // Set I,O and +INF result | |
838 | br.cond.sptk __libm_error_region | |
839 | } | |
840 | ;; | |
841 | ||
842 | // Here if x unorm | |
843 | SINH_UNORM: | |
844 | { .mfb | |
845 | getf.exp rSignexp_x = fNormX // Must recompute if x unorm | |
846 | fcmp.eq.s0 p6, p0 = f8, f0 // Set D flag | |
847 | br.cond.sptk SINH_COMMON | |
848 | } | |
849 | ;; | |
850 | ||
851 | GLOBAL_IEEE754_END(sinh) | |
0609ec0a | 852 | libm_alias_double_other (__sinh, sinh) |
d5efd131 MF |
853 | |
854 | ||
855 | LOCAL_LIBM_ENTRY(__libm_error_region) | |
856 | .prologue | |
857 | { .mfi | |
858 | add GR_Parameter_Y=-32,sp // Parameter 2 value | |
859 | nop.f 0 | |
860 | .save ar.pfs,GR_SAVE_PFS | |
861 | mov GR_SAVE_PFS=ar.pfs // Save ar.pfs | |
862 | } | |
863 | { .mfi | |
864 | .fframe 64 | |
865 | add sp=-64,sp // Create new stack | |
866 | nop.f 0 | |
867 | mov GR_SAVE_GP=gp // Save gp | |
868 | };; | |
869 | { .mmi | |
870 | stfd [GR_Parameter_Y] = FR_Y,16 // STORE Parameter 2 on stack | |
871 | add GR_Parameter_X = 16,sp // Parameter 1 address | |
872 | .save b0, GR_SAVE_B0 | |
873 | mov GR_SAVE_B0=b0 // Save b0 | |
874 | };; | |
875 | .body | |
876 | { .mib | |
877 | stfd [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack | |
878 | add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address | |
879 | nop.b 0 | |
880 | } | |
881 | { .mib | |
882 | stfd [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3 on stack | |
883 | add GR_Parameter_Y = -16,GR_Parameter_Y | |
884 | br.call.sptk b0=__libm_error_support# // Call error handling function | |
885 | };; | |
886 | { .mmi | |
887 | add GR_Parameter_RESULT = 48,sp | |
888 | nop.m 0 | |
889 | nop.i 0 | |
890 | };; | |
891 | { .mmi | |
892 | ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack | |
893 | .restore sp | |
894 | add sp = 64,sp // Restore stack pointer | |
895 | mov b0 = GR_SAVE_B0 // Restore return address | |
896 | };; | |
897 | { .mib | |
898 | mov gp = GR_SAVE_GP // Restore gp | |
899 | mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs | |
900 | br.ret.sptk b0 // Return | |
901 | };; | |
902 | ||
903 | LOCAL_LIBM_END(__libm_error_region) | |
904 | .type __libm_error_support#,@function | |
905 | .global __libm_error_support# |