]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/ia64/fpu/e_atanhl.S
2.5-18.1
[thirdparty/glibc.git] / sysdeps / ia64 / fpu / e_atanhl.S
1 .file "atanhl.s"
2
3
4 // Copyright (c) 2001 - 2003, Intel Corporation
5 // All rights reserved.
6 //
7 // Contributed 2001 by the Intel Numerics Group, Intel Corporation
8 //
9 // Redistribution and use in source and binary forms, with or without
10 // modification, are permitted provided that the following conditions are
11 // met:
12 //
13 // * Redistributions of source code must retain the above copyright
14 // notice, this list of conditions and the following disclaimer.
15 //
16 // * Redistributions in binary form must reproduce the above copyright
17 // notice, this list of conditions and the following disclaimer in the
18 // documentation and/or other materials provided with the distribution.
19 //
20 // * The name of Intel Corporation may not be used to endorse or promote
21 // products derived from this software without specific prior written
22 // permission.
23
24 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
25 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,INCLUDING,BUT NOT
26 // LIMITED TO,THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
27 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
28 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT,INDIRECT,INCIDENTAL,SPECIAL,
29 // EXEMPLARY,OR CONSEQUENTIAL DAMAGES (INCLUDING,BUT NOT LIMITED TO,
30 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,DATA,OR
31 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
32 // OF LIABILITY,WHETHER IN CONTRACT,STRICT LIABILITY OR TORT (INCLUDING
33 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
34 // SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35 //
36 // Intel Corporation is the author of this code,and requests that all
37 // problem reports or change requests be submitted to it directly at
38 // http://www.intel.com/software/products/opensource/libraries/num.htm.
39 //
40 //*********************************************************************
41 //
42 // History:
43 // 09/10/01 Initial version
44 // 12/11/01 Corrected .restore syntax
45 // 05/20/02 Cleaned up namespace and sf0 syntax
46 // 02/10/03 Reordered header: .section, .global, .proc, .align;
47 // used data8 for long double table values
48 //
49 //*********************************************************************
50 //
51 //*********************************************************************
52 //
53 // Function: atanhl(x) computes the principle value of the inverse
54 // hyperbolic tangent of x.
55 //
56 //*********************************************************************
57 //
58 // Resources Used:
59 //
60 // Floating-Point Registers: f8 (Input and Return Value)
61 // f33-f73
62 //
63 // General Purpose Registers:
64 // r32-r52
65 // r49-r52 (Used to pass arguments to error handling routine)
66 //
67 // Predicate Registers: p6-p15
68 //
69 //*********************************************************************
70 //
71 // IEEE Special Conditions:
72 //
73 // atanhl(inf) = QNaN
74 // atanhl(-inf) = QNaN
75 // atanhl(+/-0) = +/-0
76 // atanhl(1) = +inf
77 // atanhl(-1) = -inf
78 // atanhl(|x|>1) = QNaN
79 // atanhl(SNaN) = QNaN
80 // atanhl(QNaN) = QNaN
81 //
82 //*********************************************************************
83 //
84 // Overview
85 //
86 // The method consists of two cases.
87 //
88 // If |x| < 1/32 use case atanhl_near_zero;
89 // else use case atanhl_regular;
90 //
91 // Case atanhl_near_zero:
92 //
93 // atanhl(x) can be approximated by the Taylor series expansion
94 // up to order 17.
95 //
96 // Case atanhl_regular:
97 //
98 // Here we use formula atanhl(x) = sign(x)*log1pl(2*|x|/(1-|x|))/2 and
99 // calculation is subdivided into two stages. The first stage is
100 // calculating of X = 2*|x|/(1-|x|). The second one is calculating of
101 // sign(x)*log1pl(X)/2. To obtain required accuracy we use precise division
102 // algorythm output of which is a pair of two extended precision values those
103 // approximate result of division with accuracy higher than working
104 // precision. This pair is passed to modified log1pl function.
105 //
106 //
107 // 1. calculating of X = 2*|x|/(1-|x|)
108 // ( based on Peter Markstein's "IA-64 and Elementary Functions" book )
109 // ********************************************************************
110 //
111 // a = 2*|x|
112 // b = 1 - |x|
113 // b_lo = |x| - (1 - b)
114 //
115 // y = frcpa(b) initial approximation of 1/b
116 // q = a*y initial approximation of a/b
117 //
118 // e = 1 - b*y
119 // e2 = e + e^2
120 // e1 = e^2
121 // y1 = y + y*e2 = y + y*(e+e^2)
122 //
123 // e3 = e + e1^2
124 // y2 = y + y1*e3 = y + y*(e+e^2+..+e^6)
125 //
126 // r = a - b*q
127 // e = 1 - b*y2
128 // X = q + r*y2 high part of a/b
129 //
130 // y3 = y2 + y2*e4
131 // r1 = a - b*X
132 // r1 = r1 - b_lo*X
133 // X_lo = r1*y3 low part of a/b
134 //
135 // 2. special log1p algorithm overview
136 // ***********************************
137 //
138 // Here we use a table lookup method. The basic idea is that in
139 // order to compute logl(Arg) = log1pl (Arg-1) for an argument Arg in [1,2),
140 // we construct a value G such that G*Arg is close to 1 and that
141 // logl(1/G) is obtainable easily from a table of values calculated
142 // beforehand. Thus
143 //
144 // logl(Arg) = logl(1/G) + logl(G*Arg)
145 // = logl(1/G) + logl(1 + (G*Arg - 1))
146 //
147 // Because |G*Arg - 1| is small, the second term on the right hand
148 // side can be approximated by a short polynomial. We elaborate
149 // this method in several steps.
150 //
151 // Step 0: Initialization
152 // ------
153 // We need to calculate logl(X + X_lo + 1). Obtain N, S_hi such that
154 //
155 // X + X_lo + 1 = 2^N * ( S_hi + S_lo ) exactly
156 //
157 // where S_hi in [1,2) and S_lo is a correction to S_hi in the sense
158 // that |S_lo| <= ulp(S_hi).
159 //
160 // For the special version of log1p we add X_lo to S_lo (S_lo = S_lo + X_lo)
161 // !-----------------------------------------------------------------------!
162 //
163 // Step 1: Argument Reduction
164 // ------
165 // Based on S_hi, obtain G_1, G_2, G_3 from a table and calculate
166 //
167 // G := G_1 * G_2 * G_3
168 // r := (G * S_hi - 1) + G * S_lo
169 //
170 // These G_j's have the property that the product is exactly
171 // representable and that |r| < 2^(-12) as a result.
172 //
173 // Step 2: Approximation
174 // ------
175 // logl(1 + r) is approximated by a short polynomial poly(r).
176 //
177 // Step 3: Reconstruction
178 // ------
179 // Finally, log1pl(X + X_lo) = logl(X + X_lo + 1) is given by
180 //
181 // logl(X + X_lo + 1) = logl(2^N * (S_hi + S_lo))
182 // ~=~ N*logl(2) + logl(1/G) + logl(1 + r)
183 // ~=~ N*logl(2) + logl(1/G) + poly(r).
184 //
185 // For detailed description see log1p1 function, regular path.
186 //
187 //*********************************************************************
188
189 RODATA
190 .align 64
191
192 // ************* DO NOT CHANGE THE ORDER OF THESE TABLES *************
193
194 LOCAL_OBJECT_START(Constants_TaylorSeries)
195 data8 0xF0F0F0F0F0F0F0F1,0x00003FFA // C17
196 data8 0x8888888888888889,0x00003FFB // C15
197 data8 0x9D89D89D89D89D8A,0x00003FFB // C13
198 data8 0xBA2E8BA2E8BA2E8C,0x00003FFB // C11
199 data8 0xE38E38E38E38E38E,0x00003FFB // C9
200 data8 0x9249249249249249,0x00003FFC // C7
201 data8 0xCCCCCCCCCCCCCCCD,0x00003FFC // C5
202 data8 0xAAAAAAAAAAAAAAAA,0x00003FFD // C3
203 data4 0x3f000000 // 1/2
204 data4 0x00000000 // pad
205 data4 0x00000000
206 data4 0x00000000
207 LOCAL_OBJECT_END(Constants_TaylorSeries)
208
209 LOCAL_OBJECT_START(Constants_Q)
210 data4 0x00000000,0xB1721800,0x00003FFE,0x00000000 // log2_hi
211 data4 0x4361C4C6,0x82E30865,0x0000BFE2,0x00000000 // log2_lo
212 data4 0x328833CB,0xCCCCCAF2,0x00003FFC,0x00000000 // Q4
213 data4 0xA9D4BAFB,0x80000077,0x0000BFFD,0x00000000 // Q3
214 data4 0xAAABE3D2,0xAAAAAAAA,0x00003FFD,0x00000000 // Q2
215 data4 0xFFFFDAB7,0xFFFFFFFF,0x0000BFFD,0x00000000 // Q1
216 LOCAL_OBJECT_END(Constants_Q)
217
218
219 // Z1 - 16 bit fixed
220 LOCAL_OBJECT_START(Constants_Z_1)
221 data4 0x00008000
222 data4 0x00007879
223 data4 0x000071C8
224 data4 0x00006BCB
225 data4 0x00006667
226 data4 0x00006187
227 data4 0x00005D18
228 data4 0x0000590C
229 data4 0x00005556
230 data4 0x000051EC
231 data4 0x00004EC5
232 data4 0x00004BDB
233 data4 0x00004925
234 data4 0x0000469F
235 data4 0x00004445
236 data4 0x00004211
237 LOCAL_OBJECT_END(Constants_Z_1)
238
239 // G1 and H1 - IEEE single and h1 - IEEE double
240 LOCAL_OBJECT_START(Constants_G_H_h1)
241 data4 0x3F800000,0x00000000
242 data8 0x0000000000000000
243 data4 0x3F70F0F0,0x3D785196
244 data8 0x3DA163A6617D741C
245 data4 0x3F638E38,0x3DF13843
246 data8 0x3E2C55E6CBD3D5BB
247 data4 0x3F579430,0x3E2FF9A0
248 data8 0xBE3EB0BFD86EA5E7
249 data4 0x3F4CCCC8,0x3E647FD6
250 data8 0x3E2E6A8C86B12760
251 data4 0x3F430C30,0x3E8B3AE7
252 data8 0x3E47574C5C0739BA
253 data4 0x3F3A2E88,0x3EA30C68
254 data8 0x3E20E30F13E8AF2F
255 data4 0x3F321640,0x3EB9CEC8
256 data8 0xBE42885BF2C630BD
257 data4 0x3F2AAAA8,0x3ECF9927
258 data8 0x3E497F3497E577C6
259 data4 0x3F23D708,0x3EE47FC5
260 data8 0x3E3E6A6EA6B0A5AB
261 data4 0x3F1D89D8,0x3EF8947D
262 data8 0xBDF43E3CD328D9BE
263 data4 0x3F17B420,0x3F05F3A1
264 data8 0x3E4094C30ADB090A
265 data4 0x3F124920,0x3F0F4303
266 data8 0xBE28FBB2FC1FE510
267 data4 0x3F0D3DC8,0x3F183EBF
268 data8 0x3E3A789510FDE3FA
269 data4 0x3F088888,0x3F20EC80
270 data8 0x3E508CE57CC8C98F
271 data4 0x3F042108,0x3F29516A
272 data8 0xBE534874A223106C
273 LOCAL_OBJECT_END(Constants_G_H_h1)
274
275 // Z2 - 16 bit fixed
276 LOCAL_OBJECT_START(Constants_Z_2)
277 data4 0x00008000
278 data4 0x00007F81
279 data4 0x00007F02
280 data4 0x00007E85
281 data4 0x00007E08
282 data4 0x00007D8D
283 data4 0x00007D12
284 data4 0x00007C98
285 data4 0x00007C20
286 data4 0x00007BA8
287 data4 0x00007B31
288 data4 0x00007ABB
289 data4 0x00007A45
290 data4 0x000079D1
291 data4 0x0000795D
292 data4 0x000078EB
293 LOCAL_OBJECT_END(Constants_Z_2)
294
295 // G2 and H2 - IEEE single and h2 - IEEE double
296 LOCAL_OBJECT_START(Constants_G_H_h2)
297 data4 0x3F800000,0x00000000
298 data8 0x0000000000000000
299 data4 0x3F7F00F8,0x3B7F875D
300 data8 0x3DB5A11622C42273
301 data4 0x3F7E03F8,0x3BFF015B
302 data8 0x3DE620CF21F86ED3
303 data4 0x3F7D08E0,0x3C3EE393
304 data8 0xBDAFA07E484F34ED
305 data4 0x3F7C0FC0,0x3C7E0586
306 data8 0xBDFE07F03860BCF6
307 data4 0x3F7B1880,0x3C9E75D2
308 data8 0x3DEA370FA78093D6
309 data4 0x3F7A2328,0x3CBDC97A
310 data8 0x3DFF579172A753D0
311 data4 0x3F792FB0,0x3CDCFE47
312 data8 0x3DFEBE6CA7EF896B
313 data4 0x3F783E08,0x3CFC15D0
314 data8 0x3E0CF156409ECB43
315 data4 0x3F774E38,0x3D0D874D
316 data8 0xBE0B6F97FFEF71DF
317 data4 0x3F766038,0x3D1CF49B
318 data8 0xBE0804835D59EEE8
319 data4 0x3F757400,0x3D2C531D
320 data8 0x3E1F91E9A9192A74
321 data4 0x3F748988,0x3D3BA322
322 data8 0xBE139A06BF72A8CD
323 data4 0x3F73A0D0,0x3D4AE46F
324 data8 0x3E1D9202F8FBA6CF
325 data4 0x3F72B9D0,0x3D5A1756
326 data8 0xBE1DCCC4BA796223
327 data4 0x3F71D488,0x3D693B9D
328 data8 0xBE049391B6B7C239
329 LOCAL_OBJECT_END(Constants_G_H_h2)
330
331 // G3 and H3 - IEEE single and h3 - IEEE double
332 LOCAL_OBJECT_START(Constants_G_H_h3)
333 data4 0x3F7FFC00,0x38800100
334 data8 0x3D355595562224CD
335 data4 0x3F7FF400,0x39400480
336 data8 0x3D8200A206136FF6
337 data4 0x3F7FEC00,0x39A00640
338 data8 0x3DA4D68DE8DE9AF0
339 data4 0x3F7FE400,0x39E00C41
340 data8 0xBD8B4291B10238DC
341 data4 0x3F7FDC00,0x3A100A21
342 data8 0xBD89CCB83B1952CA
343 data4 0x3F7FD400,0x3A300F22
344 data8 0xBDB107071DC46826
345 data4 0x3F7FCC08,0x3A4FF51C
346 data8 0x3DB6FCB9F43307DB
347 data4 0x3F7FC408,0x3A6FFC1D
348 data8 0xBD9B7C4762DC7872
349 data4 0x3F7FBC10,0x3A87F20B
350 data8 0xBDC3725E3F89154A
351 data4 0x3F7FB410,0x3A97F68B
352 data8 0xBD93519D62B9D392
353 data4 0x3F7FAC18,0x3AA7EB86
354 data8 0x3DC184410F21BD9D
355 data4 0x3F7FA420,0x3AB7E101
356 data8 0xBDA64B952245E0A6
357 data4 0x3F7F9C20,0x3AC7E701
358 data8 0x3DB4B0ECAABB34B8
359 data4 0x3F7F9428,0x3AD7DD7B
360 data8 0x3D9923376DC40A7E
361 data4 0x3F7F8C30,0x3AE7D474
362 data8 0x3DC6E17B4F2083D3
363 data4 0x3F7F8438,0x3AF7CBED
364 data8 0x3DAE314B811D4394
365 data4 0x3F7F7C40,0x3B03E1F3
366 data8 0xBDD46F21B08F2DB1
367 data4 0x3F7F7448,0x3B0BDE2F
368 data8 0xBDDC30A46D34522B
369 data4 0x3F7F6C50,0x3B13DAAA
370 data8 0x3DCB0070B1F473DB
371 data4 0x3F7F6458,0x3B1BD766
372 data8 0xBDD65DDC6AD282FD
373 data4 0x3F7F5C68,0x3B23CC5C
374 data8 0xBDCDAB83F153761A
375 data4 0x3F7F5470,0x3B2BC997
376 data8 0xBDDADA40341D0F8F
377 data4 0x3F7F4C78,0x3B33C711
378 data8 0x3DCD1BD7EBC394E8
379 data4 0x3F7F4488,0x3B3BBCC6
380 data8 0xBDC3532B52E3E695
381 data4 0x3F7F3C90,0x3B43BAC0
382 data8 0xBDA3961EE846B3DE
383 data4 0x3F7F34A0,0x3B4BB0F4
384 data8 0xBDDADF06785778D4
385 data4 0x3F7F2CA8,0x3B53AF6D
386 data8 0x3DCC3ED1E55CE212
387 data4 0x3F7F24B8,0x3B5BA620
388 data8 0xBDBA31039E382C15
389 data4 0x3F7F1CC8,0x3B639D12
390 data8 0x3D635A0B5C5AF197
391 data4 0x3F7F14D8,0x3B6B9444
392 data8 0xBDDCCB1971D34EFC
393 data4 0x3F7F0CE0,0x3B7393BC
394 data8 0x3DC7450252CD7ADA
395 data4 0x3F7F04F0,0x3B7B8B6D
396 data8 0xBDB68F177D7F2A42
397 LOCAL_OBJECT_END(Constants_G_H_h3)
398
399
400
401 // Floating Point Registers
402
403 FR_C17 = f50
404 FR_C15 = f51
405 FR_C13 = f52
406 FR_C11 = f53
407 FR_C9 = f54
408 FR_C7 = f55
409 FR_C5 = f56
410 FR_C3 = f57
411 FR_x2 = f58
412 FR_x3 = f59
413 FR_x4 = f60
414 FR_x8 = f61
415
416 FR_Rcp = f61
417
418 FR_A = f33
419 FR_R1 = f33
420
421 FR_E1 = f34
422 FR_E3 = f34
423 FR_Y2 = f34
424 FR_Y3 = f34
425
426 FR_E2 = f35
427 FR_Y1 = f35
428
429 FR_B = f36
430 FR_Y0 = f37
431 FR_E0 = f38
432 FR_E4 = f39
433 FR_Q0 = f40
434 FR_R0 = f41
435 FR_B_lo = f42
436
437 FR_abs_x = f43
438 FR_Bp = f44
439 FR_Bn = f45
440 FR_Yp = f46
441 FR_Yn = f47
442
443 FR_X = f48
444 FR_BB = f48
445 FR_X_lo = f49
446
447 FR_G = f50
448 FR_Y_hi = f51
449 FR_H = f51
450 FR_h = f52
451 FR_G2 = f53
452 FR_H2 = f54
453 FR_h2 = f55
454 FR_G3 = f56
455 FR_H3 = f57
456 FR_h3 = f58
457
458 FR_Q4 = f59
459 FR_poly_lo = f59
460 FR_Y_lo = f59
461
462 FR_Q3 = f60
463 FR_Q2 = f61
464
465 FR_Q1 = f62
466 FR_poly_hi = f62
467
468 FR_float_N = f63
469
470 FR_AA = f64
471 FR_S_lo = f64
472
473 FR_S_hi = f65
474 FR_r = f65
475
476 FR_log2_hi = f66
477 FR_log2_lo = f67
478 FR_Z = f68
479 FR_2_to_minus_N = f69
480 FR_rcub = f70
481 FR_rsq = f71
482 FR_05r = f72
483 FR_Half = f73
484
485 FR_Arg_X = f50
486 FR_Arg_Y = f0
487 FR_RESULT = f8
488
489
490
491 // General Purpose Registers
492
493 GR_ad_05 = r33
494 GR_Index1 = r34
495 GR_ArgExp = r34
496 GR_Index2 = r35
497 GR_ExpMask = r35
498 GR_NearZeroBound = r36
499 GR_signif = r36
500 GR_X_0 = r37
501 GR_X_1 = r37
502 GR_X_2 = r38
503 GR_Index3 = r38
504 GR_minus_N = r39
505 GR_Z_1 = r40
506 GR_Z_2 = r40
507 GR_N = r41
508 GR_Bias = r42
509 GR_M = r43
510 GR_ad_taylor = r44
511 GR_ad_taylor_2 = r45
512 GR_ad2_tbl_3 = r45
513 GR_ad_tbl_1 = r46
514 GR_ad_tbl_2 = r47
515 GR_ad_tbl_3 = r48
516 GR_ad_q = r49
517 GR_ad_z_1 = r50
518 GR_ad_z_2 = r51
519 GR_ad_z_3 = r52
520
521 //
522 // Added for unwind support
523 //
524 GR_SAVE_PFS = r46
525 GR_SAVE_B0 = r47
526 GR_SAVE_GP = r48
527 GR_Parameter_X = r49
528 GR_Parameter_Y = r50
529 GR_Parameter_RESULT = r51
530 GR_Parameter_TAG = r52
531
532
533
534 .section .text
535 GLOBAL_LIBM_ENTRY(atanhl)
536
537 { .mfi
538 alloc r32 = ar.pfs,0,17,4,0
539 fnma.s1 FR_Bp = f8,f1,f1 // b = 1 - |arg| (for x>0)
540 mov GR_ExpMask = 0x1ffff
541 }
542 { .mfi
543 addl GR_ad_taylor = @ltoff(Constants_TaylorSeries),gp
544 fma.s1 FR_Bn = f8,f1,f1 // b = 1 - |arg| (for x<0)
545 mov GR_NearZeroBound = 0xfffa // biased exp of 1/32
546 };;
547 { .mfi
548 getf.exp GR_ArgExp = f8
549 fcmp.lt.s1 p6,p7 = f8,f0 // is negative?
550 nop.i 0
551 }
552 { .mfi
553 ld8 GR_ad_taylor = [GR_ad_taylor]
554 fmerge.s FR_abs_x = f1,f8
555 nop.i 0
556 };;
557 { .mfi
558 nop.m 0
559 fclass.m p8,p0 = f8,0x1C7 // is arg NaT,Q/SNaN or +/-0 ?
560 nop.i 0
561 }
562 { .mfi
563 nop.m 0
564 fma.s1 FR_x2 = f8,f8,f0
565 nop.i 0
566 };;
567 { .mfi
568 add GR_ad_z_1 = 0x0F0,GR_ad_taylor
569 fclass.m p9,p0 = f8,0x0a // is arg -denormal ?
570 add GR_ad_taylor_2 = 0x010,GR_ad_taylor
571 }
572 { .mfi
573 add GR_ad_05 = 0x080,GR_ad_taylor
574 nop.f 0
575 nop.i 0
576 };;
577 { .mfi
578 ldfe FR_C17 = [GR_ad_taylor],32
579 fclass.m p10,p0 = f8,0x09 // is arg +denormal ?
580 add GR_ad_tbl_1 = 0x040,GR_ad_z_1 // point to Constants_G_H_h1
581 }
582 { .mfb
583 add GR_ad_z_2 = 0x140,GR_ad_z_1 // point to Constants_Z_2
584 (p8) fma.s0 f8 = f8,f1,f0 // NaN or +/-0
585 (p8) br.ret.spnt b0 // exit for Nan or +/-0
586 };;
587 { .mfi
588 ldfe FR_C15 = [GR_ad_taylor_2],32
589 fclass.m p15,p0 = f8,0x23 // is +/-INF ?
590 add GR_ad_tbl_2 = 0x180,GR_ad_z_1 // point to Constants_G_H_h2
591 }
592 { .mfb
593 ldfe FR_C13 = [GR_ad_taylor],32
594 (p9) fnma.s0 f8 = f8,f8,f8 // -denormal
595 (p9) br.ret.spnt b0 // exit for -denormal
596 };;
597 { .mfi
598 ldfe FR_C11 = [GR_ad_taylor_2],32
599 fcmp.eq.s0 p13,p0 = FR_abs_x,f1 // is |arg| = 1?
600 nop.i 0
601 }
602 { .mfb
603 ldfe FR_C9 = [GR_ad_taylor],32
604 (p10) fma.s0 f8 = f8,f8,f8 // +denormal
605 (p10) br.ret.spnt b0 // exit for +denormal
606 };;
607 { .mfi
608 ldfe FR_C7 = [GR_ad_taylor_2],32
609 (p6) frcpa.s1 FR_Yn,p11 = f1,FR_Bn // y = frcpa(b)
610 and GR_ArgExp = GR_ArgExp,GR_ExpMask // biased exponent
611 }
612 { .mfb
613 ldfe FR_C5 = [GR_ad_taylor],32
614 fnma.s1 FR_B = FR_abs_x,f1,f1 // b = 1 - |arg|
615 (p15) br.cond.spnt atanhl_gt_one // |arg| > 1
616 };;
617 { .mfb
618 cmp.gt p14,p0 = GR_NearZeroBound,GR_ArgExp
619 (p7) frcpa.s1 FR_Yp,p12 = f1,FR_Bp // y = frcpa(b)
620 (p13) br.cond.spnt atanhl_eq_one // |arg| = 1/32
621 }
622 { .mfb
623 ldfe FR_C3 = [GR_ad_taylor_2],32
624 fma.s1 FR_A = FR_abs_x,f1,FR_abs_x // a = 2 * |arg|
625 (p14) br.cond.spnt atanhl_near_zero // |arg| < 1/32
626 };;
627 { .mfi
628 nop.m 0
629 fcmp.gt.s0 p8,p0 = FR_abs_x,f1 // is |arg| > 1 ?
630 nop.i 0
631 };;
632 .pred.rel "mutex",p6,p7
633 { .mfi
634 nop.m 0
635 (p6) fnma.s1 FR_B_lo = FR_Bn,f1,f1 // argt = 1 - (1 - |arg|)
636 nop.i 0
637 }
638 { .mfi
639 ldfs FR_Half = [GR_ad_05]
640 (p7) fnma.s1 FR_B_lo = FR_Bp,f1,f1
641 nop.i 0
642 };;
643 { .mfi
644 nop.m 0
645 (p6) fnma.s1 FR_E0 = FR_Yn,FR_Bn,f1 // e = 1-b*y
646 nop.i 0
647 }
648 { .mfb
649 nop.m 0
650 (p6) fma.s1 FR_Y0 = FR_Yn,f1,f0
651 (p8) br.cond.spnt atanhl_gt_one // |arg| > 1
652 };;
653 { .mfi
654 nop.m 0
655 (p7) fnma.s1 FR_E0 = FR_Yp,FR_Bp,f1
656 nop.i 0
657 }
658 { .mfi
659 nop.m 0
660 (p6) fma.s1 FR_Q0 = FR_A,FR_Yn,f0 // q = a*y
661 nop.i 0
662 };;
663 { .mfi
664 nop.m 0
665 (p7) fma.s1 FR_Q0 = FR_A,FR_Yp,f0
666 nop.i 0
667 }
668 { .mfi
669 nop.m 0
670 (p7) fma.s1 FR_Y0 = FR_Yp,f1,f0
671 nop.i 0
672 };;
673 { .mfi
674 nop.m 0
675 fclass.nm p10,p0 = f8,0x1FF // test for unsupported
676 nop.i 0
677 };;
678 { .mfi
679 nop.m 0
680 fma.s1 FR_E2 = FR_E0,FR_E0,FR_E0 // e2 = e+e^2
681 nop.i 0
682 }
683 { .mfi
684 nop.m 0
685 fma.s1 FR_E1 = FR_E0,FR_E0,f0 // e1 = e^2
686 nop.i 0
687 };;
688 { .mfb
689 nop.m 0
690 // Return generated NaN or other value for unsupported values.
691 (p10) fma.s0 f8 = f8, f0, f0
692 (p10) br.ret.spnt b0
693 };;
694 { .mfi
695 nop.m 0
696 fma.s1 FR_Y1 = FR_Y0,FR_E2,FR_Y0 // y1 = y+y*e2
697 nop.i 0
698 }
699 { .mfi
700 nop.m 0
701 fma.s1 FR_E3 = FR_E1,FR_E1,FR_E0 // e3 = e+e1^2
702 nop.i 0
703 };;
704 { .mfi
705 nop.m 0
706 fnma.s1 FR_B_lo = FR_abs_x,f1,FR_B_lo // b_lo = argt-|arg|
707 nop.i 0
708 };;
709 { .mfi
710 nop.m 0
711 fma.s1 FR_Y2 = FR_Y1,FR_E3,FR_Y0 // y2 = y+y1*e3
712 nop.i 0
713 }
714 { .mfi
715 nop.m 0
716 fnma.s1 FR_R0 = FR_B,FR_Q0,FR_A // r = a-b*q
717 nop.i 0
718 };;
719 { .mfi
720 nop.m 0
721 fnma.s1 FR_E4 = FR_B,FR_Y2,f1 // e4 = 1-b*y2
722 nop.i 0
723 }
724 { .mfi
725 nop.m 0
726 fma.s1 FR_X = FR_R0,FR_Y2,FR_Q0 // x = q+r*y2
727 nop.i 0
728 };;
729 { .mfi
730 nop.m 0
731 fma.s1 FR_Z = FR_X,f1,f1 // x+1
732 nop.i 0
733 };;
734 { .mfi
735 nop.m 0
736 (p6) fnma.s1 FR_Half = FR_Half,f1,f0 // sign(arg)/2
737 nop.i 0
738 };;
739 { .mfi
740 nop.m 0
741 fma.s1 FR_Y3 = FR_Y2,FR_E4,FR_Y2 // y3 = y2+y2*e4
742 nop.i 0
743 }
744 { .mfi
745 nop.m 0
746 fnma.s1 FR_R1 = FR_B,FR_X,FR_A // r1 = a-b*x
747 nop.i 0
748 };;
749 { .mfi
750 getf.sig GR_signif = FR_Z // get significand of x+1
751 nop.f 0
752 nop.i 0
753 };;
754
755
756 { .mfi
757 add GR_ad_q = -0x060,GR_ad_z_1
758 nop.f 0
759 extr.u GR_Index1 = GR_signif,59,4 // get high 4 bits of signif
760 }
761 { .mfi
762 add GR_ad_tbl_3 = 0x280,GR_ad_z_1 // point to Constants_G_H_h3
763 nop.f 0
764 nop.i 0
765 };;
766 { .mfi
767 shladd GR_ad_z_1 = GR_Index1,2,GR_ad_z_1 // point to Z_1
768 nop.f 0
769 extr.u GR_X_0 = GR_signif,49,15 // get high 15 bits of significand
770 };;
771 { .mfi
772 ld4 GR_Z_1 = [GR_ad_z_1] // load Z_1
773 fmax.s1 FR_AA = FR_X,f1 // for S_lo,form AA = max(X,1.0)
774 nop.i 0
775 }
776 { .mfi
777 shladd GR_ad_tbl_1 = GR_Index1,4,GR_ad_tbl_1 // point to G_1
778 nop.f 0
779 mov GR_Bias = 0x0FFFF // exponent bias
780 };;
781 { .mfi
782 ldfps FR_G,FR_H = [GR_ad_tbl_1],8 // load G_1,H_1
783 fmerge.se FR_S_hi = f1,FR_Z // form |x+1|
784 nop.i 0
785 };;
786 { .mfi
787 getf.exp GR_N = FR_Z // get N = exponent of x+1
788 nop.f 0
789 nop.i 0
790 }
791 { .mfi
792 ldfd FR_h = [GR_ad_tbl_1] // load h_1
793 fnma.s1 FR_R1 = FR_B_lo,FR_X,FR_R1 // r1 = r1-b_lo*x
794 nop.i 0
795 };;
796 { .mfi
797 ldfe FR_log2_hi = [GR_ad_q],16 // load log2_hi
798 nop.f 0
799 pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15 // get bits 30-15 of X_0 * Z_1
800 };;
801 //
802 // For performance,don't use result of pmpyshr2.u for 4 cycles.
803 //
804 { .mfi
805 ldfe FR_log2_lo = [GR_ad_q],16 // load log2_lo
806 nop.f 0
807 sub GR_N = GR_N,GR_Bias
808 };;
809 { .mfi
810 ldfe FR_Q4 = [GR_ad_q],16 // load Q4
811 fms.s1 FR_S_lo = FR_AA,f1,FR_Z // form S_lo = AA - Z
812 sub GR_minus_N = GR_Bias,GR_N // form exponent of 2^(-N)
813 };;
814 { .mmf
815 ldfe FR_Q3 = [GR_ad_q],16 // load Q3
816 // put integer N into rightmost significand
817 setf.sig FR_float_N = GR_N
818 fmin.s1 FR_BB = FR_X,f1 // for S_lo,form BB = min(X,1.0)
819 };;
820 { .mfi
821 ldfe FR_Q2 = [GR_ad_q],16 // load Q2
822 nop.f 0
823 extr.u GR_Index2 = GR_X_1,6,4 // extract bits 6-9 of X_1
824 };;
825 { .mmi
826 ldfe FR_Q1 = [GR_ad_q] // load Q1
827 shladd GR_ad_z_2 = GR_Index2,2,GR_ad_z_2 // point to Z_2
828 nop.i 0
829 };;
830 { .mmi
831 ld4 GR_Z_2 = [GR_ad_z_2] // load Z_2
832 shladd GR_ad_tbl_2 = GR_Index2,4,GR_ad_tbl_2 // point to G_2
833 nop.i 0
834 };;
835 { .mfi
836 ldfps FR_G2,FR_H2 = [GR_ad_tbl_2],8 // load G_2,H_2
837 nop.f 0
838 nop.i 0
839 };;
840 { .mfi
841 ldfd FR_h2 = [GR_ad_tbl_2] // load h_2
842 fma.s1 FR_S_lo = FR_S_lo,f1,FR_BB // S_lo = S_lo + BB
843 nop.i 0
844 }
845 { .mfi
846 setf.exp FR_2_to_minus_N = GR_minus_N // form 2^(-N)
847 fma.s1 FR_X_lo = FR_R1,FR_Y3,f0 // x_lo = r1*y3
848 nop.i 0
849 };;
850 { .mfi
851 nop.m 0
852 nop.f 0
853 pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15 // get bits 30-15 of X_1 * Z_2
854 };;
855 //
856 // For performance,don't use result of pmpyshr2.u for 4 cycles
857 //
858 { .mfi
859 add GR_ad2_tbl_3 = 8,GR_ad_tbl_3
860 nop.f 0
861 nop.i 0
862 }
863 { .mfi
864 nop.m 0
865 nop.f 0
866 nop.i 0
867 };;
868 { .mfi
869 nop.m 0
870 nop.f 0
871 nop.i 0
872 };;
873 { .mfi
874 nop.m 0
875 nop.f 0
876 nop.i 0
877 };;
878
879 //
880 // Now GR_X_2 can be used
881 //
882 { .mfi
883 nop.m 0
884 nop.f 0
885 extr.u GR_Index3 = GR_X_2,1,5 // extract bits 1-5 of X_2
886 }
887 { .mfi
888 nop.m 0
889 fma.s1 FR_S_lo = FR_S_lo,f1,FR_X_lo // S_lo = S_lo + Arg_lo
890 nop.i 0
891 };;
892
893 { .mfi
894 shladd GR_ad_tbl_3 = GR_Index3,4,GR_ad_tbl_3 // point to G_3
895 fcvt.xf FR_float_N = FR_float_N
896 nop.i 0
897 }
898 { .mfi
899 shladd GR_ad2_tbl_3 = GR_Index3,4,GR_ad2_tbl_3 // point to h_3
900 fma.s1 FR_Q1 = FR_Q1,FR_Half,f0 // sign(arg)*Q1/2
901 nop.i 0
902 };;
903 { .mmi
904 ldfps FR_G3,FR_H3 = [GR_ad_tbl_3],8 // load G_3,H_3
905 ldfd FR_h3 = [GR_ad2_tbl_3] // load h_3
906 nop.i 0
907 };;
908 { .mfi
909 nop.m 0
910 fmpy.s1 FR_G = FR_G,FR_G2 // G = G_1 * G_2
911 nop.i 0
912 }
913 { .mfi
914 nop.m 0
915 fadd.s1 FR_H = FR_H,FR_H2 // H = H_1 + H_2
916 nop.i 0
917 };;
918 { .mfi
919 nop.m 0
920 fadd.s1 FR_h = FR_h,FR_h2 // h = h_1 + h_2
921 nop.i 0
922 };;
923 { .mfi
924 nop.m 0
925 // S_lo = S_lo * 2^(-N)
926 fma.s1 FR_S_lo = FR_S_lo,FR_2_to_minus_N,f0
927 nop.i 0
928 };;
929 { .mfi
930 nop.m 0
931 fmpy.s1 FR_G = FR_G,FR_G3 // G = (G_1 * G_2) * G_3
932 nop.i 0
933 }
934 { .mfi
935 nop.m 0
936 fadd.s1 FR_H = FR_H,FR_H3 // H = (H_1 + H_2) + H_3
937 nop.i 0
938 };;
939 { .mfi
940 nop.m 0
941 fadd.s1 FR_h = FR_h,FR_h3 // h = (h_1 + h_2) + h_3
942 nop.i 0
943 };;
944 { .mfi
945 nop.m 0
946 fms.s1 FR_r = FR_G,FR_S_hi,f1 // r = G * S_hi - 1
947 nop.i 0
948 }
949 { .mfi
950 nop.m 0
951 // Y_hi = N * log2_hi + H
952 fma.s1 FR_Y_hi = FR_float_N,FR_log2_hi,FR_H
953 nop.i 0
954 };;
955 { .mfi
956 nop.m 0
957 fma.s1 FR_h = FR_float_N,FR_log2_lo,FR_h // h = N * log2_lo + h
958 nop.i 0
959 };;
960 { .mfi
961 nop.m 0
962 fma.s1 FR_r = FR_G,FR_S_lo,FR_r // r = G * S_lo + (G * S_hi - 1)
963 nop.i 0
964 };;
965 { .mfi
966 nop.m 0
967 fma.s1 FR_poly_lo = FR_r,FR_Q4,FR_Q3 // poly_lo = r * Q4 + Q3
968 nop.i 0
969 }
970 { .mfi
971 nop.m 0
972 fmpy.s1 FR_rsq = FR_r,FR_r // rsq = r * r
973 nop.i 0
974 };;
975 { .mfi
976 nop.m 0
977 fma.s1 FR_05r = FR_r,FR_Half,f0 // sign(arg)*r/2
978 nop.i 0
979 };;
980 { .mfi
981 nop.m 0
982 // poly_lo = poly_lo * r + Q2
983 fma.s1 FR_poly_lo = FR_poly_lo,FR_r,FR_Q2
984 nop.i 0
985 }
986 { .mfi
987 nop.m 0
988 fma.s1 FR_rcub = FR_rsq,FR_r,f0 // rcub = r^3
989 nop.i 0
990 };;
991 { .mfi
992 nop.m 0
993 // poly_hi = sing(arg)*(Q1*r^2 + r)/2
994 fma.s1 FR_poly_hi = FR_Q1,FR_rsq,FR_05r
995 nop.i 0
996 };;
997 { .mfi
998 nop.m 0
999 // poly_lo = poly_lo*r^3 + h
1000 fma.s1 FR_poly_lo = FR_poly_lo,FR_rcub,FR_h
1001 nop.i 0
1002 };;
1003 { .mfi
1004 nop.m 0
1005 // Y_lo = poly_hi + poly_lo/2
1006 fma.s0 FR_Y_lo = FR_poly_lo,FR_Half,FR_poly_hi
1007 nop.i 0
1008 };;
1009 { .mfb
1010 nop.m 0
1011 // Result = arctanh(x) = Y_hi/2 + Y_lo
1012 fma.s0 f8 = FR_Y_hi,FR_Half,FR_Y_lo
1013 br.ret.sptk b0
1014 };;
1015
1016 // Taylor's series
1017 atanhl_near_zero:
1018 { .mfi
1019 nop.m 0
1020 fma.s1 FR_x3 = FR_x2,f8,f0
1021 nop.i 0
1022 }
1023 { .mfi
1024 nop.m 0
1025 fma.s1 FR_x4 = FR_x2,FR_x2,f0
1026 nop.i 0
1027 };;
1028 { .mfi
1029 nop.m 0
1030 fma.s1 FR_C17 = FR_C17,FR_x2,FR_C15
1031 nop.i 0
1032 }
1033 { .mfi
1034 nop.m 0
1035 fma.s1 FR_C13 = FR_C13,FR_x2,FR_C11
1036 nop.i 0
1037 };;
1038 { .mfi
1039 nop.m 0
1040 fma.s1 FR_C9 = FR_C9,FR_x2,FR_C7
1041 nop.i 0
1042 }
1043 { .mfi
1044 nop.m 0
1045 fma.s1 FR_C5 = FR_C5,FR_x2,FR_C3
1046 nop.i 0
1047 };;
1048 { .mfi
1049 nop.m 0
1050 fma.s1 FR_x8 = FR_x4,FR_x4,f0
1051 nop.i 0
1052 };;
1053 { .mfi
1054 nop.m 0
1055 fma.s1 FR_C17 = FR_C17,FR_x4,FR_C13
1056 nop.i 0
1057 };;
1058 { .mfi
1059 nop.m 0
1060 fma.s1 FR_C9 = FR_C9,FR_x4,FR_C5
1061 nop.i 0
1062 };;
1063 { .mfi
1064 nop.m 0
1065 fma.s1 FR_C17 = FR_C17,FR_x8,FR_C9
1066 nop.i 0
1067 };;
1068 { .mfb
1069 nop.m 0
1070 fma.s0 f8 = FR_C17,FR_x3,f8
1071 br.ret.sptk b0
1072 };;
1073
1074 atanhl_eq_one:
1075 { .mfi
1076 nop.m 0
1077 frcpa.s0 FR_Rcp,p0 = f1,f0 // get inf,and raise Z flag
1078 nop.i 0
1079 }
1080 { .mfi
1081 nop.m 0
1082 fmerge.s FR_Arg_X = f8, f8
1083 nop.i 0
1084 };;
1085 { .mfb
1086 mov GR_Parameter_TAG = 130
1087 fmerge.s FR_RESULT = f8,FR_Rcp // result is +-inf
1088 br.cond.sptk __libm_error_region // exit if |x| = 1.0
1089 };;
1090
1091 atanhl_gt_one:
1092 { .mfi
1093 nop.m 0
1094 fmerge.s FR_Arg_X = f8, f8
1095 nop.i 0
1096 };;
1097 { .mfb
1098 mov GR_Parameter_TAG = 129
1099 frcpa.s0 FR_RESULT,p0 = f0,f0 // get QNaN,and raise invalid
1100 br.cond.sptk __libm_error_region // exit if |x| > 1.0
1101 };;
1102
1103 GLOBAL_LIBM_END(atanhl)
1104
1105 LOCAL_LIBM_ENTRY(__libm_error_region)
1106 .prologue
1107 { .mfi
1108 add GR_Parameter_Y=-32,sp // Parameter 2 value
1109 nop.f 0
1110 .save ar.pfs,GR_SAVE_PFS
1111 mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
1112 }
1113 { .mfi
1114 .fframe 64
1115 add sp=-64,sp // Create new stack
1116 nop.f 0
1117 mov GR_SAVE_GP=gp // Save gp
1118 };;
1119 { .mmi
1120 stfe [GR_Parameter_Y] = FR_Arg_Y,16 // Save Parameter 2 on stack
1121 add GR_Parameter_X = 16,sp // Parameter 1 address
1122 .save b0,GR_SAVE_B0
1123 mov GR_SAVE_B0=b0 // Save b0
1124 };;
1125 .body
1126 { .mib
1127 stfe [GR_Parameter_X] = FR_Arg_X // Store Parameter 1 on stack
1128 add GR_Parameter_RESULT = 0,GR_Parameter_Y
1129 nop.b 0 // Parameter 3 address
1130 }
1131 { .mib
1132 stfe [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack
1133 add GR_Parameter_Y = -16,GR_Parameter_Y
1134 br.call.sptk b0=__libm_error_support# // Call error handling function
1135 };;
1136 { .mmi
1137 nop.m 0
1138 nop.m 0
1139 add GR_Parameter_RESULT = 48,sp
1140 };;
1141 { .mmi
1142 ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack
1143 .restore sp
1144 add sp = 64,sp // Restore stack pointer
1145 mov b0 = GR_SAVE_B0 // Restore return address
1146 };;
1147 { .mib
1148 mov gp = GR_SAVE_GP // Restore gp
1149 mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
1150 br.ret.sptk b0 // Return
1151 };;
1152
1153 LOCAL_LIBM_END(__libm_error_region#)
1154
1155 .type __libm_error_support#,@function
1156 .global __libm_error_support#