]> git.ipfire.org Git - thirdparty/glibc.git/blame - ports/sysdeps/ia64/fpu/e_acoshl.S
Fix typos.
[thirdparty/glibc.git] / ports / sysdeps / ia64 / fpu / e_acoshl.S
CommitLineData
d5efd131
MF
1.file "acoshl.s"
2
3
4// Copyright (c) 2000 - 2005, Intel Corporation
5// All rights reserved.
6//
7// Contributed 2000 by the Intel Numerics Group, Intel Corporation
8//
9// Redistribution and use in source and binary forms, with or without
10// modification, are permitted provided that the following conditions are
11// met:
12//
13// * Redistributions of source code must retain the above copyright
14// notice, this list of conditions and the following disclaimer.
15//
16// * Redistributions in binary form must reproduce the above copyright
17// notice, this list of conditions and the following disclaimer in the
18// documentation and/or other materials provided with the distribution.
19//
20// * The name of Intel Corporation may not be used to endorse or promote
21// products derived from this software without specific prior written
22// permission.
23
0347518d
MF
24// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
25// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
d5efd131 26// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
0347518d 27// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
d5efd131 28// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
0347518d
MF
29// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
30// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
31// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
d5efd131 32// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
0347518d
MF
33// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
34// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35//
d5efd131 36// Intel Corporation is the author of this code, and requests that all
0347518d 37// problem reports or change requests be submitted to it directly at
d5efd131
MF
38// http://www.intel.com/software/products/opensource/libraries/num.htm.
39//
40//*********************************************************************
41//
0347518d 42// History:
d5efd131
MF
43// 10/01/01 Initial version
44// 10/10/01 Performance inproved
45// 12/11/01 Changed huges_logp to not be global
46// 01/02/02 Corrected .restore syntax
47// 05/20/02 Cleaned up namespace and sf0 syntax
48// 08/14/02 Changed mli templates to mlx
49// 02/06/03 Reorganized data tables
50// 03/31/05 Reformatted delimiters between data tables
51//
52//*********************************************************************
53//
54// API
55//==============================================================
56// long double acoshl(long double);
57//
58// Overview of operation
59//==============================================================
0347518d 60//
d5efd131
MF
61// There are 6 paths:
62// 1. x = 1
63// Return acoshl(x) = 0;
64//
65// 2. x < 1
66// Return acoshl(x) = Nan (Domain error, error handler call with tag 135);
67//
68// 3. x = [S,Q]Nan or +INF
69// Return acoshl(x) = x + x;
0347518d 70//
d5efd131 71// 4. 'Near 1': 1 < x < 1+1/8
0347518d 72// Return acoshl(x) = sqrtl(2*y)*(1-P(y)/Q(y)),
d5efd131
MF
73// where y = 1, P(y)/Q(y) - rational approximation
74//
75// 5. 'Huges': x > 0.5*2^64
76// Return acoshl(x) = (logl(2*x-1));
0347518d 77//
d5efd131
MF
78// 6. 'Main path': 1+1/8 < x < 0.5*2^64
79// b_hi + b_lo = x + sqrt(x^2 - 1);
80// acoshl(x) = logl_special(b_hi, b_lo);
0347518d
MF
81//
82// Algorithm description
d5efd131
MF
83//==============================================================
84//
85// I. Near 1 path algorithm
86// **************************************************************
0347518d 87// The formula is acoshl(x) = sqrtl(2*y)*(1-P(y)/Q(y)),
d5efd131
MF
88// where y = 1, P(y)/Q(y) - rational approximation
89//
90// 1) y = x - 1, y2 = 2 * y
91//
92// 2) Compute in parallel sqrtl(2*y) and P(y)/Q(y)
93// a) sqrtl computation method described below (main path algorithm, item 2))
0347518d 94// As result we obtain (gg+gl) - multiprecision result
d5efd131
MF
95// as pair of double extended values
96// b) P(y) and Q(y) calculated without any extra precision manipulations
97// c) P/Q division:
98// y = frcpa(Q) initial approximation of 1/Q
99// z = P*y initial approximation of P/Q
0347518d 100//
d5efd131
MF
101// e = 1 - b*y
102// e2 = e + e^2
103// e1 = e^2
104// y1 = y + y*e2 = y + y*(e+e^2)
105//
106// e3 = e + e1^2
107// y2 = y + y1*e3 = y + y*(e+e^2+..+e^6)
108//
109// r = P - Q*z
110// e = 1 - Q*y2
111// xx = z + r*y2 high part of a/b
112//
113// y3 = y2 + y2*e4
114// r1 = P - Q*xx
115// xl = r1*y3 low part of a/b
116//
117// 3) res = sqrt(2*y) - sqrt(2*y)*(P(y)/Q(y)) =
118// = (gg+gl) - (gg + gl)*(xx+xl);
119//
120// a) hh = gg*xx; hl = gg*xl; lh = gl*xx; ll = gl*xl;
121// b) res = ((((gl + ll) + lh) + hl) + hh) + gg;
122// (exactly in this order)
123//
0347518d 124// II. Main path algorithm
d5efd131
MF
125// ( thanks to Peter Markstein for the idea of sqrt(x^2+1) computation! )
126// **********************************************************************
127//
128// There are 3 parts of x+sqrt(x^2-1) computation:
129//
130// 1) m2 = (m2_hi+m2_lo) = x^2-1 obtaining
131// ------------------------------------
132// m2_hi = x2_hi - 1, where x2_hi = x * x;
0347518d
MF
133// m2_lo = x2_lo + p1_lo, where
134// x2_lo = FMS(x*x-x2_hi),
d5efd131
MF
135// p1_lo = (1 + m2_hi) - x2_hi;
136//
137// 2) g = (g_hi+g_lo) = sqrt(m2) = sqrt(m2_hi+m2_lo)
138// ----------------------------------------------
139// r = invsqrt(m2_hi) (8-bit reciprocal square root approximation);
140// g = m2_hi * r (first 8 bit-approximation of sqrt);
0347518d 141//
d5efd131
MF
142// h = 0.5 * r;
143// e = 0.5 - g * h;
144// g = g * e + g (second 16 bit-approximation of sqrt);
0347518d 145//
d5efd131
MF
146// h = h * e + h;
147// e = 0.5 - g * h;
148// g = g * e + g (third 32 bit-approximation of sqrt);
149//
150// h = h * e + h;
151// e = 0.5 - g * h;
152// g_hi = g * e + g (fourth 64 bit-approximation of sqrt);
0347518d 153//
d5efd131
MF
154// Remainder computation:
155// h = h * e + h;
156// d = (m2_hi - g_hi * g_hi) + m2_lo;
157// g_lo = d * h;
158//
159// 3) b = (b_hi + b_lo) = x + g, where g = (g_hi + g_lo) = sqrt(x^2-1)
160// -------------------------------------------------------------------
161// b_hi = (g_hi + x) + gl;
162// b_lo = (x - b_hi) + g_hi + gl;
0347518d 163//
d5efd131
MF
164// Now we pass b presented as sum b_hi + b_lo to special version
165// of logl function which accept a pair of arguments as
0347518d
MF
166// mutiprecision value.
167//
d5efd131
MF
168// Special log algorithm overview
169// ================================
170// Here we use a table lookup method. The basic idea is that in
0347518d 171// order to compute logl(Arg) for an argument Arg in [1,2),
d5efd131
MF
172// we construct a value G such that G*Arg is close to 1 and that
173// logl(1/G) is obtainable easily from a table of values calculated
174// beforehand. Thus
175//
176// logl(Arg) = logl(1/G) + logl((G*Arg - 1))
177//
178// Because |G*Arg - 1| is small, the second term on the right hand
179// side can be approximated by a short polynomial. We elaborate
180// this method in four steps.
181//
182// Step 0: Initialization
183//
184// We need to calculate logl( X+1 ). Obtain N, S_hi such that
185//
186// X = 2^N * ( S_hi + S_lo ) exactly
187//
188// where S_hi in [1,2) and S_lo is a correction to S_hi in the sense
189// that |S_lo| <= ulp(S_hi).
190//
191// For the special version of logl: S_lo = b_lo
192// !-----------------------------------------------!
193//
194// Step 1: Argument Reduction
195//
196// Based on S_hi, obtain G_1, G_2, G_3 from a table and calculate
197//
198// G := G_1 * G_2 * G_3
199// r := (G * S_hi - 1) + G * S_lo
200//
0347518d 201// These G_j's have the property that the product is exactly
d5efd131
MF
202// representable and that |r| < 2^(-12) as a result.
203//
204// Step 2: Approximation
205//
206// logl(1 + r) is approximated by a short polynomial poly(r).
207//
208// Step 3: Reconstruction
209//
210// Finally, logl( X ) = logl( X+1 ) is given by
211//
212// logl( X ) = logl( 2^N * (S_hi + S_lo) )
213// ~=~ N*logl(2) + logl(1/G) + logl(1 + r)
214// ~=~ N*logl(2) + logl(1/G) + poly(r).
215//
216// For detailed description see logl or log1pl function, regular path.
217//
218// Registers used
219//==============================================================
0347518d 220// Floating Point registers used:
d5efd131
MF
221// f8, input
222// f32 -> f95 (64 registers)
223
0347518d 224// General registers used:
d5efd131
MF
225// r32 -> r67 (36 registers)
226
227// Predicate registers used:
228// p7 -> p11
229// p7 for 'NaNs, Inf' path
230// p8 for 'near 1' path
231// p9 for 'huges' path
0347518d 232// p10 for x = 1
d5efd131
MF
233// p11 for x < 1
234//
235//*********************************************************************
236// IEEE Special Conditions:
237//
238// acoshl(+inf) = +inf
0347518d
MF
239// acoshl(-inf) = QNaN
240// acoshl(1) = 0
d5efd131
MF
241// acoshl(x<1) = QNaN
242// acoshl(SNaN) = QNaN
243// acoshl(QNaN) = QNaN
244//
245
246// Data tables
247//==============================================================
0347518d 248
d5efd131
MF
249RODATA
250.align 64
251
6f65e668 252// Near 1 path rational approximation coefficients
d5efd131 253LOCAL_OBJECT_START(Poly_P)
0347518d
MF
254data8 0xB0978143F695D40F, 0x3FF1 // .84205539791447100108478906277453574946e-4
255data8 0xB9800D841A8CAD29, 0x3FF6 // .28305085180397409672905983082168721069e-2
256data8 0xC889F455758C1725, 0x3FF9 // .24479844297887530847660233111267222945e-1
257data8 0x9BE1DFF006F45F12, 0x3FFB // .76114415657565879842941751209926938306e-1
258data8 0x9E34AF4D372861E0, 0x3FFB // .77248925727776366270605984806795850504e-1
259data8 0xF3DC502AEE14C4AE, 0x3FA6 // .3077953476682583606615438814166025592e-26
d5efd131
MF
260LOCAL_OBJECT_END(Poly_P)
261
262//
263LOCAL_OBJECT_START(Poly_Q)
0347518d
MF
264data8 0xF76E3FD3C7680357, 0x3FF1 // .11798413344703621030038719253730708525e-3
265data8 0xD107D2E7273263AE, 0x3FF7 // .63791065024872525660782716786703188820e-2
266data8 0xB609BE5CDE206AEF, 0x3FFB // .88885771950814004376363335821980079985e-1
267data8 0xF7DEACAC28067C8A, 0x3FFD // .48412074662702495416825113623936037072302
268data8 0x8F9BE5890CEC7E38, 0x3FFF // 1.1219450873557867470217771071068369729526
269data8 0xED4F06F3D2BC92D1, 0x3FFE // .92698710873331639524734537734804056798748
d5efd131
MF
270LOCAL_OBJECT_END(Poly_Q)
271
0347518d 272// Q coeffs
d5efd131 273LOCAL_OBJECT_START(Constants_Q)
0347518d 274data4 0x00000000,0xB1721800,0x00003FFE,0x00000000
d5efd131
MF
275data4 0x4361C4C6,0x82E30865,0x0000BFE2,0x00000000
276data4 0x328833CB,0xCCCCCAF2,0x00003FFC,0x00000000
277data4 0xA9D4BAFB,0x80000077,0x0000BFFD,0x00000000
278data4 0xAAABE3D2,0xAAAAAAAA,0x00003FFD,0x00000000
0347518d 279data4 0xFFFFDAB7,0xFFFFFFFF,0x0000BFFD,0x00000000
d5efd131
MF
280LOCAL_OBJECT_END(Constants_Q)
281
282// Z1 - 16 bit fixed
283LOCAL_OBJECT_START(Constants_Z_1)
284data4 0x00008000
285data4 0x00007879
286data4 0x000071C8
287data4 0x00006BCB
288data4 0x00006667
289data4 0x00006187
290data4 0x00005D18
291data4 0x0000590C
292data4 0x00005556
293data4 0x000051EC
294data4 0x00004EC5
295data4 0x00004BDB
296data4 0x00004925
297data4 0x0000469F
298data4 0x00004445
299data4 0x00004211
300LOCAL_OBJECT_END(Constants_Z_1)
301
302// G1 and H1 - IEEE single and h1 - IEEE double
303LOCAL_OBJECT_START(Constants_G_H_h1)
304data4 0x3F800000,0x00000000
305data8 0x0000000000000000
306data4 0x3F70F0F0,0x3D785196
307data8 0x3DA163A6617D741C
308data4 0x3F638E38,0x3DF13843
309data8 0x3E2C55E6CBD3D5BB
310data4 0x3F579430,0x3E2FF9A0
311data8 0xBE3EB0BFD86EA5E7
312data4 0x3F4CCCC8,0x3E647FD6
313data8 0x3E2E6A8C86B12760
314data4 0x3F430C30,0x3E8B3AE7
315data8 0x3E47574C5C0739BA
316data4 0x3F3A2E88,0x3EA30C68
317data8 0x3E20E30F13E8AF2F
318data4 0x3F321640,0x3EB9CEC8
319data8 0xBE42885BF2C630BD
320data4 0x3F2AAAA8,0x3ECF9927
321data8 0x3E497F3497E577C6
322data4 0x3F23D708,0x3EE47FC5
323data8 0x3E3E6A6EA6B0A5AB
324data4 0x3F1D89D8,0x3EF8947D
325data8 0xBDF43E3CD328D9BE
326data4 0x3F17B420,0x3F05F3A1
327data8 0x3E4094C30ADB090A
328data4 0x3F124920,0x3F0F4303
329data8 0xBE28FBB2FC1FE510
330data4 0x3F0D3DC8,0x3F183EBF
331data8 0x3E3A789510FDE3FA
332data4 0x3F088888,0x3F20EC80
333data8 0x3E508CE57CC8C98F
334data4 0x3F042108,0x3F29516A
335data8 0xBE534874A223106C
336LOCAL_OBJECT_END(Constants_G_H_h1)
337
338// Z2 - 16 bit fixed
339LOCAL_OBJECT_START(Constants_Z_2)
340data4 0x00008000
341data4 0x00007F81
342data4 0x00007F02
343data4 0x00007E85
344data4 0x00007E08
345data4 0x00007D8D
346data4 0x00007D12
347data4 0x00007C98
348data4 0x00007C20
349data4 0x00007BA8
350data4 0x00007B31
351data4 0x00007ABB
352data4 0x00007A45
353data4 0x000079D1
354data4 0x0000795D
355data4 0x000078EB
356LOCAL_OBJECT_END(Constants_Z_2)
357
358// G2 and H2 - IEEE single and h2 - IEEE double
359LOCAL_OBJECT_START(Constants_G_H_h2)
360data4 0x3F800000,0x00000000
361data8 0x0000000000000000
362data4 0x3F7F00F8,0x3B7F875D
363data8 0x3DB5A11622C42273
364data4 0x3F7E03F8,0x3BFF015B
365data8 0x3DE620CF21F86ED3
366data4 0x3F7D08E0,0x3C3EE393
367data8 0xBDAFA07E484F34ED
368data4 0x3F7C0FC0,0x3C7E0586
369data8 0xBDFE07F03860BCF6
370data4 0x3F7B1880,0x3C9E75D2
371data8 0x3DEA370FA78093D6
372data4 0x3F7A2328,0x3CBDC97A
373data8 0x3DFF579172A753D0
374data4 0x3F792FB0,0x3CDCFE47
375data8 0x3DFEBE6CA7EF896B
376data4 0x3F783E08,0x3CFC15D0
377data8 0x3E0CF156409ECB43
378data4 0x3F774E38,0x3D0D874D
379data8 0xBE0B6F97FFEF71DF
380data4 0x3F766038,0x3D1CF49B
381data8 0xBE0804835D59EEE8
382data4 0x3F757400,0x3D2C531D
383data8 0x3E1F91E9A9192A74
384data4 0x3F748988,0x3D3BA322
385data8 0xBE139A06BF72A8CD
386data4 0x3F73A0D0,0x3D4AE46F
387data8 0x3E1D9202F8FBA6CF
388data4 0x3F72B9D0,0x3D5A1756
389data8 0xBE1DCCC4BA796223
390data4 0x3F71D488,0x3D693B9D
391data8 0xBE049391B6B7C239
392LOCAL_OBJECT_END(Constants_G_H_h2)
393
0347518d 394// G3 and H3 - IEEE single and h3 - IEEE double
d5efd131
MF
395LOCAL_OBJECT_START(Constants_G_H_h3)
396data4 0x3F7FFC00,0x38800100
397data8 0x3D355595562224CD
398data4 0x3F7FF400,0x39400480
399data8 0x3D8200A206136FF6
400data4 0x3F7FEC00,0x39A00640
401data8 0x3DA4D68DE8DE9AF0
402data4 0x3F7FE400,0x39E00C41
403data8 0xBD8B4291B10238DC
404data4 0x3F7FDC00,0x3A100A21
405data8 0xBD89CCB83B1952CA
406data4 0x3F7FD400,0x3A300F22
407data8 0xBDB107071DC46826
408data4 0x3F7FCC08,0x3A4FF51C
409data8 0x3DB6FCB9F43307DB
410data4 0x3F7FC408,0x3A6FFC1D
411data8 0xBD9B7C4762DC7872
412data4 0x3F7FBC10,0x3A87F20B
413data8 0xBDC3725E3F89154A
414data4 0x3F7FB410,0x3A97F68B
415data8 0xBD93519D62B9D392
416data4 0x3F7FAC18,0x3AA7EB86
417data8 0x3DC184410F21BD9D
418data4 0x3F7FA420,0x3AB7E101
419data8 0xBDA64B952245E0A6
420data4 0x3F7F9C20,0x3AC7E701
421data8 0x3DB4B0ECAABB34B8
422data4 0x3F7F9428,0x3AD7DD7B
423data8 0x3D9923376DC40A7E
424data4 0x3F7F8C30,0x3AE7D474
425data8 0x3DC6E17B4F2083D3
426data4 0x3F7F8438,0x3AF7CBED
427data8 0x3DAE314B811D4394
428data4 0x3F7F7C40,0x3B03E1F3
429data8 0xBDD46F21B08F2DB1
430data4 0x3F7F7448,0x3B0BDE2F
431data8 0xBDDC30A46D34522B
432data4 0x3F7F6C50,0x3B13DAAA
433data8 0x3DCB0070B1F473DB
434data4 0x3F7F6458,0x3B1BD766
435data8 0xBDD65DDC6AD282FD
436data4 0x3F7F5C68,0x3B23CC5C
437data8 0xBDCDAB83F153761A
438data4 0x3F7F5470,0x3B2BC997
439data8 0xBDDADA40341D0F8F
440data4 0x3F7F4C78,0x3B33C711
441data8 0x3DCD1BD7EBC394E8
442data4 0x3F7F4488,0x3B3BBCC6
443data8 0xBDC3532B52E3E695
444data4 0x3F7F3C90,0x3B43BAC0
445data8 0xBDA3961EE846B3DE
446data4 0x3F7F34A0,0x3B4BB0F4
447data8 0xBDDADF06785778D4
448data4 0x3F7F2CA8,0x3B53AF6D
449data8 0x3DCC3ED1E55CE212
450data4 0x3F7F24B8,0x3B5BA620
451data8 0xBDBA31039E382C15
452data4 0x3F7F1CC8,0x3B639D12
453data8 0x3D635A0B5C5AF197
454data4 0x3F7F14D8,0x3B6B9444
455data8 0xBDDCCB1971D34EFC
456data4 0x3F7F0CE0,0x3B7393BC
457data8 0x3DC7450252CD7ADA
458data4 0x3F7F04F0,0x3B7B8B6D
459data8 0xBDB68F177D7F2A42
460LOCAL_OBJECT_END(Constants_G_H_h3)
461
462// Assembly macros
463//==============================================================
464
465// Floating Point Registers
466
467FR_Arg = f8
468FR_Res = f8
469
470
471FR_PP0 = f32
472FR_PP1 = f33
473FR_PP2 = f34
474FR_PP3 = f35
475FR_PP4 = f36
476FR_PP5 = f37
477FR_QQ0 = f38
478FR_QQ1 = f39
479FR_QQ2 = f40
480FR_QQ3 = f41
481FR_QQ4 = f42
482FR_QQ5 = f43
483
0347518d
MF
484FR_Q1 = f44
485FR_Q2 = f45
486FR_Q3 = f46
487FR_Q4 = f47
d5efd131
MF
488
489FR_Half = f48
490FR_Two = f49
491
0347518d
MF
492FR_log2_hi = f50
493FR_log2_lo = f51
d5efd131
MF
494
495
496FR_X2 = f52
497FR_M2 = f53
498FR_M2L = f54
499FR_Rcp = f55
500FR_GG = f56
501FR_HH = f57
502FR_EE = f58
503FR_DD = f59
504FR_GL = f60
505FR_Tmp = f61
506
507
508FR_XM1 = f62
509FR_2XM1 = f63
510FR_XM12 = f64
511
512
513
514 // Special logl registers
0347518d
MF
515FR_XLog_Hi = f65
516FR_XLog_Lo = f66
d5efd131 517
0347518d 518FR_Y_hi = f67
d5efd131
MF
519FR_Y_lo = f68
520
0347518d
MF
521FR_S_hi = f69
522FR_S_lo = f70
d5efd131
MF
523
524FR_poly_lo = f71
525FR_poly_hi = f72
526
527FR_G = f73
528FR_H = f74
529FR_h = f75
530
531FR_G2 = f76
532FR_H2 = f77
0347518d 533FR_h2 = f78
d5efd131 534
0347518d
MF
535FR_r = f79
536FR_rsq = f80
537FR_rcub = f81
d5efd131 538
0347518d 539FR_float_N = f82
d5efd131 540
0347518d
MF
541FR_G3 = f83
542FR_H3 = f84
543FR_h3 = f85
d5efd131 544
0347518d 545FR_2_to_minus_N = f86
d5efd131
MF
546
547
548 // Near 1 registers
549FR_PP = f65
550FR_QQ = f66
551
552
553FR_PV6 = f69
554FR_PV4 = f70
555FR_PV3 = f71
556FR_PV2 = f72
557
558FR_QV6 = f73
559FR_QV4 = f74
560FR_QV3 = f75
561FR_QV2 = f76
562
563FR_Y0 = f77
0347518d 564FR_Q0 = f78
d5efd131
MF
565FR_E0 = f79
566FR_E2 = f80
567FR_E1 = f81
568FR_Y1 = f82
569FR_E3 = f83
570FR_Y2 = f84
571FR_R0 = f85
572FR_E4 = f86
573FR_Y3 = f87
574FR_R1 = f88
575FR_X_Hi = f89
576FR_X_lo = f90
577
578FR_HH = f91
579FR_LL = f92
580FR_HL = f93
581FR_LH = f94
582
583
584
585 // Error handler registers
586FR_Arg_X = f95
587FR_Arg_Y = f0
588
589
590// General Purpose Registers
591
592 // General prolog registers
593GR_PFS = r32
594GR_OneP125 = r33
595GR_TwoP63 = r34
596GR_Arg = r35
597GR_Half = r36
598
599 // Near 1 path registers
600GR_Poly_P = r37
601GR_Poly_Q = r38
602
603 // Special logl registers
0347518d
MF
604GR_Index1 = r39
605GR_Index2 = r40
606GR_signif = r41
607GR_X_0 = r42
608GR_X_1 = r43
609GR_X_2 = r44
d5efd131 610GR_minus_N = r45
0347518d
MF
611GR_Z_1 = r46
612GR_Z_2 = r47
613GR_N = r48
614GR_Bias = r49
615GR_M = r50
616GR_Index3 = r51
617GR_exp_2tom80 = r52
618GR_exp_mask = r53
619GR_exp_2tom7 = r54
620GR_ad_ln10 = r55
d5efd131
MF
621GR_ad_tbl_1 = r56
622GR_ad_tbl_2 = r57
623GR_ad_tbl_3 = r58
624GR_ad_q = r59
625GR_ad_z_1 = r60
626GR_ad_z_2 = r61
627GR_ad_z_3 = r62
628
629//
630// Added for unwind support
631//
632GR_SAVE_PFS = r32
633GR_SAVE_B0 = r33
634GR_SAVE_GP = r34
635
636GR_Parameter_X = r64
637GR_Parameter_Y = r65
638GR_Parameter_RESULT = r66
639GR_Parameter_TAG = r67
640
641
642
643.section .text
644GLOBAL_LIBM_ENTRY(acoshl)
645
646{ .mfi
647 alloc GR_PFS = ar.pfs,0,32,4,0 // Local frame allocation
648 fcmp.lt.s1 p11, p0 = FR_Arg, f1 // if arg is less than 1
649 mov GR_Half = 0xfffe // 0.5's exp
650}
651{ .mfi
652 addl GR_Poly_Q = @ltoff(Poly_Q), gp // Address of Q-coeff table
653 fma.s1 FR_X2 = FR_Arg, FR_Arg, f0 // Obtain x^2
654 addl GR_Poly_P = @ltoff(Poly_P), gp // Address of P-coeff table
0347518d 655};;
d5efd131 656
0347518d 657{ .mfi
6f65e668 658 getf.d GR_Arg = FR_Arg // get argument as double (int64)
d5efd131
MF
659 fma.s0 FR_Two = f1, f1, f1 // construct 2.0
660 addl GR_ad_z_1 = @ltoff(Constants_Z_1#),gp // logl tables
661}
0347518d
MF
662{ .mlx
663 nop.m 0
d5efd131 664 movl GR_TwoP63 = 0x43E8000000000000 // 0.5*2^63 (huge arguments)
0347518d 665};;
d5efd131 666
0347518d 667{ .mfi
d5efd131
MF
668 ld8 GR_Poly_P = [GR_Poly_P] // get actual P-coeff table address
669 fcmp.eq.s1 p10, p0 = FR_Arg, f1 // if arg == 1 (return 0)
670 nop.i 0
671}
0347518d 672{ .mlx
d5efd131
MF
673 ld8 GR_Poly_Q = [GR_Poly_Q] // get actual Q-coeff table address
674 movl GR_OneP125 = 0x3FF2000000000000 // 1.125 (near 1 path bound)
675};;
676
0347518d 677{ .mfi
d5efd131
MF
678 ld8 GR_ad_z_1 = [GR_ad_z_1] // Get pointer to Constants_Z_1
679 fclass.m p7,p0 = FR_Arg, 0xe3 // if arg NaN inf
680 cmp.le p9, p0 = GR_TwoP63, GR_Arg // if arg > 0.5*2^63 ('huges')
681}
682{ .mfb
683 cmp.ge p8, p0 = GR_OneP125, GR_Arg // if arg<1.125 -near 1 path
684 fms.s1 FR_XM1 = FR_Arg, f1, f1 // X0 = X-1 (for near 1 path)
685(p11) br.cond.spnt acoshl_lt_pone // error branch (less than 1)
0347518d 686};;
d5efd131 687
0347518d 688{ .mmi
d5efd131
MF
689 setf.exp FR_Half = GR_Half // construct 0.5
690(p9) setf.s FR_XLog_Lo = r0 // Low of logl arg=0 (Huges path)
691 mov GR_exp_mask = 0x1FFFF // Create exponent mask
0347518d 692};;
d5efd131 693
0347518d 694{ .mmf
d5efd131
MF
695(p8) ldfe FR_PP5 = [GR_Poly_P],16 // Load P5
696(p8) ldfe FR_QQ5 = [GR_Poly_Q],16 // Load Q5
697 fms.s1 FR_M2 = FR_X2, f1, f1 // m2 = x^2 - 1
698};;
699
0347518d 700{ .mfi
d5efd131 701(p8) ldfe FR_QQ4 = [GR_Poly_Q],16 // Load Q4
0347518d 702 fms.s1 FR_M2L = FR_Arg, FR_Arg, FR_X2 // low part of
d5efd131
MF
703 // m2 = fma(X*X - m2)
704 add GR_ad_tbl_1 = 0x040, GR_ad_z_1 // Point to Constants_G_H_h1
705}
706{ .mfb
0347518d 707(p8) ldfe FR_PP4 = [GR_Poly_P],16 // Load P4
d5efd131
MF
708(p7) fma.s0 FR_Res = FR_Arg,f1,FR_Arg // r = a + a (Nan, Inf)
709(p7) br.ret.spnt b0 // return (Nan, Inf)
0347518d 710};;
d5efd131
MF
711
712{ .mfi
713(p8) ldfe FR_PP3 = [GR_Poly_P],16 // Load P3
714 nop.f 0
715 add GR_ad_q = -0x60, GR_ad_z_1 // Point to Constants_P
716}
717{ .mfb
718(p8) ldfe FR_QQ3 = [GR_Poly_Q],16 // Load Q3
719(p9) fms.s1 FR_XLog_Hi = FR_Two, FR_Arg, f1 // Hi of log arg = 2*X-1
720(p9) br.cond.spnt huges_logl // special version of log
721}
0347518d 722;;
d5efd131 723
0347518d 724{ .mfi
d5efd131
MF
725(p8) ldfe FR_PP2 = [GR_Poly_P],16 // Load P2
726(p8) fma.s1 FR_2XM1 = FR_Two, FR_XM1, f0 // 2X0 = 2 * X0
727 add GR_ad_z_2 = 0x140, GR_ad_z_1 // Point to Constants_Z_2
728}
729{ .mfb
730(p8) ldfe FR_QQ2 = [GR_Poly_Q],16 // Load Q2
731(p10) fma.s0 FR_Res = f0,f1,f0 // r = 0 (arg = 1)
0347518d
MF
732(p10) br.ret.spnt b0 // return (arg = 1)
733};;
d5efd131 734
0347518d 735{ .mmi
d5efd131
MF
736(p8) ldfe FR_PP1 = [GR_Poly_P],16 // Load P1
737(p8) ldfe FR_QQ1 = [GR_Poly_Q],16 // Load Q1
738 add GR_ad_tbl_2 = 0x180, GR_ad_z_1 // Point to Constants_G_H_h2
739}
740;;
741
0347518d
MF
742{ .mfi
743(p8) ldfe FR_PP0 = [GR_Poly_P] // Load P0
d5efd131
MF
744 fma.s1 FR_Tmp = f1, f1, FR_M2 // Tmp = 1 + m2
745 add GR_ad_tbl_3 = 0x280, GR_ad_z_1 // Point to Constants_G_H_h3
746}
747{ .mfb
748(p8) ldfe FR_QQ0 = [GR_Poly_Q]
749 nop.f 0
750(p8) br.cond.spnt near_1 // near 1 path
0347518d
MF
751};;
752{ .mfi
d5efd131
MF
753 ldfe FR_log2_hi = [GR_ad_q],16 // Load log2_hi
754 nop.f 0
755 mov GR_Bias = 0x0FFFF // Create exponent bias
756};;
0347518d 757{ .mfi
d5efd131
MF
758 nop.m 0
759 frsqrta.s1 FR_Rcp, p0 = FR_M2 // Rcp = 1/m2 reciprocal appr.
760 nop.i 0
0347518d 761};;
d5efd131
MF
762
763{ .mfi
764 ldfe FR_log2_lo = [GR_ad_q],16 // Load log2_lo
765 fms.s1 FR_Tmp = FR_X2, f1, FR_Tmp // Tmp = x^2 - Tmp
766 nop.i 0
767};;
768
769{ .mfi
770 ldfe FR_Q4 = [GR_ad_q],16 // Load Q4
771 fma.s1 FR_GG = FR_Rcp, FR_M2, f0 // g = Rcp * m2
772 // 8 bit Newton Raphson iteration
773 nop.i 0
774}
775{ .mfi
0347518d 776 nop.m 0
d5efd131
MF
777 fma.s1 FR_HH = FR_Half, FR_Rcp, f0 // h = 0.5 * Rcp
778 nop.i 0
779};;
780{ .mfi
781 ldfe FR_Q3 = [GR_ad_q],16 // Load Q3
782 fnma.s1 FR_EE = FR_GG, FR_HH, FR_Half // e = 0.5 - g * h
783 nop.i 0
784}
785{ .mfi
0347518d 786 nop.m 0
d5efd131
MF
787 fma.s1 FR_M2L = FR_Tmp, f1, FR_M2L // low part of m2 = Tmp+m2l
788 nop.i 0
789};;
790
791{ .mfi
792 ldfe FR_Q2 = [GR_ad_q],16 // Load Q2
0347518d 793 fma.s1 FR_GG = FR_GG, FR_EE, FR_GG // g = g * e + g
d5efd131
MF
794 // 16 bit Newton Raphson iteration
795 nop.i 0
796}
797{ .mfi
798 nop.m 0
799 fma.s1 FR_HH = FR_HH, FR_EE, FR_HH // h = h * e + h
800 nop.i 0
801};;
802
803{ .mfi
804 ldfe FR_Q1 = [GR_ad_q] // Load Q1
805 fnma.s1 FR_EE = FR_GG, FR_HH, FR_Half // e = 0.5 - g * h
806 nop.i 0
807};;
808{ .mfi
809 nop.m 0
0347518d 810 fma.s1 FR_GG = FR_GG, FR_EE, FR_GG // g = g * e + g
d5efd131
MF
811 // 32 bit Newton Raphson iteration
812 nop.i 0
813}
814{ .mfi
815 nop.m 0
816 fma.s1 FR_HH = FR_HH, FR_EE, FR_HH // h = h * e + h
817 nop.i 0
818};;
819
820{ .mfi
821 nop.m 0
822 fnma.s1 FR_EE = FR_GG, FR_HH, FR_Half // e = 0.5 - g * h
823 nop.i 0
824};;
825
826{ .mfi
827 nop.m 0
0347518d 828 fma.s1 FR_GG = FR_GG, FR_EE, FR_GG // g = g * e + g
d5efd131
MF
829 // 64 bit Newton Raphson iteration
830 nop.i 0
831}
832{ .mfi
833 nop.m 0
834 fma.s1 FR_HH = FR_HH, FR_EE, FR_HH // h = h * e + h
835 nop.i 0
836};;
837
838{ .mfi
839 nop.m 0
840 fnma.s1 FR_DD = FR_GG, FR_GG, FR_M2 // Remainder d = g * g - p2
841 nop.i 0
842}
843{ .mfi
844 nop.m 0
845 fma.s1 FR_XLog_Hi = FR_Arg, f1, FR_GG // bh = z + gh
846 nop.i 0
847};;
848
849{ .mfi
850 nop.m 0
851 fma.s1 FR_DD = FR_DD, f1, FR_M2L // add p2l: d = d + p2l
852 nop.i 0
853};;
854
855{ .mfi
856 getf.sig GR_signif = FR_XLog_Hi // Get significand of x+1
857 nop.f 0
858 mov GR_exp_2tom7 = 0x0fff8 // Exponent of 2^-7
859};;
860
861{ .mfi
862 nop.m 0
863 fma.s1 FR_GL = FR_DD, FR_HH, f0 // gl = d * h
864 extr.u GR_Index1 = GR_signif, 59, 4 // Get high 4 bits of signif
865}
866{ .mfi
867 nop.m 0
868 fma.s1 FR_XLog_Hi = FR_DD, FR_HH, FR_XLog_Hi // bh = bh + gl
869 nop.i 0
870};;
871
872
873
874{ .mmi
875 shladd GR_ad_z_1 = GR_Index1, 2, GR_ad_z_1 // Point to Z_1
876 shladd GR_ad_tbl_1 = GR_Index1, 4, GR_ad_tbl_1 // Point to G_1
877 extr.u GR_X_0 = GR_signif, 49, 15 // Get high 15 bits of signif.
878};;
879
880{ .mmi
881 ld4 GR_Z_1 = [GR_ad_z_1] // Load Z_1
882 nop.m 0
883 nop.i 0
884};;
885
886{ .mmi
887 ldfps FR_G, FR_H = [GR_ad_tbl_1],8 // Load G_1, H_1
888 nop.m 0
889 nop.i 0
890};;
891
892{ .mfi
893 nop.m 0
894 fms.s1 FR_XLog_Lo = FR_Arg, f1, FR_XLog_Hi // bl = x - bh
895 pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15 // Get bits 30-15 of X_0 * Z_1
896};;
897
898// WE CANNOT USE GR_X_1 IN NEXT 3 CYCLES BECAUSE OF POSSIBLE 10 CLOCKS STALL!
899// "DEAD" ZONE!
900
901{ .mfi
902 nop.m 0
903 nop.f 0
904 nop.i 0
905};;
906
907{ .mfi
908 nop.m 0
909 fmerge.se FR_S_hi = f1,FR_XLog_Hi // Form |x+1|
910 nop.i 0
911};;
912
913
914{ .mmi
915 getf.exp GR_N = FR_XLog_Hi // Get N = exponent of x+1
916 ldfd FR_h = [GR_ad_tbl_1] // Load h_1
917 nop.i 0
918};;
919
920{ .mfi
921 nop.m 0
922 nop.f 0
0347518d 923 extr.u GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1
d5efd131
MF
924};;
925
926{ .mfi
927 shladd GR_ad_tbl_2 = GR_Index2, 4, GR_ad_tbl_2 // Point to G_2
928 fma.s1 FR_XLog_Lo = FR_XLog_Lo, f1, FR_GG // bl = bl + gg
929 mov GR_exp_2tom80 = 0x0ffaf // Exponent of 2^-80
930}
931{ .mfi
932 shladd GR_ad_z_2 = GR_Index2, 2, GR_ad_z_2 // Point to Z_2
933 nop.f 0
934 sub GR_N = GR_N, GR_Bias // sub bias from exp
935};;
936
937{ .mmi
938 ldfps FR_G2, FR_H2 = [GR_ad_tbl_2],8 // Load G_2, H_2
939 ld4 GR_Z_2 = [GR_ad_z_2] // Load Z_2
940 sub GR_minus_N = GR_Bias, GR_N // Form exponent of 2^(-N)
941};;
942
943{ .mmi
944 ldfd FR_h2 = [GR_ad_tbl_2] // Load h_2
945 nop.m 0
946 nop.i 0
947};;
948
949{ .mmi
950 setf.sig FR_float_N = GR_N // Put integer N into rightmost sign
951 setf.exp FR_2_to_minus_N = GR_minus_N // Form 2^(-N)
952 pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15 // Get bits 30-15 of X_1 * Z_2
953};;
954
0347518d 955// WE CANNOT USE GR_X_2 IN NEXT 3 CYCLES ("DEAD" ZONE!)
d5efd131
MF
956// BECAUSE OF POSSIBLE 10 CLOCKS STALL!
957// (Just nops added - nothing to do here)
958
959{ .mfi
960 nop.m 0
961 fma.s1 FR_XLog_Lo = FR_XLog_Lo, f1, FR_GL // bl = bl + gl
962 nop.i 0
963};;
964{ .mfi
965 nop.m 0
966 nop.f 0
967 nop.i 0
968};;
969{ .mfi
970 nop.m 0
971 nop.f 0
972 nop.i 0
973};;
974
975{ .mfi
976 nop.m 0
977 nop.f 0
978 extr.u GR_Index3 = GR_X_2, 1, 5 // Extract bits 1-5 of X_2
979};;
980
981{ .mfi
982 shladd GR_ad_tbl_3 = GR_Index3, 4, GR_ad_tbl_3 // Point to G_3
983 nop.f 0
984 nop.i 0
985};;
986
987{ .mfi
988 ldfps FR_G3, FR_H3 = [GR_ad_tbl_3],8 // Load G_3, H_3
989 nop.f 0
990 nop.i 0
991};;
992
993{ .mfi
994 ldfd FR_h3 = [GR_ad_tbl_3] // Load h_3
995 fcvt.xf FR_float_N = FR_float_N
996 nop.i 0
997};;
998
999{ .mfi
1000 nop.m 0
1001 fmpy.s1 FR_G = FR_G, FR_G2 // G = G_1 * G_2
1002 nop.i 0
1003}
1004{ .mfi
1005 nop.m 0
1006 fadd.s1 FR_H = FR_H, FR_H2 // H = H_1 + H_2
1007 nop.i 0
1008};;
1009
1010{ .mfi
1011 nop.m 0
1012 fadd.s1 FR_h = FR_h, FR_h2 // h = h_1 + h_2
1013 nop.i 0
1014}
1015{ .mfi
1016 nop.m 0
1017 fma.s1 FR_S_lo = FR_XLog_Lo, FR_2_to_minus_N, f0 //S_lo=S_lo*2^(-N)
1018 nop.i 0
1019};;
1020
1021{ .mfi
1022 nop.m 0
1023 fmpy.s1 FR_G = FR_G, FR_G3 // G = (G_1 * G_2) * G_3
1024 nop.i 0
1025}
1026{ .mfi
1027 nop.m 0
1028 fadd.s1 FR_H = FR_H, FR_H3 // H = (H_1 + H_2) + H_3
1029 nop.i 0
1030};;
1031
1032{ .mfi
1033 nop.m 0
1034 fadd.s1 FR_h = FR_h, FR_h3 // h = (h_1 + h_2) + h_3
1035 nop.i 0
1036};;
1037
1038{ .mfi
1039 nop.m 0
1040 fms.s1 FR_r = FR_G, FR_S_hi, f1 // r = G * S_hi - 1
1041 nop.i 0
1042}
1043{ .mfi
1044 nop.m 0
1045 fma.s1 FR_Y_hi = FR_float_N, FR_log2_hi, FR_H // Y_hi=N*log2_hi+H
1046 nop.i 0
1047};;
1048
1049{ .mfi
1050 nop.m 0
1051 fma.s1 FR_h = FR_float_N, FR_log2_lo, FR_h // h=N*log2_lo+h
1052 nop.i 0
1053}
1054{ .mfi
1055 nop.m 0
1056 fma.s1 FR_r = FR_G, FR_S_lo, FR_r // r=G*S_lo+(G*S_hi-1)
1057 nop.i 0
1058};;
1059
1060{ .mfi
1061 nop.m 0
1062 fma.s1 FR_poly_lo = FR_r, FR_Q4, FR_Q3 // poly_lo = r * Q4 + Q3
1063 nop.i 0
1064}
1065{ .mfi
1066 nop.m 0
1067 fmpy.s1 FR_rsq = FR_r, FR_r // rsq = r * r
1068 nop.i 0
1069};;
1070
1071{ .mfi
1072 nop.m 0
1073 fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_Q2 // poly_lo=poly_lo*r+Q2
1074 nop.i 0
1075}
1076{ .mfi
1077 nop.m 0
1078 fma.s1 FR_rcub = FR_rsq, FR_r, f0 // rcub = r^3
1079 nop.i 0
1080};;
1081
1082{ .mfi
1083 nop.m 0
1084 fma.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r // poly_hi = Q1*rsq + r
1085 nop.i 0
1086};;
1087
1088{ .mfi
1089 nop.m 0
1090 fma.s1 FR_poly_lo = FR_poly_lo, FR_rcub, FR_h//poly_lo=poly_lo*r^3+h
1091 nop.i 0
1092};;
1093
1094{ .mfi
1095 nop.m 0
0347518d 1096 fadd.s0 FR_Y_lo = FR_poly_hi, FR_poly_lo
d5efd131
MF
1097 // Y_lo=poly_hi+poly_lo
1098 nop.i 0
1099};;
1100
1101{ .mfb
1102 nop.m 0
1103 fadd.s0 FR_Res = FR_Y_lo,FR_Y_hi // Result=Y_lo+Y_hi
1104 br.ret.sptk b0 // Common exit for 2^-7 < x < inf
1105};;
1106
1107
1108huges_logl:
1109{ .mmi
1110 getf.sig GR_signif = FR_XLog_Hi // Get significand of x+1
1111 mov GR_exp_2tom7 = 0x0fff8 // Exponent of 2^-7
1112 nop.i 0
1113};;
1114
1115{ .mfi
1116 add GR_ad_tbl_1 = 0x040, GR_ad_z_1 // Point to Constants_G_H_h1
1117 nop.f 0
1118 add GR_ad_q = -0x60, GR_ad_z_1 // Point to Constants_P
1119}
1120{ .mfi
1121 add GR_ad_z_2 = 0x140, GR_ad_z_1 // Point to Constants_Z_2
1122 nop.f 0
1123 add GR_ad_tbl_2 = 0x180, GR_ad_z_1 // Point to Constants_G_H_h2
1124};;
1125
1126{ .mfi
1127 add GR_ad_tbl_3 = 0x280, GR_ad_z_1 // Point to Constants_G_H_h3
1128 nop.f 0
1129 extr.u GR_Index1 = GR_signif, 59, 4 // Get high 4 bits of signif
1130};;
1131
1132{ .mfi
1133 shladd GR_ad_z_1 = GR_Index1, 2, GR_ad_z_1 // Point to Z_1
1134 nop.f 0
1135 extr.u GR_X_0 = GR_signif, 49, 15 // Get high 15 bits of signif.
1136};;
1137
1138{ .mfi
1139 ld4 GR_Z_1 = [GR_ad_z_1] // Load Z_1
1140 nop.f 0
1141 mov GR_exp_mask = 0x1FFFF // Create exponent mask
1142}
1143{ .mfi
1144 shladd GR_ad_tbl_1 = GR_Index1, 4, GR_ad_tbl_1 // Point to G_1
1145 nop.f 0
1146 mov GR_Bias = 0x0FFFF // Create exponent bias
1147};;
1148
1149{ .mfi
1150 ldfps FR_G, FR_H = [GR_ad_tbl_1],8 // Load G_1, H_1
1151 fmerge.se FR_S_hi = f1,FR_XLog_Hi // Form |x|
1152 nop.i 0
1153};;
1154
1155{ .mmi
1156 getf.exp GR_N = FR_XLog_Hi // Get N = exponent of x+1
1157 ldfd FR_h = [GR_ad_tbl_1] // Load h_1
1158 nop.i 0
1159};;
1160
1161{ .mfi
1162 ldfe FR_log2_hi = [GR_ad_q],16 // Load log2_hi
1163 nop.f 0
1164 pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15 // Get bits 30-15 of X_0 * Z_1
1165};;
1166
1167{ .mmi
1168 ldfe FR_log2_lo = [GR_ad_q],16 // Load log2_lo
0347518d 1169 sub GR_N = GR_N, GR_Bias
d5efd131
MF
1170 mov GR_exp_2tom80 = 0x0ffaf // Exponent of 2^-80
1171};;
1172
1173{ .mfi
1174 ldfe FR_Q4 = [GR_ad_q],16 // Load Q4
1175 nop.f 0
1176 sub GR_minus_N = GR_Bias, GR_N // Form exponent of 2^(-N)
1177};;
1178
1179{ .mmf
1180 ldfe FR_Q3 = [GR_ad_q],16 // Load Q3
1181 setf.sig FR_float_N = GR_N // Put integer N into rightmost sign
1182 nop.f 0
1183};;
1184
1185{ .mmi
1186 ldfe FR_Q2 = [GR_ad_q],16 // Load Q2
1187 nop.m 0
0347518d 1188 extr.u GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1
d5efd131
MF
1189};;
1190
1191{ .mmi
1192 ldfe FR_Q1 = [GR_ad_q] // Load Q1
1193 shladd GR_ad_z_2 = GR_Index2, 2, GR_ad_z_2 // Point to Z_2
1194 nop.i 0
1195};;
1196
1197{ .mmi
1198 ld4 GR_Z_2 = [GR_ad_z_2] // Load Z_2
1199 shladd GR_ad_tbl_2 = GR_Index2, 4, GR_ad_tbl_2 // Point to G_2
1200 nop.i 0
1201};;
1202
1203{ .mmi
1204 ldfps FR_G2, FR_H2 = [GR_ad_tbl_2],8 // Load G_2, H_2
1205 nop.m 0
1206 nop.i 0
1207};;
1208
1209{ .mmf
1210 ldfd FR_h2 = [GR_ad_tbl_2] // Load h_2
1211 setf.exp FR_2_to_minus_N = GR_minus_N // Form 2^(-N)
1212 nop.f 0
1213};;
1214
1215{ .mfi
1216 nop.m 0
1217 nop.f 0
1218 pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15 // Get bits 30-15 of X_1*Z_2
1219};;
1220
0347518d 1221// WE CANNOT USE GR_X_2 IN NEXT 3 CYCLES ("DEAD" ZONE!)
d5efd131
MF
1222// BECAUSE OF POSSIBLE 10 CLOCKS STALL!
1223// (Just nops added - nothing to do here)
1224
1225{ .mfi
1226 nop.m 0
1227 nop.f 0
1228 nop.i 0
1229};;
1230
1231{ .mfi
1232 nop.m 0
1233 nop.f 0
1234 nop.i 0
1235};;
1236
1237{ .mfi
1238 nop.m 0
1239 nop.f 0
1240 nop.i 0
1241};;
1242
1243{ .mfi
1244 nop.m 0
1245 nop.f 0
1246 extr.u GR_Index3 = GR_X_2, 1, 5 // Extract bits 1-5 of X_2
1247};;
1248
1249{ .mfi
1250 shladd GR_ad_tbl_3 = GR_Index3, 4, GR_ad_tbl_3 // Point to G_3
1251 fcvt.xf FR_float_N = FR_float_N
1252 nop.i 0
1253};;
1254
1255{ .mfi
1256 ldfps FR_G3, FR_H3 = [GR_ad_tbl_3],8 // Load G_3, H_3
1257 nop.f 0
1258 nop.i 0
1259};;
1260
1261{ .mfi
1262 ldfd FR_h3 = [GR_ad_tbl_3] // Load h_3
1263 fmpy.s1 FR_G = FR_G, FR_G2 // G = G_1 * G_2
1264 nop.i 0
1265}
1266{ .mfi
1267 nop.m 0
1268 fadd.s1 FR_H = FR_H, FR_H2 // H = H_1 + H_2
1269 nop.i 0
1270};;
1271
1272{ .mmf
1273 nop.m 0
1274 nop.m 0
1275 fadd.s1 FR_h = FR_h, FR_h2 // h = h_1 + h_2
1276};;
1277
1278{ .mfi
1279 nop.m 0
1280 fmpy.s1 FR_G = FR_G, FR_G3 // G = (G_1 * G_2)*G_3
1281 nop.i 0
1282}
1283{ .mfi
1284 nop.m 0
1285 fadd.s1 FR_H = FR_H, FR_H3 // H = (H_1 + H_2)+H_3
1286 nop.i 0
1287};;
1288
1289{ .mfi
1290 nop.m 0
1291 fadd.s1 FR_h = FR_h, FR_h3 // h = (h_1 + h_2) + h_3
1292 nop.i 0
1293};;
1294
1295{ .mfi
1296 nop.m 0
1297 fms.s1 FR_r = FR_G, FR_S_hi, f1 // r = G * S_hi - 1
1298 nop.i 0
1299}
1300{ .mfi
1301 nop.m 0
1302 fma.s1 FR_Y_hi = FR_float_N, FR_log2_hi, FR_H // Y_hi=N*log2_hi+H
1303 nop.i 0
1304};;
1305
1306{ .mfi
1307 nop.m 0
1308 fma.s1 FR_h = FR_float_N, FR_log2_lo, FR_h // h = N*log2_lo+h
1309 nop.i 0
1310};;
1311
1312{ .mfi
1313 nop.m 0
1314 fma.s1 FR_poly_lo = FR_r, FR_Q4, FR_Q3 // poly_lo = r * Q4 + Q3
1315 nop.i 0
1316}
1317{ .mfi
1318 nop.m 0
1319 fmpy.s1 FR_rsq = FR_r, FR_r // rsq = r * r
1320 nop.i 0
1321};;
1322
1323{ .mfi
1324 nop.m 0
1325 fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_Q2 // poly_lo=poly_lo*r+Q2
1326 nop.i 0
1327}
1328{ .mfi
1329 nop.m 0
1330 fma.s1 FR_rcub = FR_rsq, FR_r, f0 // rcub = r^3
1331 nop.i 0
1332};;
1333
1334{ .mfi
1335 nop.m 0
1336 fma.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r // poly_hi = Q1*rsq + r
1337 nop.i 0
1338};;
1339
1340{ .mfi
1341 nop.m 0
1342 fma.s1 FR_poly_lo = FR_poly_lo, FR_rcub, FR_h//poly_lo=poly_lo*r^3+h
1343 nop.i 0
1344};;
1345{ .mfi
1346 nop.m 0
0347518d 1347 fadd.s0 FR_Y_lo = FR_poly_hi, FR_poly_lo // Y_lo=poly_hi+poly_lo
d5efd131
MF
1348 nop.i 0
1349};;
1350{ .mfb
1351 nop.m 0
1352 fadd.s0 FR_Res = FR_Y_lo,FR_Y_hi // Result=Y_lo+Y_hi
1353 br.ret.sptk b0 // Common exit
1354};;
1355
1356
1357// NEAR ONE INTERVAL
1358near_1:
0347518d
MF
1359{ .mfi
1360 nop.m 0
d5efd131 1361 frsqrta.s1 FR_Rcp, p0 = FR_2XM1 // Rcp = 1/x reciprocal appr. &SQRT&
0347518d 1362 nop.i 0
d5efd131
MF
1363};;
1364
0347518d
MF
1365{ .mfi
1366 nop.m 0
d5efd131 1367 fma.s1 FR_PV6 = FR_PP5, FR_XM1, FR_PP4 // pv6 = P5*xm1+P4 $POLY$
0347518d 1368 nop.i 0
d5efd131
MF
1369}
1370{ .mfi
0347518d 1371 nop.m 0
d5efd131 1372 fma.s1 FR_QV6 = FR_QQ5, FR_XM1, FR_QQ4 // qv6 = Q5*xm1+Q4 $POLY$
0347518d 1373 nop.i 0
d5efd131
MF
1374};;
1375
0347518d
MF
1376{ .mfi
1377 nop.m 0
d5efd131 1378 fma.s1 FR_PV4 = FR_PP3, FR_XM1, FR_PP2 // pv4 = P3*xm1+P2 $POLY$
0347518d 1379 nop.i 0
d5efd131
MF
1380}
1381{ .mfi
0347518d 1382 nop.m 0
d5efd131 1383 fma.s1 FR_QV4 = FR_QQ3, FR_XM1, FR_QQ2 // qv4 = Q3*xm1+Q2 $POLY$
0347518d 1384 nop.i 0
d5efd131
MF
1385};;
1386
0347518d
MF
1387{ .mfi
1388 nop.m 0
d5efd131 1389 fma.s1 FR_XM12 = FR_XM1, FR_XM1, f0 // xm1^2 = xm1 * xm1 $POLY$
0347518d 1390 nop.i 0
d5efd131
MF
1391};;
1392
0347518d
MF
1393{ .mfi
1394 nop.m 0
d5efd131 1395 fma.s1 FR_PV2 = FR_PP1, FR_XM1, FR_PP0 // pv2 = P1*xm1+P0 $POLY$
0347518d 1396 nop.i 0
d5efd131
MF
1397}
1398{ .mfi
0347518d 1399 nop.m 0
d5efd131 1400 fma.s1 FR_QV2 = FR_QQ1, FR_XM1, FR_QQ0 // qv2 = Q1*xm1+Q0 $POLY$
0347518d 1401 nop.i 0
d5efd131
MF
1402};;
1403
0347518d
MF
1404{ .mfi
1405 nop.m 0
1406 fma.s1 FR_GG = FR_Rcp, FR_2XM1, f0 // g = Rcp * x &SQRT&
1407 nop.i 0
d5efd131
MF
1408}
1409{ .mfi
0347518d 1410 nop.m 0
d5efd131 1411 fma.s1 FR_HH = FR_Half, FR_Rcp, f0 // h = 0.5 * Rcp &SQRT&
0347518d 1412 nop.i 0
d5efd131
MF
1413};;
1414
1415
0347518d
MF
1416{ .mfi
1417 nop.m 0
d5efd131 1418 fma.s1 FR_PV3 = FR_XM12, FR_PV6, FR_PV4//pv3=pv6*xm1^2+pv4 $POLY$
0347518d 1419 nop.i 0
d5efd131
MF
1420}
1421{ .mfi
0347518d 1422 nop.m 0
d5efd131 1423 fma.s1 FR_QV3 = FR_XM12, FR_QV6, FR_QV4//qv3=qv6*xm1^2+qv4 $POLY$
0347518d 1424 nop.i 0
d5efd131
MF
1425};;
1426
1427
0347518d
MF
1428{ .mfi
1429 nop.m 0
d5efd131 1430 fnma.s1 FR_EE = FR_GG, FR_HH, FR_Half // e = 0.5 - g * h &SQRT&
0347518d 1431 nop.i 0
d5efd131
MF
1432};;
1433
0347518d
MF
1434{ .mfi
1435 nop.m 0
d5efd131 1436 fma.s1 FR_PP = FR_XM12, FR_PV3, FR_PV2 //pp=pv3*xm1^2+pv2 $POLY$
0347518d 1437 nop.i 0
d5efd131
MF
1438}
1439{ .mfi
0347518d 1440 nop.m 0
d5efd131 1441 fma.s1 FR_QQ = FR_XM12, FR_QV3, FR_QV2 //qq=qv3*xm1^2+qv2 $POLY$
0347518d 1442 nop.i 0
d5efd131
MF
1443};;
1444
1445{ .mfi
0347518d 1446 nop.m 0
d5efd131 1447 fma.s1 FR_GG = FR_GG, FR_EE, FR_GG // g = g * e + g &SQRT&
0347518d 1448 nop.i 0
d5efd131
MF
1449}
1450{ .mfi
0347518d 1451 nop.m 0
d5efd131 1452 fma.s1 FR_HH = FR_HH, FR_EE, FR_HH // h = h * e + h &SQRT&
0347518d 1453 nop.i 0
d5efd131
MF
1454};;
1455
1456{ .mfi
0347518d 1457 nop.m 0
d5efd131 1458 frcpa.s1 FR_Y0,p0 = f1,FR_QQ // y = frcpa(b) #DIV#
0347518d 1459 nop.i 0
d5efd131
MF
1460}
1461{ .mfi
0347518d 1462 nop.m 0
d5efd131 1463 fnma.s1 FR_EE = FR_GG, FR_HH, FR_Half // e = 0.5 - g*h &SQRT&
0347518d 1464 nop.i 0
d5efd131
MF
1465};;
1466
1467{ .mfi
0347518d 1468 nop.m 0
d5efd131 1469 fma.s1 FR_Q0 = FR_PP,FR_Y0,f0 // q = a*y #DIV#
0347518d 1470 nop.i 0
d5efd131
MF
1471}
1472{ .mfi
0347518d 1473 nop.m 0
d5efd131 1474 fnma.s1 FR_E0 = FR_Y0,FR_QQ,f1 // e = 1 - b*y #DIV#
0347518d 1475 nop.i 0
d5efd131
MF
1476};;
1477
1478{ .mfi
0347518d
MF
1479 nop.m 0
1480 fma.s1 FR_GG = FR_GG, FR_EE, FR_GG // g = g * e + g &SQRT&
1481 nop.i 0
d5efd131
MF
1482}
1483{ .mfi
0347518d 1484 nop.m 0
d5efd131 1485 fma.s1 FR_HH = FR_HH, FR_EE, FR_HH // h = h * e + h &SQRT&
0347518d 1486 nop.i 0
d5efd131
MF
1487};;
1488
1489{ .mfi
0347518d 1490 nop.m 0
d5efd131 1491 fma.s1 FR_E2 = FR_E0,FR_E0,FR_E0 // e2 = e+e^2 #DIV#
0347518d 1492 nop.i 0
d5efd131
MF
1493}
1494{ .mfi
0347518d 1495 nop.m 0
d5efd131 1496 fma.s1 FR_E1 = FR_E0,FR_E0,f0 // e1 = e^2 #DIV#
0347518d 1497 nop.i 0
d5efd131
MF
1498};;
1499
1500{ .mfi
0347518d 1501 nop.m 0
d5efd131 1502 fnma.s1 FR_EE = FR_GG, FR_HH, FR_Half // e = 0.5 - g * h &SQRT&
0347518d 1503 nop.i 0
d5efd131
MF
1504}
1505{ .mfi
0347518d 1506 nop.m 0
d5efd131 1507 fnma.s1 FR_DD = FR_GG, FR_GG, FR_2XM1 // d = x - g * g &SQRT&
0347518d 1508 nop.i 0
d5efd131
MF
1509};;
1510
1511{ .mfi
0347518d 1512 nop.m 0
d5efd131 1513 fma.s1 FR_Y1 = FR_Y0,FR_E2,FR_Y0 // y1 = y+y*e2 #DIV#
0347518d 1514 nop.i 0
d5efd131
MF
1515}
1516{ .mfi
0347518d 1517 nop.m 0
d5efd131 1518 fma.s1 FR_E3 = FR_E1,FR_E1,FR_E0 // e3 = e+e1^2 #DIV#
0347518d 1519 nop.i 0
d5efd131
MF
1520};;
1521
1522{ .mfi
0347518d 1523 nop.m 0
d5efd131 1524 fma.s1 FR_GG = FR_DD, FR_HH, FR_GG // g = d * h + g &SQRT&
0347518d 1525 nop.i 0
d5efd131
MF
1526}
1527{ .mfi
0347518d 1528 nop.m 0
d5efd131 1529 fma.s1 FR_HH = FR_HH, FR_EE, FR_HH // h = h * e + h &SQRT&
0347518d 1530 nop.i 0
d5efd131
MF
1531};;
1532
1533{ .mfi
0347518d 1534 nop.m 0
d5efd131 1535 fma.s1 FR_Y2 = FR_Y1,FR_E3,FR_Y0 // y2 = y+y1*e3 #DIV#
0347518d 1536 nop.i 0
d5efd131
MF
1537}
1538{ .mfi
0347518d 1539 nop.m 0
d5efd131 1540 fnma.s1 FR_R0 = FR_QQ,FR_Q0,FR_PP // r = a-b*q #DIV#
0347518d 1541 nop.i 0
d5efd131
MF
1542};;
1543
1544{ .mfi
0347518d
MF
1545 nop.m 0
1546 fnma.s1 FR_DD = FR_GG, FR_GG, FR_2XM1 // d = x - g * g &SQRT&
1547 nop.i 0
d5efd131
MF
1548};;
1549
1550{ .mfi
0347518d 1551 nop.m 0
d5efd131 1552 fnma.s1 FR_E4 = FR_QQ,FR_Y2,f1 // e4 = 1-b*y2 #DIV#
0347518d 1553 nop.i 0
d5efd131
MF
1554}
1555{ .mfi
0347518d 1556 nop.m 0
d5efd131 1557 fma.s1 FR_X_Hi = FR_R0,FR_Y2,FR_Q0 // x = q+r*y2 #DIV#
0347518d 1558 nop.i 0
d5efd131
MF
1559};;
1560
1561{ .mfi
0347518d 1562 nop.m 0
d5efd131 1563 fma.s1 FR_GL = FR_DD, FR_HH, f0 // gl = d * h &SQRT&
0347518d 1564 nop.i 0
d5efd131
MF
1565};;
1566
1567{ .mfi
0347518d 1568 nop.m 0
d5efd131 1569 fma.s1 FR_Y3 = FR_Y2,FR_E4,FR_Y2 // y3 = y2+y2*e4 #DIV#
0347518d 1570 nop.i 0
d5efd131
MF
1571}
1572{ .mfi
0347518d 1573 nop.m 0
d5efd131 1574 fnma.s1 FR_R1 = FR_QQ,FR_X_Hi,FR_PP // r1 = a-b*x #DIV#
0347518d 1575 nop.i 0
d5efd131
MF
1576};;
1577
1578{ .mfi
0347518d 1579 nop.m 0
d5efd131 1580 fma.s1 FR_HH = FR_GG, FR_X_Hi, f0 // hh = gg * x_hi
0347518d 1581 nop.i 0
d5efd131
MF
1582}
1583{ .mfi
0347518d 1584 nop.m 0
d5efd131 1585 fma.s1 FR_LH = FR_GL, FR_X_Hi, f0 // lh = gl * x_hi
0347518d 1586 nop.i 0
d5efd131
MF
1587};;
1588
1589{ .mfi
0347518d 1590 nop.m 0
d5efd131 1591 fma.s1 FR_X_lo = FR_R1,FR_Y3,f0 // x_lo = r1*y3 #DIV#
0347518d 1592 nop.i 0
d5efd131
MF
1593};;
1594
1595{ .mfi
0347518d 1596 nop.m 0
d5efd131 1597 fma.s1 FR_LL = FR_GL, FR_X_lo, f0 // ll = gl*x_lo
0347518d 1598 nop.i 0
d5efd131
MF
1599}
1600{ .mfi
0347518d 1601 nop.m 0
d5efd131 1602 fma.s1 FR_HL = FR_GG, FR_X_lo, f0 // hl = gg * x_lo
0347518d 1603 nop.i 0
d5efd131
MF
1604};;
1605
1606{ .mfi
0347518d 1607 nop.m 0
d5efd131 1608 fms.s1 FR_Res = FR_GL, f1, FR_LL // res = gl + ll
0347518d 1609 nop.i 0
d5efd131
MF
1610};;
1611
1612{ .mfi
0347518d 1613 nop.m 0
d5efd131 1614 fms.s1 FR_Res = FR_Res, f1, FR_LH // res = res + lh
0347518d 1615 nop.i 0
d5efd131
MF
1616};;
1617
1618{ .mfi
0347518d 1619 nop.m 0
d5efd131 1620 fms.s1 FR_Res = FR_Res, f1, FR_HL // res = res + hl
0347518d 1621 nop.i 0
d5efd131
MF
1622};;
1623
1624{ .mfi
0347518d 1625 nop.m 0
d5efd131 1626 fms.s1 FR_Res = FR_Res, f1, FR_HH // res = res + hh
0347518d 1627 nop.i 0
d5efd131
MF
1628};;
1629
1630{ .mfb
0347518d 1631 nop.m 0
d5efd131
MF
1632 fma.s0 FR_Res = FR_Res, f1, FR_GG // result = res + gg
1633 br.ret.sptk b0 // Exit for near 1 path
1634};;
1635// NEAR ONE INTERVAL END
1636
1637
1638
1639
1640acoshl_lt_pone:
1641{ .mfi
0347518d 1642 nop.m 0
d5efd131 1643 fmerge.s FR_Arg_X = FR_Arg, FR_Arg
0347518d 1644 nop.i 0
d5efd131
MF
1645};;
1646{ .mfb
1647 mov GR_Parameter_TAG = 135
1648 frcpa.s0 FR_Res,p0 = f0,f0 // get QNaN,and raise invalid
1649 br.cond.sptk __libm_error_region // exit if x < 1.0
1650};;
1651
1652GLOBAL_LIBM_END(acoshl)
1653
1654
1655
1656LOCAL_LIBM_ENTRY(__libm_error_region)
1657.prologue
1658{ .mfi
1659 add GR_Parameter_Y = -32,sp // Parameter 2 value
1660 nop.f 0
1661.save ar.pfs,GR_SAVE_PFS
1662 mov GR_SAVE_PFS = ar.pfs // Save ar.pfs
1663}
1664{ .mfi
1665.fframe 64
1666 add sp = -64,sp // Create new stack
1667 nop.f 0
1668 mov GR_SAVE_GP = gp // Save gp
1669};;
1670
1671{ .mmi
1672 stfe [GR_Parameter_Y] = FR_Arg_Y,16 // Parameter 2 to stack
1673 add GR_Parameter_X = 16,sp // Parameter 1 address
1674.save b0,GR_SAVE_B0
1675 mov GR_SAVE_B0 = b0 // Save b0
1676};;
1677
1678.body
1679{ .mib
1680 stfe [GR_Parameter_X] = FR_Arg_X // Parameter 1 to stack
1681 add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
0347518d 1682 nop.b 0
d5efd131
MF
1683}
1684{ .mib
1685 stfe [GR_Parameter_Y] = FR_Res // Parameter 3 to stack
1686 add GR_Parameter_Y = -16,GR_Parameter_Y
1687 br.call.sptk b0 = __libm_error_support# // Error handling function
1688};;
1689
1690{ .mmi
1691 nop.m 0
1692 nop.m 0
1693 add GR_Parameter_RESULT = 48,sp
1694};;
1695
1696{ .mmi
1697 ldfe f8 = [GR_Parameter_RESULT] // Get return res
1698.restore sp
1699 add sp = 64,sp // Restore stack pointer
1700 mov b0 = GR_SAVE_B0 // Restore return address
1701};;
1702
1703{ .mib
1704 mov gp = GR_SAVE_GP // Restore gp
1705 mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
1706 br.ret.sptk b0 // Return
1707};;
1708
1709LOCAL_LIBM_END(__libm_error_region#)
1710
1711.type __libm_error_support#,@function
1712.global __libm_error_support#
1713
1714
1715
1716