]> git.ipfire.org Git - thirdparty/glibc.git/blame - ports/sysdeps/ia64/fpu/s_atanl.S
Move all files into ports/ subdirectory in preparation for merge with glibc
[thirdparty/glibc.git] / ports / sysdeps / ia64 / fpu / s_atanl.S
CommitLineData
d5efd131
MF
1.file "atanl.s"
2
3
4// Copyright (c) 2000 - 2005, Intel Corporation
5// All rights reserved.
6//
7// Contributed 2000 by the Intel Numerics Group, Intel Corporation
8//
9// Redistribution and use in source and binary forms, with or without
10// modification, are permitted provided that the following conditions are
11// met:
12//
13// * Redistributions of source code must retain the above copyright
14// notice, this list of conditions and the following disclaimer.
15//
16// * Redistributions in binary form must reproduce the above copyright
17// notice, this list of conditions and the following disclaimer in the
18// documentation and/or other materials provided with the distribution.
19//
20// * The name of Intel Corporation may not be used to endorse or promote
21// products derived from this software without specific prior written
22// permission.
23
0347518d
MF
24// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
25// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
d5efd131 26// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
0347518d 27// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
d5efd131 28// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
0347518d
MF
29// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
30// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
31// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
d5efd131 32// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
0347518d
MF
33// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
34// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35//
d5efd131 36// Intel Corporation is the author of this code, and requests that all
0347518d 37// problem reports or change requests be submitted to it directly at
d5efd131
MF
38// http://www.intel.com/software/products/opensource/libraries/num.htm.
39//
40//
41//*********************************************************************
42//
43// History
44// 02/02/00 (hand-optimized)
45// 04/04/00 Unwind support added
46// 08/15/00 Bundle added after call to __libm_error_support to properly
47// set [the previously overwritten] GR_Parameter_RESULT.
48// 03/13/01 Fixed flags when denormal raised on intermediate result
49// 01/08/02 Improved speed.
50// 02/06/02 Corrected .section statement
51// 05/20/02 Cleaned up namespace and sf0 syntax
52// 02/10/03 Reordered header: .section, .global, .proc, .align;
53// used data8 for long double table values
54// 03/31/05 Reformatted delimiters between data tables
55//
56//*********************************************************************
57//
58// Function: atanl(x) = inverse tangent(x), for double extended x values
59// Function: atan2l(y,x) = atan(y/x), for double extended y, x values
60//
61// API
62//
63// long double atanl (long double x)
64// long double atan2l (long double y, long double x)
65//
66//*********************************************************************
67//
68// Resources Used:
69//
70// Floating-Point Registers: f8 (Input and Return Value)
71// f9 (Input for atan2l)
72// f10-f15, f32-f83
73//
74// General Purpose Registers:
75// r32-r51
76// r49-r52 (Arguments to error support for 0,0 case)
77//
78// Predicate Registers: p6-p15
79//
80//*********************************************************************
81//
82// IEEE Special Conditions:
83//
84// Denormal fault raised on denormal inputs
0347518d 85// Underflow exceptions may occur
d5efd131
MF
86// Special error handling for the y=0 and x=0 case
87// Inexact raised when appropriate by algorithm
88//
89// atanl(SNaN) = QNaN
90// atanl(QNaN) = QNaN
91// atanl(+/-0) = +/- 0
0347518d 92// atanl(+/-Inf) = +/-pi/2
d5efd131
MF
93//
94// atan2l(Any NaN for x or y) = QNaN
0347518d
MF
95// atan2l(+/-0,x) = +/-0 for x > 0
96// atan2l(+/-0,x) = +/-pi for x < 0
97// atan2l(+/-0,+0) = +/-0
98// atan2l(+/-0,-0) = +/-pi
d5efd131
MF
99// atan2l(y,+/-0) = pi/2 y > 0
100// atan2l(y,+/-0) = -pi/2 y < 0
101// atan2l(+/-y, Inf) = +/-0 for finite y > 0
0347518d
MF
102// atan2l(+/-Inf, x) = +/-pi/2 for finite x
103// atan2l(+/-y, -Inf) = +/-pi for finite y > 0
d5efd131
MF
104// atan2l(+/-Inf, Inf) = +/-pi/4
105// atan2l(+/-Inf, -Inf) = +/-3pi/4
106//
107//*********************************************************************
108//
109// Mathematical Description
110// ---------------------------
111//
112// The function ATANL( Arg_Y, Arg_X ) returns the "argument"
113// or the "phase" of the complex number
114//
115// Arg_X + i Arg_Y
116//
117// or equivalently, the angle in radians from the positive
118// x-axis to the line joining the origin and the point
119// (Arg_X,Arg_Y)
120//
121//
122// (Arg_X, Arg_Y) x
123// \
124// \
125// \
126// \
127// \ angle between is ATANL(Arg_Y,Arg_X)
128
129
130
131
132// \
133// ------------------> X-axis
134
135// Origin
136//
137// Moreover, this angle is reported in the range [-pi,pi] thus
138//
139// -pi <= ATANL( Arg_Y, Arg_X ) <= pi.
140//
141// From the geometry, it is easy to define ATANL when one of
142// Arg_X or Arg_Y is +-0 or +-inf:
143//
144//
145// \ Y |
146// X \ | +0 | -0 | +inf | -inf | finite non-zero
147// \ | | | | |
148// ______________________________________________________
149// | | | |
150// +-0 | Invalid/ | pi/2 | -pi/2 | sign(Y)*pi/2
151// | qNaN | | |
152// --------------------------------------------------------
153// | | | | |
154// +inf | +0 | -0 | pi/4 | -pi/4 | sign(Y)*0
155// --------------------------------------------------------
156// | | | | |
157// -inf | +pi | -pi | 3pi/4 | -3pi/4 | sign(Y)*pi
158// --------------------------------------------------------
159// finite | X>0? | pi/2 | -pi/2 | normal case
160// non-zero| sign(Y)*0: | | |
161// | sign(Y)*pi | | |
162//
163//
164// One must take note that ATANL is NOT the arctangent of the
165// value Arg_Y/Arg_X; but rather ATANL and arctan are related
166// in a slightly more complicated way as follows:
167//
168// Let U := max(|Arg_X|, |Arg_Y|); V := min(|Arg_X|, |Arg_Y|);
169// sign_X be the sign bit of Arg_X, i.e., sign_X is 0 or 1;
170// s_X be the sign of Arg_X, i.e., s_X = (-1)^sign_X;
171//
172// sign_Y be the sign bit of Arg_Y, i.e., sign_Y is 0 or 1;
173// s_Y be the sign of Arg_Y, i.e., s_Y = (-1)^sign_Y;
174//
175// swap be 0 if |Arg_X| >= |Arg_Y| and 1 otherwise.
176//
177// Then, ATANL(Arg_Y, Arg_X) =
178//
179// / arctan(V/U) \ sign_X = 0 & swap = 0
180// | pi/2 - arctan(V/U) | sign_X = 0 & swap = 1
181// s_Y * | |
182// | pi - arctan(V/U) | sign_X = 1 & swap = 0
183// \ pi/2 + arctan(V/U) / sign_X = 1 & swap = 1
184//
185//
186// This relationship also suggest that the algorithm's major
187// task is to calculate arctan(V/U) for 0 < V <= U; and the
188// final Result is given by
189//
190// s_Y * { (P_hi + P_lo) + sigma * arctan(V/U) }
191//
192// where
193//
194// (P_hi,P_lo) represents M(sign_X,swap)*(pi/2) accurately
195//
196// M(sign_X,swap) = 0 for sign_X = 0 and swap = 0
197// 1 for swap = 1
198// 2 for sign_X = 1 and swap = 0
199//
200// and
201//
202// sigma = { (sign_X XOR swap) : -1.0 : 1.0 }
203//
204// = (-1) ^ ( sign_X XOR swap )
205//
206// Both (P_hi,P_lo) and sigma can be stored in a table and fetched
207// using (sign_X,swap) as an index. (P_hi, P_lo) can be stored as a
208// double-precision, and single-precision pair; and sigma can
209// obviously be just a single-precision number.
210//
211// In the algorithm we propose, arctan(V/U) is calculated to high accuracy
212// as A_hi + A_lo. Consequently, the Result ATANL( Arg_Y, Arg_X ) is
213// given by
214//
215// s_Y*P_hi + s_Y*sigma*A_hi + s_Y*(sigma*A_lo + P_lo)
216//
217// We now discuss the calculation of arctan(V/U) for 0 < V <= U.
218//
219// For (V/U) < 2^(-3), we use a simple polynomial of the form
220//
221// z + z^3*(P_1 + z^2*(P_2 + z^2*(P_3 + ... + P_8)))
222//
223// where z = V/U.
224//
225// For the sake of accuracy, the first term "z" must approximate V/U to
226// extra precision. For z^3 and higher power, a working precision
227// approximation to V/U suffices. Thus, we obtain:
228//
229// z_hi + z_lo = V/U to extra precision and
230// z = V/U to working precision
231//
232// The value arctan(V/U) is delivered as two pieces (A_hi, A_lo)
233//
234// (A_hi,A_lo) = (z_hi, z^3*(P_1 + ... + P_8) + z_lo).
235//
236//
237// For 2^(-3) <= (V/U) <= 1, we use a table-driven approach.
238// Consider
239//
240// (V/U) = 2^k * 1.b_1 b_2 .... b_63 b_64 b_65 ....
241//
242// Define
243//
244// z_hi = 2^k * 1.b_1 b_2 b_3 b_4 1
245//
246// then
247// / \
248// | (V/U) - z_hi |
249
250// arctan(V/U) = arctan(z_hi) + acrtan| -------------- |
251// | 1 + (V/U)*z_hi |
252// \ /
253//
254// / \
255// | V - z_hi*U |
256
257// = arctan(z_hi) + acrtan| -------------- |
258// | U + V*z_hi |
259// \ /
260//
261// = arctan(z_hi) + acrtan( V' / U' )
262//
263//
264// where
265//
266// V' = V - U*z_hi; U' = U + V*z_hi.
267//
268// Let
269//
270// w_hi + w_lo = V'/U' to extra precision and
271// w = V'/U' to working precision
272//
273// then we can approximate arctan(V'/U') by
274//
275// arctan(V'/U') = w_hi + w_lo
276// + w^3*(Q_1 + w^2*(Q_2 + w^2*(Q_3 + w^2*Q_4)))
277//
278// = w_hi + w_lo + poly
279//
280// Finally, arctan(z_hi) is calculated beforehand and stored in a table
281// as Tbl_hi, Tbl_lo. Thus,
282//
283// (A_hi, A_lo) = (Tbl_hi, w_hi+(poly+(w_lo+Tbl_lo)))
284//
285// This completes the mathematical description.
286//
287//
288// Algorithm
289// -------------
290//
291// Step 0. Check for unsupported format.
292//
293// If
294// ( expo(Arg_X) not zero AND msb(Arg_X) = 0 ) OR
295// ( expo(Arg_Y) not zero AND msb(Arg_Y) = 0 )
296//
297// then one of the arguments is unsupported. Generate an
298// invalid and return qNaN.
299//
300// Step 1. Initialize
301//
302// Normalize Arg_X and Arg_Y and set the following
303//
304// sign_X := sign_bit(Arg_X)
305// s_Y := (sign_bit(Arg_Y)==0? 1.0 : -1.0)
306// swap := (|Arg_X| >= |Arg_Y|? 0 : 1 )
307// U := max( |Arg_X|, |Arg_Y| )
308// V := min( |Arg_X|, |Arg_Y| )
309//
310// execute: frcpa E, pred, V, U
311// If pred is 0, go to Step 5 for special cases handling.
312//
313// Step 2. Decide on branch.
314//
315// Q := E * V
316// If Q < 2^(-3) go to Step 4 for simple polynomial case.
317//
318// Step 3. Table-driven algorithm.
319//
320// Q is represented as
321//
322// 2^(-k) * 1.b_1 b_2 b_3 ... b_63; k = 0,-1,-2,-3
323//
324// and that if k = 0, b_1 = b_2 = b_3 = b_4 = 0.
325//
326// Define
327//
328// z_hi := 2^(-k) * 1.b_1 b_2 b_3 b_4 1
329//
330// (note that there are 49 possible values of z_hi).
331//
332// ...We now calculate V' and U'. While V' is representable
333// ...as a 64-bit number because of cancellation, U' is
334// ...not in general a 64-bit number. Obtaining U' accurately
335// ...requires two working precision numbers
336//
337// U_prime_hi := U + V * z_hi ...WP approx. to U'
338// U_prime_lo := ( U - U_prime_hi ) + V*z_hi ...observe order
339// V_prime := V - U * z_hi ...this is exact
340//
341// C_hi := frcpa (1.0, U_prime_hi) ...C_hi approx 1/U'_hi
342//
343// loop 3 times
344// C_hi := C_hi + C_hi*(1.0 - C_hi*U_prime_hi)
345//
346// ...at this point C_hi is (1/U_prime_hi) to roughly 64 bits
347//
348// w_hi := V_prime * C_hi ...w_hi is V_prime/U_prime to
349// ...roughly working precision
350//
351// ...note that we want w_hi + w_lo to approximate
352// ...V_prime/(U_prime_hi + U_prime_lo) to extra precision
353// ...but for now, w_hi is good enough for the polynomial
354// ...calculation.
355//
356// wsq := w_hi*w_hi
357// poly := w_hi*wsq*(Q_1 + wsq*(Q_2 + wsq*(Q_3 + wsq*Q_4)))
358//
359// Fetch
360// (Tbl_hi, Tbl_lo) = atan(z_hi) indexed by (k,b_1,b_2,b_3,b_4)
361// ...Tbl_hi is a double-precision number
362// ...Tbl_lo is a single-precision number
363//
364// (P_hi, P_lo) := M(sign_X,swap)*(Pi_by_2_hi, Pi_by_2_lo)
365// ...as discussed previous. Again; the implementation can
366// ...chose to fetch P_hi and P_lo from a table indexed by
367// ...(sign_X, swap).
368// ...P_hi is a double-precision number;
369// ...P_lo is a single-precision number.
370//
371// ...calculate w_lo so that w_hi + w_lo is V'/U' accurately
372// w_lo := ((V_prime - w_hi*U_prime_hi) -
373// w_hi*U_prime_lo) * C_hi ...observe order
374//
375//
376// ...Ready to deliver arctan(V'/U') as A_hi, A_lo
377// A_hi := Tbl_hi
378// A_lo := w_hi + (poly + (Tbl_lo + w_lo)) ...observe order
379//
380// ...Deliver final Result
381// ...s_Y*P_hi + s_Y*sigma*A_hi + s_Y*(sigma*A_lo + P_lo)
382//
383// sigma := ( (sign_X XOR swap) ? -1.0 : 1.0 )
384// ...sigma can be obtained by a table lookup using
385// ...(sign_X,swap) as index and stored as single precision
386// ...sigma should be calculated earlier
387//
388// P_hi := s_Y*P_hi
389// A_hi := s_Y*A_hi
390//
391// Res_hi := P_hi + sigma*A_hi ...this is exact because
392// ...both P_hi and Tbl_hi
393// ...are double-precision
394// ...and |Tbl_hi| > 2^(-4)
395// ...P_hi is either 0 or
396// ...between (1,4)
397//
398// Res_lo := sigma*A_lo + P_lo
399//
400// Return Res_hi + s_Y*Res_lo in user-defined rounding control
401//
402// Step 4. Simple polynomial case.
403//
404// ...E and Q are inherited from Step 2.
405//
406// A_hi := Q ...Q is inherited from Step 2 Q approx V/U
407//
408// loop 3 times
409// E := E + E2(1.0 - E*U1
410// ...at this point E approximates 1/U to roughly working precision
411//
412// z := V * E ...z approximates V/U to roughly working precision
413// zsq := z * z
414// z4 := zsq * zsq; z8 := z4 * z4
415//
416// poly1 := P_4 + zsq*(P_5 + zsq*(P_6 + zsq*(P_7 + zsq*P_8)))
417// poly2 := zsq*(P_1 + zsq*(P_2 + zsq*P_3))
418//
419// poly := poly1 + z8*poly2
420//
421// z_lo := (V - A_hi*U)*E
422//
423// A_lo := z*poly + z_lo
424// ...A_hi, A_lo approximate arctan(V/U) accurately
425//
426// (P_hi, P_lo) := M(sign_X,swap)*(Pi_by_2_hi, Pi_by_2_lo)
427// ...one can store the M(sign_X,swap) as single precision
428// ...values
429//
430// ...Deliver final Result
431// ...s_Y*P_hi + s_Y*sigma*A_hi + s_Y*(sigma*A_lo + P_lo)
432//
433// sigma := ( (sign_X XOR swap) ? -1.0 : 1.0 )
434// ...sigma can be obtained by a table lookup using
435// ...(sign_X,swap) as index and stored as single precision
436// ...sigma should be calculated earlier
437//
438// P_hi := s_Y*P_hi
439// A_hi := s_Y*A_hi
440//
441// Res_hi := P_hi + sigma*A_hi ...need to compute
442// ...P_hi + sigma*A_hi
443// ...exactly
444//
445// tmp := (P_hi - Res_hi) + sigma*A_hi
446//
447// Res_lo := s_Y*(sigma*A_lo + P_lo) + tmp
448//
449// Return Res_hi + Res_lo in user-defined rounding control
450//
451// Step 5. Special Cases
452//
453// These are detected early in the function by fclass instructions.
454//
455// We are in one of those special cases when X or Y is 0,+-inf or NaN
456//
457// If one of X and Y is NaN, return X+Y (which will generate
458// invalid in case one is a signaling NaN). Otherwise,
459// return the Result as described in the table
460//
461//
462//
463// \ Y |
464// X \ | +0 | -0 | +inf | -inf | finite non-zero
465// \ | | | | |
466// ______________________________________________________
467// | | | |
468// +-0 | Invalid/ | pi/2 | -pi/2 | sign(Y)*pi/2
469// | qNaN | | |
470// --------------------------------------------------------
471// | | | | |
472// +inf | +0 | -0 | pi/4 | -pi/4 | sign(Y)*0
473// --------------------------------------------------------
474// | | | | |
475// -inf | +pi | -pi | 3pi/4 | -3pi/4 | sign(Y)*pi
476// --------------------------------------------------------
477// finite | X>0? | pi/2 | -pi/2 |
478// non-zero| sign(Y)*0: | | | N/A
479// | sign(Y)*pi | | |
480//
481//
482
483ArgY_orig = f8
484Result = f8
485FR_RESULT = f8
486ArgX_orig = f9
487ArgX = f10
488FR_X = f10
489ArgY = f11
490FR_Y = f11
491s_Y = f12
492U = f13
493V = f14
494E = f15
495Q = f32
496z_hi = f33
497U_prime_hi = f34
498U_prime_lo = f35
499V_prime = f36
500C_hi = f37
501w_hi = f38
502w_lo = f39
503wsq = f40
504poly = f41
505Tbl_hi = f42
506Tbl_lo = f43
507P_hi = f44
508P_lo = f45
509A_hi = f46
510A_lo = f47
511sigma = f48
512Res_hi = f49
513Res_lo = f50
514Z = f52
515zsq = f53
516z4 = f54
517z8 = f54
518poly1 = f55
519poly2 = f56
520z_lo = f57
521tmp = f58
522P_1 = f59
523Q_1 = f60
524P_2 = f61
525Q_2 = f62
526P_3 = f63
527Q_3 = f64
528P_4 = f65
529Q_4 = f66
530P_5 = f67
531P_6 = f68
532P_7 = f69
533P_8 = f70
534U_hold = f71
535TWO_TO_NEG3 = f72
536C_hi_hold = f73
537E_hold = f74
538M = f75
539ArgX_abs = f76
540ArgY_abs = f77
541Result_lo = f78
542A_temp = f79
543FR_temp = f80
544Xsq = f81
545Ysq = f82
546tmp_small = f83
547
548GR_SAVE_PFS = r33
549GR_SAVE_B0 = r34
550GR_SAVE_GP = r35
551sign_X = r36
0347518d
MF
552sign_Y = r37
553swap = r38
554table_ptr1 = r39
555table_ptr2 = r40
556k = r41
557lookup = r42
558exp_ArgX = r43
559exp_ArgY = r44
560exponent_Q = r45
561significand_Q = r46
562special = r47
563sp_exp_Q = r48
564sp_exp_4sig_Q = r49
565table_base = r50
d5efd131
MF
566int_temp = r51
567
568GR_Parameter_X = r49
569GR_Parameter_Y = r50
570GR_Parameter_RESULT = r51
571GR_Parameter_TAG = r52
572GR_temp = r52
573
574RODATA
0347518d 575.align 16
d5efd131
MF
576
577LOCAL_OBJECT_START(Constants_atan)
578// double pi/2
579data8 0x3FF921FB54442D18
580// single lo_pi/2, two**(-3)
581data4 0x248D3132, 0x3E000000
582data8 0xAAAAAAAAAAAAAAA3, 0xBFFD // P_1
583data8 0xCCCCCCCCCCCC54B2, 0x3FFC // P_2
584data8 0x9249249247E4D0C2, 0xBFFC // P_3
585data8 0xE38E38E058870889, 0x3FFB // P_4
586data8 0xBA2E895B290149F8, 0xBFFB // P_5
587data8 0x9D88E6D4250F733D, 0x3FFB // P_6
588data8 0x884E51FFFB8745A0, 0xBFFB // P_7
589data8 0xE1C7412B394396BD, 0x3FFA // P_8
590data8 0xAAAAAAAAAAAAA52F, 0xBFFD // Q_1
591data8 0xCCCCCCCCC75B60D3, 0x3FFC // Q_2
592data8 0x924923AD011F1940, 0xBFFC // Q_3
593data8 0xE36F716D2A5F89BD, 0x3FFB // Q_4
594//
595// Entries Tbl_hi (double precision)
596// B = 1+Index/16+1/32 Index = 0
597// Entries Tbl_lo (single precision)
598// B = 1+Index/16+1/32 Index = 0
599//
0347518d 600data8 0x3FE9A000A935BD8E
d5efd131
MF
601data4 0x23ACA08F, 0x00000000
602//
603// Entries Tbl_hi (double precision) Index = 0,1,...,15
604// B = 2^(-1)*(1+Index/16+1/32)
605// Entries Tbl_lo (single precision)
606// Index = 0,1,...,15 B = 2^(-1)*(1+Index/16+1/32)
607//
0347518d 608data8 0x3FDE77EB7F175A34
d5efd131 609data4 0x238729EE, 0x00000000
0347518d 610data8 0x3FE0039C73C1A40B
d5efd131 611data4 0x249334DB, 0x00000000
0347518d 612data8 0x3FE0C6145B5B43DA
d5efd131 613data4 0x22CBA7D1, 0x00000000
0347518d 614data8 0x3FE1835A88BE7C13
d5efd131 615data4 0x246310E7, 0x00000000
0347518d 616data8 0x3FE23B71E2CC9E6A
d5efd131 617data4 0x236210E5, 0x00000000
0347518d 618data8 0x3FE2EE628406CBCA
d5efd131 619data4 0x2462EAF5, 0x00000000
0347518d 620data8 0x3FE39C391CD41719
d5efd131 621data4 0x24B73EF3, 0x00000000
0347518d 622data8 0x3FE445065B795B55
d5efd131 623data4 0x24C11260, 0x00000000
0347518d 624data8 0x3FE4E8DE5BB6EC04
d5efd131 625data4 0x242519EE, 0x00000000
0347518d 626data8 0x3FE587D81F732FBA
d5efd131 627data4 0x24D4346C, 0x00000000
0347518d 628data8 0x3FE6220D115D7B8D
d5efd131 629data4 0x24ED487B, 0x00000000
0347518d 630data8 0x3FE6B798920B3D98
d5efd131 631data4 0x2495FF1E, 0x00000000
0347518d 632data8 0x3FE748978FBA8E0F
d5efd131 633data4 0x223D9531, 0x00000000
0347518d 634data8 0x3FE7D528289FA093
d5efd131 635data4 0x242B0411, 0x00000000
0347518d 636data8 0x3FE85D69576CC2C5
d5efd131 637data4 0x2335B374, 0x00000000
0347518d 638data8 0x3FE8E17AA99CC05D
d5efd131
MF
639data4 0x24C27CFB, 0x00000000
640//
641// Entries Tbl_hi (double precision) Index = 0,1,...,15
642// B = 2^(-2)*(1+Index/16+1/32)
643// Entries Tbl_lo (single precision)
644// Index = 0,1,...,15 B = 2^(-2)*(1+Index/16+1/32)
645//
0347518d 646data8 0x3FD025FA510665B5
d5efd131
MF
647data4 0x24263482, 0x00000000
648data8 0x3FD1151A362431C9
649data4 0x242C8DC9, 0x00000000
650data8 0x3FD2025567E47C95
651data4 0x245CF9BA, 0x00000000
652data8 0x3FD2ED987A823CFE
653data4 0x235C892C, 0x00000000
654data8 0x3FD3D6D129271134
655data4 0x2389BE52, 0x00000000
656data8 0x3FD4BDEE586890E6
657data4 0x24436471, 0x00000000
658data8 0x3FD5A2E0175E0F4E
659data4 0x2389DBD4, 0x00000000
660data8 0x3FD685979F5FA6FD
661data4 0x2476D43F, 0x00000000
662data8 0x3FD7660752817501
663data4 0x24711774, 0x00000000
664data8 0x3FD84422B8DF95D7
665data4 0x23EBB501, 0x00000000
666data8 0x3FD91FDE7CD0C662
667data4 0x23883A0C, 0x00000000
668data8 0x3FD9F93066168001
669data4 0x240DF63F, 0x00000000
670data8 0x3FDAD00F5422058B
671data4 0x23FE261A, 0x00000000
672data8 0x3FDBA473378624A5
673data4 0x23A8CD0E, 0x00000000
674data8 0x3FDC76550AAD71F8
675data4 0x2422D1D0, 0x00000000
676data8 0x3FDD45AEC9EC862B
677data4 0x2344A109, 0x00000000
678//
679// Entries Tbl_hi (double precision) Index = 0,1,...,15
680// B = 2^(-3)*(1+Index/16+1/32)
681// Entries Tbl_lo (single precision)
682// Index = 0,1,...,15 B = 2^(-3)*(1+Index/16+1/32)
683//
684data8 0x3FC068D584212B3D
685data4 0x239874B6, 0x00000000
686data8 0x3FC1646541060850
687data4 0x2335E774, 0x00000000
688data8 0x3FC25F6E171A535C
689data4 0x233E36BE, 0x00000000
690data8 0x3FC359E8EDEB99A3
691data4 0x239680A3, 0x00000000
692data8 0x3FC453CEC6092A9E
693data4 0x230FB29E, 0x00000000
694data8 0x3FC54D18BA11570A
695data4 0x230C1418, 0x00000000
696data8 0x3FC645BFFFB3AA73
697data4 0x23F0564A, 0x00000000
698data8 0x3FC73DBDE8A7D201
699data4 0x23D4A5E1, 0x00000000
700data8 0x3FC8350BE398EBC7
701data4 0x23D4ADDA, 0x00000000
702data8 0x3FC92BA37D050271
703data4 0x23BCB085, 0x00000000
704data8 0x3FCA217E601081A5
705data4 0x23BC841D, 0x00000000
706data8 0x3FCB1696574D780B
707data4 0x23CF4A8E, 0x00000000
708data8 0x3FCC0AE54D768466
709data4 0x23BECC90, 0x00000000
710data8 0x3FCCFE654E1D5395
711data4 0x2323DCD2, 0x00000000
712data8 0x3FCDF110864C9D9D
713data4 0x23F53F3A, 0x00000000
714data8 0x3FCEE2E1451D980C
715data4 0x23CCB11F, 0x00000000
716//
717data8 0x400921FB54442D18, 0x3CA1A62633145C07 // PI two doubles
718data8 0x3FF921FB54442D18, 0x3C91A62633145C07 // PI_by_2 two dbles
719data8 0x3FE921FB54442D18, 0x3C81A62633145C07 // PI_by_4 two dbles
720data8 0x4002D97C7F3321D2, 0x3C9A79394C9E8A0A // 3PI_by_4 two dbles
721LOCAL_OBJECT_END(Constants_atan)
722
723
724.section .text
725GLOBAL_IEEE754_ENTRY(atanl)
726
727// Use common code with atan2l after setting x=1.0
728{ .mfi
729 alloc r32 = ar.pfs, 0, 17, 4, 0
730 fma.s1 Ysq = ArgY_orig, ArgY_orig, f0 // Form y*y
731 nop.i 999
732}
733{ .mfi
734 addl table_ptr1 = @ltoff(Constants_atan#), gp // Address of table pointer
735 fma.s1 Xsq = f1, f1, f0 // Form x*x
736 nop.i 999
737}
738;;
739
740{ .mfi
741 ld8 table_ptr1 = [table_ptr1] // Get table pointer
742 fnorm.s1 ArgY = ArgY_orig
743 nop.i 999
744}
745{ .mfi
746 nop.m 999
747 fnorm.s1 ArgX = f1
748 nop.i 999
749}
750;;
751
752{ .mfi
753 getf.exp sign_X = f1 // Get signexp of x
754 fmerge.s ArgX_abs = f0, f1 // Form |x|
755 nop.i 999
756}
757{ .mfi
758 nop.m 999
759 fnorm.s1 ArgX_orig = f1
760 nop.i 999
761}
762;;
763
764{ .mfi
765 getf.exp sign_Y = ArgY_orig // Get signexp of y
766 fmerge.s ArgY_abs = f0, ArgY_orig // Form |y|
767 mov table_base = table_ptr1 // Save base pointer to tables
768}
769;;
770
771{ .mfi
772 ldfd P_hi = [table_ptr1],8 // Load double precision hi part of pi
773 fclass.m p8,p0 = ArgY_orig, 0x1e7 // Test y natval, nan, inf, zero
0347518d 774 nop.i 999
d5efd131
MF
775}
776;;
777
778{ .mfi
779 ldfps P_lo, TWO_TO_NEG3 = [table_ptr1], 8 // Load P_lo and constant 2^-3
0347518d
MF
780 nop.f 999
781 nop.i 999
d5efd131
MF
782}
783{ .mfi
784 nop.m 999
785 fma.s1 M = f1, f1, f0 // Set M = 1.0
0347518d 786 nop.i 999
d5efd131
MF
787}
788;;
789
790//
791// Check for everything - if false, then must be pseudo-zero
792// or pseudo-nan (IA unsupporteds).
793//
794{ .mfb
795 nop.m 999
796 fclass.m p0,p12 = f1, 0x1FF // Test x unsupported
797(p8) br.cond.spnt ATANL_Y_SPECIAL // Branch if y natval, nan, inf, zero
798}
799;;
800
801// U = max(ArgX_abs,ArgY_abs)
802// V = min(ArgX_abs,ArgY_abs)
803{ .mfi
804 nop.m 999
805 fcmp.ge.s1 p6,p7 = Xsq, Ysq // Test for |x| >= |y| using squares
0347518d 806 nop.i 999
d5efd131
MF
807}
808{ .mfb
809 nop.m 999
810 fma.s1 V = ArgX_abs, f1, f0 // Set V assuming |x| < |y|
811 br.cond.sptk ATANL_COMMON // Branch to common code
812}
813;;
814
815GLOBAL_IEEE754_END(atanl)
816
817GLOBAL_IEEE754_ENTRY(atan2l)
818
819{ .mfi
820 alloc r32 = ar.pfs, 0, 17, 4, 0
821 fma.s1 Ysq = ArgY_orig, ArgY_orig, f0 // Form y*y
822 nop.i 999
823}
824{ .mfi
825 addl table_ptr1 = @ltoff(Constants_atan#), gp // Address of table pointer
826 fma.s1 Xsq = ArgX_orig, ArgX_orig, f0 // Form x*x
827 nop.i 999
828}
829;;
830
831{ .mfi
832 ld8 table_ptr1 = [table_ptr1] // Get table pointer
833 fnorm.s1 ArgY = ArgY_orig
834 nop.i 999
835}
836{ .mfi
837 nop.m 999
838 fnorm.s1 ArgX = ArgX_orig
839 nop.i 999
840}
841;;
842
843{ .mfi
844 getf.exp sign_X = ArgX_orig // Get signexp of x
845 fmerge.s ArgX_abs = f0, ArgX_orig // Form |x|
846 nop.i 999
847}
848;;
849
850{ .mfi
851 getf.exp sign_Y = ArgY_orig // Get signexp of y
852 fmerge.s ArgY_abs = f0, ArgY_orig // Form |y|
853 mov table_base = table_ptr1 // Save base pointer to tables
854}
855;;
856
857{ .mfi
858 ldfd P_hi = [table_ptr1],8 // Load double precision hi part of pi
859 fclass.m p8,p0 = ArgY_orig, 0x1e7 // Test y natval, nan, inf, zero
0347518d 860 nop.i 999
d5efd131
MF
861}
862;;
863
864{ .mfi
865 ldfps P_lo, TWO_TO_NEG3 = [table_ptr1], 8 // Load P_lo and constant 2^-3
866 fclass.m p9,p0 = ArgX_orig, 0x1e7 // Test x natval, nan, inf, zero
0347518d 867 nop.i 999
d5efd131
MF
868}
869{ .mfi
870 nop.m 999
871 fma.s1 M = f1, f1, f0 // Set M = 1.0
0347518d 872 nop.i 999
d5efd131
MF
873}
874;;
875
876//
877// Check for everything - if false, then must be pseudo-zero
878// or pseudo-nan (IA unsupporteds).
879//
880{ .mfb
881 nop.m 999
882 fclass.m p0,p12 = ArgX_orig, 0x1FF // Test x unsupported
883(p8) br.cond.spnt ATANL_Y_SPECIAL // Branch if y natval, nan, inf, zero
884}
885;;
886
887// U = max(ArgX_abs,ArgY_abs)
888// V = min(ArgX_abs,ArgY_abs)
889{ .mfi
890 nop.m 999
891 fcmp.ge.s1 p6,p7 = Xsq, Ysq // Test for |x| >= |y| using squares
0347518d 892 nop.i 999
d5efd131
MF
893}
894{ .mfb
895 nop.m 999
896 fma.s1 V = ArgX_abs, f1, f0 // Set V assuming |x| < |y|
897(p9) br.cond.spnt ATANL_X_SPECIAL // Branch if x natval, nan, inf, zero
898}
899;;
900
901// Now common code for atanl and atan2l
902ATANL_COMMON:
903{ .mfi
904 nop.m 999
905 fclass.m p0,p13 = ArgY_orig, 0x1FF // Test y unsupported
906 shr sign_X = sign_X, 17 // Get sign bit of x
907}
908{ .mfi
909 nop.m 999
910 fma.s1 U = ArgY_abs, f1, f0 // Set U assuming |x| < |y|
911 adds table_ptr1 = 176, table_ptr1 // Point to Q4
912}
913;;
914
915{ .mfi
916(p6) add swap = r0, r0 // Set swap=0 if |x| >= |y|
917(p6) frcpa.s1 E, p0 = ArgY_abs, ArgX_abs // Compute E if |x| >= |y|
918 shr sign_Y = sign_Y, 17 // Get sign bit of y
919}
920{ .mfb
921 nop.m 999
922(p6) fma.s1 V = ArgY_abs, f1, f0 // Set V if |x| >= |y|
923(p12) br.cond.spnt ATANL_UNSUPPORTED // Branch if x unsupported
924}
925;;
926
927// Set p8 if y >=0
928// Set p9 if y < 0
929// Set p10 if |x| >= |y| and x >=0
930// Set p11 if |x| >= |y| and x < 0
931{ .mfi
932 cmp.eq p8, p9 = 0, sign_Y // Test for y >= 0
933(p7) frcpa.s1 E, p0 = ArgX_abs, ArgY_abs // Compute E if |x| < |y|
934(p7) add swap = 1, r0 // Set swap=1 if |x| < |y|
935}
936{ .mfb
937(p6) cmp.eq.unc p10, p11 = 0, sign_X // If |x| >= |y|, test for x >= 0
938(p6) fma.s1 U = ArgX_abs, f1, f0 // Set U if |x| >= |y|
939(p13) br.cond.spnt ATANL_UNSUPPORTED // Branch if y unsupported
940}
941;;
942
943//
944// if p8, s_Y = 1.0
945// if p9, s_Y = -1.0
946//
947.pred.rel "mutex",p8,p9
948{ .mfi
949 nop.m 999
950(p8) fadd.s1 s_Y = f0, f1 // If y >= 0 set s_Y = 1.0
951 nop.i 999
952}
953{ .mfi
954 nop.m 999
955(p9) fsub.s1 s_Y = f0, f1 // If y < 0 set s_Y = -1.0
956 nop.i 999
957}
958;;
959
960.pred.rel "mutex",p10,p11
961{ .mfi
962 nop.m 999
963(p10) fsub.s1 M = M, f1 // If |x| >= |y| and x >=0, set M=0
964 nop.i 999
965}
966{ .mfi
967 nop.m 999
968(p11) fadd.s1 M = M, f1 // If |x| >= |y| and x < 0, set M=2.0
969 nop.i 999
970}
971;;
972
973{ .mfi
974 nop.m 999
975 fcmp.eq.s0 p0, p9 = ArgX_orig, ArgY_orig // Dummy to set denormal flag
976 nop.i 999
977}
978// *************************************************
979// ********************* STEP2 *********************
980// *************************************************
981//
982// Q = E * V
983//
984{ .mfi
985 nop.m 999
986 fmpy.s1 Q = E, V
987 nop.i 999
988}
989;;
990
991{ .mfi
992 nop.m 999
993 fnma.s1 E_hold = E, U, f1 // E_hold = 1.0 - E*U (1) if POLY path
994 nop.i 999
995}
996;;
997
0347518d 998// Create a single precision representation of the signexp of Q with the
d5efd131
MF
999// 4 most significant bits of the significand followed by a 1 and then 18 0's
1000{ .mfi
1001 nop.m 999
1002 fmpy.s1 P_hi = M, P_hi
1003 dep.z special = 0x1, 18, 1 // Form 0x0000000000040000
1004}
1005{ .mfi
1006 nop.m 999
1007 fmpy.s1 P_lo = M, P_lo
1008 add table_ptr2 = 32, table_ptr1
1009}
1010;;
1011
1012{ .mfi
1013 nop.m 999
1014 fma.s1 A_temp = Q, f1, f0 // Set A_temp if POLY path
1015 nop.i 999
1016}
1017{ .mfi
1018 nop.m 999
1019 fma.s1 E = E, E_hold, E // E = E + E*E_hold (1) if POLY path
1020 nop.i 999
1021}
1022;;
1023
1024//
1025// Is Q < 2**(-3)?
1026// swap = xor(swap,sign_X)
1027//
1028{ .mfi
1029 nop.m 999
1030 fcmp.lt.s1 p9, p0 = Q, TWO_TO_NEG3 // Test Q < 2^-3
1031 xor swap = sign_X, swap
1032}
1033;;
1034
1035// P_hi = s_Y * P_hi
1036{ .mmf
1037 getf.exp exponent_Q = Q // Get signexp of Q
1038 cmp.eq.unc p7, p6 = 0x00000, swap
1039 fmpy.s1 P_hi = s_Y, P_hi
1040}
1041;;
1042
1043//
1044// if (PR_1) sigma = -1.0
1045// if (PR_2) sigma = 1.0
1046//
1047{ .mfi
1048 getf.sig significand_Q = Q // Get significand of Q
1049(p6) fsub.s1 sigma = f0, f1
1050 nop.i 999
1051}
1052{ .mfb
1053(p9) add table_ptr1 = 128, table_base // Point to P8 if POLY path
1054(p7) fadd.s1 sigma = f0, f1
1055(p9) br.cond.spnt ATANL_POLY // Branch to POLY if 0 < Q < 2^-3
1056}
1057;;
1058
1059//
1060// *************************************************
1061// ******************** STEP3 **********************
1062// *************************************************
1063//
1064// lookup = b_1 b_2 b_3 B_4
1065//
1066{ .mmi
1067 nop.m 999
1068 nop.m 999
1069 andcm k = 0x0003, exponent_Q // k=0,1,2,3 for exp_Q=0,-1,-2,-3
1070}
1071;;
1072
1073//
0347518d 1074// Generate sign_exp_Q b_1 b_2 b_3 b_4 1 0 0 0 ... 0 in single precision
d5efd131
MF
1075// representation. Note sign of Q is always 0.
1076//
1077{ .mfi
1078 cmp.eq p8, p9 = 0x0000, k // Test k=0
1079 nop.f 999
1080 extr.u lookup = significand_Q, 59, 4 // Extract b_1 b_2 b_3 b_4 for index
1081}
1082{ .mfi
1083 sub sp_exp_Q = 0x7f, k // Form single prec biased exp of Q
1084 nop.f 999
1085 sub k = k, r0, 1 // Decrement k
1086}
1087;;
1088
1089// Form pointer to B index table
1090{ .mfi
1091 ldfe Q_4 = [table_ptr1], -16 // Load Q_4
1092 nop.f 999
1093(p9) shl k = k, 8 // k = 0, 256, or 512
1094}
1095{ .mfi
1096(p9) shladd table_ptr2 = lookup, 4, table_ptr2
1097 nop.f 999
1098 shladd sp_exp_4sig_Q = sp_exp_Q, 4, lookup // Shift and add in 4 high bits
1099}
1100;;
1101
1102{ .mmi
1103(p8) add table_ptr2 = -16, table_ptr2 // Pointer if original k was 0
1104(p9) add table_ptr2 = k, table_ptr2 // Pointer if k was 1, 2, 3
1105 dep special = sp_exp_4sig_Q, special, 19, 13 // Form z_hi as single prec
1106}
1107;;
1108
1109// z_hi = s exp 1.b_1 b_2 b_3 b_4 1 0 0 0 ... 0
1110{ .mmi
1111 ldfd Tbl_hi = [table_ptr2], 8 // Load Tbl_hi from index table
1112;;
1113 setf.s z_hi = special // Form z_hi
1114 nop.i 999
1115}
1116{ .mmi
1117 ldfs Tbl_lo = [table_ptr2], 8 // Load Tbl_lo from index table
1118;;
1119 ldfe Q_3 = [table_ptr1], -16 // Load Q_3
1120 nop.i 999
1121}
1122;;
1123
1124{ .mmi
1125 ldfe Q_2 = [table_ptr1], -16 // Load Q_2
1126 nop.m 999
1127 nop.i 999
1128}
1129;;
1130
1131{ .mmf
1132 ldfe Q_1 = [table_ptr1], -16 // Load Q_1
1133 nop.m 999
1134 nop.f 999
1135}
1136;;
1137
1138{ .mfi
1139 nop.m 999
1140 fma.s1 U_prime_hi = V, z_hi, U // U_prime_hi = U + V * z_hi
1141 nop.i 999
1142}
1143{ .mfi
1144 nop.m 999
1145 fnma.s1 V_prime = U, z_hi, V // V_prime = V - U * z_hi
1146 nop.i 999
1147}
1148;;
1149
1150{ .mfi
1151 nop.m 999
1152 mov A_hi = Tbl_hi // Start with A_hi = Tbl_hi
1153 nop.i 999
1154}
1155;;
1156
1157{ .mfi
1158 nop.m 999
1159 fsub.s1 U_hold = U, U_prime_hi // U_hold = U - U_prime_hi
1160 nop.i 999
1161}
1162;;
1163
1164{ .mfi
1165 nop.m 999
1166 frcpa.s1 C_hi, p0 = f1, U_prime_hi // C_hi = frcpa(1,U_prime_hi)
1167 nop.i 999
1168}
1169;;
1170
1171{ .mfi
1172 nop.m 999
1173 fmpy.s1 A_hi = s_Y, A_hi // A_hi = s_Y * A_hi
1174 nop.i 999
1175}
1176;;
1177
1178{ .mfi
1179 nop.m 999
1180 fma.s1 U_prime_lo = z_hi, V, U_hold // U_prime_lo = U_hold + V * z_hi
1181 nop.i 999
1182}
1183;;
1184
1185// C_hi_hold = 1 - C_hi * U_prime_hi (1)
1186{ .mfi
1187 nop.m 999
0347518d 1188 fnma.s1 C_hi_hold = C_hi, U_prime_hi, f1
d5efd131
MF
1189 nop.i 999
1190}
1191;;
1192
1193{ .mfi
1194 nop.m 999
1195 fma.s1 Res_hi = sigma, A_hi, P_hi // Res_hi = P_hi + sigma * A_hi
1196 nop.i 999
1197}
1198;;
1199
1200{ .mfi
1201 nop.m 999
1202 fma.s1 C_hi = C_hi_hold, C_hi, C_hi // C_hi = C_hi + C_hi * C_hi_hold (1)
1203 nop.i 999
1204}
1205;;
1206
1207// C_hi_hold = 1 - C_hi * U_prime_hi (2)
1208{ .mfi
1209 nop.m 999
1210 fnma.s1 C_hi_hold = C_hi, U_prime_hi, f1
1211 nop.i 999
1212}
1213;;
1214
1215{ .mfi
1216 nop.m 999
1217 fma.s1 C_hi = C_hi_hold, C_hi, C_hi // C_hi = C_hi + C_hi * C_hi_hold (2)
1218 nop.i 999
1219}
1220;;
1221
1222// C_hi_hold = 1 - C_hi * U_prime_hi (3)
1223{ .mfi
1224 nop.m 999
0347518d 1225 fnma.s1 C_hi_hold = C_hi, U_prime_hi, f1
d5efd131
MF
1226 nop.i 999
1227}
1228;;
1229
1230{ .mfi
1231 nop.m 999
1232 fma.s1 C_hi = C_hi_hold, C_hi, C_hi // C_hi = C_hi + C_hi * C_hi_hold (3)
1233 nop.i 999
1234}
1235;;
1236
1237{ .mfi
1238 nop.m 999
1239 fmpy.s1 w_hi = V_prime, C_hi // w_hi = V_prime * C_hi
1240 nop.i 999
1241}
1242;;
1243
1244{ .mfi
1245 nop.m 999
1246 fmpy.s1 wsq = w_hi, w_hi // wsq = w_hi * w_hi
1247 nop.i 999
1248}
1249{ .mfi
1250 nop.m 999
1251 fnma.s1 w_lo = w_hi, U_prime_hi, V_prime // w_lo = V_prime-w_hi*U_prime_hi
1252 nop.i 999
1253}
1254;;
1255
1256{ .mfi
1257 nop.m 999
1258 fma.s1 poly = wsq, Q_4, Q_3 // poly = Q_3 + wsq * Q_4
1259 nop.i 999
1260}
1261{ .mfi
1262 nop.m 999
1263 fnma.s1 w_lo = w_hi, U_prime_lo, w_lo // w_lo = w_lo - w_hi * U_prime_lo
1264 nop.i 999
1265}
1266;;
1267
1268{ .mfi
1269 nop.m 999
1270 fma.s1 poly = wsq, poly, Q_2 // poly = Q_2 + wsq * poly
1271 nop.i 999
1272}
1273{ .mfi
1274 nop.m 999
1275 fmpy.s1 w_lo = C_hi, w_lo // w_lo = = w_lo * C_hi
1276 nop.i 999
1277}
1278;;
1279
1280{ .mfi
1281 nop.m 999
1282 fma.s1 poly = wsq, poly, Q_1 // poly = Q_1 + wsq * poly
1283 nop.i 999
1284}
1285{ .mfi
1286 nop.m 999
1287 fadd.s1 A_lo = Tbl_lo, w_lo // A_lo = Tbl_lo + w_lo
1288 nop.i 999
1289}
1290;;
1291
1292{ .mfi
1293 nop.m 999
1294 fmpy.s0 Q_1 = Q_1, Q_1 // Dummy operation to raise inexact
1295 nop.i 999
1296}
1297;;
1298
1299{ .mfi
1300 nop.m 999
1301 fmpy.s1 poly = wsq, poly // poly = wsq * poly
1302 nop.i 999
1303}
1304;;
1305
1306{ .mfi
1307 nop.m 999
1308 fmpy.s1 poly = w_hi, poly // poly = w_hi * poly
1309 nop.i 999
1310}
1311;;
1312
1313{ .mfi
1314 nop.m 999
1315 fadd.s1 A_lo = A_lo, poly // A_lo = A_lo + poly
1316 nop.i 999
1317}
1318;;
1319
1320{ .mfi
1321 nop.m 999
1322 fadd.s1 A_lo = A_lo, w_hi // A_lo = A_lo + w_hi
1323 nop.i 999
1324}
1325;;
1326
1327{ .mfi
1328 nop.m 999
1329 fma.s1 Res_lo = sigma, A_lo, P_lo // Res_lo = P_lo + sigma * A_lo
1330 nop.i 999
1331}
1332;;
1333
1334//
1335// Result = Res_hi + Res_lo * s_Y (User Supplied Rounding Mode)
1336//
1337{ .mfb
1338 nop.m 999
1339 fma.s0 Result = Res_lo, s_Y, Res_hi
1340 br.ret.sptk b0 // Exit table path 2^-3 <= V/U < 1
1341}
1342;;
1343
1344
0347518d 1345ATANL_POLY:
d5efd131
MF
1346// Here if 0 < V/U < 2^-3
1347//
1348// ***********************************************
1349// ******************** STEP4 ********************
1350// ***********************************************
1351
1352//
1353// Following:
1354// Iterate 3 times E = E + E*(1.0 - E*U)
1355// Also load P_8, P_7, P_6, P_5, P_4
1356//
1357{ .mfi
1358 ldfe P_8 = [table_ptr1], -16 // Load P_8
1359 fnma.s1 z_lo = A_temp, U, V // z_lo = V - A_temp * U
1360 nop.i 999
1361}
1362{ .mfi
1363 nop.m 999
1364 fnma.s1 E_hold = E, U, f1 // E_hold = 1.0 - E*U (2)
1365 nop.i 999
1366}
1367;;
1368
1369{ .mmi
1370 ldfe P_7 = [table_ptr1], -16 // Load P_7
1371;;
1372 ldfe P_6 = [table_ptr1], -16 // Load P_6
1373 nop.i 999
1374}
1375;;
1376
1377{ .mfi
1378 ldfe P_5 = [table_ptr1], -16 // Load P_5
1379 fma.s1 E = E, E_hold, E // E = E + E_hold*E (2)
1380 nop.i 999
1381}
1382;;
1383
1384{ .mmi
1385 ldfe P_4 = [table_ptr1], -16 // Load P_4
1386;;
1387 ldfe P_3 = [table_ptr1], -16 // Load P_3
1388 nop.i 999
1389}
1390;;
1391
1392{ .mfi
1393 ldfe P_2 = [table_ptr1], -16 // Load P_2
1394 fnma.s1 E_hold = E, U, f1 // E_hold = 1.0 - E*U (3)
1395 nop.i 999
1396}
1397{ .mlx
1398 nop.m 999
1399 movl int_temp = 0x24005 // Signexp for small neg number
1400}
1401;;
1402
1403{ .mmf
1404 ldfe P_1 = [table_ptr1], -16 // Load P_1
1405 setf.exp tmp_small = int_temp // Form small neg number
1406 fma.s1 E = E, E_hold, E // E = E + E_hold*E (3)
1407}
1408;;
1409
1410//
1411//
1412// At this point E approximates 1/U to roughly working precision
1413// Z = V*E approximates V/U
1414//
1415{ .mfi
1416 nop.m 999
1417 fmpy.s1 Z = V, E // Z = V * E
1418 nop.i 999
1419}
1420{ .mfi
1421 nop.m 999
1422 fmpy.s1 z_lo = z_lo, E // z_lo = z_lo * E
1423 nop.i 999
1424}
1425;;
1426
1427//
1428// Now what we want to do is
1429// poly1 = P_4 + zsq*(P_5 + zsq*(P_6 + zsq*(P_7 + zsq*P_8)))
1430// poly2 = zsq*(P_1 + zsq*(P_2 + zsq*P_3))
1431//
1432//
1433// Fixup added to force inexact later -
1434// A_hi = A_temp + z_lo
1435// z_lo = (A_temp - A_hi) + z_lo
1436//
1437{ .mfi
1438 nop.m 999
1439 fmpy.s1 zsq = Z, Z // zsq = Z * Z
1440 nop.i 999
1441}
1442{ .mfi
1443 nop.m 999
1444 fadd.s1 A_hi = A_temp, z_lo // A_hi = A_temp + z_lo
1445 nop.i 999
1446}
1447;;
1448
1449{ .mfi
1450 nop.m 999
1451 fma.s1 poly1 = zsq, P_8, P_7 // poly1 = P_7 + zsq * P_8
1452 nop.i 999
1453}
1454{ .mfi
1455 nop.m 999
1456 fma.s1 poly2 = zsq, P_3, P_2 // poly2 = P_2 + zsq * P_3
1457 nop.i 999
1458}
1459;;
1460
1461{ .mfi
1462 nop.m 999
1463 fmpy.s1 z4 = zsq, zsq // z4 = zsq * zsq
1464 nop.i 999
1465}
1466{ .mfi
1467 nop.m 999
1468 fsub.s1 A_temp = A_temp, A_hi // A_temp = A_temp - A_hi
1469 nop.i 999
1470}
1471;;
1472
1473{ .mfi
1474 nop.m 999
1475 fmerge.s tmp = A_hi, A_hi // Copy tmp = A_hi
1476 nop.i 999
1477}
1478;;
1479
1480{ .mfi
1481 nop.m 999
1482 fma.s1 poly1 = zsq, poly1, P_6 // poly1 = P_6 + zsq * poly1
1483 nop.i 999
1484}
1485{ .mfi
1486 nop.m 999
1487 fma.s1 poly2 = zsq, poly2, P_1 // poly2 = P_2 + zsq * poly2
1488 nop.i 999
1489}
1490;;
1491
1492{ .mfi
1493 nop.m 999
1494 fmpy.s1 z8 = z4, z4 // z8 = z4 * z4
1495 nop.i 999
1496}
1497{ .mfi
1498 nop.m 999
1499 fadd.s1 z_lo = A_temp, z_lo // z_lo = (A_temp - A_hi) + z_lo
1500 nop.i 999
1501}
1502;;
1503
1504{ .mfi
1505 nop.m 999
1506 fma.s1 poly1 = zsq, poly1, P_5 // poly1 = P_5 + zsq * poly1
1507 nop.i 999
1508}
1509{ .mfi
1510 nop.m 999
1511 fmpy.s1 poly2 = poly2, zsq // poly2 = zsq * poly2
1512 nop.i 999
1513}
1514;;
1515
1516// Create small GR double in case need to raise underflow
1517{ .mfi
1518 nop.m 999
1519 fma.s1 poly1 = zsq, poly1, P_4 // poly1 = P_4 + zsq * poly1
1520 dep GR_temp = -1,r0,0,53
1521}
1522;;
1523
1524// Create small double in case need to raise underflow
1525{ .mfi
0347518d 1526 setf.d FR_temp = GR_temp
d5efd131
MF
1527 fma.s1 poly = z8, poly1, poly2 // poly = poly2 + z8 * poly1
1528 nop.i 999
1529}
1530;;
1531
1532{ .mfi
1533 nop.m 999
1534 fma.s1 A_lo = Z, poly, z_lo // A_lo = z_lo + Z * poly
1535 nop.i 999
1536}
1537;;
1538
1539{ .mfi
1540 nop.m 999
1541 fadd.s1 A_hi = tmp, A_lo // A_hi = tmp + A_lo
1542 nop.i 999
1543}
1544;;
1545
1546{ .mfi
1547 nop.m 999
1548 fsub.s1 tmp = tmp, A_hi // tmp = tmp - A_hi
1549 nop.i 999
1550}
1551{ .mfi
1552 nop.m 999
1553 fmpy.s1 A_hi = s_Y, A_hi // A_hi = s_Y * A_hi
1554 nop.i 999
1555}
1556;;
1557
1558{ .mfi
1559 nop.m 999
1560 fadd.s1 A_lo = tmp, A_lo // A_lo = tmp + A_lo
1561 nop.i 999
1562}
1563{ .mfi
1564 nop.m 999
1565 fma.s1 Res_hi = sigma, A_hi, P_hi // Res_hi = P_hi + sigma * A_hi
1566 nop.i 999
1567}
1568;;
1569
1570{ .mfi
1571 nop.m 999
1572 fsub.s1 tmp = P_hi, Res_hi // tmp = P_hi - Res_hi
1573 nop.i 999
1574}
1575;;
1576
1577//
1578// Test if A_lo is zero
1579//
1580{ .mfi
1581 nop.m 999
1582 fclass.m p6,p0 = A_lo, 0x007 // Test A_lo = 0
1583 nop.i 999
1584}
1585;;
1586
1587{ .mfi
1588 nop.m 999
1589(p6) mov A_lo = tmp_small // If A_lo zero, make very small
1590 nop.i 999
1591}
1592;;
1593
1594{ .mfi
1595 nop.m 999
1596 fma.s1 tmp = A_hi, sigma, tmp // tmp = sigma * A_hi + tmp
1597 nop.i 999
1598}
1599{ .mfi
1600 nop.m 999
1601 fma.s1 sigma = A_lo, sigma, P_lo // sigma = A_lo * sigma + P_lo
1602 nop.i 999
1603}
1604;;
1605
1606{ .mfi
1607 nop.m 999
1608 fma.s1 Res_lo = s_Y, sigma, tmp // Res_lo = s_Y * sigma + tmp
1609 nop.i 999
1610}
1611;;
1612
1613//
1614// Test if Res_lo is denormal
1615//
1616{ .mfi
1617 nop.m 999
1618 fclass.m p14, p15 = Res_lo, 0x0b
1619 nop.i 999
1620}
1621;;
1622
1623//
1624// Compute Result = Res_lo + Res_hi. Use s3 if Res_lo is denormal.
1625//
1626{ .mfi
1627 nop.m 999
1628(p14) fadd.s3 Result = Res_lo, Res_hi // Result for Res_lo denormal
1629 nop.i 999
1630}
1631{ .mfi
1632 nop.m 999
1633(p15) fadd.s0 Result = Res_lo, Res_hi // Result for Res_lo normal
1634 nop.i 999
1635}
1636;;
1637
0347518d 1638//
d5efd131 1639// If Res_lo is denormal test if Result equals zero
0347518d 1640//
d5efd131
MF
1641{ .mfi
1642 nop.m 999
1643(p14) fclass.m.unc p14, p0 = Result, 0x07
1644 nop.i 999
1645}
1646;;
1647
1648//
1649// If Res_lo is denormal and Result equals zero, raise inexact, underflow
1650// by squaring small double
1651//
1652{ .mfb
1653 nop.m 999
1654(p14) fmpy.d.s0 FR_temp = FR_temp, FR_temp
1655 br.ret.sptk b0 // Exit POLY path, 0 < Q < 2^-3
1656}
1657;;
1658
1659
0347518d 1660ATANL_UNSUPPORTED:
d5efd131
MF
1661{ .mfb
1662 nop.m 999
0347518d 1663 fmpy.s0 Result = ArgX,ArgY
d5efd131
MF
1664 br.ret.sptk b0
1665}
1666;;
1667
1668// Here if y natval, nan, inf, zero
1669ATANL_Y_SPECIAL:
1670// Here if x natval, nan, inf, zero
1671ATANL_X_SPECIAL:
1672{ .mfi
1673 nop.m 999
1674 fclass.m p13,p12 = ArgY_orig, 0x0c3 // Test y nan
1675 nop.i 999
1676}
1677;;
1678
1679{ .mfi
1680 nop.m 999
1681 fclass.m p15,p14 = ArgY_orig, 0x103 // Test y natval
1682 nop.i 999
1683}
1684;;
1685
1686{ .mfi
1687 nop.m 999
1688(p12) fclass.m p13,p0 = ArgX_orig, 0x0c3 // Test x nan
1689 nop.i 999
1690}
1691;;
1692
1693{ .mfi
1694 nop.m 999
1695(p14) fclass.m p15,p0 = ArgX_orig, 0x103 // Test x natval
1696 nop.i 999
1697}
1698;;
1699
1700{ .mfb
1701 nop.m 999
1702(p13) fmpy.s0 Result = ArgX_orig, ArgY_orig // Result nan if x or y nan
1703(p13) br.ret.spnt b0 // Exit if x or y nan
1704}
1705;;
1706
1707{ .mfb
1708 nop.m 999
1709(p15) fmpy.s0 Result = ArgX_orig, ArgY_orig // Result natval if x or y natval
1710(p15) br.ret.spnt b0 // Exit if x or y natval
1711}
1712;;
1713
1714
1715// Here if x or y inf or zero
0347518d 1716ATANL_SPECIAL_HANDLING:
d5efd131
MF
1717{ .mfi
1718 nop.m 999
1719 fclass.m p6, p7 = ArgY_orig, 0x007 // Test y zero
1720 mov special = 992 // Offset to table
1721}
1722;;
1723
1724{ .mfb
1725 add table_ptr1 = table_base, special // Point to 3pi/4
1726 fcmp.eq.s0 p0, p9 = ArgX_orig, ArgY_orig // Dummy to set denormal flag
1727(p7) br.cond.spnt ATANL_ArgY_Not_ZERO // Branch if y not zero
1728}
1729;;
1730
1731// Here if y zero
1732{ .mmf
1733 ldfd Result = [table_ptr1], 8 // Get pi high
1734 nop.m 999
1735 fclass.m p14, p0 = ArgX, 0x035 // Test for x>=+0
1736}
1737;;
1738
1739{ .mmf
1740 nop.m 999
1741 ldfd Result_lo = [table_ptr1], -8 // Get pi lo
1742 fclass.m p15, p0 = ArgX, 0x036 // Test for x<=-0
1743}
1744;;
1745
1746//
1747// Return sign_Y * 0 when ArgX > +0
1748//
1749{ .mfi
1750 nop.m 999
1751(p14) fmerge.s Result = ArgY, f0 // If x>=+0, y=0, hi sgn(y)*0
1752 nop.i 999
1753}
1754;;
1755
1756{ .mfi
1757 nop.m 999
1758 fclass.m p13, p0 = ArgX, 0x007 // Test for x=0
1759 nop.i 999
1760}
1761;;
1762
1763{ .mfi
1764 nop.m 999
1765(p14) fmerge.s Result_lo = ArgY, f0 // If x>=+0, y=0, lo sgn(y)*0
1766 nop.i 999
1767}
1768;;
1769
1770{ .mfi
1771(p13) mov GR_Parameter_TAG = 36 // Error tag for x=0, y=0
1772 nop.f 999
1773 nop.i 999
1774}
1775;;
1776
1777//
1778// Return sign_Y * pi when ArgX < -0
1779//
1780{ .mfi
1781 nop.m 999
1782(p15) fmerge.s Result = ArgY, Result // If x<0, y=0, hi=sgn(y)*pi
1783 nop.i 999
1784}
1785;;
1786
1787{ .mfi
1788 nop.m 999
1789(p15) fmerge.s Result_lo = ArgY, Result_lo // If x<0, y=0, lo=sgn(y)*pi
1790 nop.i 999
1791}
1792;;
1793
1794//
1795// Call error support function for atan(0,0)
1796//
1797{ .mfb
1798 nop.m 999
1799 fadd.s0 Result = Result, Result_lo
1800(p13) br.cond.spnt __libm_error_region // Branch if atan(0,0)
1801}
1802;;
1803
1804{ .mib
1805 nop.m 999
1806 nop.i 999
1807 br.ret.sptk b0 // Exit for y=0, x not 0
1808}
1809;;
1810
1811// Here if y not zero
0347518d 1812ATANL_ArgY_Not_ZERO:
d5efd131
MF
1813{ .mfi
1814 nop.m 999
1815 fclass.m p0, p10 = ArgY, 0x023 // Test y inf
1816 nop.i 999
1817}
1818;;
1819
1820{ .mfb
1821 nop.m 999
1822 fclass.m p6, p0 = ArgX, 0x017 // Test for 0 <= |x| < inf
1823(p10) br.cond.spnt ATANL_ArgY_Not_INF // Branch if 0 < |y| < inf
1824}
1825;;
1826
1827// Here if y=inf
1828//
1829// Return +PI/2 when ArgY = +Inf and ArgX = +/-0 or normal
1830// Return -PI/2 when ArgY = -Inf and ArgX = +/-0 or normal
1831// Return +PI/4 when ArgY = +Inf and ArgX = +Inf
1832// Return -PI/4 when ArgY = -Inf and ArgX = +Inf
1833// Return +3PI/4 when ArgY = +Inf and ArgX = -Inf
1834// Return -3PI/4 when ArgY = -Inf and ArgX = -Inf
1835//
1836{ .mfi
1837 nop.m 999
1838 fclass.m p7, p0 = ArgX, 0x021 // Test for x=+inf
1839 nop.i 999
1840}
1841;;
1842
1843{ .mfi
0347518d 1844(p6) add table_ptr1 = 16, table_ptr1 // Point to pi/2, if x finite
d5efd131
MF
1845 fclass.m p8, p0 = ArgX, 0x022 // Test for x=-inf
1846 nop.i 999
1847}
1848;;
1849
1850{ .mmi
1851(p7) add table_ptr1 = 32, table_ptr1 // Point to pi/4 if x=+inf
1852;;
1853(p8) add table_ptr1 = 48, table_ptr1 // Point to 3pi/4 if x=-inf
1854
1855 nop.i 999
1856}
1857;;
1858
1859{ .mmi
1860 ldfd Result = [table_ptr1], 8 // Load pi/2, pi/4, or 3pi/4 hi
1861;;
1862 ldfd Result_lo = [table_ptr1], -8 // Load pi/2, pi/4, or 3pi/4 lo
1863 nop.i 999
1864}
1865;;
1866
1867{ .mfi
1868 nop.m 999
1869 fmerge.s Result = ArgY, Result // Merge sgn(y) in hi
1870 nop.i 999
1871}
1872;;
1873
1874{ .mfi
1875 nop.m 999
1876 fmerge.s Result_lo = ArgY, Result_lo // Merge sgn(y) in lo
1877 nop.i 999
1878}
1879;;
1880
1881{ .mfb
1882 nop.m 999
1883 fadd.s0 Result = Result, Result_lo // Compute complete result
1884 br.ret.sptk b0 // Exit for y=inf
1885}
1886;;
1887
1888// Here if y not INF, and x=0 or INF
0347518d 1889ATANL_ArgY_Not_INF:
d5efd131
MF
1890//
1891// Return +PI/2 when ArgY NOT Inf, ArgY > 0 and ArgX = +/-0
1892// Return -PI/2 when ArgY NOT Inf, ArgY < 0 and ArgX = +/-0
1893// Return +0 when ArgY NOT Inf, ArgY > 0 and ArgX = +Inf
1894// Return -0 when ArgY NOT Inf, ArgY > 0 and ArgX = +Inf
1895// Return +PI when ArgY NOT Inf, ArgY > 0 and ArgX = -Inf
1896// Return -PI when ArgY NOT Inf, ArgY > 0 and ArgX = -Inf
1897//
1898{ .mfi
1899 nop.m 999
1900 fclass.m p7, p9 = ArgX, 0x021 // Test for x=+inf
1901 nop.i 999
1902}
1903;;
1904
1905{ .mfi
1906 nop.m 999
1907 fclass.m p6, p0 = ArgX, 0x007 // Test for x=0
1908 nop.i 999
1909}
1910;;
1911
1912{ .mfi
1913(p6) add table_ptr1 = 16, table_ptr1 // Point to pi/2
1914 fclass.m p8, p0 = ArgX, 0x022 // Test for x=-inf
1915 nop.i 999
1916}
1917;;
1918
1919.pred.rel "mutex",p7,p9
1920{ .mfi
1921(p9) ldfd Result = [table_ptr1], 8 // Load pi or pi/2 hi
1922(p7) fmerge.s Result = ArgY, f0 // If y not inf, x=+inf, sgn(y)*0
1923 nop.i 999
1924}
1925;;
1926
1927{ .mfi
1928(p9) ldfd Result_lo = [table_ptr1], -8 // Load pi or pi/2 lo
1929(p7) fnorm.s0 Result = Result // If y not inf, x=+inf normalize
1930 nop.i 999
1931}
1932;;
1933
1934{ .mfi
1935 nop.m 999
1936(p9) fmerge.s Result = ArgY, Result // Merge sgn(y) in hi
1937 nop.i 999
1938}
1939;;
1940
1941{ .mfi
1942 nop.m 999
1943(p9) fmerge.s Result_lo = ArgY, Result_lo // Merge sgn(y) in lo
1944 nop.i 999
1945}
1946;;
1947
1948{ .mfb
1949 nop.m 999
1950(p9) fadd.s0 Result = Result, Result_lo // Compute complete result
1951 br.ret.spnt b0 // Exit for y not inf, x=0,inf
1952}
1953;;
1954
1955GLOBAL_IEEE754_END(atan2l)
0347518d 1956
d5efd131
MF
1957LOCAL_LIBM_ENTRY(__libm_error_region)
1958.prologue
1959{ .mfi
1960 add GR_Parameter_Y=-32,sp // Parameter 2 value
1961 nop.f 0
1962.save ar.pfs,GR_SAVE_PFS
1963 mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
1964}
1965{ .mfi
1966.fframe 64
1967 add sp=-64,sp // Create new stack
1968 nop.f 0
1969 mov GR_SAVE_GP=gp // Save gp
1970};;
1971{ .mmi
1972 stfe [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack
1973 add GR_Parameter_X = 16,sp // Parameter 1 address
1974.save b0, GR_SAVE_B0
1975 mov GR_SAVE_B0=b0 // Save b0
1976};;
1977.body
1978{ .mib
1979 stfe [GR_Parameter_X] = FR_X // Store Parameter 1 on stack
1980 add GR_Parameter_RESULT = 0,GR_Parameter_Y
1981 nop.b 0 // Parameter 3 address
1982}
1983{ .mib
1984 stfe [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack
1985 add GR_Parameter_Y = -16,GR_Parameter_Y
1986 br.call.sptk b0=__libm_error_support# // Call error handling function
1987};;
1988{ .mmi
1989 nop.m 0
1990 nop.m 0
1991 add GR_Parameter_RESULT = 48,sp
1992};;
1993{ .mmi
1994 ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack
1995.restore sp
1996 add sp = 64,sp // Restore stack pointer
1997 mov b0 = GR_SAVE_B0 // Restore return address
1998};;
1999{ .mib
2000 mov gp = GR_SAVE_GP // Restore gp
2001 mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
2002 br.ret.sptk b0 // Return
2003};;
2004
2005LOCAL_LIBM_END(__libm_error_region#)
2006.type __libm_error_support#,@function
2007.global __libm_error_support#