]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/ia64/fpu/s_tanl.S
ia64: strip trailing whitespace
[thirdparty/glibc.git] / sysdeps / ia64 / fpu / s_tanl.S
CommitLineData
d5efd131
MF
1.file "tancotl.s"
2
3
4// Copyright (c) 2000 - 2004, Intel Corporation
5// All rights reserved.
6//
7// Contributed 2000 by the Intel Numerics Group, Intel Corporation
8//
9// Redistribution and use in source and binary forms, with or without
10// modification, are permitted provided that the following conditions are
11// met:
12//
13// * Redistributions of source code must retain the above copyright
14// notice, this list of conditions and the following disclaimer.
15//
16// * Redistributions in binary form must reproduce the above copyright
17// notice, this list of conditions and the following disclaimer in the
18// documentation and/or other materials provided with the distribution.
19//
20// * The name of Intel Corporation may not be used to endorse or promote
21// products derived from this software without specific prior written
22// permission.
23
0347518d
MF
24// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
25// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
d5efd131 26// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
0347518d 27// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
d5efd131 28// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
0347518d
MF
29// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
30// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
31// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
d5efd131 32// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
0347518d
MF
33// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
34// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35//
d5efd131 36// Intel Corporation is the author of this code, and requests that all
0347518d 37// problem reports or change requests be submitted to it directly at
d5efd131
MF
38// http://www.intel.com/software/products/opensource/libraries/num.htm.
39//
40//*********************************************************************
41//
0347518d 42// History:
d5efd131
MF
43//
44// 02/02/00 (hand-optimized)
45// 04/04/00 Unwind support added
46// 12/28/00 Fixed false invalid flags
47// 02/06/02 Improved speed
48// 05/07/02 Changed interface to __libm_pi_by_2_reduce
49// 05/30/02 Added cotl
50// 02/10/03 Reordered header: .section, .global, .proc, .align;
51// used data8 for long double table values
52// 05/15/03 Reformatted data tables
53// 10/26/04 Avoided using r14-31 as scratch so not clobbered by dynamic loader
54//
55//*********************************************************************
56//
57// Functions: tanl(x) = tangent(x), for double-extended precision x values
58// cotl(x) = cotangent(x), for double-extended precision x values
59//
60//*********************************************************************
61//
62// Resources Used:
63//
64// Floating-Point Registers: f8 (Input and Return Value)
65// f9-f15
66// f32-f121
67//
68// General Purpose Registers:
69// r32-r70
70//
71// Predicate Registers: p6-p15
72//
73//*********************************************************************
74//
75// IEEE Special Conditions for tanl:
76//
77// Denormal fault raised on denormal inputs
78// Overflow exceptions do not occur
79// Underflow exceptions raised when appropriate for tan
80// (No specialized error handling for this routine)
81// Inexact raised when appropriate by algorithm
82//
83// tanl(SNaN) = QNaN
84// tanl(QNaN) = QNaN
85// tanl(inf) = QNaN
86// tanl(+/-0) = +/-0
87//
88//*********************************************************************
89//
90// IEEE Special Conditions for cotl:
91//
92// Denormal fault raised on denormal inputs
93// Overflow exceptions occur at zero and near zero
94// Underflow exceptions do not occur
95// Inexact raised when appropriate by algorithm
96//
97// cotl(SNaN) = QNaN
98// cotl(QNaN) = QNaN
99// cotl(inf) = QNaN
100// cotl(+/-0) = +/-Inf and error handling is called
101//
102//*********************************************************************
103//
104// Below are mathematical and algorithmic descriptions for tanl.
105// For cotl we use next identity cot(x) = -tan(x + Pi/2).
106// So, to compute cot(x) we just need to increment N (N = N + 1)
107// and invert sign of the computed result.
108//
109//*********************************************************************
110//
111// Mathematical Description
112//
113// We consider the computation of FPTANL of Arg. Now, given
114//
115// Arg = N pi/2 + alpha, |alpha| <= pi/4,
116//
117// basic mathematical relationship shows that
118//
119// tan( Arg ) = tan( alpha ) if N is even;
120// = -cot( alpha ) otherwise.
121//
122// The value of alpha is obtained by argument reduction and
123// represented by two working precision numbers r and c where
124//
125// alpha = r + c accurately.
126//
127// The reduction method is described in a previous write up.
128// The argument reduction scheme identifies 4 cases. For Cases 2
129// and 4, because |alpha| is small, tan(r+c) and -cot(r+c) can be
130// computed very easily by 2 or 3 terms of the Taylor series
131// expansion as follows:
132//
133// Case 2:
134// -------
135//
136// tan(r + c) = r + c + r^3/3 ...accurately
137// -cot(r + c) = -1/(r+c) + r/3 ...accurately
138//
139// Case 4:
140// -------
141//
142// tan(r + c) = r + c + r^3/3 + 2r^5/15 ...accurately
143// -cot(r + c) = -1/(r+c) + r/3 + r^3/45 ...accurately
144//
145//
146// The only cases left are Cases 1 and 3 of the argument reduction
147// procedure. These two cases will be merged since after the
148// argument is reduced in either cases, we have the reduced argument
149// represented as r + c and that the magnitude |r + c| is not small
150// enough to allow the usage of a very short approximation.
151//
152// The greatest challenge of this task is that the second terms of
153// the Taylor series for tan(r) and -cot(r)
154//
155// r + r^3/3 + 2 r^5/15 + ...
156//
157// and
158//
159// -1/r + r/3 + r^3/45 + ...
160//
161// are not very small when |r| is close to pi/4 and the rounding
162// errors will be a concern if simple polynomial accumulation is
163// used. When |r| < 2^(-2), however, the second terms will be small
164// enough (5 bits or so of right shift) that a normal Horner
165// recurrence suffices. Hence there are two cases that we consider
166// in the accurate computation of tan(r) and cot(r), |r| <= pi/4.
167//
168// Case small_r: |r| < 2^(-2)
169// --------------------------
170//
171// Since Arg = N pi/4 + r + c accurately, we have
172//
173// tan(Arg) = tan(r+c) for N even,
174// = -cot(r+c) otherwise.
175//
176// Here for this case, both tan(r) and -cot(r) can be approximated
177// by simple polynomials:
178//
179// tan(r) = r + P1_1 r^3 + P1_2 r^5 + ... + P1_9 r^19
180// -cot(r) = -1/r + Q1_1 r + Q1_2 r^3 + ... + Q1_7 r^13
181//
182// accurately. Since |r| is relatively small, tan(r+c) and
183// -cot(r+c) can be accurately approximated by replacing r with
184// r+c only in the first two terms of the corresponding polynomials.
185//
186// Note that P1_1 (and Q1_1 for that matter) approximates 1/3 to
187// almost 64 sig. bits, thus
188//
189// P1_1 (r+c)^3 = P1_1 r^3 + c * r^2 accurately.
190//
191// Hence,
192//
193// tan(r+c) = r + P1_1 r^3 + P1_2 r^5 + ... + P1_9 r^19
194// + c*(1 + r^2)
195//
196// -cot(r+c) = -1/(r+c) + Q1_1 r + Q1_2 r^3 + ... + Q1_7 r^13
197// + Q1_1*c
198//
199//
200// Case normal_r: 2^(-2) <= |r| <= pi/4
201// ------------------------------------
202//
203// This case is more likely than the previous one if one considers
204// r to be uniformly distributed in [-pi/4 pi/4].
205//
206// The required calculation is either
207//
208// tan(r + c) = tan(r) + correction, or
209// -cot(r + c) = -cot(r) + correction.
210//
211// Specifically,
212//
213// tan(r + c) = tan(r) + c tan'(r) + O(c^2)
214// = tan(r) + c sec^2(r) + O(c^2)
215// = tan(r) + c SEC_sq ...accurately
216// as long as SEC_sq approximates sec^2(r)
217// to, say, 5 bits or so.
218//
219// Similarly,
220//
221// -cot(r + c) = -cot(r) - c cot'(r) + O(c^2)
222// = -cot(r) + c csc^2(r) + O(c^2)
223// = -cot(r) + c CSC_sq ...accurately
224// as long as CSC_sq approximates csc^2(r)
225// to, say, 5 bits or so.
226//
227// We therefore concentrate on accurately calculating tan(r) and
228// cot(r) for a working-precision number r, |r| <= pi/4 to within
229// 0.1% or so.
230//
231// We will employ a table-driven approach. Let
232//
233// r = sgn_r * 2^k * 1.b_1 b_2 ... b_5 ... b_63
234// = sgn_r * ( B + x )
235//
236// where
237//
238// B = 2^k * 1.b_1 b_2 ... b_5 1
239// x = |r| - B
240//
241// Now,
242// tan(B) + tan(x)
243// tan( B + x ) = ------------------------
244// 1 - tan(B)*tan(x)
245//
246// / \
247// | tan(B) + tan(x) |
248
249// = tan(B) + | ------------------------ - tan(B) |
250// | 1 - tan(B)*tan(x) |
251// \ /
252//
253// sec^2(B) * tan(x)
254// = tan(B) + ------------------------
255// 1 - tan(B)*tan(x)
256//
257// (1/[sin(B)*cos(B)]) * tan(x)
258// = tan(B) + --------------------------------
259// cot(B) - tan(x)
260//
261//
262// Clearly, the values of tan(B), cot(B) and 1/(sin(B)*cos(B)) are
263// calculated beforehand and stored in a table. Since
264//
265// |x| <= 2^k * 2^(-6) <= 2^(-7) (because k = -1, -2)
266//
267// a very short polynomial will be sufficient to approximate tan(x)
268// accurately. The details involved in computing the last expression
269// will be given in the next section on algorithm description.
270//
271//
272// Now, we turn to the case where cot( B + x ) is needed.
273//
274//
275// 1 - tan(B)*tan(x)
276// cot( B + x ) = ------------------------
277// tan(B) + tan(x)
278//
279// / \
280// | 1 - tan(B)*tan(x) |
281
282// = cot(B) + | ----------------------- - cot(B) |
283// | tan(B) + tan(x) |
284// \ /
285//
286// [tan(B) + cot(B)] * tan(x)
287// = cot(B) - ----------------------------
288// tan(B) + tan(x)
289//
290// (1/[sin(B)*cos(B)]) * tan(x)
291// = cot(B) - --------------------------------
292// tan(B) + tan(x)
293//
294//
295// Note that the values of tan(B), cot(B) and 1/(sin(B)*cos(B)) that
296// are needed are the same set of values needed in the previous
297// case.
298//
299// Finally, we can put all the ingredients together as follows:
300//
301// Arg = N * pi/2 + r + c ...accurately
302//
303// tan(Arg) = tan(r) + correction if N is even;
304// = -cot(r) + correction otherwise.
305//
306// For Cases 2 and 4,
307//
308// Case 2:
309// tan(Arg) = tan(r + c) = r + c + r^3/3 N even
310// = -cot(r + c) = -1/(r+c) + r/3 N odd
311// Case 4:
312// tan(Arg) = tan(r + c) = r + c + r^3/3 + 2r^5/15 N even
313// = -cot(r + c) = -1/(r+c) + r/3 + r^3/45 N odd
314//
315//
316// For Cases 1 and 3,
317//
318// Case small_r: |r| < 2^(-2)
319//
320// tan(Arg) = r + P1_1 r^3 + P1_2 r^5 + ... + P1_9 r^19
321// + c*(1 + r^2) N even
322//
323// = -1/(r+c) + Q1_1 r + Q1_2 r^3 + ... + Q1_7 r^13
324// + Q1_1*c N odd
325//
326// Case normal_r: 2^(-2) <= |r| <= pi/4
327//
328// tan(Arg) = tan(r) + c * sec^2(r) N even
329// = -cot(r) + c * csc^2(r) otherwise
330//
331// For N even,
332//
333// tan(Arg) = tan(r) + c*sec^2(r)
334// = tan( sgn_r * (B+x) ) + c * sec^2(|r|)
335// = sgn_r * ( tan(B+x) + sgn_r*c*sec^2(|r|) )
336// = sgn_r * ( tan(B+x) + sgn_r*c*sec^2(B) )
337//
338// since B approximates |r| to 2^(-6) in relative accuracy.
339//
340// / (1/[sin(B)*cos(B)]) * tan(x)
341// tan(Arg) = sgn_r * | tan(B) + --------------------------------
342// \ cot(B) - tan(x)
343// \
344// + CORR |
345
346// /
347// where
348//
349// CORR = sgn_r*c*tan(B)*SC_inv(B); SC_inv(B) = 1/(sin(B)*cos(B)).
350//
351// For N odd,
352//
353// tan(Arg) = -cot(r) + c*csc^2(r)
354// = -cot( sgn_r * (B+x) ) + c * csc^2(|r|)
355// = sgn_r * ( -cot(B+x) + sgn_r*c*csc^2(|r|) )
356// = sgn_r * ( -cot(B+x) + sgn_r*c*csc^2(B) )
357//
358// since B approximates |r| to 2^(-6) in relative accuracy.
359//
360// / (1/[sin(B)*cos(B)]) * tan(x)
361// tan(Arg) = sgn_r * | -cot(B) + --------------------------------
362// \ tan(B) + tan(x)
363// \
364// + CORR |
365
366// /
367// where
368//
369// CORR = sgn_r*c*cot(B)*SC_inv(B); SC_inv(B) = 1/(sin(B)*cos(B)).
370//
371//
372// The actual algorithm prescribes how all the mathematical formulas
373// are calculated.
374//
375//
376// 2. Algorithmic Description
377// ==========================
378//
379// 2.1 Computation for Cases 2 and 4.
380// ----------------------------------
381//
382// For Case 2, we use two-term polynomials.
383//
384// For N even,
385//
386// rsq := r * r
387// Poly := c + r * rsq * P1_1
388// Result := r + Poly ...in user-defined rounding
389//
390// For N odd,
391// S_hi := -frcpa(r) ...8 bits
392// S_hi := S_hi + S_hi*(1 + S_hi*r) ...16 bits
393// S_hi := S_hi + S_hi*(1 + S_hi*r) ...32 bits
394// S_hi := S_hi + S_hi*(1 + S_hi*r) ...64 bits
395// S_lo := S_hi*( (1 + S_hi*r) + S_hi*c )
396// ...S_hi + S_lo is -1/(r+c) to extra precision
397// S_lo := S_lo + Q1_1*r
398//
399// Result := S_hi + S_lo ...in user-defined rounding
400//
401// For Case 4, we use three-term polynomials
402//
403// For N even,
404//
405// rsq := r * r
406// Poly := c + r * rsq * (P1_1 + rsq * P1_2)
407// Result := r + Poly ...in user-defined rounding
408//
409// For N odd,
410// S_hi := -frcpa(r) ...8 bits
411// S_hi := S_hi + S_hi*(1 + S_hi*r) ...16 bits
412// S_hi := S_hi + S_hi*(1 + S_hi*r) ...32 bits
413// S_hi := S_hi + S_hi*(1 + S_hi*r) ...64 bits
414// S_lo := S_hi*( (1 + S_hi*r) + S_hi*c )
415// ...S_hi + S_lo is -1/(r+c) to extra precision
416// rsq := r * r
417// P := Q1_1 + rsq*Q1_2
418// S_lo := S_lo + r*P
419//
420// Result := S_hi + S_lo ...in user-defined rounding
421//
422//
423// Note that the coefficients P1_1, P1_2, Q1_1, and Q1_2 are
424// the same as those used in the small_r case of Cases 1 and 3
425// below.
426//
427//
428// 2.2 Computation for Cases 1 and 3.
429// ----------------------------------
430// This is further divided into the case of small_r,
431// where |r| < 2^(-2), and the case of normal_r, where |r| lies between
432// 2^(-2) and pi/4.
433//
434// Algorithm for the case of small_r
435// ---------------------------------
436//
437// For N even,
438// rsq := r * r
439// Poly1 := rsq*(P1_1 + rsq*(P1_2 + rsq*P1_3))
440// r_to_the_8 := rsq * rsq
441// r_to_the_8 := r_to_the_8 * r_to_the_8
442// Poly2 := P1_4 + rsq*(P1_5 + rsq*(P1_6 + ... rsq*P1_9))
443// CORR := c * ( 1 + rsq )
444// Poly := Poly1 + r_to_the_8*Poly2
445// Poly := r*Poly + CORR
446// Result := r + Poly ...in user-defined rounding
447// ...note that Poly1 and r_to_the_8 can be computed in parallel
448// ...with Poly2 (Poly1 is intentionally set to be much
449// ...shorter than Poly2 so that r_to_the_8 and CORR can be hidden)
450//
451// For N odd,
452// S_hi := -frcpa(r) ...8 bits
453// S_hi := S_hi + S_hi*(1 + S_hi*r) ...16 bits
454// S_hi := S_hi + S_hi*(1 + S_hi*r) ...32 bits
455// S_hi := S_hi + S_hi*(1 + S_hi*r) ...64 bits
456// S_lo := S_hi*( (1 + S_hi*r) + S_hi*c )
457// ...S_hi + S_lo is -1/(r+c) to extra precision
458// S_lo := S_lo + Q1_1*c
459//
460// ...S_hi and S_lo are computed in parallel with
461// ...the following
462// rsq := r*r
463// P := Q1_1 + rsq*(Q1_2 + rsq*(Q1_3 + ... + rsq*Q1_7))
464//
465// Poly := r*P + S_lo
466// Result := S_hi + Poly ...in user-defined rounding
467//
468//
469// Algorithm for the case of normal_r
470// ----------------------------------
471//
472// Here, we first consider the computation of tan( r + c ). As
473// presented in the previous section,
474//
475// tan( r + c ) = tan(r) + c * sec^2(r)
476// = sgn_r * [ tan(B+x) + CORR ]
477// CORR = sgn_r * c * tan(B) * 1/[sin(B)*cos(B)]
478//
479// because sec^2(r) = sec^(|r|), and B approximate |r| to 6.5 bits.
480//
481// tan( r + c ) =
482// / (1/[sin(B)*cos(B)]) * tan(x)
483// sgn_r * | tan(B) + -------------------------------- +
484// \ cot(B) - tan(x)
485// \
486// CORR |
487
488// /
489//
490// The values of tan(B), cot(B) and 1/(sin(B)*cos(B)) are
491// calculated beforehand and stored in a table. Specifically,
492// the table values are
493//
494// tan(B) as T_hi + T_lo;
495// cot(B) as C_hi + C_lo;
496// 1/[sin(B)*cos(B)] as SC_inv
497//
498// T_hi, C_hi are in double-precision memory format;
499// T_lo, C_lo are in single-precision memory format;
500// SC_inv is in extended-precision memory format.
501//
502// The value of tan(x) will be approximated by a short polynomial of
503// the form
504//
505// tan(x) as x + x * P, where
506// P = x^2 * (P2_1 + x^2 * (P2_2 + x^2 * P2_3))
507//
508// Because |x| <= 2^(-7), cot(B) - x approximates cot(B) - tan(x)
509// to a relative accuracy better than 2^(-20). Thus, a good
510// initial guess of 1/( cot(B) - tan(x) ) to initiate the iterative
511// division is:
512//
513// 1/(cot(B) - tan(x)) is approximately
514// 1/(cot(B) - x) is
515// tan(B)/(1 - x*tan(B)) is approximately
516// T_hi / ( 1 - T_hi * x ) is approximately
517//
518// T_hi * [ 1 + (Thi * x) + (T_hi * x)^2 ]
519//
520// The calculation of tan(r+c) therefore proceed as follows:
521//
522// Tx := T_hi * x
523// xsq := x * x
524//
525// V_hi := T_hi*(1 + Tx*(1 + Tx))
526// P := xsq * (P1_1 + xsq*(P1_2 + xsq*P1_3))
527// ...V_hi serves as an initial guess of 1/(cot(B) - tan(x))
528// ...good to about 20 bits of accuracy
529//
530// tanx := x + x*P
531// D := C_hi - tanx
532// ...D is a double precision denominator: cot(B) - tan(x)
533//
534// V_hi := V_hi + V_hi*(1 - V_hi*D)
535// ....V_hi approximates 1/(cot(B)-tan(x)) to 40 bits
536//
537// V_lo := V_hi * ( [ (1 - V_hi*C_hi) + V_hi*tanx ]
538// - V_hi*C_lo ) ...observe all order
539// ...V_hi + V_lo approximates 1/(cot(B) - tan(x))
540// ...to extra accuracy
541//
542// ... SC_inv(B) * (x + x*P)
543// ... tan(B) + ------------------------- + CORR
544// ... cot(B) - (x + x*P)
545// ...
546// ... = tan(B) + SC_inv(B)*(x + x*P)*(V_hi + V_lo) + CORR
547// ...
548//
549// Sx := SC_inv * x
550// CORR := sgn_r * c * SC_inv * T_hi
551//
552// ...put the ingredients together to compute
553// ... SC_inv(B) * (x + x*P)
554// ... tan(B) + ------------------------- + CORR
555// ... cot(B) - (x + x*P)
556// ...
557// ... = tan(B) + SC_inv(B)*(x + x*P)*(V_hi + V_lo) + CORR
558// ...
559// ... = T_hi + T_lo + CORR +
560// ... Sx * V_hi + Sx * V_lo + Sx * P *(V_hi + V_lo)
561//
562// CORR := CORR + T_lo
563// tail := V_lo + P*(V_hi + V_lo)
564// tail := Sx * tail + CORR
565// tail := Sx * V_hi + tail
566// T_hi := sgn_r * T_hi
567//
568// ...T_hi + sgn_r*tail now approximate
569// ...sgn_r*(tan(B+x) + CORR) accurately
570//
571// Result := T_hi + sgn_r*tail ...in user-defined
572// ...rounding control
573// ...It is crucial that independent paths be fully
574// ...exploited for performance's sake.
575//
576//
577// Next, we consider the computation of -cot( r + c ). As
578// presented in the previous section,
579//
580// -cot( r + c ) = -cot(r) + c * csc^2(r)
581// = sgn_r * [ -cot(B+x) + CORR ]
582// CORR = sgn_r * c * cot(B) * 1/[sin(B)*cos(B)]
583//
584// because csc^2(r) = csc^(|r|), and B approximate |r| to 6.5 bits.
585//
586// -cot( r + c ) =
587// / (1/[sin(B)*cos(B)]) * tan(x)
588// sgn_r * | -cot(B) + -------------------------------- +
589// \ tan(B) + tan(x)
590// \
591// CORR |
592
593// /
594//
595// The values of tan(B), cot(B) and 1/(sin(B)*cos(B)) are
596// calculated beforehand and stored in a table. Specifically,
597// the table values are
598//
599// tan(B) as T_hi + T_lo;
600// cot(B) as C_hi + C_lo;
601// 1/[sin(B)*cos(B)] as SC_inv
602//
603// T_hi, C_hi are in double-precision memory format;
604// T_lo, C_lo are in single-precision memory format;
605// SC_inv is in extended-precision memory format.
606//
607// The value of tan(x) will be approximated by a short polynomial of
608// the form
609//
610// tan(x) as x + x * P, where
611// P = x^2 * (P2_1 + x^2 * (P2_2 + x^2 * P2_3))
612//
613// Because |x| <= 2^(-7), tan(B) + x approximates tan(B) + tan(x)
614// to a relative accuracy better than 2^(-18). Thus, a good
615// initial guess of 1/( tan(B) + tan(x) ) to initiate the iterative
616// division is:
617//
618// 1/(tan(B) + tan(x)) is approximately
619// 1/(tan(B) + x) is
620// cot(B)/(1 + x*cot(B)) is approximately
621// C_hi / ( 1 + C_hi * x ) is approximately
622//
623// C_hi * [ 1 - (C_hi * x) + (C_hi * x)^2 ]
624//
625// The calculation of -cot(r+c) therefore proceed as follows:
626//
627// Cx := C_hi * x
628// xsq := x * x
629//
630// V_hi := C_hi*(1 - Cx*(1 - Cx))
631// P := xsq * (P1_1 + xsq*(P1_2 + xsq*P1_3))
632// ...V_hi serves as an initial guess of 1/(tan(B) + tan(x))
633// ...good to about 18 bits of accuracy
634//
635// tanx := x + x*P
636// D := T_hi + tanx
637// ...D is a double precision denominator: tan(B) + tan(x)
638//
639// V_hi := V_hi + V_hi*(1 - V_hi*D)
640// ....V_hi approximates 1/(tan(B)+tan(x)) to 40 bits
641//
642// V_lo := V_hi * ( [ (1 - V_hi*T_hi) - V_hi*tanx ]
643// - V_hi*T_lo ) ...observe all order
644// ...V_hi + V_lo approximates 1/(tan(B) + tan(x))
645// ...to extra accuracy
646//
647// ... SC_inv(B) * (x + x*P)
648// ... -cot(B) + ------------------------- + CORR
649// ... tan(B) + (x + x*P)
650// ...
651// ... =-cot(B) + SC_inv(B)*(x + x*P)*(V_hi + V_lo) + CORR
652// ...
653//
654// Sx := SC_inv * x
655// CORR := sgn_r * c * SC_inv * C_hi
656//
657// ...put the ingredients together to compute
658// ... SC_inv(B) * (x + x*P)
659// ... -cot(B) + ------------------------- + CORR
660// ... tan(B) + (x + x*P)
661// ...
662// ... =-cot(B) + SC_inv(B)*(x + x*P)*(V_hi + V_lo) + CORR
663// ...
664// ... =-C_hi - C_lo + CORR +
665// ... Sx * V_hi + Sx * V_lo + Sx * P *(V_hi + V_lo)
666//
667// CORR := CORR - C_lo
668// tail := V_lo + P*(V_hi + V_lo)
669// tail := Sx * tail + CORR
670// tail := Sx * V_hi + tail
671// C_hi := -sgn_r * C_hi
672//
673// ...C_hi + sgn_r*tail now approximates
674// ...sgn_r*(-cot(B+x) + CORR) accurately
675//
676// Result := C_hi + sgn_r*tail in user-defined rounding control
677// ...It is crucial that independent paths be fully
678// ...exploited for performance's sake.
679//
680// 3. Implementation Notes
681// =======================
682//
683// Table entries T_hi, T_lo; C_hi, C_lo; SC_inv
684//
685// Recall that 2^(-2) <= |r| <= pi/4;
686//
687// r = sgn_r * 2^k * 1.b_1 b_2 ... b_63
688//
689// and
690//
691// B = 2^k * 1.b_1 b_2 b_3 b_4 b_5 1
692//
693// Thus, for k = -2, possible values of B are
694//
695// B = 2^(-2) * ( 1 + index/32 + 1/64 ),
696// index ranges from 0 to 31
697//
698// For k = -1, however, since |r| <= pi/4 = 0.78...
699// possible values of B are
700//
701// B = 2^(-1) * ( 1 + index/32 + 1/64 )
702// index ranges from 0 to 19.
703//
704//
705
706RODATA
707.align 16
708
709LOCAL_OBJECT_START(TANL_BASE_CONSTANTS)
710
711tanl_table_1:
712data8 0xA2F9836E4E44152A, 0x00003FFE // two_by_pi
713data8 0xC84D32B0CE81B9F1, 0x00004016 // P_0
714data8 0xC90FDAA22168C235, 0x00003FFF // P_1
715data8 0xECE675D1FC8F8CBB, 0x0000BFBD // P_2
716data8 0xB7ED8FBBACC19C60, 0x0000BF7C // P_3
717LOCAL_OBJECT_END(TANL_BASE_CONSTANTS)
718
719LOCAL_OBJECT_START(tanl_table_2)
720data8 0xC90FDAA22168C234, 0x00003FFE // PI_BY_4
721data8 0xA397E5046EC6B45A, 0x00003FE7 // Inv_P_0
722data8 0x8D848E89DBD171A1, 0x0000BFBF // d_1
723data8 0xD5394C3618A66F8E, 0x0000BF7C // d_2
724data4 0x3E800000 // two**-2
725data4 0xBE800000 // -two**-2
726data4 0x00000000 // pad
727data4 0x00000000 // pad
728LOCAL_OBJECT_END(tanl_table_2)
729
730LOCAL_OBJECT_START(tanl_table_p1)
731data8 0xAAAAAAAAAAAAAABD, 0x00003FFD // P1_1
732data8 0x8888888888882E6A, 0x00003FFC // P1_2
733data8 0xDD0DD0DD0F0177B6, 0x00003FFA // P1_3
734data8 0xB327A440646B8C6D, 0x00003FF9 // P1_4
735data8 0x91371B251D5F7D20, 0x00003FF8 // P1_5
736data8 0xEB69A5F161C67914, 0x00003FF6 // P1_6
737data8 0xBEDD37BE019318D2, 0x00003FF5 // P1_7
738data8 0x9979B1463C794015, 0x00003FF4 // P1_8
739data8 0x8EBD21A38C6EB58A, 0x00003FF3 // P1_9
740LOCAL_OBJECT_END(tanl_table_p1)
741
742LOCAL_OBJECT_START(tanl_table_q1)
743data8 0xAAAAAAAAAAAAAAB4, 0x00003FFD // Q1_1
744data8 0xB60B60B60B5FC93E, 0x00003FF9 // Q1_2
745data8 0x8AB355E00C9BBFBF, 0x00003FF6 // Q1_3
746data8 0xDDEBBC89CBEE3D4C, 0x00003FF2 // Q1_4
747data8 0xB3548A685F80BBB6, 0x00003FEF // Q1_5
748data8 0x913625604CED5BF1, 0x00003FEC // Q1_6
749data8 0xF189D95A8EE92A83, 0x00003FE8 // Q1_7
750LOCAL_OBJECT_END(tanl_table_q1)
751
752LOCAL_OBJECT_START(tanl_table_p2)
753data8 0xAAAAAAAAAAAB362F, 0x00003FFD // P2_1
754data8 0x88888886E97A6097, 0x00003FFC // P2_2
755data8 0xDD108EE025E716A1, 0x00003FFA // P2_3
756LOCAL_OBJECT_END(tanl_table_p2)
757
758LOCAL_OBJECT_START(tanl_table_tm2)
759//
760// Entries T_hi double-precision memory format
761// Index = 0,1,...,31 B = 2^(-2)*(1+Index/32+1/64)
762// Entries T_lo single-precision memory format
763// Index = 0,1,...,31 B = 2^(-2)*(1+Index/32+1/64)
764//
765data8 0x3FD09BC362400794
766data4 0x23A05C32, 0x00000000
767data8 0x3FD124A9DFFBC074
768data4 0x240078B2, 0x00000000
769data8 0x3FD1AE235BD4920F
770data4 0x23826B8E, 0x00000000
771data8 0x3FD2383515E2701D
772data4 0x22D31154, 0x00000000
773data8 0x3FD2C2E463739C2D
774data4 0x2265C9E2, 0x00000000
775data8 0x3FD34E36AFEEA48B
776data4 0x245C05EB, 0x00000000
777data8 0x3FD3DA317DBB35D1
778data4 0x24749F2D, 0x00000000
779data8 0x3FD466DA67321619
780data4 0x2462CECE, 0x00000000
781data8 0x3FD4F4371F94A4D5
782data4 0x246D0DF1, 0x00000000
783data8 0x3FD5824D740C3E6D
784data4 0x240A85B5, 0x00000000
785data8 0x3FD611234CB1E73D
786data4 0x23F96E33, 0x00000000
787data8 0x3FD6A0BEAD9EA64B
788data4 0x247C5393, 0x00000000
789data8 0x3FD73125B804FD01
790data4 0x241F3B29, 0x00000000
791data8 0x3FD7C25EAB53EE83
792data4 0x2479989B, 0x00000000
793data8 0x3FD8546FE6640EED
794data4 0x23B343BC, 0x00000000
795data8 0x3FD8E75FE8AF1892
796data4 0x241454D1, 0x00000000
797data8 0x3FD97B3553928BDA
798data4 0x238613D9, 0x00000000
799data8 0x3FDA0FF6EB9DE4DE
800data4 0x22859FA7, 0x00000000
801data8 0x3FDAA5AB99ECF92D
802data4 0x237A6D06, 0x00000000
803data8 0x3FDB3C5A6D8F1796
804data4 0x23952F6C, 0x00000000
805data8 0x3FDBD40A9CFB8BE4
806data4 0x2280FC95, 0x00000000
807data8 0x3FDC6CC387943100
808data4 0x245D2EC0, 0x00000000
809data8 0x3FDD068CB736C500
810data4 0x23C4AD7D, 0x00000000
811data8 0x3FDDA16DE1DDBC31
812data4 0x23D076E6, 0x00000000
813data8 0x3FDE3D6EEB515A93
814data4 0x244809A6, 0x00000000
815data8 0x3FDEDA97E6E9E5F1
816data4 0x220856C8, 0x00000000
817data8 0x3FDF78F11963CE69
818data4 0x244BE993, 0x00000000
819data8 0x3FE00C417D635BCE
820data4 0x23D21799, 0x00000000
821data8 0x3FE05CAB1C302CD3
822data4 0x248A1B1D, 0x00000000
823data8 0x3FE0ADB9DB6A1FA0
824data4 0x23D53E33, 0x00000000
825data8 0x3FE0FF724A20BA81
826data4 0x24DB9ED5, 0x00000000
827data8 0x3FE151D9153FA6F5
828data4 0x24E9E451, 0x00000000
829LOCAL_OBJECT_END(tanl_table_tm2)
830
831LOCAL_OBJECT_START(tanl_table_tm1)
832//
833// Entries T_hi double-precision memory format
834// Index = 0,1,...,19 B = 2^(-1)*(1+Index/32+1/64)
835// Entries T_lo single-precision memory format
836// Index = 0,1,...,19 B = 2^(-1)*(1+Index/32+1/64)
837//
838data8 0x3FE1CEC4BA1BE39E
839data4 0x24B60F9E, 0x00000000
840data8 0x3FE277E45ABD9B2D
841data4 0x248C2474, 0x00000000
842data8 0x3FE324180272B110
843data4 0x247B8311, 0x00000000
844data8 0x3FE3D38B890E2DF0
845data4 0x24C55751, 0x00000000
846data8 0x3FE4866D46236871
847data4 0x24E5BC34, 0x00000000
848data8 0x3FE53CEE45E044B0
849data4 0x24001BA4, 0x00000000
850data8 0x3FE5F74282EC06E4
851data4 0x24B973DC, 0x00000000
852data8 0x3FE6B5A125DF43F9
853data4 0x24895440, 0x00000000
854data8 0x3FE77844CAFD348C
855data4 0x240021CA, 0x00000000
856data8 0x3FE83F6BCEED6B92
857data4 0x24C45372, 0x00000000
858data8 0x3FE90B58A34F3665
859data4 0x240DAD33, 0x00000000
860data8 0x3FE9DC522C1E56B4
861data4 0x24F846CE, 0x00000000
862data8 0x3FEAB2A427041578
863data4 0x2323FB6E, 0x00000000
864data8 0x3FEB8E9F9DD8C373
865data4 0x24B3090B, 0x00000000
866data8 0x3FEC709B65C9AA7B
867data4 0x2449F611, 0x00000000
868data8 0x3FED58F4ACCF8435
869data4 0x23616A7E, 0x00000000
870data8 0x3FEE480F97635082
871data4 0x24C2FEAE, 0x00000000
872data8 0x3FEF3E57F0ACC544
873data4 0x242CE964, 0x00000000
874data8 0x3FF01E20F7E06E4B
875data4 0x2480D3EE, 0x00000000
876data8 0x3FF0A1258A798A69
877data4 0x24DB8967, 0x00000000
878LOCAL_OBJECT_END(tanl_table_tm1)
879
880LOCAL_OBJECT_START(tanl_table_cm2)
881//
882// Entries C_hi double-precision memory format
883// Index = 0,1,...,31 B = 2^(-2)*(1+Index/32+1/64)
884// Entries C_lo single-precision memory format
885// Index = 0,1,...,31 B = 2^(-2)*(1+Index/32+1/64)
886//
887data8 0x400ED3E2E63EFBD0
888data4 0x259D94D4, 0x00000000
889data8 0x400DDDB4C515DAB5
890data4 0x245F0537, 0x00000000
891data8 0x400CF57ABE19A79F
892data4 0x25D4EA9F, 0x00000000
893data8 0x400C1A06D15298ED
894data4 0x24AE40A0, 0x00000000
895data8 0x400B4A4C164B2708
896data4 0x25A5AAB6, 0x00000000
897data8 0x400A855A5285B068
898data4 0x25524F18, 0x00000000
899data8 0x4009CA5A3FFA549F
900data4 0x24C999C0, 0x00000000
901data8 0x4009188A646AF623
902data4 0x254FD801, 0x00000000
903data8 0x40086F3C6084D0E7
904data4 0x2560F5FD, 0x00000000
905data8 0x4007CDD2A29A76EE
906data4 0x255B9D19, 0x00000000
907data8 0x400733BE6C8ECA95
908data4 0x25CB021B, 0x00000000
909data8 0x4006A07E1F8DDC52
910data4 0x24AB4722, 0x00000000
911data8 0x4006139BC298AD58
912data4 0x252764E2, 0x00000000
913data8 0x40058CABBAD7164B
914data4 0x24DAF5DB, 0x00000000
915data8 0x40050B4BAE31A5D3
916data4 0x25EA20F4, 0x00000000
917data8 0x40048F2189F85A8A
918data4 0x2583A3E8, 0x00000000
919data8 0x400417DAA862380D
920data4 0x25DCC4CC, 0x00000000
921data8 0x4003A52B1088FCFE
922data4 0x2430A492, 0x00000000
923data8 0x400336CCCD3527D5
924data4 0x255F77CF, 0x00000000
925data8 0x4002CC7F5760766D
926data4 0x25DA0BDA, 0x00000000
927data8 0x4002660711CE02E3
928data4 0x256FF4A2, 0x00000000
929data8 0x4002032CD37BBE04
930data4 0x25208AED, 0x00000000
931data8 0x4001A3BD7F050775
932data4 0x24B72DD6, 0x00000000
933data8 0x40014789A554848A
934data4 0x24AB4DAA, 0x00000000
935data8 0x4000EE65323E81B7
936data4 0x2584C440, 0x00000000
937data8 0x4000982721CF1293
938data4 0x25C9428D, 0x00000000
939data8 0x400044A93D415EEB
940data4 0x25DC8482, 0x00000000
941data8 0x3FFFE78FBD72C577
942data4 0x257F5070, 0x00000000
943data8 0x3FFF4AC375EFD28E
944data4 0x23EBBF7A, 0x00000000
945data8 0x3FFEB2AF60B52DDE
946data4 0x22EECA07, 0x00000000
947data8 0x3FFE1F1935204180
948data4 0x24191079, 0x00000000
949data8 0x3FFD8FCA54F7E60A
950data4 0x248D3058, 0x00000000
951LOCAL_OBJECT_END(tanl_table_cm2)
952
953LOCAL_OBJECT_START(tanl_table_cm1)
954//
955// Entries C_hi double-precision memory format
956// Index = 0,1,...,19 B = 2^(-1)*(1+Index/32+1/64)
957// Entries C_lo single-precision memory format
958// Index = 0,1,...,19 B = 2^(-1)*(1+Index/32+1/64)
959//
960data8 0x3FFCC06A79F6FADE
961data4 0x239C7886, 0x00000000
962data8 0x3FFBB91F891662A6
963data4 0x250BD191, 0x00000000
964data8 0x3FFABFB6529F155D
965data4 0x256CC3E6, 0x00000000
966data8 0x3FF9D3002E964AE9
967data4 0x250843E3, 0x00000000
968data8 0x3FF8F1EF89DCB383
969data4 0x2277C87E, 0x00000000
970data8 0x3FF81B937C87DBD6
971data4 0x256DA6CF, 0x00000000
972data8 0x3FF74F141042EDE4
973data4 0x2573D28A, 0x00000000
974data8 0x3FF68BAF1784B360
975data4 0x242E489A, 0x00000000
976data8 0x3FF5D0B57C923C4C
977data4 0x2532D940, 0x00000000
978data8 0x3FF51D88F418EF20
979data4 0x253C7DD6, 0x00000000
980data8 0x3FF4719A02F88DAE
981data4 0x23DB59BF, 0x00000000
982data8 0x3FF3CC6649DA0788
983data4 0x252B4756, 0x00000000
984data8 0x3FF32D770B980DB8
985data4 0x23FE585F, 0x00000000
986data8 0x3FF2945FE56C987A
987data4 0x25378A63, 0x00000000
988data8 0x3FF200BDB16523F6
989data4 0x247BB2E0, 0x00000000
990data8 0x3FF172358CE27778
991data4 0x24446538, 0x00000000
992data8 0x3FF0E873FDEFE692
993data4 0x2514638F, 0x00000000
994data8 0x3FF0632C33154062
995data4 0x24A7FC27, 0x00000000
996data8 0x3FEFC42EB3EF115F
997data4 0x248FD0FE, 0x00000000
998data8 0x3FEEC9E8135D26F6
999data4 0x2385C719, 0x00000000
1000LOCAL_OBJECT_END(tanl_table_cm1)
1001
1002LOCAL_OBJECT_START(tanl_table_scim2)
1003//
1004// Entries SC_inv in Swapped IEEE format (extended)
1005// Index = 0,1,...,31 B = 2^(-2)*(1+Index/32+1/64)
1006//
1007data8 0x839D6D4A1BF30C9E, 0x00004001
1008data8 0x80092804554B0EB0, 0x00004001
1009data8 0xF959F94CA1CF0DE9, 0x00004000
1010data8 0xF3086BA077378677, 0x00004000
1011data8 0xED154515CCD4723C, 0x00004000
1012data8 0xE77909441C27CF25, 0x00004000
1013data8 0xE22D037D8DDACB88, 0x00004000
1014data8 0xDD2B2D8A89C73522, 0x00004000
1015data8 0xD86E1A23BB2C1171, 0x00004000
1016data8 0xD3F0E288DFF5E0F9, 0x00004000
1017data8 0xCFAF16B1283BEBD5, 0x00004000
1018data8 0xCBA4AFAA0D88DD53, 0x00004000
1019data8 0xC7CE03CCCA67C43D, 0x00004000
1020data8 0xC427BC820CA0DDB0, 0x00004000
1021data8 0xC0AECD57F13D8CAB, 0x00004000
1022data8 0xBD606C3871ECE6B1, 0x00004000
1023data8 0xBA3A0A96A44C4929, 0x00004000
1024data8 0xB7394F6FE5CCCEC1, 0x00004000
1025data8 0xB45C12039637D8BC, 0x00004000
1026data8 0xB1A0552892CB051B, 0x00004000
1027data8 0xAF04432B6BA2FFD0, 0x00004000
1028data8 0xAC862A237221235F, 0x00004000
1029data8 0xAA2478AF5F00A9D1, 0x00004000
1030data8 0xA7DDBB0C81E082BF, 0x00004000
1031data8 0xA5B0987D45684FEE, 0x00004000
1032data8 0xA39BD0F5627A8F53, 0x00004000
1033data8 0xA19E3B036EC5C8B0, 0x00004000
1034data8 0x9FB6C1F091CD7C66, 0x00004000
1035data8 0x9DE464101FA3DF8A, 0x00004000
1036data8 0x9C263139A8F6B888, 0x00004000
1037data8 0x9A7B4968C27B0450, 0x00004000
1038data8 0x98E2DB7E5EE614EE, 0x00004000
1039LOCAL_OBJECT_END(tanl_table_scim2)
1040
1041LOCAL_OBJECT_START(tanl_table_scim1)
1042//
1043// Entries SC_inv in Swapped IEEE format (extended)
1044// Index = 0,1,...,19 B = 2^(-1)*(1+Index/32+1/64)
1045//
1046data8 0x969F335C13B2B5BA, 0x00004000
1047data8 0x93D446D9D4C0F548, 0x00004000
1048data8 0x9147094F61B798AF, 0x00004000
1049data8 0x8EF317CC758787AC, 0x00004000
1050data8 0x8CD498B3B99EEFDB, 0x00004000
1051data8 0x8AE82A7DDFF8BC37, 0x00004000
1052data8 0x892AD546E3C55D42, 0x00004000
1053data8 0x8799FEA9D15573C1, 0x00004000
1054data8 0x86335F88435A4B4C, 0x00004000
1055data8 0x84F4FB6E3E93A87B, 0x00004000
1056data8 0x83DD195280A382FB, 0x00004000
1057data8 0x82EA3D7FA4CB8C9E, 0x00004000
1058data8 0x821B247C6861D0A8, 0x00004000
1059data8 0x816EBED163E8D244, 0x00004000
1060data8 0x80E42D9127E4CFC6, 0x00004000
1061data8 0x807ABF8D28E64AFD, 0x00004000
1062data8 0x8031EF26863B4FD8, 0x00004000
1063data8 0x800960ADAE8C11FD, 0x00004000
1064data8 0x8000E1475FDBEC21, 0x00004000
1065data8 0x80186650A07791FA, 0x00004000
1066LOCAL_OBJECT_END(tanl_table_scim1)
1067
1068Arg = f8
1069Save_Norm_Arg = f8 // For input to reduction routine
1070Result = f8
1071r = f8 // For output from reduction routine
1072c = f9 // For output from reduction routine
1073U_2 = f10
1074rsq = f11
1075C_hi = f12
1076C_lo = f13
1077T_hi = f14
1078T_lo = f15
1079
1080d_1 = f33
1081N_0 = f34
1082tail = f35
1083tanx = f36
1084Cx = f37
1085Sx = f38
1086sgn_r = f39
1087CORR = f40
1088P = f41
1089D = f42
1090ArgPrime = f43
1091P_0 = f44
1092
1093P2_1 = f45
1094P2_2 = f46
1095P2_3 = f47
1096
1097P1_1 = f45
1098P1_2 = f46
1099P1_3 = f47
1100
1101P1_4 = f48
1102P1_5 = f49
1103P1_6 = f50
1104P1_7 = f51
1105P1_8 = f52
1106P1_9 = f53
1107
1108x = f56
1109xsq = f57
1110Tx = f58
1111Tx1 = f59
1112Set = f60
1113poly1 = f61
1114poly2 = f62
1115Poly = f63
1116Poly1 = f64
1117Poly2 = f65
1118r_to_the_8 = f66
1119B = f67
1120SC_inv = f68
1121Pos_r = f69
1122N_0_fix = f70
1123d_2 = f71
1124PI_BY_4 = f72
1125TWO_TO_NEG14 = f74
1126TWO_TO_NEG33 = f75
1127NEGTWO_TO_NEG14 = f76
1128NEGTWO_TO_NEG33 = f77
1129two_by_PI = f78
1130N = f79
1131N_fix = f80
1132P_1 = f81
1133P_2 = f82
1134P_3 = f83
1135s_val = f84
1136w = f85
1137B_mask1 = f86
1138B_mask2 = f87
1139w2 = f88
1140A = f89
1141a = f90
1142t = f91
1143U_1 = f92
1144NEGTWO_TO_NEG2 = f93
1145TWO_TO_NEG2 = f94
1146Q1_1 = f95
1147Q1_2 = f96
1148Q1_3 = f97
1149Q1_4 = f98
1150Q1_5 = f99
1151Q1_6 = f100
1152Q1_7 = f101
1153Q1_8 = f102
1154S_hi = f103
1155S_lo = f104
1156V_hi = f105
1157V_lo = f106
1158U_hi = f107
1159U_lo = f108
1160U_hiabs = f109
1161V_hiabs = f110
1162V = f111
1163Inv_P_0 = f112
1164
1165FR_inv_pi_2to63 = f113
1166FR_rshf_2to64 = f114
1167FR_2tom64 = f115
1168FR_rshf = f116
1169Norm_Arg = f117
1170Abs_Arg = f118
1171TWO_TO_NEG65 = f119
1172fp_tmp = f120
1173mOne = f121
1174
1175GR_SAVE_B0 = r33
1176GR_SAVE_GP = r34
1177GR_SAVE_PFS = r35
1178table_base = r36
1179table_ptr1 = r37
1180table_ptr2 = r38
1181table_ptr3 = r39
1182lookup = r40
1183N_fix_gr = r41
1184GR_exp_2tom2 = r42
1185GR_exp_2tom65 = r43
1186exp_r = r44
1187sig_r = r45
1188bmask1 = r46
1189table_offset = r47
1190bmask2 = r48
1191gr_tmp = r49
1192cot_flag = r50
1193
1194GR_sig_inv_pi = r51
1195GR_rshf_2to64 = r52
1196GR_exp_2tom64 = r53
1197GR_rshf = r54
1198GR_exp_2_to_63 = r55
1199GR_exp_2_to_24 = r56
1200GR_signexp_x = r57
1201GR_exp_x = r58
1202GR_exp_mask = r59
1203GR_exp_2tom14 = r60
1204GR_exp_m2tom14 = r61
1205GR_exp_2tom33 = r62
1206GR_exp_m2tom33 = r63
1207
1208GR_SAVE_B0 = r64
1209GR_SAVE_PFS = r65
1210GR_SAVE_GP = r66
1211
1212GR_Parameter_X = r67
1213GR_Parameter_Y = r68
1214GR_Parameter_RESULT = r69
1215GR_Parameter_Tag = r70
1216
1217
1218.section .text
1219.global __libm_tanl#
1220.global __libm_cotl#
1221
1222.proc __libm_cotl#
1223__libm_cotl:
1224.endp __libm_cotl#
1225LOCAL_LIBM_ENTRY(cotl)
1226
1227{ .mlx
1228 alloc r32 = ar.pfs, 0,35,4,0
1229 movl GR_sig_inv_pi = 0xa2f9836e4e44152a // significand of 1/pi
1230}
1231{ .mlx
1232 mov GR_exp_mask = 0x1ffff // Exponent mask
1233 movl GR_rshf_2to64 = 0x47e8000000000000 // 1.1000 2^(63+64)
1234}
1235;;
1236
1237// Check for NatVals, Infs , NaNs, and Zeros
1238{ .mfi
1239 getf.exp GR_signexp_x = Arg // Get sign and exponent of x
1240 fclass.m p6,p0 = Arg, 0x1E7 // Test for natval, nan, inf, zero
1241 mov cot_flag = 0x1
1242}
1243{ .mfb
1244 addl table_base = @ltoff(TANL_BASE_CONSTANTS), gp // Pointer to table ptr
1245 fnorm.s1 Norm_Arg = Arg // Normalize x
1246 br.cond.sptk COMMON_PATH
1247};;
1248
1249LOCAL_LIBM_END(cotl)
1250
1251
1252.proc __libm_tanl#
1253__libm_tanl:
1254.endp __libm_tanl#
1255GLOBAL_IEEE754_ENTRY(tanl)
1256
1257{ .mlx
1258 alloc r32 = ar.pfs, 0,35,4,0
1259 movl GR_sig_inv_pi = 0xa2f9836e4e44152a // significand of 1/pi
1260}
1261{ .mlx
1262 mov GR_exp_mask = 0x1ffff // Exponent mask
1263 movl GR_rshf_2to64 = 0x47e8000000000000 // 1.1000 2^(63+64)
1264}
1265;;
1266
1267// Check for NatVals, Infs , NaNs, and Zeros
1268{ .mfi
1269 getf.exp GR_signexp_x = Arg // Get sign and exponent of x
1270 fclass.m p6,p0 = Arg, 0x1E7 // Test for natval, nan, inf, zero
1271 mov cot_flag = 0x0
1272}
1273{ .mfi
1274 addl table_base = @ltoff(TANL_BASE_CONSTANTS), gp // Pointer to table ptr
1275 fnorm.s1 Norm_Arg = Arg // Normalize x
1276 nop.i 0
1277};;
1278
1279// Common path for both tanl and cotl
1280COMMON_PATH:
1281{ .mfi
1282 setf.sig FR_inv_pi_2to63 = GR_sig_inv_pi // Form 1/pi * 2^63
1283 fclass.m p9, p0 = Arg, 0x0b // Test x denormal
1284 mov GR_exp_2tom64 = 0xffff - 64 // Scaling constant to compute N
1285}
1286{ .mlx
1287 setf.d FR_rshf_2to64 = GR_rshf_2to64 // Form const 1.1000 * 2^(63+64)
1288 movl GR_rshf = 0x43e8000000000000 // Form const 1.1000 * 2^63
1289}
1290;;
1291
1292// Check for everything - if false, then must be pseudo-zero or pseudo-nan.
1293// Branch out to deal with special values.
1294{ .mfi
1295 addl gr_tmp = -1,r0
1296 fclass.nm p7,p0 = Arg, 0x1FF // Test x unsupported
1297 mov GR_exp_2_to_63 = 0xffff + 63 // Exponent of 2^63
1298}
1299{ .mfb
1300 ld8 table_base = [table_base] // Get pointer to constant table
1301 fms.s1 mOne = f0, f0, f1
1302(p6) br.cond.spnt TANL_SPECIAL // Branch if x natval, nan, inf, zero
1303}
1304;;
1305
1306{ .mmb
1307 setf.sig fp_tmp = gr_tmp // Make a constant so fmpy produces inexact
1308 mov GR_exp_2_to_24 = 0xffff + 24 // Exponent of 2^24
1309(p9) br.cond.spnt TANL_DENORMAL // Branch if x denormal
1310}
1311;;
1312
1313TANL_COMMON:
1314// Return to here if x denormal
1315//
1316// Do fcmp to generate Denormal exception
1317// - can't do FNORM (will generate Underflow when U is unmasked!)
1318// Branch out to deal with unsupporteds values.
1319{ .mfi
1320 setf.exp FR_2tom64 = GR_exp_2tom64 // Form 2^-64 for scaling N_float
1321 fcmp.eq.s0 p0, p6 = Arg, f1 // Dummy to flag denormals
1322 add table_ptr1 = 0, table_base // Point to tanl_table_1
1323}
1324{ .mib
1325 setf.d FR_rshf = GR_rshf // Form right shift const 1.1000 * 2^63
1326 add table_ptr2 = 80, table_base // Point to tanl_table_2
1327(p7) br.cond.spnt TANL_UNSUPPORTED // Branch if x unsupported type
1328}
1329;;
1330
1331{ .mfi
1332 and GR_exp_x = GR_exp_mask, GR_signexp_x // Get exponent of x
1333 fmpy.s1 Save_Norm_Arg = Norm_Arg, f1 // Save x if large arg reduction
1334 dep.z bmask1 = 0x7c, 56, 8 // Form mask to get 5 msb of r
1335 // bmask1 = 0x7c00000000000000
1336}
1337;;
1338
1339//
1340// Decide about the paths to take:
1341// Set PR_6 if |Arg| >= 2**63
1342// Set PR_9 if |Arg| < 2**24 - CASE 1 OR 2
1343// OTHERWISE Set PR_8 - CASE 3 OR 4
1344//
1345// Branch out if the magnitude of the input argument is >= 2^63
1346// - do this branch before the next.
1347{ .mfi
1348 ldfe two_by_PI = [table_ptr1],16 // Load 2/pi
1349 nop.f 999
1350 dep.z bmask2 = 0x41, 57, 7 // Form mask to OR to produce B
1351 // bmask2 = 0x8200000000000000
1352}
1353{ .mib
1354 ldfe PI_BY_4 = [table_ptr2],16 // Load pi/4
1355 cmp.ge p6,p0 = GR_exp_x, GR_exp_2_to_63 // Is |x| >= 2^63
1356(p6) br.cond.spnt TANL_ARG_TOO_LARGE // Branch if |x| >= 2^63
1357}
1358;;
1359
1360{ .mmi
1361 ldfe P_0 = [table_ptr1],16 // Load P_0
1362 ldfe Inv_P_0 = [table_ptr2],16 // Load Inv_P_0
1363 nop.i 999
1364}
1365;;
1366
1367{ .mfi
1368 ldfe P_1 = [table_ptr1],16 // Load P_1
1369 fmerge.s Abs_Arg = f0, Norm_Arg // Get |x|
1370 mov GR_exp_m2tom33 = 0x2ffff - 33 // Form signexp of -2^-33
1371}
1372{ .mfi
1373 ldfe d_1 = [table_ptr2],16 // Load d_1 for 2^24 <= |x| < 2^63
1374 nop.f 999
1375 mov GR_exp_2tom33 = 0xffff - 33 // Form signexp of 2^-33
1376}
1377;;
1378
1379{ .mmi
1380 ldfe P_2 = [table_ptr1],16 // Load P_2
1381 ldfe d_2 = [table_ptr2],16 // Load d_2 for 2^24 <= |x| < 2^63
1382 cmp.ge p8,p0 = GR_exp_x, GR_exp_2_to_24 // Is |x| >= 2^24
1383}
1384;;
1385
1386// Use special scaling to right shift so N=Arg * 2/pi is in rightmost bits
1387// Branch to Cases 3 or 4 if Arg <= -2**24 or Arg >= 2**24
1388{ .mfb
1389 ldfe P_3 = [table_ptr1],16 // Load P_3
1390 fma.s1 N_fix = Norm_Arg, FR_inv_pi_2to63, FR_rshf_2to64
1391(p8) br.cond.spnt TANL_LARGER_ARG // Branch if 2^24 <= |x| < 2^63
1392}
1393;;
1394
1395// Here if 0 < |x| < 2^24
1396// ARGUMENT REDUCTION CODE - CASE 1 and 2
1397//
1398{ .mmf
1399 setf.exp TWO_TO_NEG33 = GR_exp_2tom33 // Form 2^-33
1400 setf.exp NEGTWO_TO_NEG33 = GR_exp_m2tom33 // Form -2^-33
1401 fmerge.s r = Norm_Arg,Norm_Arg // Assume r=x, ok if |x| < pi/4
1402}
1403;;
1404
1405//
1406// If |Arg| < pi/4, set PR_8, else pi/4 <=|Arg| < 2^24 - set PR_9.
1407//
1408// Case 2: Convert integer N_fix back to normalized floating-point value.
1409{ .mfi
1410 getf.sig sig_r = Norm_Arg // Get sig_r if 1/4 <= |x| < pi/4
1411 fcmp.lt.s1 p8,p9= Abs_Arg,PI_BY_4 // Test |x| < pi/4
1412 mov GR_exp_2tom2 = 0xffff - 2 // Form signexp of 2^-2
1413}
1414{ .mfi
1415 ldfps TWO_TO_NEG2, NEGTWO_TO_NEG2 = [table_ptr2] // Load 2^-2, -2^-2
1416 fms.s1 N = N_fix, FR_2tom64, FR_rshf // Use scaling to get N floated
1417 mov N_fix_gr = r0 // Assume N=0, ok if |x| < pi/4
1418}
1419;;
1420
1421//
1422// Case 1: Is |r| < 2**(-2).
1423// Arg is the same as r in this case.
1424// r = Arg
1425// c = 0
1426//
1427// Case 2: Place integer part of N in GP register.
1428{ .mfi
1429(p9) getf.sig N_fix_gr = N_fix
1430 fmerge.s c = f0, f0 // Assume c=0, ok if |x| < pi/4
1431 cmp.lt p10, p0 = GR_exp_x, GR_exp_2tom2 // Test if |x| < 1/4
1432}
1433;;
1434
1435{ .mfi
1436 setf.sig B_mask1 = bmask1 // Form mask to get 5 msb of r
1437 nop.f 999
1438 mov exp_r = GR_exp_x // Get exp_r if 1/4 <= |x| < pi/4
1439}
1440{ .mbb
1441 setf.sig B_mask2 = bmask2 // Form mask to form B from r
1442(p10) br.cond.spnt TANL_SMALL_R // Branch if 0 < |x| < 1/4
1443(p8) br.cond.spnt TANL_NORMAL_R // Branch if 1/4 <= |x| < pi/4
1444}
1445;;
1446
1447// Here if pi/4 <= |x| < 2^24
1448//
1449// Case 1: PR_3 is only affected when PR_1 is set.
1450//
1451//
1452// Case 2: w = N * P_2
1453// Case 2: s_val = -N * P_1 + Arg
1454//
1455
1456{ .mfi
1457 nop.m 999
1458 fnma.s1 s_val = N, P_1, Norm_Arg
1459 nop.i 999
1460}
1461{ .mfi
1462 nop.m 999
1463 fmpy.s1 w = N, P_2 // w = N * P_2 for |s| >= 2^-33
1464 nop.i 999
1465}
1466;;
1467
1468// Case 2_reduce: w = N * P_3 (change sign)
1469{ .mfi
1470 nop.m 999
1471 fmpy.s1 w2 = N, P_3 // w = N * P_3 for |s| < 2^-33
1472 nop.i 999
1473}
1474;;
1475
1476// Case 1_reduce: r = s + w (change sign)
1477{ .mfi
1478 nop.m 999
1479 fsub.s1 r = s_val, w // r = s_val - w for |s| >= 2^-33
1480 nop.i 999
1481}
1482;;
1483
1484// Case 2_reduce: U_1 = N * P_2 + w
1485{ .mfi
1486 nop.m 999
1487 fma.s1 U_1 = N, P_2, w2 // U_1 = N * P_2 + w for |s| < 2^-33
1488 nop.i 999
1489}
1490;;
1491
1492//
1493// Decide between case_1 and case_2 reduce:
1494// Case 1_reduce: |s| >= 2**(-33)
1495// Case 2_reduce: |s| < 2**(-33)
1496//
1497{ .mfi
1498 nop.m 999
1499 fcmp.lt.s1 p9, p8 = s_val, TWO_TO_NEG33
1500 nop.i 999
1501}
1502;;
1503
1504{ .mfi
1505 nop.m 999
1506(p9) fcmp.gt.s1 p9, p8 = s_val, NEGTWO_TO_NEG33
1507 nop.i 999
1508}
1509;;
1510
1511// Case 1_reduce: c = s - r
1512{ .mfi
1513 nop.m 999
1514 fsub.s1 c = s_val, r // c = s_val - r for |s| >= 2^-33
1515 nop.i 999
1516}
1517;;
1518
1519// Case 2_reduce: r is complete here - continue to calculate c .
1520// r = s - U_1
1521{ .mfi
1522 nop.m 999
1523(p9) fsub.s1 r = s_val, U_1
1524 nop.i 999
1525}
1526{ .mfi
1527 nop.m 999
1528(p9) fms.s1 U_2 = N, P_2, U_1
1529 nop.i 999
1530}
1531;;
1532
1533//
1534// Case 1_reduce: Is |r| < 2**(-2), if so set PR_10
1535// else set PR_13.
1536//
1537
1538{ .mfi
1539 nop.m 999
1540 fand B = B_mask1, r
1541 nop.i 999
1542}
1543{ .mfi
1544 nop.m 999
1545(p8) fcmp.lt.unc.s1 p10, p13 = r, TWO_TO_NEG2
1546 nop.i 999
1547}
1548;;
1549
1550{ .mfi
1551(p8) getf.sig sig_r = r // Get signif of r if |s| >= 2^-33
1552 nop.f 999
1553 nop.i 999
1554}
1555;;
1556
1557{ .mfi
1558(p8) getf.exp exp_r = r // Extract signexp of r if |s| >= 2^-33
1559(p10) fcmp.gt.s1 p10, p13 = r, NEGTWO_TO_NEG2
1560 nop.i 999
1561}
1562;;
1563
1564// Case 1_reduce: c is complete here.
1565// Case 1: Branch to SMALL_R or NORMAL_R.
1566// c = c + w (w has not been negated.)
1567{ .mfi
1568 nop.m 999
1569(p8) fsub.s1 c = c, w // c = c - w for |s| >= 2^-33
1570 nop.i 999
1571}
1572{ .mbb
1573 nop.m 999
1574(p10) br.cond.spnt TANL_SMALL_R // Branch if pi/4 < |x| < 2^24 and |r|<1/4
1575(p13) br.cond.sptk TANL_NORMAL_R_A // Branch if pi/4 < |x| < 2^24 and |r|>=1/4
1576}
1577;;
1578
1579
1580// Here if pi/4 < |x| < 2^24 and |s| < 2^-33
1581//
1582// Is i_1 = lsb of N_fix_gr even or odd?
1583// if i_1 == 0, set p11, else set p12.
1584//
1585{ .mfi
1586 nop.m 999
1587 fsub.s1 s_val = s_val, r
1588 add N_fix_gr = N_fix_gr, cot_flag // N = N + 1 (for cotl)
1589}
1590{ .mfi
1591 nop.m 999
1592//
1593// Case 2_reduce:
1594// U_2 = N * P_2 - U_1
1595// Not needed until later.
1596//
1597 fadd.s1 U_2 = U_2, w2
1598//
1599// Case 2_reduce:
1600// s = s - r
1601// U_2 = U_2 + w
1602//
1603 nop.i 999
1604}
1605;;
1606
1607//
1608// Case 2_reduce:
1609// c = c - U_2
1610// c is complete here
1611// Argument reduction ends here.
1612//
1613{ .mfi
1614 nop.m 999
1615 fmpy.s1 rsq = r, r
1616 tbit.z p11, p12 = N_fix_gr, 0 ;; // Set p11 if N even, p12 if odd
1617}
1618
1619{ .mfi
1620 nop.m 999
1621(p12) frcpa.s1 S_hi,p0 = f1, r
1622 nop.i 999
1623}
1624{ .mfi
1625 nop.m 999
1626 fsub.s1 c = s_val, U_1
1627 nop.i 999
1628}
1629;;
1630
1631{ .mmi
1632 add table_ptr1 = 160, table_base ;; // Point to tanl_table_p1
1633 ldfe P1_1 = [table_ptr1],144
1634 nop.i 999 ;;
1635}
1636//
1637// Load P1_1 and point to Q1_1 .
1638//
1639{ .mfi
1640 ldfe Q1_1 = [table_ptr1]
1641//
1642// N even: rsq = r * Z
1643// N odd: S_hi = frcpa(r)
1644//
1645(p12) fmerge.ns S_hi = S_hi, S_hi
1646 nop.i 999
1647}
1648{ .mfi
1649 nop.m 999
1650//
1651// Case 2_reduce:
1652// c = s - U_1
1653//
1654(p9) fsub.s1 c = c, U_2
1655 nop.i 999 ;;
1656}
1657{ .mfi
1658 nop.m 999
1659(p12) fma.s1 poly1 = S_hi, r, f1
1660 nop.i 999 ;;
1661}
1662{ .mfi
1663 nop.m 999
1664//
1665// N odd: Change sign of S_hi
1666//
1667(p11) fmpy.s1 rsq = rsq, P1_1
1668 nop.i 999 ;;
1669}
1670{ .mfi
1671 nop.m 999
1672(p12) fma.s1 S_hi = S_hi, poly1, S_hi
1673 nop.i 999 ;;
1674}
1675{ .mfi
1676 nop.m 999
1677//
1678// N even: rsq = rsq * P1_1
1679// N odd: poly1 = 1.0 + S_hi * r 16 bits partial account for necessary
1680//
1681(p11) fma.s1 Poly = r, rsq, c
1682 nop.i 999 ;;
1683}
1684{ .mfi
1685 nop.m 999
1686//
1687// N even: Poly = c + r * rsq
1688// N odd: S_hi = S_hi + S_hi*poly1 16 bits account for necessary
1689//
1690(p12) fma.s1 poly1 = S_hi, r, f1
1691(p11) tbit.z.unc p14, p15 = cot_flag, 0 ;; // p14=1 for tanl; p15=1 for cotl
1692}
1693{ .mfi
1694 nop.m 999
1695//
1696// N even: Result = Poly + r
1697// N odd: poly1 = 1.0 + S_hi * r 32 bits partial
1698//
1699(p14) fadd.s0 Result = r, Poly // for tanl
1700 nop.i 999
1701}
1702{ .mfi
1703 nop.m 999
1704(p15) fms.s0 Result = r, mOne, Poly // for cotl
1705 nop.i 999
1706}
1707;;
1708
1709{ .mfi
1710 nop.m 999
1711(p12) fma.s1 S_hi = S_hi, poly1, S_hi
1712 nop.i 999 ;;
1713}
1714{ .mfi
1715 nop.m 999
1716//
1717// N even: Result1 = Result + r
1718// N odd: S_hi = S_hi * poly1 + S_hi 32 bits
1719//
1720(p12) fma.s1 poly1 = S_hi, r, f1
1721 nop.i 999 ;;
1722}
1723{ .mfi
1724 nop.m 999
1725//
1726// N odd: poly1 = S_hi * r + 1.0 64 bits partial
1727//
1728(p12) fma.s1 S_hi = S_hi, poly1, S_hi
1729 nop.i 999 ;;
1730}
1731{ .mfi
1732 nop.m 999
1733//
1734// N odd: poly1 = S_hi * poly + 1.0 64 bits
1735//
1736(p12) fma.s1 poly1 = S_hi, r, f1
1737 nop.i 999 ;;
1738}
1739{ .mfi
1740 nop.m 999
1741//
1742// N odd: poly1 = S_hi * r + 1.0
1743//
1744(p12) fma.s1 poly1 = S_hi, c, poly1
1745 nop.i 999 ;;
1746}
1747{ .mfi
1748 nop.m 999
1749//
1750// N odd: poly1 = S_hi * c + poly1
1751//
1752(p12) fmpy.s1 S_lo = S_hi, poly1
1753 nop.i 999 ;;
1754}
1755{ .mfi
1756 nop.m 999
1757//
1758// N odd: S_lo = S_hi * poly1
1759//
1760(p12) fma.s1 S_lo = Q1_1, r, S_lo
1761(p12) tbit.z.unc p14, p15 = cot_flag, 0 // p14=1 for tanl; p15=1 for cotl
1762}
1763{ .mfi
1764 nop.m 999
1765//
1766// N odd: Result = S_hi + S_lo
1767//
1768 fmpy.s0 fp_tmp = fp_tmp, fp_tmp // Dummy mult to set inexact
1769 nop.i 999 ;;
1770}
1771{ .mfi
1772 nop.m 999
1773//
1774// N odd: S_lo = S_lo + Q1_1 * r
1775//
1776(p14) fadd.s0 Result = S_hi, S_lo // for tanl
1777 nop.i 999
1778}
1779{ .mfb
1780 nop.m 999
1781(p15) fms.s0 Result = S_hi, mOne, S_lo // for cotl
1782 br.ret.sptk b0 ;; // Exit for pi/4 <= |x| < 2^24 and |s| < 2^-33
1783}
1784
1785
1786TANL_LARGER_ARG:
1787// Here if 2^24 <= |x| < 2^63
1788//
1789// ARGUMENT REDUCTION CODE - CASE 3 and 4
1790//
1791
1792{ .mmf
1793 mov GR_exp_2tom14 = 0xffff - 14 // Form signexp of 2^-14
1794 mov GR_exp_m2tom14 = 0x2ffff - 14 // Form signexp of -2^-14
1795 fmpy.s1 N_0 = Norm_Arg, Inv_P_0
1796}
1797;;
1798
1799{ .mmi
1800 setf.exp TWO_TO_NEG14 = GR_exp_2tom14 // Form 2^-14
1801 setf.exp NEGTWO_TO_NEG14 = GR_exp_m2tom14// Form -2^-14
1802 nop.i 999
1803}
1804;;
1805
1806
1807//
1808// Adjust table_ptr1 to beginning of table.
1809// N_0 = Arg * Inv_P_0
1810//
1811{ .mmi
1812 add table_ptr2 = 144, table_base ;; // Point to 2^-2
1813 ldfps TWO_TO_NEG2, NEGTWO_TO_NEG2 = [table_ptr2]
1814 nop.i 999
1815}
1816;;
1817
1818//
1819// N_0_fix = integer part of N_0 .
1820//
1821//
1822// Make N_0 the integer part.
1823//
1824{ .mfi
1825 nop.m 999
1826 fcvt.fx.s1 N_0_fix = N_0
1827 nop.i 999 ;;
1828}
1829{ .mfi
1830 setf.sig B_mask1 = bmask1 // Form mask to get 5 msb of r
1831 fcvt.xf N_0 = N_0_fix
1832 nop.i 999 ;;
1833}
1834{ .mfi
1835 setf.sig B_mask2 = bmask2 // Form mask to form B from r
1836 fnma.s1 ArgPrime = N_0, P_0, Norm_Arg
1837 nop.i 999
1838}
1839{ .mfi
1840 nop.m 999
1841 fmpy.s1 w = N_0, d_1
1842 nop.i 999 ;;
1843}
1844//
1845// ArgPrime = -N_0 * P_0 + Arg
1846// w = N_0 * d_1
1847//
1848//
1849// N = ArgPrime * 2/pi
1850//
1851// fcvt.fx.s1 N_fix = N
1852// Use special scaling to right shift so N=Arg * 2/pi is in rightmost bits
1853// Branch to Cases 3 or 4 if Arg <= -2**24 or Arg >= 2**24
1854{ .mfi
1855 nop.m 999
1856 fma.s1 N_fix = ArgPrime, FR_inv_pi_2to63, FR_rshf_2to64
1857
1858 nop.i 999 ;;
1859}
1860// Convert integer N_fix back to normalized floating-point value.
1861{ .mfi
1862 nop.m 999
1863 fms.s1 N = N_fix, FR_2tom64, FR_rshf // Use scaling to get N floated
1864 nop.i 999
1865}
1866;;
1867
1868//
1869// N is the integer part of the reduced-reduced argument.
1870// Put the integer in a GP register.
1871//
1872{ .mfi
1873 getf.sig N_fix_gr = N_fix
1874 nop.f 999
1875 nop.i 999
1876}
1877;;
1878
1879//
1880// s_val = -N*P_1 + ArgPrime
1881// w = -N*P_2 + w
1882//
1883{ .mfi
1884 nop.m 999
1885 fnma.s1 s_val = N, P_1, ArgPrime
1886 nop.i 999
1887}
1888{ .mfi
1889 nop.m 999
1890 fnma.s1 w = N, P_2, w
1891 nop.i 999
1892}
1893;;
1894
1895// Case 4: V_hi = N * P_2
1896// Case 4: U_hi = N_0 * d_1
1897{ .mfi
1898 nop.m 999
1899 fmpy.s1 V_hi = N, P_2 // V_hi = N * P_2 for |s| < 2^-14
1900 nop.i 999
1901}
1902{ .mfi
1903 nop.m 999
1904 fmpy.s1 U_hi = N_0, d_1 // U_hi = N_0 * d_1 for |s| < 2^-14
1905 nop.i 999
1906}
1907;;
1908
1909// Case 3: r = s_val + w (Z complete)
1910// Case 4: w = N * P_3
1911{ .mfi
1912 nop.m 999
1913 fadd.s1 r = s_val, w // r = s_val + w for |s| >= 2^-14
1914 nop.i 999
1915}
1916{ .mfi
1917 nop.m 999
1918 fmpy.s1 w2 = N, P_3 // w = N * P_3 for |s| < 2^-14
1919 nop.i 999
1920}
1921;;
1922
1923// Case 4: A = U_hi + V_hi
1924// Note: Worry about switched sign of V_hi, so subtract instead of add.
1925// Case 4: V_lo = -N * P_2 - V_hi (U_hi is in place of V_hi in writeup)
1926// Note: the (-) is still missing for V_hi.
1927{ .mfi
1928 nop.m 999
1929 fsub.s1 A = U_hi, V_hi // A = U_hi - V_hi for |s| < 2^-14
1930 nop.i 999
1931}
1932{ .mfi
1933 nop.m 999
1934 fnma.s1 V_lo = N, P_2, V_hi // V_lo = V_hi - N * P_2 for |s| < 2^-14
1935 nop.i 999
1936}
1937;;
1938
1939// Decide between case 3 and 4:
1940// Case 3: |s| >= 2**(-14) Set p10
1941// Case 4: |s| < 2**(-14) Set p11
1942//
1943// Case 4: U_lo = N_0 * d_1 - U_hi
1944{ .mfi
1945 nop.m 999
1946 fms.s1 U_lo = N_0, d_1, U_hi // U_lo = N_0*d_1 - U_hi for |s| < 2^-14
1947 nop.i 999
1948}
1949{ .mfi
1950 nop.m 999
1951 fcmp.lt.s1 p11, p10 = s_val, TWO_TO_NEG14
1952 nop.i 999
1953}
1954;;
1955
1956// Case 4: We need abs of both U_hi and V_hi - dont
1957// worry about switched sign of V_hi.
1958{ .mfi
1959 nop.m 999
1960 fabs V_hiabs = V_hi // |V_hi| for |s| < 2^-14
1961 nop.i 999
1962}
1963{ .mfi
1964 nop.m 999
1965(p11) fcmp.gt.s1 p11, p10 = s_val, NEGTWO_TO_NEG14
1966 nop.i 999
1967}
1968;;
1969
1970// Case 3: c = s_val - r
1971{ .mfi
1972 nop.m 999
1973 fabs U_hiabs = U_hi // |U_hi| for |s| < 2^-14
1974 nop.i 999
1975}
1976{ .mfi
1977 nop.m 999
1978 fsub.s1 c = s_val, r // c = s_val - r for |s| >= 2^-14
1979 nop.i 999
1980}
1981;;
1982
1983// For Case 3, |s| >= 2^-14, determine if |r| < 1/4
1984//
1985// Case 4: C_hi = s_val + A
1986//
1987{ .mfi
1988 nop.m 999
1989(p11) fadd.s1 C_hi = s_val, A // C_hi = s_val + A for |s| < 2^-14
1990 nop.i 999
1991}
1992{ .mfi
1993 nop.m 999
1994(p10) fcmp.lt.unc.s1 p14, p15 = r, TWO_TO_NEG2
1995 nop.i 999
1996}
1997;;
1998
1999{ .mfi
2000 getf.sig sig_r = r // Get signif of r if |s| >= 2^-33
2001 fand B = B_mask1, r
2002 nop.i 999
2003}
2004;;
2005
2006// Case 4: t = U_lo + V_lo
2007{ .mfi
2008 getf.exp exp_r = r // Extract signexp of r if |s| >= 2^-33
2009(p11) fadd.s1 t = U_lo, V_lo // t = U_lo + V_lo for |s| < 2^-14
2010 nop.i 999
2011}
2012{ .mfi
2013 nop.m 999
2014(p14) fcmp.gt.s1 p14, p15 = r, NEGTWO_TO_NEG2
2015 nop.i 999
2016}
2017;;
2018
2019// Case 3: c = (s - r) + w (c complete)
2020{ .mfi
2021 nop.m 999
2022(p10) fadd.s1 c = c, w // c = c + w for |s| >= 2^-14
2023 nop.i 999
2024}
2025{ .mbb
2026 nop.m 999
2027(p14) br.cond.spnt TANL_SMALL_R // Branch if 2^24 <= |x| < 2^63 and |r|< 1/4
2028(p15) br.cond.sptk TANL_NORMAL_R_A // Branch if 2^24 <= |x| < 2^63 and |r|>=1/4
2029}
2030;;
2031
2032
2033// Here if 2^24 <= |x| < 2^63 and |s| < 2^-14 >>>>>>> Case 4.
2034//
2035// Case 4: Set P_12 if U_hiabs >= V_hiabs
2036// Case 4: w = w + N_0 * d_2
2037// Note: the (-) is now incorporated in w .
2038{ .mfi
2039 add table_ptr1 = 160, table_base // Point to tanl_table_p1
2040 fcmp.ge.unc.s1 p12, p13 = U_hiabs, V_hiabs
2041 nop.i 999
2042}
2043{ .mfi
2044 nop.m 999
2045 fms.s1 w2 = N_0, d_2, w2
2046 nop.i 999
2047}
2048;;
2049
2050// Case 4: C_lo = s_val - C_hi
2051{ .mfi
2052 ldfe P1_1 = [table_ptr1], 16 // Load P1_1
2053 fsub.s1 C_lo = s_val, C_hi
2054 nop.i 999
2055}
2056;;
2057
2058//
2059// Case 4: a = U_hi - A
2060// a = V_hi - A (do an add to account for missing (-) on V_hi
2061//
2062{ .mfi
2063 ldfe P1_2 = [table_ptr1], 128 // Load P1_2
2064(p12) fsub.s1 a = U_hi, A
2065 nop.i 999
2066}
2067{ .mfi
2068 nop.m 999
2069(p13) fadd.s1 a = V_hi, A
2070 nop.i 999
2071}
2072;;
2073
2074// Case 4: t = U_lo + V_lo + w
2075{ .mfi
2076 ldfe Q1_1 = [table_ptr1], 16 // Load Q1_1
2077 fadd.s1 t = t, w2
2078 nop.i 999
2079}
2080;;
2081
2082// Case 4: a = (U_hi - A) + V_hi
2083// a = (V_hi - A) + U_hi
2084// In each case account for negative missing form V_hi .
2085//
2086{ .mfi
2087 ldfe Q1_2 = [table_ptr1], 16 // Load Q1_2
2088(p12) fsub.s1 a = a, V_hi
2089 nop.i 999
2090}
2091{ .mfi
2092 nop.m 999
2093(p13) fsub.s1 a = U_hi, a
2094 nop.i 999
2095}
2096;;
2097
2098//
2099// Case 4: C_lo = (s_val - C_hi) + A
2100//
2101{ .mfi
2102 nop.m 999
2103 fadd.s1 C_lo = C_lo, A
2104 nop.i 999 ;;
2105}
2106//
2107// Case 4: t = t + a
2108//
2109{ .mfi
2110 nop.m 999
2111 fadd.s1 t = t, a
2112 nop.i 999
2113}
2114;;
2115
2116// Case 4: C_lo = C_lo + t
2117// Case 4: r = C_hi + C_lo
2118{ .mfi
2119 nop.m 999
2120 fadd.s1 C_lo = C_lo, t
2121 nop.i 999
2122}
2123;;
2124
2125{ .mfi
2126 nop.m 999
2127 fadd.s1 r = C_hi, C_lo
2128 nop.i 999
2129}
2130;;
2131
2132//
2133// Case 4: c = C_hi - r
2134//
2135{ .mfi
2136 nop.m 999
2137 fsub.s1 c = C_hi, r
2138 nop.i 999
2139}
2140{ .mfi
2141 nop.m 999
2142 fmpy.s1 rsq = r, r
2143 add N_fix_gr = N_fix_gr, cot_flag // N = N + 1 (for cotl)
2144}
2145;;
2146
2147// Case 4: c = c + C_lo finished.
2148//
2149// Is i_1 = lsb of N_fix_gr even or odd?
2150// if i_1 == 0, set PR_11, else set PR_12.
2151//
2152{ .mfi
2153 nop.m 999
2154 fadd.s1 c = c , C_lo
2155 tbit.z p11, p12 = N_fix_gr, 0
2156}
2157;;
2158
2159// r and c have been computed.
2160{ .mfi
2161 nop.m 999
2162(p12) frcpa.s1 S_hi, p0 = f1, r
2163 nop.i 999
2164}
2165{ .mfi
2166 nop.m 999
2167//
2168// N odd: Change sign of S_hi
2169//
2170(p11) fma.s1 Poly = rsq, P1_2, P1_1
2171 nop.i 999 ;;
2172}
2173{ .mfi
2174 nop.m 999
2175(p12) fma.s1 P = rsq, Q1_2, Q1_1
2176 nop.i 999
2177}
2178{ .mfi
2179 nop.m 999
2180//
2181// N odd: Result = S_hi + S_lo (User supplied rounding mode for C1)
2182//
2183 fmpy.s0 fp_tmp = fp_tmp, fp_tmp // Dummy mult to set inexact
2184 nop.i 999 ;;
2185}
2186{ .mfi
2187 nop.m 999
2188//
2189// N even: rsq = r * r
2190// N odd: S_hi = frcpa(r)
2191//
2192(p12) fmerge.ns S_hi = S_hi, S_hi
2193 nop.i 999
2194}
2195{ .mfi
2196 nop.m 999
2197//
2198// N even: rsq = rsq * P1_2 + P1_1
2199// N odd: poly1 = 1.0 + S_hi * r 16 bits partial account for necessary
2200//
2201(p11) fmpy.s1 Poly = rsq, Poly
2202 nop.i 999 ;;
2203}
2204{ .mfi
2205 nop.m 999
2206(p12) fma.s1 poly1 = S_hi, r,f1
2207(p11) tbit.z.unc p14, p15 = cot_flag, 0 // p14=1 for tanl; p15=1 for cotl
2208}
2209{ .mfi
2210 nop.m 999
2211//
2212// N even: Poly = Poly * rsq
2213// N odd: S_hi = S_hi + S_hi*poly1 16 bits account for necessary
2214//
2215(p11) fma.s1 Poly = r, Poly, c
2216 nop.i 999 ;;
2217}
2218{ .mfi
2219 nop.m 999
2220(p12) fma.s1 S_hi = S_hi, poly1, S_hi
2221 nop.i 999
2222}
2223{ .mfi
2224 nop.m 999
2225//
2226// N odd: S_hi = S_hi * poly1 + S_hi 32 bits
2227//
2228(p14) fadd.s0 Result = r, Poly // for tanl
2229 nop.i 999 ;;
2230}
2231
2232.pred.rel "mutex",p15,p12
2233{ .mfi
2234 nop.m 999
2235(p15) fms.s0 Result = r, mOne, Poly // for cotl
2236 nop.i 999
2237}
2238{ .mfi
2239 nop.m 999
2240(p12) fma.s1 poly1 = S_hi, r, f1
2241 nop.i 999 ;;
2242}
2243{ .mfi
2244 nop.m 999
2245//
2246// N even: Poly = Poly * r + c
2247// N odd: poly1 = 1.0 + S_hi * r 32 bits partial
2248//
2249(p12) fma.s1 S_hi = S_hi, poly1, S_hi
2250 nop.i 999 ;;
2251}
2252{ .mfi
2253 nop.m 999
2254(p12) fma.s1 poly1 = S_hi, r, f1
2255 nop.i 999 ;;
2256}
2257{ .mfi
2258 nop.m 999
2259//
2260// N even: Result = Poly + r (Rounding mode S0)
2261// N odd: poly1 = S_hi * r + 1.0 64 bits partial
2262//
2263(p12) fma.s1 S_hi = S_hi, poly1, S_hi
2264 nop.i 999 ;;
2265}
2266{ .mfi
2267 nop.m 999
2268//
2269// N odd: poly1 = S_hi * poly + S_hi 64 bits
2270//
2271(p12) fma.s1 poly1 = S_hi, r, f1
2272 nop.i 999 ;;
2273}
2274{ .mfi
2275 nop.m 999
2276//
2277// N odd: poly1 = S_hi * r + 1.0
2278//
2279(p12) fma.s1 poly1 = S_hi, c, poly1
2280 nop.i 999 ;;
2281}
2282{ .mfi
2283 nop.m 999
2284//
2285// N odd: poly1 = S_hi * c + poly1
2286//
2287(p12) fmpy.s1 S_lo = S_hi, poly1
2288 nop.i 999 ;;
2289}
2290{ .mfi
2291 nop.m 999
2292//
2293// N odd: S_lo = S_hi * poly1
2294//
2295(p12) fma.s1 S_lo = P, r, S_lo
2296(p12) tbit.z.unc p14, p15 = cot_flag, 0 ;; // p14=1 for tanl; p15=1 for cotl
2297}
2298
2299{ .mfi
2300 nop.m 999
2301(p14) fadd.s0 Result = S_hi, S_lo // for tanl
2302 nop.i 999
2303}
2304{ .mfb
2305 nop.m 999
2306//
2307// N odd: S_lo = S_lo + r * P
2308//
2309(p15) fms.s0 Result = S_hi, mOne, S_lo // for cotl
2310 br.ret.sptk b0 ;; // Exit for 2^24 <= |x| < 2^63 and |s| < 2^-14
2311}
2312
2313
2314TANL_SMALL_R:
2315// Here if |r| < 1/4
2316// r and c have been computed.
2317// *****************************************************************
2318// *****************************************************************
2319// *****************************************************************
2320// N odd: S_hi = frcpa(r)
2321// Get [i_1] - lsb of N_fix_gr. Set p11 if N even, p12 if N odd.
2322// N even: rsq = r * r
2323{ .mfi
2324 add table_ptr1 = 160, table_base // Point to tanl_table_p1
2325 frcpa.s1 S_hi, p0 = f1, r // S_hi for N odd
2326 add N_fix_gr = N_fix_gr, cot_flag // N = N + 1 (for cotl)
2327}
2328{ .mfi
2329 add table_ptr2 = 400, table_base // Point to Q1_7
2330 fmpy.s1 rsq = r, r
2331 nop.i 999
2332}
2333;;
2334
2335{ .mmi
2336 ldfe P1_1 = [table_ptr1], 16
2337;;
2338 ldfe P1_2 = [table_ptr1], 16
2339 tbit.z p11, p12 = N_fix_gr, 0
2340}
2341;;
2342
2343
2344{ .mfi
2345 ldfe P1_3 = [table_ptr1], 96
2346 nop.f 999
2347 nop.i 999
2348}
2349;;
2350
2351{ .mfi
2352(p11) ldfe P1_9 = [table_ptr1], -16
2353(p12) fmerge.ns S_hi = S_hi, S_hi
2354 nop.i 999
2355}
2356{ .mfi
2357 nop.m 999
2358(p11) fmpy.s1 r_to_the_8 = rsq, rsq
2359 nop.i 999
2360}
2361;;
2362
2363//
2364// N even: Poly2 = P1_7 + Poly2 * rsq
2365// N odd: poly2 = Q1_5 + poly2 * rsq
2366//
2367{ .mfi
2368(p11) ldfe P1_8 = [table_ptr1], -16
2369(p11) fadd.s1 CORR = rsq, f1
2370 nop.i 999
2371}
2372;;
2373
2374//
2375// N even: Poly1 = P1_2 + P1_3 * rsq
2376// N odd: poly1 = 1.0 + S_hi * r
2377// 16 bits partial account for necessary (-1)
2378//
2379{ .mmi
2380(p11) ldfe P1_7 = [table_ptr1], -16
2381;;
2382(p11) ldfe P1_6 = [table_ptr1], -16
2383 nop.i 999
2384}
2385;;
2386
2387//
2388// N even: Poly1 = P1_1 + Poly1 * rsq
2389// N odd: S_hi = S_hi + S_hi * poly1) 16 bits account for necessary
2390//
2391//
2392// N even: Poly2 = P1_5 + Poly2 * rsq
2393// N odd: poly2 = Q1_3 + poly2 * rsq
2394//
2395{ .mfi
2396(p11) ldfe P1_5 = [table_ptr1], -16
2397(p11) fmpy.s1 r_to_the_8 = r_to_the_8, r_to_the_8
2398 nop.i 999
2399}
2400{ .mfi
2401 nop.m 999
2402(p12) fma.s1 poly1 = S_hi, r, f1
2403 nop.i 999
2404}
2405;;
2406
2407//
2408// N even: Poly1 = Poly1 * rsq
2409// N odd: poly1 = 1.0 + S_hi * r 32 bits partial
2410//
2411
2412//
2413// N even: CORR = CORR * c
2414// N odd: S_hi = S_hi * poly1 + S_hi 32 bits
2415//
2416
2417//
2418// N even: Poly2 = P1_6 + Poly2 * rsq
2419// N odd: poly2 = Q1_4 + poly2 * rsq
2420//
2421
2422{ .mmf
2423(p11) ldfe P1_4 = [table_ptr1], -16
2424 nop.m 999
2425(p11) fmpy.s1 CORR = CORR, c
2426}
2427;;
2428
2429{ .mfi
2430 nop.m 999
2431(p11) fma.s1 Poly1 = P1_3, rsq, P1_2
2432 nop.i 999 ;;
2433}
2434{ .mfi
2435(p12) ldfe Q1_7 = [table_ptr2], -16
2436(p12) fma.s1 S_hi = S_hi, poly1, S_hi
2437 nop.i 999 ;;
2438}
2439{ .mfi
2440(p12) ldfe Q1_6 = [table_ptr2], -16
2441(p11) fma.s1 Poly2 = P1_9, rsq, P1_8
2442 nop.i 999 ;;
2443}
2444{ .mmi
2445(p12) ldfe Q1_5 = [table_ptr2], -16 ;;
2446(p12) ldfe Q1_4 = [table_ptr2], -16
2447 nop.i 999 ;;
2448}
2449{ .mfi
2450(p12) ldfe Q1_3 = [table_ptr2], -16
2451//
2452// N even: Poly2 = P1_8 + P1_9 * rsq
2453// N odd: poly2 = Q1_6 + Q1_7 * rsq
2454//
2455(p11) fma.s1 Poly1 = Poly1, rsq, P1_1
2456 nop.i 999 ;;
2457}
2458{ .mfi
2459(p12) ldfe Q1_2 = [table_ptr2], -16
2460(p12) fma.s1 poly1 = S_hi, r, f1
2461 nop.i 999 ;;
2462}
2463{ .mfi
2464(p12) ldfe Q1_1 = [table_ptr2], -16
2465(p11) fma.s1 Poly2 = Poly2, rsq, P1_7
2466 nop.i 999 ;;
2467}
2468{ .mfi
2469 nop.m 999
2470//
2471// N even: CORR = rsq + 1
2472// N even: r_to_the_8 = rsq * rsq
2473//
2474(p11) fmpy.s1 Poly1 = Poly1, rsq
2475 nop.i 999 ;;
2476}
2477{ .mfi
2478 nop.m 999
2479(p12) fma.s1 S_hi = S_hi, poly1, S_hi
2480 nop.i 999
2481}
2482{ .mfi
2483 nop.m 999
2484(p12) fma.s1 poly2 = Q1_7, rsq, Q1_6
2485 nop.i 999 ;;
2486}
2487{ .mfi
2488 nop.m 999
2489(p11) fma.s1 Poly2 = Poly2, rsq, P1_6
2490 nop.i 999 ;;
2491}
2492{ .mfi
2493 nop.m 999
2494(p12) fma.s1 poly1 = S_hi, r, f1
2495 nop.i 999
2496}
2497{ .mfi
2498 nop.m 999
2499(p12) fma.s1 poly2 = poly2, rsq, Q1_5
2500 nop.i 999 ;;
2501}
2502{ .mfi
2503 nop.m 999
2504(p11) fma.s1 Poly2= Poly2, rsq, P1_5
2505 nop.i 999 ;;
2506}
2507{ .mfi
2508 nop.m 999
2509(p12) fma.s1 S_hi = S_hi, poly1, S_hi
2510 nop.i 999
2511}
2512{ .mfi
2513 nop.m 999
2514(p12) fma.s1 poly2 = poly2, rsq, Q1_4
2515 nop.i 999 ;;
2516}
2517{ .mfi
2518 nop.m 999
2519//
2520// N even: r_to_the_8 = r_to_the_8 * r_to_the_8
2521// N odd: poly1 = S_hi * r + 1.0 64 bits partial
2522//
2523(p11) fma.s1 Poly2 = Poly2, rsq, P1_4
2524 nop.i 999 ;;
2525}
2526{ .mfi
2527 nop.m 999
2528//
2529// N even: Poly = CORR + Poly * r
2530// N odd: P = Q1_1 + poly2 * rsq
2531//
2532(p12) fma.s1 poly1 = S_hi, r, f1
2533 nop.i 999
2534}
2535{ .mfi
2536 nop.m 999
2537(p12) fma.s1 poly2 = poly2, rsq, Q1_3
2538 nop.i 999 ;;
2539}
2540{ .mfi
2541 nop.m 999
2542//
2543// N even: Poly2 = P1_4 + Poly2 * rsq
2544// N odd: poly2 = Q1_2 + poly2 * rsq
2545//
2546(p11) fma.s1 Poly = Poly2, r_to_the_8, Poly1
2547 nop.i 999 ;;
2548}
2549{ .mfi
2550 nop.m 999
2551(p12) fma.s1 poly1 = S_hi, c, poly1
2552 nop.i 999
2553}
2554{ .mfi
2555 nop.m 999
2556(p12) fma.s1 poly2 = poly2, rsq, Q1_2
2557 nop.i 999 ;;
2558}
2559
2560{ .mfi
2561 nop.m 999
2562//
2563// N even: Poly = Poly1 + Poly2 * r_to_the_8
2564// N odd: S_hi = S_hi * poly1 + S_hi 64 bits
2565//
2566(p11) fma.s1 Poly = Poly, r, CORR
2567 nop.i 999 ;;
2568}
2569{ .mfi
2570 nop.m 999
2571//
2572// N even: Result = r + Poly (User supplied rounding mode)
2573// N odd: poly1 = S_hi * c + poly1
2574//
2575(p12) fmpy.s1 S_lo = S_hi, poly1
2576(p11) tbit.z.unc p14, p15 = cot_flag, 0 // p14=1 for tanl; p15=1 for cotl
2577}
2578{ .mfi
2579 nop.m 999
2580(p12) fma.s1 P = poly2, rsq, Q1_1
2581 nop.i 999 ;;
2582}
2583{ .mfi
2584 nop.m 999
2585//
2586// N odd: poly1 = S_hi * r + 1.0
2587//
2588//
2589// N odd: S_lo = S_hi * poly1
2590//
2591(p14) fadd.s0 Result = Poly, r // for tanl
2592 nop.i 999
2593}
2594{ .mfi
2595 nop.m 999
2596(p15) fms.s0 Result = Poly, mOne, r // for cotl
2597 nop.i 999 ;;
2598}
2599
2600{ .mfi
2601 nop.m 999
2602//
2603// N odd: S_lo = Q1_1 * c + S_lo
2604//
2605(p12) fma.s1 S_lo = Q1_1, c, S_lo
2606 nop.i 999
2607}
2608{ .mfi
2609 nop.m 999
2610 fmpy.s0 fp_tmp = fp_tmp, fp_tmp // Dummy mult to set inexact
2611 nop.i 999 ;;
2612}
2613{ .mfi
2614 nop.m 999
2615//
2616// N odd: Result = S_lo + r * P
2617//
2618(p12) fma.s1 Result = P, r, S_lo
2619(p12) tbit.z.unc p14, p15 = cot_flag, 0 ;; // p14=1 for tanl; p15=1 for cotl
2620}
2621
2622//
2623// N odd: Result = Result + S_hi (user supplied rounding mode)
2624//
2625{ .mfi
2626 nop.m 999
2627(p14) fadd.s0 Result = Result, S_hi // for tanl
2628 nop.i 999
2629}
2630{ .mfb
2631 nop.m 999
2632(p15) fms.s0 Result = Result, mOne, S_hi // for cotl
2633 br.ret.sptk b0 ;; // Exit |r| < 1/4 path
2634}
2635
2636
2637TANL_NORMAL_R:
2638// Here if 1/4 <= |x| < pi/4 or if |x| >= 2^63 and |r| >= 1/4
2639// *******************************************************************
2640// *******************************************************************
2641// *******************************************************************
2642//
2643// r and c have been computed.
2644//
2645{ .mfi
2646 nop.m 999
2647 fand B = B_mask1, r
2648 nop.i 999
2649}
2650;;
2651
2652TANL_NORMAL_R_A:
2653// Enter here if pi/4 <= |x| < 2^63 and |r| >= 1/4
2654// Get the 5 bits or r for the lookup. 1.xxxxx ....
2655{ .mmi
2656 add table_ptr1 = 416, table_base // Point to tanl_table_p2
2657 mov GR_exp_2tom65 = 0xffff - 65 // Scaling constant for B
2658 extr.u lookup = sig_r, 58, 5
2659}
2660;;
2661
2662{ .mmi
2663 ldfe P2_1 = [table_ptr1], 16
2664 setf.exp TWO_TO_NEG65 = GR_exp_2tom65 // 2^-65 for scaling B if exp_r=-2
2665 add N_fix_gr = N_fix_gr, cot_flag // N = N + 1 (for cotl)
2666}
2667;;
2668
2669.pred.rel "mutex",p11,p12
2670// B = 2^63 * 1.xxxxx 100...0
2671{ .mfi
2672 ldfe P2_2 = [table_ptr1], 16
2673 for B = B_mask2, B
2674 mov table_offset = 512 // Assume table offset is 512
2675}
2676;;
2677
2678{ .mfi
2679 ldfe P2_3 = [table_ptr1], 16
2680 fmerge.s Pos_r = f1, r
2681 tbit.nz p8,p9 = exp_r, 0
2682}
2683;;
2684
2685// Is B = 2** -2 or B= 2** -1? If 2**-1, then
2686// we want an offset of 512 for table addressing.
2687{ .mii
2688 add table_ptr2 = 1296, table_base // Point to tanl_table_cm2
2689(p9) shladd table_offset = lookup, 4, table_offset
2690(p8) shladd table_offset = lookup, 4, r0
2691}
2692;;
2693
2694{ .mmi
2695 add table_ptr1 = table_ptr1, table_offset // Point to T_hi
2696 add table_ptr2 = table_ptr2, table_offset // Point to C_hi
2697 add table_ptr3 = 2128, table_base // Point to tanl_table_scim2
2698}
2699;;
2700
2701{ .mmi
2702 ldfd T_hi = [table_ptr1], 8 // Load T_hi
2703;;
2704 ldfd C_hi = [table_ptr2], 8 // Load C_hi
2705 add table_ptr3 = table_ptr3, table_offset // Point to SC_inv
2706}
2707;;
2708
2709//
2710// x = |r| - B
2711//
2712// Convert B so it has the same exponent as Pos_r before subtracting
2713{ .mfi
2714 ldfs T_lo = [table_ptr1] // Load T_lo
2715(p9) fnma.s1 x = B, FR_2tom64, Pos_r
2716 nop.i 999
2717}
2718{ .mfi
2719 nop.m 999
2720(p8) fnma.s1 x = B, TWO_TO_NEG65, Pos_r
2721 nop.i 999
2722}
2723;;
2724
2725{ .mfi
2726 ldfs C_lo = [table_ptr2] // Load C_lo
2727 nop.f 999
2728 nop.i 999
2729}
2730;;
2731
2732{ .mfi
2733 ldfe SC_inv = [table_ptr3] // Load SC_inv
2734 fmerge.s sgn_r = r, f1
2735 tbit.z p11, p12 = N_fix_gr, 0 // p11 if N even, p12 if odd
2736
2737}
2738;;
2739
2740//
2741// xsq = x * x
2742// N even: Tx = T_hi * x
2743//
2744// N even: Tx1 = Tx + 1
2745// N odd: Cx1 = 1 - Cx
2746//
2747
2748{ .mfi
2749 nop.m 999
2750 fmpy.s1 xsq = x, x
2751 nop.i 999
2752}
2753{ .mfi
2754 nop.m 999
2755(p11) fmpy.s1 Tx = T_hi, x
2756 nop.i 999
2757}
2758;;
2759
2760//
2761// N odd: Cx = C_hi * x
2762//
2763{ .mfi
2764 nop.m 999
2765(p12) fmpy.s1 Cx = C_hi, x
2766 nop.i 999
2767}
2768;;
2769//
2770// N even and odd: P = P2_3 + P2_2 * xsq
2771//
2772{ .mfi
2773 nop.m 999
2774 fma.s1 P = P2_3, xsq, P2_2
2775 nop.i 999
2776}
2777{ .mfi
2778 nop.m 999
2779(p11) fadd.s1 Tx1 = Tx, f1
2780 nop.i 999 ;;
2781}
2782{ .mfi
2783 nop.m 999
2784//
2785// N even: D = C_hi - tanx
2786// N odd: D = T_hi + tanx
2787//
2788(p11) fmpy.s1 CORR = SC_inv, T_hi
2789 nop.i 999
2790}
2791{ .mfi
2792 nop.m 999
2793 fmpy.s1 Sx = SC_inv, x
2794 nop.i 999 ;;
2795}
2796{ .mfi
2797 nop.m 999
2798(p12) fmpy.s1 CORR = SC_inv, C_hi
2799 nop.i 999 ;;
2800}
2801{ .mfi
2802 nop.m 999
2803(p12) fsub.s1 V_hi = f1, Cx
2804 nop.i 999 ;;
2805}
2806{ .mfi
2807 nop.m 999
2808 fma.s1 P = P, xsq, P2_1
2809 nop.i 999
2810}
2811{ .mfi
2812 nop.m 999
2813//
2814// N even and odd: P = P2_1 + P * xsq
2815//
2816(p11) fma.s1 V_hi = Tx, Tx1, f1
2817 nop.i 999 ;;
2818}
2819{ .mfi
2820 nop.m 999
2821//
2822// N even: Result = sgn_r * tail + T_hi (user rounding mode for C1)
2823// N odd: Result = sgn_r * tail + C_hi (user rounding mode for C1)
2824//
2825 fmpy.s0 fp_tmp = fp_tmp, fp_tmp // Dummy mult to set inexact
2826 nop.i 999 ;;
2827}
2828{ .mfi
2829 nop.m 999
2830 fmpy.s1 CORR = CORR, c
2831 nop.i 999 ;;
2832}
2833{ .mfi
2834 nop.m 999
2835(p12) fnma.s1 V_hi = Cx,V_hi,f1
2836 nop.i 999 ;;
2837}
2838{ .mfi
2839 nop.m 999
2840//
2841// N even: V_hi = Tx * Tx1 + 1
2842// N odd: Cx1 = 1 - Cx * Cx1
2843//
2844 fmpy.s1 P = P, xsq
2845 nop.i 999
2846}
2847{ .mfi
2848 nop.m 999
2849//
2850// N even and odd: P = P * xsq
2851//
2852(p11) fmpy.s1 V_hi = V_hi, T_hi
2853 nop.i 999 ;;
2854}
2855{ .mfi
2856 nop.m 999
2857//
2858// N even and odd: tail = P * tail + V_lo
2859//
2860(p11) fmpy.s1 T_hi = sgn_r, T_hi
2861 nop.i 999 ;;
2862}
2863{ .mfi
2864 nop.m 999
2865 fmpy.s1 CORR = CORR, sgn_r
2866 nop.i 999 ;;
2867}
2868{ .mfi
2869 nop.m 999
2870(p12) fmpy.s1 V_hi = V_hi,C_hi
2871 nop.i 999 ;;
2872}
2873{ .mfi
2874 nop.m 999
2875//
2876// N even: V_hi = T_hi * V_hi
2877// N odd: V_hi = C_hi * V_hi
2878//
2879 fma.s1 tanx = P, x, x
2880 nop.i 999
2881}
2882{ .mfi
2883 nop.m 999
2884(p12) fnmpy.s1 C_hi = sgn_r, C_hi
2885 nop.i 999 ;;
2886}
2887{ .mfi
2888 nop.m 999
2889//
2890// N even: V_lo = 1 - V_hi + C_hi
2891// N odd: V_lo = 1 - V_hi + T_hi
2892//
2893(p11) fadd.s1 CORR = CORR, T_lo
2894 nop.i 999
2895}
2896{ .mfi
2897 nop.m 999
2898(p12) fsub.s1 CORR = CORR, C_lo
2899 nop.i 999 ;;
2900}
2901{ .mfi
2902 nop.m 999
2903//
2904// N even and odd: tanx = x + x * P
2905// N even and odd: Sx = SC_inv * x
2906//
2907(p11) fsub.s1 D = C_hi, tanx
2908 nop.i 999
2909}
2910{ .mfi
2911 nop.m 999
2912(p12) fadd.s1 D = T_hi, tanx
2913 nop.i 999 ;;
2914}
2915{ .mfi
2916 nop.m 999
2917//
2918// N odd: CORR = SC_inv * C_hi
2919// N even: CORR = SC_inv * T_hi
2920//
2921 fnma.s1 D = V_hi, D, f1
2922 nop.i 999 ;;
2923}
2924{ .mfi
2925 nop.m 999
2926//
2927// N even and odd: D = 1 - V_hi * D
2928// N even and odd: CORR = CORR * c
2929//
2930 fma.s1 V_hi = V_hi, D, V_hi
2931 nop.i 999 ;;
2932}
2933{ .mfi
2934 nop.m 999
2935//
2936// N even and odd: V_hi = V_hi + V_hi * D
2937// N even and odd: CORR = sgn_r * CORR
2938//
2939(p11) fnma.s1 V_lo = V_hi, C_hi, f1
2940 nop.i 999
2941}
2942{ .mfi
2943 nop.m 999
2944(p12) fnma.s1 V_lo = V_hi, T_hi, f1
2945 nop.i 999 ;;
2946}
2947{ .mfi
2948 nop.m 999
2949//
2950// N even: CORR = COOR + T_lo
2951// N odd: CORR = CORR - C_lo
2952//
2953(p11) fma.s1 V_lo = tanx, V_hi, V_lo
2954 tbit.nz p15, p0 = cot_flag, 0 // p15=1 if we compute cotl
2955}
2956{ .mfi
2957 nop.m 999
2958(p12) fnma.s1 V_lo = tanx, V_hi, V_lo
2959 nop.i 999 ;;
2960}
2961
2962{ .mfi
2963 nop.m 999
2964(p15) fms.s1 T_hi = f0, f0, T_hi // to correct result's sign for cotl
2965 nop.i 999
2966}
2967{ .mfi
2968 nop.m 999
2969(p15) fms.s1 C_hi = f0, f0, C_hi // to correct result's sign for cotl
2970 nop.i 999
2971};;
2972
2973{ .mfi
2974 nop.m 999
2975(p15) fms.s1 sgn_r = f0, f0, sgn_r // to correct result's sign for cotl
2976 nop.i 999
2977};;
2978
2979{ .mfi
2980 nop.m 999
2981//
2982// N even: V_lo = V_lo + V_hi * tanx
2983// N odd: V_lo = V_lo - V_hi * tanx
2984//
2985(p11) fnma.s1 V_lo = C_lo, V_hi, V_lo
2986 nop.i 999
2987}
2988{ .mfi
2989 nop.m 999
2990(p12) fnma.s1 V_lo = T_lo, V_hi, V_lo
2991 nop.i 999 ;;
2992}
2993{ .mfi
2994 nop.m 999
2995//
2996// N even: V_lo = V_lo - V_hi * C_lo
2997// N odd: V_lo = V_lo - V_hi * T_lo
2998//
2999 fmpy.s1 V_lo = V_hi, V_lo
3000 nop.i 999 ;;
3001}
3002{ .mfi
3003 nop.m 999
3004//
3005// N even and odd: V_lo = V_lo * V_hi
3006//
3007 fadd.s1 tail = V_hi, V_lo
3008 nop.i 999 ;;
3009}
3010{ .mfi
3011 nop.m 999
3012//
3013// N even and odd: tail = V_hi + V_lo
3014//
3015 fma.s1 tail = tail, P, V_lo
3016 nop.i 999 ;;
3017}
3018{ .mfi
3019 nop.m 999
3020//
3021// N even: T_hi = sgn_r * T_hi
3022// N odd : C_hi = -sgn_r * C_hi
3023//
3024 fma.s1 tail = tail, Sx, CORR
3025 nop.i 999 ;;
3026}
3027{ .mfi
3028 nop.m 999
3029//
3030// N even and odd: tail = Sx * tail + CORR
3031//
3032 fma.s1 tail = V_hi, Sx, tail
3033 nop.i 999 ;;
3034}
3035{ .mfi
3036 nop.m 999
3037//
3038// N even an odd: tail = Sx * V_hi + tail
3039//
3040(p11) fma.s0 Result = sgn_r, tail, T_hi
3041 nop.i 999
3042}
3043{ .mfb
3044 nop.m 999
3045(p12) fma.s0 Result = sgn_r, tail, C_hi
3046 br.ret.sptk b0 ;; // Exit for 1/4 <= |r| < pi/4
3047}
3048
3049TANL_DENORMAL:
3050// Here if x denormal
3051{ .mfb
3052 getf.exp GR_signexp_x = Norm_Arg // Get sign and exponent of x
3053 nop.f 999
3054 br.cond.sptk TANL_COMMON // Return to common code
3055}
3056;;
3057
3058
3059TANL_SPECIAL:
3060TANL_UNSUPPORTED:
3061//
3062// Code for NaNs, Unsupporteds, Infs, or +/- zero ?
3063// Invalid raised for Infs and SNaNs.
3064//
3065
3066{ .mfi
3067 nop.m 999
3068 fmerge.s f10 = f8, f8 // Save input for error call
3069 tbit.nz p6, p7 = cot_flag, 0 // p6=1 if we compute cotl
3070}
3071;;
3072
3073{ .mfi
3074 nop.m 999
3075(p6) fclass.m p6, p7 = f8, 0x7 // Test for zero (cotl only)
3076 nop.i 999
3077}
3078;;
3079
3080.pred.rel "mutex", p6, p7
3081{ .mfi
3082(p6) mov GR_Parameter_Tag = 225 // (cotl)
3083(p6) frcpa.s0 f8, p0 = f1, f8 // cotl(+-0) = +-Inf
3084 nop.i 999
3085}
3086{ .mfb
3087 nop.m 999
3088(p7) fmpy.s0 f8 = f8, f0
3089(p7) br.ret.sptk b0
3090}
3091;;
3092
3093GLOBAL_IEEE754_END(tanl)
3094
3095
3096LOCAL_LIBM_ENTRY(__libm_error_region)
3097.prologue
3098
3099// (1)
3100{ .mfi
3101 add GR_Parameter_Y=-32,sp // Parameter 2 value
3102 nop.f 0
3103.save ar.pfs,GR_SAVE_PFS
3104 mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
3105}
3106{ .mfi
3107.fframe 64
3108 add sp=-64,sp // Create new stack
3109 nop.f 0
3110 mov GR_SAVE_GP=gp // Save gp
3111};;
3112
3113// (2)
3114{ .mmi
3115 stfe [GR_Parameter_Y] = f1,16 // STORE Parameter 2 on stack
3116 add GR_Parameter_X = 16,sp // Parameter 1 address
3117.save b0, GR_SAVE_B0
3118 mov GR_SAVE_B0=b0 // Save b0
3119};;
3120
3121.body
3122// (3)
3123{ .mib
3124 stfe [GR_Parameter_X] = f10 // STORE Parameter 1 on stack
3125 add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
3126 nop.b 0
3127}
3128{ .mib
3129 stfe [GR_Parameter_Y] = f8 // STORE Parameter 3 on stack
3130 add GR_Parameter_Y = -16,GR_Parameter_Y
3131 br.call.sptk b0=__libm_error_support# // Call error handling function
3132};;
3133{ .mmi
3134 nop.m 0
3135 nop.m 0
3136 add GR_Parameter_RESULT = 48,sp
3137};;
3138
3139// (4)
3140{ .mmi
3141 ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack
3142.restore sp
3143 add sp = 64,sp // Restore stack pointer
3144 mov b0 = GR_SAVE_B0 // Restore return address
3145};;
3146{ .mib
3147 mov gp = GR_SAVE_GP // Restore gp
3148 mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
3149 br.ret.sptk b0 // Return
3150};;
3151
3152LOCAL_LIBM_END(__libm_error_region)
3153
3154.type __libm_error_support#,@function
3155.global __libm_error_support#
3156
3157
3158// *******************************************************************
3159// *******************************************************************
3160// *******************************************************************
3161//
3162// Special Code to handle very large argument case.
3163// Call int __libm_pi_by_2_reduce(x,r,c) for |arguments| >= 2**63
3164// The interface is custom:
3165// On input:
3166// (Arg or x) is in f8
3167// On output:
3168// r is in f8
3169// c is in f9
3170// N is in r8
3171// We know also that __libm_pi_by_2_reduce preserves f10-15, f71-127. We
3172// use this to eliminate save/restore of key fp registers in this calling
3173// function.
3174//
3175// *******************************************************************
3176// *******************************************************************
3177// *******************************************************************
3178
3179LOCAL_LIBM_ENTRY(__libm_callout)
3180TANL_ARG_TOO_LARGE:
3181.prologue
3182{ .mfi
3183 add table_ptr2 = 144, table_base // Point to 2^-2
3184 nop.f 999
3185.save ar.pfs,GR_SAVE_PFS
3186 mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
3187}
3188;;
3189
3190// Load 2^-2, -2^-2
3191{ .mmi
3192 ldfps TWO_TO_NEG2, NEGTWO_TO_NEG2 = [table_ptr2]
3193 setf.sig B_mask1 = bmask1 // Form mask to get 5 msb of r
3194.save b0, GR_SAVE_B0
3195 mov GR_SAVE_B0=b0 // Save b0
3196};;
3197
3198.body
3199//
3200// Call argument reduction with x in f8
3201// Returns with N in r8, r in f8, c in f9
3202// Assumes f71-127 are preserved across the call
3203//
3204{ .mib
3205 setf.sig B_mask2 = bmask2 // Form mask to form B from r
3206 mov GR_SAVE_GP=gp // Save gp
3207 br.call.sptk b0=__libm_pi_by_2_reduce#
3208}
3209;;
3210
3211//
3212// Is |r| < 2**(-2)
3213//
3214{ .mfi
3215 getf.sig sig_r = r // Extract significand of r
3216 fcmp.lt.s1 p6, p0 = r, TWO_TO_NEG2
3217 mov gp = GR_SAVE_GP // Restore gp
3218}
3219;;
3220
3221{ .mfi
3222 getf.exp exp_r = r // Extract signexp of r
3223 nop.f 999
3224 mov b0 = GR_SAVE_B0 // Restore return address
3225}
3226;;
3227
3228//
3229// Get N_fix_gr
3230//
3231{ .mfi
3232 mov N_fix_gr = r8
3233(p6) fcmp.gt.unc.s1 p6, p0 = r, NEGTWO_TO_NEG2
3234 mov ar.pfs = GR_SAVE_PFS // Restore pfs
3235}
3236;;
3237
3238{ .mbb
3239 nop.m 999
3240(p6) br.cond.spnt TANL_SMALL_R // Branch if |r| < 1/4
3241 br.cond.sptk TANL_NORMAL_R // Branch if 1/4 <= |r| < pi/4
3242}
3243;;
3244
3245LOCAL_LIBM_END(__libm_callout)
3246
3247.type __libm_pi_by_2_reduce#,@function
3248.global __libm_pi_by_2_reduce#