]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/ia64/fpu/s_cosl.S
(CFLAGS-tst-align.c): Add -mpreferred-stack-boundary=4.
[thirdparty/glibc.git] / sysdeps / ia64 / fpu / s_cosl.S
CommitLineData
8da2915d
UD
1.file "sincosl.s"
2
a334319f 3// Copyright (C) 2000, 2001, Intel Corporation
8da2915d 4// All rights reserved.
a334319f
UD
5//
6// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
7// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
aeb25823
AJ
8//
9// Redistribution and use in source and binary forms, with or without
10// modification, are permitted provided that the following conditions are
11// met:
12//
13// * Redistributions of source code must retain the above copyright
14// notice, this list of conditions and the following disclaimer.
15//
16// * Redistributions in binary form must reproduce the above copyright
17// notice, this list of conditions and the following disclaimer in the
18// documentation and/or other materials provided with the distribution.
19//
20// * The name of Intel Corporation may not be used to endorse or promote
21// products derived from this software without specific prior written
22// permission.
a334319f
UD
23//
24// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
25// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
8da2915d 26// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
a334319f 27// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
8da2915d 28// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
a334319f
UD
29// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
30// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
31// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
8da2915d 32// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
a334319f
UD
33// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
34// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35//
8da2915d 36// Intel Corporation is the author of this code, and requests that all
a334319f
UD
37// problem reports or change requests be submitted to it directly at
38// http://developer.intel.com/opensource.
8da2915d 39//
a334319f 40// *********************************************************************
8da2915d 41//
a334319f
UD
42// History:
43// 2/02/2000 (hand-optimized)
44// 4/04/00 Unwind support added
8da2915d 45//
a334319f 46// *********************************************************************
8da2915d
UD
47//
48// Function: Combined sinl(x) and cosl(x), where
49//
50// sinl(x) = sine(x), for double-extended precision x values
51// cosl(x) = cosine(x), for double-extended precision x values
52//
a334319f 53// *********************************************************************
8da2915d
UD
54//
55// Resources Used:
56//
a334319f 57// Floating-Point Registers: f8 (Input and Return Value)
8da2915d
UD
58// f32-f99
59//
60// General Purpose Registers:
a334319f
UD
61// r32-r43
62// r44-r45 (Used to pass arguments to pi_by_2 reduce routine)
8da2915d
UD
63//
64// Predicate Registers: p6-p13
65//
a334319f 66// *********************************************************************
8da2915d
UD
67//
68// IEEE Special Conditions:
69//
70// Denormal fault raised on denormal inputs
71// Overflow exceptions do not occur
a334319f 72// Underflow exceptions raised when appropriate for sin
8da2915d
UD
73// (No specialized error handling for this routine)
74// Inexact raised when appropriate by algorithm
75//
76// sinl(SNaN) = QNaN
77// sinl(QNaN) = QNaN
a334319f 78// sinl(inf) = QNaN
8da2915d 79// sinl(+/-0) = +/-0
a334319f 80// cosl(inf) = QNaN
8da2915d
UD
81// cosl(SNaN) = QNaN
82// cosl(QNaN) = QNaN
83// cosl(0) = 1
a334319f
UD
84//
85// *********************************************************************
8da2915d
UD
86//
87// Mathematical Description
88// ========================
89//
a334319f
UD
90// The computation of FSIN and FCOS is best handled in one piece of
91// code. The main reason is that given any argument Arg, computation
92// of trigonometric functions first calculate N and an approximation
8da2915d
UD
93// to alpha where
94//
95// Arg = N pi/2 + alpha, |alpha| <= pi/4.
96//
97// Since
98//
99// cosl( Arg ) = sinl( (N+1) pi/2 + alpha ),
100//
a334319f
UD
101// therefore, the code for computing sine will produce cosine as long
102// as 1 is added to N immediately after the argument reduction
8da2915d
UD
103// process.
104//
105// Let M = N if sine
a334319f 106// N+1 if cosine.
8da2915d
UD
107//
108// Now, given
109//
110// Arg = M pi/2 + alpha, |alpha| <= pi/4,
111//
a334319f 112// let I = M mod 4, or I be the two lsb of M when M is represented
8da2915d
UD
113// as 2's complement. I = [i_0 i_1]. Then
114//
a334319f 115// sinl( Arg ) = (-1)^i_0 sinl( alpha ) if i_1 = 0,
8da2915d
UD
116// = (-1)^i_0 cosl( alpha ) if i_1 = 1.
117//
118// For example:
a334319f 119// if M = -1, I = 11
8da2915d 120// sin ((-pi/2 + alpha) = (-1) cos (alpha)
a334319f 121// if M = 0, I = 00
8da2915d 122// sin (alpha) = sin (alpha)
a334319f 123// if M = 1, I = 01
8da2915d 124// sin (pi/2 + alpha) = cos (alpha)
a334319f 125// if M = 2, I = 10
8da2915d 126// sin (pi + alpha) = (-1) sin (alpha)
a334319f 127// if M = 3, I = 11
8da2915d
UD
128// sin ((3/2)pi + alpha) = (-1) cos (alpha)
129//
a334319f 130// The value of alpha is obtained by argument reduction and
8da2915d
UD
131// represented by two working precision numbers r and c where
132//
133// alpha = r + c accurately.
134//
135// The reduction method is described in a previous write up.
a334319f
UD
136// The argument reduction scheme identifies 4 cases. For Cases 2
137// and 4, because |alpha| is small, sinl(r+c) and cosl(r+c) can be
138// computed very easily by 2 or 3 terms of the Taylor series
8da2915d
UD
139// expansion as follows:
140//
141// Case 2:
142// -------
143//
a334319f
UD
144// sinl(r + c) = r + c - r^3/6 accurately
145// cosl(r + c) = 1 - 2^(-67) accurately
8da2915d
UD
146//
147// Case 4:
148// -------
149//
a334319f
UD
150// sinl(r + c) = r + c - r^3/6 + r^5/120 accurately
151// cosl(r + c) = 1 - r^2/2 + r^4/24 accurately
8da2915d 152//
a334319f
UD
153// The only cases left are Cases 1 and 3 of the argument reduction
154// procedure. These two cases will be merged since after the
155// argument is reduced in either cases, we have the reduced argument
156// represented as r + c and that the magnitude |r + c| is not small
8da2915d
UD
157// enough to allow the usage of a very short approximation.
158//
159// The required calculation is either
160//
161// sinl(r + c) = sinl(r) + correction, or
162// cosl(r + c) = cosl(r) + correction.
163//
164// Specifically,
165//
a334319f
UD
166// sinl(r + c) = sinl(r) + c sin'(r) + O(c^2)
167// = sinl(r) + c cos (r) + O(c^2)
168// = sinl(r) + c(1 - r^2/2) accurately.
8da2915d
UD
169// Similarly,
170//
a334319f
UD
171// cosl(r + c) = cosl(r) - c sinl(r) + O(c^2)
172// = cosl(r) - c(r - r^3/6) accurately.
8da2915d 173//
a334319f 174// We therefore concentrate on accurately calculating sinl(r) and
8da2915d
UD
175// cosl(r) for a working-precision number r, |r| <= pi/4 to within
176// 0.1% or so.
177//
a334319f 178// The greatest challenge of this task is that the second terms of
8da2915d 179// the Taylor series
a334319f
UD
180//
181// r - r^3/3! + r^r/5! - ...
8da2915d
UD
182//
183// and
184//
a334319f 185// 1 - r^2/2! + r^4/4! - ...
8da2915d 186//
a334319f
UD
187// are not very small when |r| is close to pi/4 and the rounding
188// errors will be a concern if simple polynomial accumulation is
189// used. When |r| < 2^-3, however, the second terms will be small
190// enough (6 bits or so of right shift) that a normal Horner
191// recurrence suffices. Hence there are two cases that we consider
8da2915d
UD
192// in the accurate computation of sinl(r) and cosl(r), |r| <= pi/4.
193//
194// Case small_r: |r| < 2^(-3)
195// --------------------------
196//
197// Since Arg = M pi/4 + r + c accurately, and M mod 4 is [i_0 i_1],
198// we have
199//
a334319f
UD
200// sinl(Arg) = (-1)^i_0 * sinl(r + c) if i_1 = 0
201// = (-1)^i_0 * cosl(r + c) if i_1 = 1
8da2915d
UD
202//
203// can be accurately approximated by
204//
a334319f 205// sinl(Arg) = (-1)^i_0 * [sinl(r) + c] if i_1 = 0
8da2915d
UD
206// = (-1)^i_0 * [cosl(r) - c*r] if i_1 = 1
207//
a334319f 208// because |r| is small and thus the second terms in the correction
8da2915d
UD
209// are unneccessary.
210//
a334319f 211// Finally, sinl(r) and cosl(r) are approximated by polynomials of
8da2915d
UD
212// moderate lengths.
213//
214// sinl(r) = r + S_1 r^3 + S_2 r^5 + ... + S_5 r^11
215// cosl(r) = 1 + C_1 r^2 + C_2 r^4 + ... + C_5 r^10
216//
a334319f
UD
217// We can make use of predicates to selectively calculate
218// sinl(r) or cosl(r) based on i_1.
8da2915d
UD
219//
220// Case normal_r: 2^(-3) <= |r| <= pi/4
221// ------------------------------------
222//
223// This case is more likely than the previous one if one considers
224// r to be uniformly distributed in [-pi/4 pi/4]. Again,
a334319f
UD
225//
226// sinl(Arg) = (-1)^i_0 * sinl(r + c) if i_1 = 0
227// = (-1)^i_0 * cosl(r + c) if i_1 = 1.
8da2915d 228//
a334319f 229// Because |r| is now larger, we need one extra term in the
8da2915d
UD
230// correction. sinl(Arg) can be accurately approximated by
231//
232// sinl(Arg) = (-1)^i_0 * [sinl(r) + c(1-r^2/2)] if i_1 = 0
233// = (-1)^i_0 * [cosl(r) - c*r*(1 - r^2/6)] i_1 = 1.
234//
a334319f 235// Finally, sinl(r) and cosl(r) are approximated by polynomials of
8da2915d
UD
236// moderate lengths.
237//
a334319f
UD
238// sinl(r) = r + PP_1_hi r^3 + PP_1_lo r^3 +
239// PP_2 r^5 + ... + PP_8 r^17
8da2915d 240//
a334319f 241// cosl(r) = 1 + QQ_1 r^2 + QQ_2 r^4 + ... + QQ_8 r^16
8da2915d 242//
a334319f
UD
243// where PP_1_hi is only about 16 bits long and QQ_1 is -1/2.
244// The crux in accurate computation is to calculate
8da2915d
UD
245//
246// r + PP_1_hi r^3 or 1 + QQ_1 r^2
247//
a334319f
UD
248// accurately as two pieces: U_hi and U_lo. The way to achieve this
249// is to obtain r_hi as a 10 sig. bit number that approximates r to
8da2915d
UD
250// roughly 8 bits or so of accuracy. (One convenient way is
251//
252// r_hi := frcpa( frcpa( r ) ).)
253//
254// This way,
255//
a334319f
UD
256// r + PP_1_hi r^3 = r + PP_1_hi r_hi^3 +
257// PP_1_hi (r^3 - r_hi^3)
258// = [r + PP_1_hi r_hi^3] +
259// [PP_1_hi (r - r_hi)
260// (r^2 + r_hi r + r_hi^2) ]
261// = U_hi + U_lo
8da2915d
UD
262//
263// Since r_hi is only 10 bit long and PP_1_hi is only 16 bit long,
a334319f
UD
264// PP_1_hi * r_hi^3 is only at most 46 bit long and thus computed
265// exactly. Furthermore, r and PP_1_hi r_hi^3 are of opposite sign
266// and that there is no more than 8 bit shift off between r and
267// PP_1_hi * r_hi^3. Hence the sum, U_hi, is representable and thus
268// calculated without any error. Finally, the fact that
8da2915d 269//
a334319f 270// |U_lo| <= 2^(-8) |U_hi|
8da2915d 271//
a334319f 272// says that U_hi + U_lo is approximating r + PP_1_hi r^3 to roughly
8da2915d
UD
273// 8 extra bits of accuracy.
274//
275// Similarly,
276//
a334319f
UD
277// 1 + QQ_1 r^2 = [1 + QQ_1 r_hi^2] +
278// [QQ_1 (r - r_hi)(r + r_hi)]
279// = U_hi + U_lo.
280//
281// Summarizing, we calculate r_hi = frcpa( frcpa( r ) ).
8da2915d
UD
282//
283// If i_1 = 0, then
284//
285// U_hi := r + PP_1_hi * r_hi^3
286// U_lo := PP_1_hi * (r - r_hi) * (r^2 + r*r_hi + r_hi^2)
287// poly := PP_1_lo r^3 + PP_2 r^5 + ... + PP_8 r^17
288// correction := c * ( 1 + C_1 r^2 )
289//
290// Else ...i_1 = 1
291//
292// U_hi := 1 + QQ_1 * r_hi * r_hi
293// U_lo := QQ_1 * (r - r_hi) * (r + r_hi)
294// poly := QQ_2 * r^4 + QQ_3 * r^6 + ... + QQ_8 r^16
295// correction := -c * r * (1 + S_1 * r^2)
296//
297// End
298//
299// Finally,
a334319f
UD
300//
301// V := poly + ( U_lo + correction )
8da2915d
UD
302//
303// / U_hi + V if i_0 = 0
a334319f 304// result := |
8da2915d
UD
305// \ (-U_hi) - V if i_0 = 1
306//
a334319f
UD
307// It is important that in the last step, negation of U_hi is
308// performed prior to the subtraction which is to be performed in
309// the user-set rounding mode.
8da2915d
UD
310//
311//
312// Algorithmic Description
313// =======================
314//
a334319f
UD
315// The argument reduction algorithm is tightly integrated into FSIN
316// and FCOS which share the same code. The following is complete and
317// self-contained. The argument reduction description given
8da2915d
UD
318// previously is repeated below.
319//
320//
a334319f 321// Step 0. Initialization.
8da2915d
UD
322//
323// If FSIN is invoked, set N_inc := 0; else if FCOS is invoked,
324// set N_inc := 1.
325//
326// Step 1. Check for exceptional and special cases.
327//
a334319f 328// * If Arg is +-0, +-inf, NaN, NaT, go to Step 10 for special
8da2915d
UD
329// handling.
330// * If |Arg| < 2^24, go to Step 2 for reduction of moderate
331// arguments. This is the most likely case.
332// * If |Arg| < 2^63, go to Step 8 for pre-reduction of large
333// arguments.
334// * If |Arg| >= 2^63, go to Step 10 for special handling.
335//
336// Step 2. Reduction of moderate arguments.
337//
a334319f
UD
338// If |Arg| < pi/4 ...quick branch
339// N_fix := N_inc (integer)
8da2915d
UD
340// r := Arg
341// c := 0.0
342// Branch to Step 4, Case_1_complete
a334319f
UD
343// Else ...cf. argument reduction
344// N := Arg * two_by_PI (fp)
345// N_fix := fcvt.fx( N ) (int)
8da2915d
UD
346// N := fcvt.xf( N_fix )
347// N_fix := N_fix + N_inc
a334319f
UD
348// s := Arg - N * P_1 (first piece of pi/2)
349// w := -N * P_2 (second piece of pi/2)
8da2915d
UD
350//
351// If |s| >= 2^(-33)
352// go to Step 3, Case_1_reduce
353// Else
354// go to Step 7, Case_2_reduce
355// Endif
356// Endif
357//
358// Step 3. Case_1_reduce.
359//
360// r := s + w
a334319f
UD
361// c := (s - r) + w ...observe order
362//
8da2915d
UD
363// Step 4. Case_1_complete
364//
365// ...At this point, the reduced argument alpha is
366// ...accurately represented as r + c.
367// If |r| < 2^(-3), go to Step 6, small_r.
368//
369// Step 5. Normal_r.
370//
371// Let [i_0 i_1] by the 2 lsb of N_fix.
372// FR_rsq := r * r
373// r_hi := frcpa( frcpa( r ) )
374// r_lo := r - r_hi
375//
376// If i_1 = 0, then
377// poly := r*FR_rsq*(PP_1_lo + FR_rsq*(PP_2 + ... FR_rsq*PP_8))
a334319f 378// U_hi := r + PP_1_hi*r_hi*r_hi*r_hi ...any order
8da2915d 379// U_lo := PP_1_hi*r_lo*(r*r + r*r_hi + r_hi*r_hi)
a334319f 380// correction := c + c*C_1*FR_rsq ...any order
8da2915d
UD
381// Else
382// poly := FR_rsq*FR_rsq*(QQ_2 + FR_rsq*(QQ_3 + ... + FR_rsq*QQ_8))
a334319f 383// U_hi := 1 + QQ_1 * r_hi * r_hi ...any order
8da2915d 384// U_lo := QQ_1 * r_lo * (r + r_hi)
a334319f 385// correction := -c*(r + S_1*FR_rsq*r) ...any order
8da2915d
UD
386// Endif
387//
a334319f 388// V := poly + (U_lo + correction) ...observe order
8da2915d
UD
389//
390// result := (i_0 == 0? 1.0 : -1.0)
391//
392// Last instruction in user-set rounding mode
393//
394// result := (i_0 == 0? result*U_hi + V :
395// result*U_hi - V)
396//
397// Return
398//
399// Step 6. Small_r.
a334319f 400//
8da2915d
UD
401// ...Use flush to zero mode without causing exception
402// Let [i_0 i_1] be the two lsb of N_fix.
403//
404// FR_rsq := r * r
405//
406// If i_1 = 0 then
407// z := FR_rsq*FR_rsq; z := FR_rsq*z *r
408// poly_lo := S_3 + FR_rsq*(S_4 + FR_rsq*S_5)
409// poly_hi := r*FR_rsq*(S_1 + FR_rsq*S_2)
410// correction := c
411// result := r
412// Else
413// z := FR_rsq*FR_rsq; z := FR_rsq*z
414// poly_lo := C_3 + FR_rsq*(C_4 + FR_rsq*C_5)
a334319f 415// poly_hi := FR_rsq*(C_1 + FR_rsq*C_2)
8da2915d
UD
416// correction := -c*r
417// result := 1
418// Endif
419//
420// poly := poly_hi + (z * poly_lo + correction)
421//
422// If i_0 = 1, result := -result
423//
424// Last operation. Perform in user-set rounding mode
425//
426// result := (i_0 == 0? result + poly :
427// result - poly )
428// Return
429//
430// Step 7. Case_2_reduce.
431//
a334319f 432// ...Refer to the write up for argument reduction for
8da2915d
UD
433// ...rationale. The reduction algorithm below is taken from
434// ...argument reduction description and integrated this.
435//
436// w := N*P_3
a334319f
UD
437// U_1 := N*P_2 + w ...FMA
438// U_2 := (N*P_2 - U_1) + w ...2 FMA
8da2915d 439// ...U_1 + U_2 is N*(P_2+P_3) accurately
a334319f 440//
8da2915d
UD
441// r := s - U_1
442// c := ( (s - r) - U_1 ) - U_2
443//
444// ...The mathematical sum r + c approximates the reduced
445// ...argument accurately. Note that although compared to
446// ...Case 1, this case requires much more work to reduce
447// ...the argument, the subsequent calculation needed for
448// ...any of the trigonometric function is very little because
a334319f 449// ...|alpha| < 1.01*2^(-33) and thus two terms of the
8da2915d
UD
450// ...Taylor series expansion suffices.
451//
452// If i_1 = 0 then
a334319f 453// poly := c + S_1 * r * r * r ...any order
8da2915d
UD
454// result := r
455// Else
456// poly := -2^(-67)
457// result := 1.0
458// Endif
a334319f 459//
8da2915d
UD
460// If i_0 = 1, result := -result
461//
462// Last operation. Perform in user-set rounding mode
463//
464// result := (i_0 == 0? result + poly :
465// result - poly )
a334319f 466//
8da2915d
UD
467// Return
468//
a334319f 469//
8da2915d 470// Step 8. Pre-reduction of large arguments.
a334319f 471//
8da2915d
UD
472// ...Again, the following reduction procedure was described
473// ...in the separate write up for argument reduction, which
474// ...is tightly integrated here.
475
476// N_0 := Arg * Inv_P_0
477// N_0_fix := fcvt.fx( N_0 )
478// N_0 := fcvt.xf( N_0_fix)
a334319f 479
8da2915d
UD
480// Arg' := Arg - N_0 * P_0
481// w := N_0 * d_1
482// N := Arg' * two_by_PI
483// N_fix := fcvt.fx( N )
484// N := fcvt.xf( N_fix )
a334319f 485// N_fix := N_fix + N_inc
8da2915d
UD
486//
487// s := Arg' - N * P_1
488// w := w - N * P_2
489//
490// If |s| >= 2^(-14)
491// go to Step 3
492// Else
493// go to Step 9
494// Endif
495//
496// Step 9. Case_4_reduce.
a334319f 497//
8da2915d 498// ...first obtain N_0*d_1 and -N*P_2 accurately
a334319f
UD
499// U_hi := N_0 * d_1 V_hi := -N*P_2
500// U_lo := N_0 * d_1 - U_hi V_lo := -N*P_2 - U_hi ...FMAs
8da2915d
UD
501//
502// ...compute the contribution from N_0*d_1 and -N*P_3
503// w := -N*P_3
504// w := w + N_0*d_2
a334319f 505// t := U_lo + V_lo + w ...any order
8da2915d
UD
506//
507// ...at this point, the mathematical value
508// ...s + U_hi + V_hi + t approximates the true reduced argument
509// ...accurately. Just need to compute this accurately.
510//
511// ...Calculate U_hi + V_hi accurately:
512// A := U_hi + V_hi
513// if |U_hi| >= |V_hi| then
514// a := (U_hi - A) + V_hi
515// else
516// a := (V_hi - A) + U_hi
517// endif
518// ...order in computing "a" must be observed. This branch is
519// ...best implemented by predicates.
a334319f 520// ...A + a is U_hi + V_hi accurately. Moreover, "a" is
8da2915d
UD
521// ...much smaller than A: |a| <= (1/2)ulp(A).
522//
523// ...Just need to calculate s + A + a + t
a334319f
UD
524// C_hi := s + A t := t + a
525// C_lo := (s - C_hi) + A
8da2915d
UD
526// C_lo := C_lo + t
527//
528// ...Final steps for reduction
529// r := C_hi + C_lo
530// c := (C_hi - r) + C_lo
531//
532// ...At this point, we have r and c
533// ...And all we need is a couple of terms of the corresponding
534// ...Taylor series.
535//
536// If i_1 = 0
537// poly := c + r*FR_rsq*(S_1 + FR_rsq*S_2)
538// result := r
539// Else
540// poly := FR_rsq*(C_1 + FR_rsq*C_2)
541// result := 1
542// Endif
543//
544// If i_0 = 1, result := -result
545//
546// Last operation. Perform in user-set rounding mode
547//
548// result := (i_0 == 0? result + poly :
549// result - poly )
550// Return
a334319f 551//
8da2915d
UD
552// Large Arguments: For arguments above 2**63, a Payne-Hanek
553// style argument reduction is used and pi_by_2 reduce is called.
554//
555
a334319f
UD
556#include "libm_support.h"
557
558#ifdef _LIBC
559.rodata
560#else
561.data
562#endif
563.align 64
564
565FSINCOSL_CONSTANTS:
566ASM_TYPE_DIRECTIVE(FSINCOSL_CONSTANTS,@object)
567data4 0x4B800000, 0xCB800000, 0x00000000,0x00000000 // two**24, -two**24
568data4 0x4E44152A, 0xA2F9836E, 0x00003FFE,0x00000000 // Inv_pi_by_2
569data4 0xCE81B9F1, 0xC84D32B0, 0x00004016,0x00000000 // P_0
570data4 0x2168C235, 0xC90FDAA2, 0x00003FFF,0x00000000 // P_1
571data4 0xFC8F8CBB, 0xECE675D1, 0x0000BFBD,0x00000000 // P_2
572data4 0xACC19C60, 0xB7ED8FBB, 0x0000BF7C,0x00000000 // P_3
573data4 0x5F000000, 0xDF000000, 0x00000000,0x00000000 // two_to_63, -two_to_63
574data4 0x6EC6B45A, 0xA397E504, 0x00003FE7,0x00000000 // Inv_P_0
575data4 0xDBD171A1, 0x8D848E89, 0x0000BFBF,0x00000000 // d_1
576data4 0x18A66F8E, 0xD5394C36, 0x0000BF7C,0x00000000 // d_2
577data4 0x2168C234, 0xC90FDAA2, 0x00003FFE,0x00000000 // pi_by_4
578data4 0x2168C234, 0xC90FDAA2, 0x0000BFFE,0x00000000 // neg_pi_by_4
579data4 0x3E000000, 0xBE000000, 0x00000000,0x00000000 // two**-3, -two**-3
580data4 0x2F000000, 0xAF000000, 0x9E000000,0x00000000 // two**-33, -two**-33, -two**-67
581data4 0xA21C0BC9, 0xCC8ABEBC, 0x00003FCE,0x00000000 // PP_8
582data4 0x720221DA, 0xD7468A05, 0x0000BFD6,0x00000000 // PP_7
583data4 0x640AD517, 0xB092382F, 0x00003FDE,0x00000000 // PP_6
584data4 0xD1EB75A4, 0xD7322B47, 0x0000BFE5,0x00000000 // PP_5
585data4 0xFFFFFFFE, 0xFFFFFFFF, 0x0000BFFD,0x00000000 // C_1
586data4 0x00000000, 0xAAAA0000, 0x0000BFFC,0x00000000 // PP_1_hi
587data4 0xBAF69EEA, 0xB8EF1D2A, 0x00003FEC,0x00000000 // PP_4
588data4 0x0D03BB69, 0xD00D00D0, 0x0000BFF2,0x00000000 // PP_3
589data4 0x88888962, 0x88888888, 0x00003FF8,0x00000000 // PP_2
590data4 0xAAAB0000, 0xAAAAAAAA, 0x0000BFEC,0x00000000 // PP_1_lo
591data4 0xC2B0FE52, 0xD56232EF, 0x00003FD2,0x00000000 // QQ_8
592data4 0x2B48DCA6, 0xC9C99ABA, 0x0000BFDA,0x00000000 // QQ_7
593data4 0x9C716658, 0x8F76C650, 0x00003FE2,0x00000000 // QQ_6
594data4 0xFDA8D0FC, 0x93F27DBA, 0x0000BFE9,0x00000000 // QQ_5
595data4 0xAAAAAAAA, 0xAAAAAAAA, 0x0000BFFC,0x00000000 // S_1
596data4 0x00000000, 0x80000000, 0x0000BFFE,0x00000000 // QQ_1
597data4 0x0C6E5041, 0xD00D00D0, 0x00003FEF,0x00000000 // QQ_4
598data4 0x0B607F60, 0xB60B60B6, 0x0000BFF5,0x00000000 // QQ_3
599data4 0xAAAAAA9B, 0xAAAAAAAA, 0x00003FFA,0x00000000 // QQ_2
600data4 0xFFFFFFFE, 0xFFFFFFFF, 0x0000BFFD,0x00000000 // C_1
601data4 0xAAAA719F, 0xAAAAAAAA, 0x00003FFA,0x00000000 // C_2
602data4 0x0356F994, 0xB60B60B6, 0x0000BFF5,0x00000000 // C_3
603data4 0xB2385EA9, 0xD00CFFD5, 0x00003FEF,0x00000000 // C_4
604data4 0x292A14CD, 0x93E4BD18, 0x0000BFE9,0x00000000 // C_5
605data4 0xAAAAAAAA, 0xAAAAAAAA, 0x0000BFFC,0x00000000 // S_1
606data4 0x888868DB, 0x88888888, 0x00003FF8,0x00000000 // S_2
607data4 0x055EFD4B, 0xD00D00D0, 0x0000BFF2,0x00000000 // S_3
608data4 0x839730B9, 0xB8EF1C5D, 0x00003FEC,0x00000000 // S_4
609data4 0xE5B3F492, 0xD71EA3A4, 0x0000BFE5,0x00000000 // S_5
610data4 0x38800000, 0xB8800000, 0x00000000 // two**-14, -two**-14
611ASM_SIZE_DIRECTIVE(FSINCOSL_CONSTANTS)
612
613FR_Input_X = f8
614FR_Neg_Two_to_M3 = f32
615FR_Two_to_63 = f32
616FR_Two_to_24 = f33
617FR_Pi_by_4 = f33
618FR_Two_to_M14 = f34
619FR_Two_to_M33 = f35
620FR_Neg_Two_to_24 = f36
621FR_Neg_Pi_by_4 = f36
622FR_Neg_Two_to_M14 = f37
623FR_Neg_Two_to_M33 = f38
624FR_Neg_Two_to_M67 = f39
625FR_Inv_pi_by_2 = f40
626FR_N_float = f41
627FR_N_fix = f42
628FR_P_1 = f43
629FR_P_2 = f44
630FR_P_3 = f45
631FR_s = f46
632FR_w = f47
633FR_c = f48
634FR_r = f49
635FR_Z = f50
636FR_A = f51
637FR_a = f52
638FR_t = f53
639FR_U_1 = f54
640FR_U_2 = f55
641FR_C_1 = f56
642FR_C_2 = f57
643FR_C_3 = f58
644FR_C_4 = f59
645FR_C_5 = f60
646FR_S_1 = f61
647FR_S_2 = f62
648FR_S_3 = f63
649FR_S_4 = f64
650FR_S_5 = f65
651FR_poly_hi = f66
652FR_poly_lo = f67
653FR_r_hi = f68
654FR_r_lo = f69
655FR_rsq = f70
656FR_r_cubed = f71
657FR_C_hi = f72
658FR_N_0 = f73
659FR_d_1 = f74
660FR_V = f75
661FR_V_hi = f75
662FR_V_lo = f76
663FR_U_hi = f77
664FR_U_lo = f78
665FR_U_hiabs = f79
666FR_V_hiabs = f80
667FR_PP_8 = f81
668FR_QQ_8 = f81
669FR_PP_7 = f82
670FR_QQ_7 = f82
671FR_PP_6 = f83
672FR_QQ_6 = f83
673FR_PP_5 = f84
674FR_QQ_5 = f84
675FR_PP_4 = f85
676FR_QQ_4 = f85
677FR_PP_3 = f86
678FR_QQ_3 = f86
679FR_PP_2 = f87
680FR_QQ_2 = f87
681FR_QQ_1 = f88
682FR_N_0_fix = f89
683FR_Inv_P_0 = f90
684FR_corr = f91
685FR_poly = f92
686FR_d_2 = f93
687FR_Two_to_M3 = f94
688FR_Neg_Two_to_63 = f94
689FR_P_0 = f95
690FR_C_lo = f96
691FR_PP_1 = f97
692FR_PP_1_lo = f98
693FR_ArgPrime = f99
694
695GR_Table_Base = r32
696GR_Table_Base1 = r33
697GR_i_0 = r34
698GR_i_1 = r35
699GR_N_Inc = r36
700GR_Sin_or_Cos = r37
8da2915d
UD
701
702// Added for unwind support
703
704GR_SAVE_B0 = r39
705GR_SAVE_GP = r40
706GR_SAVE_PFS = r41
707
708
a334319f
UD
709.global sinl#
710.global cosl#
711#ifdef _LIBC
712.global __sinl#
713.global __cosl#
714#endif
0ecb606c 715
a334319f
UD
716.section .text
717.proc sinl#
718#ifdef _LIBC
719.proc __sinl#
720#endif
721.align 64
722sinl:
723#ifdef _LIBC
724__sinl:
725#endif
8da2915d 726{ .mlx
a334319f
UD
727alloc GR_Table_Base = ar.pfs,0,12,2,0
728(p0) movl GR_Sin_or_Cos = 0x0 ;;
8da2915d 729}
a334319f
UD
730
731{ .mmi
732 nop.m 999
733(p0) addl GR_Table_Base = @ltoff(FSINCOSL_CONSTANTS#), gp
734 nop.i 999
8da2915d
UD
735}
736;;
737
a334319f
UD
738{ .mmb
739 ld8 GR_Table_Base = [GR_Table_Base]
8da2915d 740 nop.m 999
a334319f 741(p0) br.cond.sptk L(SINCOSL_CONTINUE) ;;
8da2915d
UD
742}
743;;
744
745
a334319f
UD
746.endp sinl#
747ASM_SIZE_DIRECTIVE(sinl#)
748
749.section .text
750.proc cosl#
751cosl:
752#ifdef _LIBC
753.proc __cosl#
754__cosl:
755#endif
8da2915d 756{ .mlx
a334319f
UD
757alloc GR_Table_Base= ar.pfs,0,12,2,0
758(p0) movl GR_Sin_or_Cos = 0x1 ;;
8da2915d
UD
759}
760;;
761
a334319f 762{ .mmi
8da2915d 763 nop.m 999
a334319f 764(p0) addl GR_Table_Base = @ltoff(FSINCOSL_CONSTANTS#), gp
8da2915d
UD
765 nop.i 999
766}
767;;
768
a334319f
UD
769{ .mmb
770 ld8 GR_Table_Base = [GR_Table_Base]
771 nop.m 999
772 nop.b 999
8da2915d
UD
773}
774;;
775
776
a334319f
UD
777
778//
779// Load Table Address
780//
781
782L(SINCOSL_CONTINUE):
783{ .mmi
784(p0) add GR_Table_Base1 = 96, GR_Table_Base
785(p0) ldfs FR_Two_to_24 = [GR_Table_Base], 4
786// GR_Sin_or_Cos denotes
787(p0) mov r39 = b0 ;;
8da2915d 788}
a334319f
UD
789{ .mmi
790 nop.m 0
791//
792// Load 2**24, load 2**63.
793//
794(p0) ldfs FR_Neg_Two_to_24 = [GR_Table_Base], 12
795 nop.i 0
8da2915d
UD
796}
797{ .mfi
a334319f
UD
798(p0) ldfs FR_Two_to_63 = [GR_Table_Base1], 4
799//
800// Check for unnormals - unsupported operands. We do not want
801// to generate denormal exception
802// Check for NatVals, QNaNs, SNaNs, +/-Infs
803// Check for EM unsupporteds
804// Check for Zero
805//
806(p0) fclass.m.unc p6, p0 = FR_Input_X, 0x1E3
807 nop.i 0
808};;
809{ .mmf
810 nop.m 999
811(p0) ldfs FR_Neg_Two_to_63 = [GR_Table_Base1], 12
812(p0) fclass.nm.unc p8, p0 = FR_Input_X, 0x1FF
8da2915d 813}
a334319f
UD
814{ .mfb
815 nop.m 999
816(p0) fclass.m.unc p10, p0 = FR_Input_X, 0x007
817(p6) br.cond.spnt L(SINCOSL_SPECIAL) ;;
8da2915d 818}
a334319f
UD
819{ .mib
820 nop.m 999
821 nop.i 999
822(p8) br.cond.spnt L(SINCOSL_SPECIAL) ;;
8da2915d 823}
0ecb606c 824{ .mib
a334319f
UD
825 nop.m 999
826 nop.i 999
827//
828// Branch if +/- NaN, Inf.
829// Load -2**24, load -2**63.
830//
831(p10) br.cond.spnt L(SINCOSL_ZERO) ;;
8da2915d 832}
a334319f
UD
833{ .mmb
834(p0) ldfe FR_Inv_pi_by_2 = [GR_Table_Base], 16
835(p0) ldfe FR_Inv_P_0 = [GR_Table_Base1], 16
836 nop.b 999 ;;
8da2915d 837}
a334319f
UD
838{ .mmb
839(p0) ldfe FR_d_1 = [GR_Table_Base1], 16
840//
841// Raise possible denormal operand flag with useful fcmp
842// Is x <= -2**63
843// Load Inv_P_0 for pre-reduction
844// Load Inv_pi_by_2
845//
846(p0) ldfe FR_P_0 = [GR_Table_Base], 16
847 nop.b 999 ;;
8da2915d 848}
a334319f
UD
849{ .mmb
850(p0) ldfe FR_d_2 = [GR_Table_Base1], 16
851//
852// Load P_0
853// Load d_1
854// Is x >= 2**63
855// Is x <= -2**24?
856//
857(p0) ldfe FR_P_1 = [GR_Table_Base], 16
858 nop.b 999 ;;
8da2915d 859}
a334319f
UD
860//
861// Load P_1
862// Load d_2
863// Is x >= 2**24?
864//
0ecb606c 865{ .mfi
a334319f
UD
866(p0) ldfe FR_P_2 = [GR_Table_Base], 16
867(p0) fcmp.le.unc.s1 p7, p8 = FR_Input_X, FR_Neg_Two_to_24
868 nop.i 999 ;;
8da2915d 869}
a334319f
UD
870{ .mbb
871(p0) ldfe FR_P_3 = [GR_Table_Base], 16
872 nop.b 999
873 nop.b 999 ;;
8da2915d
UD
874}
875{ .mfi
a334319f
UD
876 nop.m 999
877(p8) fcmp.ge.s1 p7, p0 = FR_Input_X, FR_Two_to_24
878 nop.i 999
8da2915d 879}
0ecb606c 880{ .mfi
a334319f
UD
881(p0) ldfe FR_Pi_by_4 = [GR_Table_Base1], 16
882//
883// Branch if +/- zero.
884// Decide about the paths to take:
885// If -2**24 < FR_Input_X < 2**24 - CASE 1 OR 2
886// OTHERWISE - CASE 3 OR 4
887//
888(p0) fcmp.le.unc.s0 p10, p11 = FR_Input_X, FR_Neg_Two_to_63
889 nop.i 999 ;;
8da2915d 890}
a334319f
UD
891{ .mmi
892(p0) ldfe FR_Neg_Pi_by_4 = [GR_Table_Base1], 16 ;;
893(p0) ldfs FR_Two_to_M3 = [GR_Table_Base1], 4
894 nop.i 999
8da2915d 895}
0ecb606c 896{ .mfi
a334319f
UD
897 nop.m 999
898(p11) fcmp.ge.s1 p10, p0 = FR_Input_X, FR_Two_to_63
899 nop.i 999 ;;
8da2915d
UD
900}
901{ .mib
a334319f
UD
902(p0) ldfs FR_Neg_Two_to_M3 = [GR_Table_Base1], 12
903 nop.i 999
904//
905// Load P_2
906// Load P_3
907// Load pi_by_4
908// Load neg_pi_by_4
909// Load 2**(-3)
910// Load -2**(-3).
911//
912(p10) br.cond.spnt L(SINCOSL_ARG_TOO_LARGE) ;;
0ecb606c 913}
a334319f
UD
914{ .mib
915 nop.m 999
916 nop.i 999
917//
918// Branch out if x >= 2**63. Use Payne-Hanek Reduction
919//
920(p7) br.cond.spnt L(SINCOSL_LARGER_ARG) ;;
8da2915d
UD
921}
922{ .mfi
a334319f
UD
923 nop.m 999
924//
925// Branch if Arg <= -2**24 or Arg >= 2**24 and use pre-reduction.
926//
927(p0) fma.s1 FR_N_float = FR_Input_X, FR_Inv_pi_by_2, f0
928 nop.i 999 ;;
8da2915d
UD
929}
930{ .mfi
a334319f
UD
931 nop.m 999
932(p0) fcmp.lt.unc.s1 p6, p7 = FR_Input_X, FR_Pi_by_4
933 nop.i 999 ;;
0ecb606c 934}
a334319f
UD
935{ .mfi
936 nop.m 999
937//
938// Select the case when |Arg| < pi/4
939// Else Select the case when |Arg| >= pi/4
940//
941(p0) fcvt.fx.s1 FR_N_fix = FR_N_float
942 nop.i 999 ;;
0ecb606c 943}
a334319f
UD
944{ .mfi
945 nop.m 999
8da2915d
UD
946//
947// N = Arg * 2/pi
948// Check if Arg < pi/4
949//
a334319f
UD
950(p6) fcmp.gt.s1 p6, p7 = FR_Input_X, FR_Neg_Pi_by_4
951 nop.i 999 ;;
952}
8da2915d
UD
953//
954// Case 2: Convert integer N_fix back to normalized floating-point value.
955// Case 1: p8 is only affected when p6 is set
956//
a334319f
UD
957{ .mfi
958(p7) ldfs FR_Two_to_M33 = [GR_Table_Base1], 4
8da2915d
UD
959//
960// Grab the integer part of N and call it N_fix
961//
a334319f
UD
962(p6) fmerge.se FR_r = FR_Input_X, FR_Input_X
963// If |x| < pi/4, r = x and c = 0
8da2915d 964// lf |x| < pi/4, is x < 2**(-3).
a334319f 965// r = Arg
8da2915d 966// c = 0
a334319f 967(p6) mov GR_N_Inc = GR_Sin_or_Cos ;;
8da2915d 968}
a334319f
UD
969{ .mmf
970 nop.m 999
971(p7) ldfs FR_Neg_Two_to_M33 = [GR_Table_Base1], 4
972(p6) fmerge.se FR_c = f0, f0
973}
974{ .mfi
975 nop.m 999
976(p6) fcmp.lt.unc.s1 p8, p9 = FR_Input_X, FR_Two_to_M3
977 nop.i 999 ;;
978}
979{ .mfi
980 nop.m 999
8da2915d
UD
981//
982// lf |x| < pi/4, is -2**(-3)< x < 2**(-3) - set p8.
a334319f
UD
983// If |x| >= pi/4,
984// Create the right N for |x| < pi/4 and otherwise
8da2915d
UD
985// Case 2: Place integer part of N in GP register
986//
a334319f
UD
987(p7) fcvt.xf FR_N_float = FR_N_fix
988 nop.i 999 ;;
8da2915d 989}
a334319f
UD
990{ .mmf
991 nop.m 999
992(p7) getf.sig GR_N_Inc = FR_N_fix
993(p8) fcmp.gt.s1 p8, p0 = FR_Input_X, FR_Neg_Two_to_M3 ;;
8da2915d 994}
a334319f
UD
995{ .mib
996 nop.m 999
997 nop.i 999
998//
999// Load 2**(-33), -2**(-33)
1000//
1001(p8) br.cond.spnt L(SINCOSL_SMALL_R) ;;
8da2915d 1002}
a334319f
UD
1003{ .mib
1004 nop.m 999
1005 nop.i 999
1006(p6) br.cond.sptk L(SINCOSL_NORMAL_R) ;;
8da2915d 1007}
a334319f
UD
1008//
1009// if |x| < pi/4, branch based on |x| < 2**(-3) or otherwise.
1010//
1011//
1012// In this branch, |x| >= pi/4.
1013//
8da2915d 1014{ .mfi
a334319f
UD
1015(p0) ldfs FR_Neg_Two_to_M67 = [GR_Table_Base1], 8
1016//
1017// Load -2**(-67)
1018//
1019(p0) fnma.s1 FR_s = FR_N_float, FR_P_1, FR_Input_X
1020//
1021// w = N * P_2
1022// s = -N * P_1 + Arg
1023//
1024(p0) add GR_N_Inc = GR_N_Inc, GR_Sin_or_Cos
8da2915d
UD
1025}
1026{ .mfi
a334319f
UD
1027 nop.m 999
1028(p0) fma.s1 FR_w = FR_N_float, FR_P_2, f0
1029 nop.i 999 ;;
8da2915d
UD
1030}
1031{ .mfi
a334319f
UD
1032 nop.m 999
1033//
1034// Adjust N_fix by N_inc to determine whether sine or
1035// cosine is being calculated
1036//
1037(p0) fcmp.lt.unc.s1 p7, p6 = FR_s, FR_Two_to_M33
1038 nop.i 999 ;;
8da2915d
UD
1039}
1040{ .mfi
a334319f
UD
1041 nop.m 999
1042(p7) fcmp.gt.s1 p7, p6 = FR_s, FR_Neg_Two_to_M33
1043 nop.i 999 ;;
8da2915d
UD
1044}
1045{ .mfi
a334319f
UD
1046 nop.m 999
1047// Remember x >= pi/4.
1048// Is s <= -2**(-33) or s >= 2**(-33) (p6)
1049// or -2**(-33) < s < 2**(-33) (p7)
1050(p6) fms.s1 FR_r = FR_s, f1, FR_w
1051 nop.i 999
8da2915d
UD
1052}
1053{ .mfi
a334319f
UD
1054 nop.m 999
1055(p7) fma.s1 FR_w = FR_N_float, FR_P_3, f0
1056 nop.i 999 ;;
8da2915d
UD
1057}
1058{ .mfi
a334319f
UD
1059 nop.m 999
1060(p7) fma.s1 FR_U_1 = FR_N_float, FR_P_2, FR_w
1061 nop.i 999
0ecb606c 1062}
0ecb606c 1063{ .mfi
a334319f
UD
1064 nop.m 999
1065(p6) fms.s1 FR_c = FR_s, f1, FR_r
1066 nop.i 999 ;;
0ecb606c 1067}
0ecb606c 1068{ .mfi
a334319f
UD
1069 nop.m 999
1070//
1071// For big s: r = s - w: No futher reduction is necessary
1072// For small s: w = N * P_3 (change sign) More reduction
1073//
1074(p6) fcmp.lt.unc.s1 p8, p9 = FR_r, FR_Two_to_M3
1075 nop.i 999 ;;
8da2915d
UD
1076}
1077{ .mfi
a334319f
UD
1078 nop.m 999
1079(p8) fcmp.gt.s1 p8, p9 = FR_r, FR_Neg_Two_to_M3
1080 nop.i 999 ;;
8da2915d
UD
1081}
1082{ .mfi
a334319f 1083 nop.m 999
8da2915d 1084(p7) fms.s1 FR_r = FR_s, f1, FR_U_1
a334319f 1085 nop.i 999
0ecb606c 1086}
a334319f
UD
1087{ .mfb
1088 nop.m 999
8da2915d
UD
1089//
1090// For big s: Is |r| < 2**(-3)?
1091// For big s: c = S - r
1092// For small s: U_1 = N * P_2 + w
1093//
1094// If p8 is set, prepare to branch to Small_R.
1095// If p9 is set, prepare to branch to Normal_R.
1096// For big s, r is complete here.
1097//
a334319f
UD
1098(p6) fms.s1 FR_c = FR_c, f1, FR_w
1099//
8da2915d
UD
1100// For big s: c = c + w (w has not been negated.)
1101// For small s: r = S - U_1
1102//
a334319f 1103(p8) br.cond.spnt L(SINCOSL_SMALL_R) ;;
8da2915d 1104}
a334319f
UD
1105{ .mib
1106 nop.m 999
1107 nop.i 999
1108(p9) br.cond.sptk L(SINCOSL_NORMAL_R) ;;
8da2915d 1109}
a334319f
UD
1110{ .mfi
1111(p7) add GR_Table_Base1 = 224, GR_Table_Base1
8da2915d 1112//
a334319f 1113// Branch to SINCOSL_SMALL_R or SINCOSL_NORMAL_R
8da2915d 1114//
a334319f
UD
1115(p7) fms.s1 FR_U_2 = FR_N_float, FR_P_2, FR_U_1
1116//
8da2915d
UD
1117// c = S - U_1
1118// r = S_1 * r
1119//
1120//
a334319f 1121(p7) extr.u GR_i_1 = GR_N_Inc, 0, 1 ;;
8da2915d
UD
1122}
1123{ .mmi
a334319f 1124 nop.m 999
8da2915d
UD
1125//
1126// Get [i_0,i_1] - two lsb of N_fix_gr.
1127// Do dummy fmpy so inexact is always set.
1128//
a334319f
UD
1129(p7) cmp.eq.unc p9, p10 = 0x0, GR_i_1
1130(p7) extr.u GR_i_0 = GR_N_Inc, 1, 1 ;;
8da2915d 1131}
a334319f 1132//
8da2915d
UD
1133// For small s: U_2 = N * P_2 - U_1
1134// S_1 stored constant - grab the one stored with the
1135// coefficients.
a334319f 1136//
8da2915d 1137{ .mfi
a334319f 1138(p7) ldfe FR_S_1 = [GR_Table_Base1], 16
8da2915d
UD
1139//
1140// Check if i_1 and i_0 != 0
1141//
a334319f
UD
1142(p10) fma.s1 FR_poly = f0, f1, FR_Neg_Two_to_M67
1143(p7) cmp.eq.unc p11, p12 = 0x0, GR_i_0 ;;
8da2915d
UD
1144}
1145{ .mfi
a334319f
UD
1146 nop.m 999
1147(p7) fms.s1 FR_s = FR_s, f1, FR_r
1148 nop.i 999
8da2915d
UD
1149}
1150{ .mfi
a334319f
UD
1151 nop.m 999
1152//
8da2915d
UD
1153// S = S - r
1154// U_2 = U_2 + w
1155// load S_1
1156//
a334319f
UD
1157(p7) fma.s1 FR_rsq = FR_r, FR_r, f0
1158 nop.i 999 ;;
8da2915d
UD
1159}
1160{ .mfi
a334319f
UD
1161 nop.m 999
1162(p7) fma.s1 FR_U_2 = FR_U_2, f1, FR_w
1163 nop.i 999
8da2915d
UD
1164}
1165{ .mfi
a334319f
UD
1166 nop.m 999
1167(p7) fmerge.se FR_Input_X = FR_r, FR_r
1168 nop.i 999 ;;
8da2915d
UD
1169}
1170{ .mfi
a334319f
UD
1171 nop.m 999
1172(p10) fma.s1 FR_Input_X = f0, f1, f1
1173 nop.i 999 ;;
8da2915d
UD
1174}
1175{ .mfi
a334319f
UD
1176 nop.m 999
1177//
8da2915d
UD
1178// FR_rsq = r * r
1179// Save r as the result.
1180//
a334319f
UD
1181(p7) fms.s1 FR_c = FR_s, f1, FR_U_1
1182 nop.i 999 ;;
8da2915d
UD
1183}
1184{ .mfi
a334319f
UD
1185 nop.m 999
1186//
8da2915d
UD
1187// if ( i_1 ==0) poly = c + S_1*r*r*r
1188// else Result = 1
1189//
a334319f
UD
1190(p12) fnma.s1 FR_Input_X = FR_Input_X, f1, f0
1191 nop.i 999
8da2915d
UD
1192}
1193{ .mfi
a334319f
UD
1194 nop.m 999
1195(p7) fma.s1 FR_r = FR_S_1, FR_r, f0
1196 nop.i 999 ;;
8da2915d
UD
1197}
1198{ .mfi
a334319f
UD
1199 nop.m 999
1200(p7) fma.s0 FR_S_1 = FR_S_1, FR_S_1, f0
1201 nop.i 999 ;;
8da2915d
UD
1202}
1203{ .mfi
a334319f 1204 nop.m 999
8da2915d
UD
1205//
1206// If i_1 != 0, poly = 2**(-67)
1207//
a334319f
UD
1208(p7) fms.s1 FR_c = FR_c, f1, FR_U_2
1209 nop.i 999 ;;
8da2915d
UD
1210}
1211{ .mfi
a334319f
UD
1212 nop.m 999
1213//
8da2915d 1214// c = c - U_2
a334319f 1215//
8da2915d 1216(p9) fma.s1 FR_poly = FR_r, FR_rsq, FR_c
a334319f 1217 nop.i 999 ;;
8da2915d
UD
1218}
1219{ .mfi
a334319f 1220 nop.m 999
8da2915d
UD
1221//
1222// i_0 != 0, so Result = -Result
1223//
a334319f
UD
1224(p11) fma.s0 FR_Input_X = FR_Input_X, f1, FR_poly
1225 nop.i 999 ;;
8da2915d
UD
1226}
1227{ .mfb
a334319f
UD
1228 nop.m 999
1229(p12) fms.s0 FR_Input_X = FR_Input_X, f1, FR_poly
8da2915d
UD
1230//
1231// if (i_0 == 0), Result = Result + poly
1232// else Result = Result - poly
1233//
a334319f
UD
1234(p0) br.ret.sptk b0 ;;
1235}
1236L(SINCOSL_LARGER_ARG):
1237{ .mfi
1238 nop.m 999
1239(p0) fma.s1 FR_N_0 = FR_Input_X, FR_Inv_P_0, f0
1240 nop.i 999
8da2915d
UD
1241}
1242;;
1243
a334319f
UD
1244// This path for argument > 2*24
1245// Adjust table_ptr1 to beginning of table.
8da2915d 1246//
a334319f
UD
1247
1248{ .mmi
1249 nop.m 999
1250(p0) addl GR_Table_Base = @ltoff(FSINCOSL_CONSTANTS#), gp
1251 nop.i 999
8da2915d
UD
1252}
1253;;
1254
a334319f
UD
1255{ .mmi
1256 ld8 GR_Table_Base = [GR_Table_Base]
1257 nop.m 999
1258 nop.i 999
1259}
1260;;
1261
1262
1263//
1264// Point to 2*-14
8da2915d
UD
1265// N_0 = Arg * Inv_P_0
1266//
1267{ .mmi
a334319f
UD
1268(p0) add GR_Table_Base = 688, GR_Table_Base ;;
1269(p0) ldfs FR_Two_to_M14 = [GR_Table_Base], 4
1270 nop.i 999 ;;
8da2915d
UD
1271}
1272{ .mfi
a334319f
UD
1273(p0) ldfs FR_Neg_Two_to_M14 = [GR_Table_Base], 0
1274 nop.f 999
1275 nop.i 999 ;;
8da2915d
UD
1276}
1277{ .mfi
a334319f 1278 nop.m 999
8da2915d 1279//
a334319f 1280// Load values 2**(-14) and -2**(-14)
8da2915d 1281//
a334319f
UD
1282(p0) fcvt.fx.s1 FR_N_0_fix = FR_N_0
1283 nop.i 999 ;;
8da2915d
UD
1284}
1285{ .mfi
a334319f 1286 nop.m 999
8da2915d
UD
1287//
1288// N_0_fix = integer part of N_0
1289//
a334319f
UD
1290(p0) fcvt.xf FR_N_0 = FR_N_0_fix
1291 nop.i 999 ;;
8da2915d
UD
1292}
1293{ .mfi
a334319f 1294 nop.m 999
8da2915d
UD
1295//
1296// Make N_0 the integer part
1297//
a334319f
UD
1298(p0) fnma.s1 FR_ArgPrime = FR_N_0, FR_P_0, FR_Input_X
1299 nop.i 999
8da2915d
UD
1300}
1301{ .mfi
a334319f
UD
1302 nop.m 999
1303(p0) fma.s1 FR_w = FR_N_0, FR_d_1, f0
1304 nop.i 999 ;;
8da2915d
UD
1305}
1306{ .mfi
a334319f 1307 nop.m 999
8da2915d
UD
1308//
1309// Arg' = -N_0 * P_0 + Arg
1310// w = N_0 * d_1
1311//
a334319f
UD
1312(p0) fma.s1 FR_N_float = FR_ArgPrime, FR_Inv_pi_by_2, f0
1313 nop.i 999 ;;
8da2915d
UD
1314}
1315{ .mfi
a334319f 1316 nop.m 999
8da2915d 1317//
a334319f 1318// N = A' * 2/pi
8da2915d 1319//
a334319f
UD
1320(p0) fcvt.fx.s1 FR_N_fix = FR_N_float
1321 nop.i 999 ;;
8da2915d
UD
1322}
1323{ .mfi
a334319f 1324 nop.m 999
8da2915d 1325//
a334319f 1326// N_fix is the integer part
8da2915d 1327//
a334319f
UD
1328(p0) fcvt.xf FR_N_float = FR_N_fix
1329 nop.i 999 ;;
8da2915d
UD
1330}
1331{ .mfi
a334319f
UD
1332(p0) getf.sig GR_N_Inc = FR_N_fix
1333 nop.f 999
1334 nop.i 999 ;;
8da2915d
UD
1335}
1336{ .mii
a334319f
UD
1337 nop.m 999
1338 nop.i 999 ;;
1339(p0) add GR_N_Inc = GR_N_Inc, GR_Sin_or_Cos ;;
8da2915d
UD
1340}
1341{ .mfi
a334319f 1342 nop.m 999
8da2915d
UD
1343//
1344// N is the integer part of the reduced-reduced argument.
1345// Put the integer in a GP register
1346//
a334319f
UD
1347(p0) fnma.s1 FR_s = FR_N_float, FR_P_1, FR_ArgPrime
1348 nop.i 999
8da2915d
UD
1349}
1350{ .mfi
a334319f
UD
1351 nop.m 999
1352(p0) fnma.s1 FR_w = FR_N_float, FR_P_2, FR_w
1353 nop.i 999 ;;
8da2915d
UD
1354}
1355{ .mfi
a334319f 1356 nop.m 999
8da2915d
UD
1357//
1358// s = -N*P_1 + Arg'
1359// w = -N*P_2 + w
1360// N_fix_gr = N_fix_gr + N_inc
1361//
a334319f
UD
1362(p0) fcmp.lt.unc.s1 p9, p8 = FR_s, FR_Two_to_M14
1363 nop.i 999 ;;
8da2915d
UD
1364}
1365{ .mfi
a334319f
UD
1366 nop.m 999
1367(p9) fcmp.gt.s1 p9, p8 = FR_s, FR_Neg_Two_to_M14
1368 nop.i 999 ;;
8da2915d
UD
1369}
1370{ .mfi
a334319f 1371 nop.m 999
8da2915d
UD
1372//
1373// For |s| > 2**(-14) r = S + w (r complete)
1374// Else U_hi = N_0 * d_1
1375//
1376(p9) fma.s1 FR_V_hi = FR_N_float, FR_P_2, f0
a334319f 1377 nop.i 999
8da2915d
UD
1378}
1379{ .mfi
a334319f 1380 nop.m 999
8da2915d 1381(p9) fma.s1 FR_U_hi = FR_N_0, FR_d_1, f0
a334319f 1382 nop.i 999 ;;
8da2915d
UD
1383}
1384{ .mfi
a334319f 1385 nop.m 999
8da2915d
UD
1386//
1387// Either S <= -2**(-14) or S >= 2**(-14)
1388// or -2**(-14) < s < 2**(-14)
1389//
1390(p8) fma.s1 FR_r = FR_s, f1, FR_w
a334319f 1391 nop.i 999
8da2915d
UD
1392}
1393{ .mfi
a334319f 1394 nop.m 999
8da2915d 1395(p9) fma.s1 FR_w = FR_N_float, FR_P_3, f0
a334319f 1396 nop.i 999 ;;
8da2915d
UD
1397}
1398{ .mfi
a334319f 1399 nop.m 999
8da2915d
UD
1400//
1401// We need abs of both U_hi and V_hi - don't
1402// worry about switched sign of V_hi.
1403//
1404(p9) fms.s1 FR_A = FR_U_hi, f1, FR_V_hi
a334319f 1405 nop.i 999
8da2915d
UD
1406}
1407{ .mfi
a334319f 1408 nop.m 999
8da2915d 1409//
a334319f 1410// Big s: finish up c = (S - r) + w (c complete)
8da2915d
UD
1411// Case 4: A = U_hi + V_hi
1412// Note: Worry about switched sign of V_hi, so subtract instead of add.
1413//
1414(p9) fnma.s1 FR_V_lo = FR_N_float, FR_P_2, FR_V_hi
a334319f 1415 nop.i 999 ;;
8da2915d
UD
1416}
1417{ .mmf
a334319f
UD
1418 nop.m 999
1419 nop.m 999
8da2915d
UD
1420(p9) fms.s1 FR_U_lo = FR_N_0, FR_d_1, FR_U_hi
1421}
1422{ .mfi
a334319f 1423 nop.m 999
8da2915d 1424(p9) fmerge.s FR_V_hiabs = f0, FR_V_hi
a334319f 1425 nop.i 999 ;;
8da2915d
UD
1426}
1427{ .mfi
a334319f 1428 nop.m 999
8da2915d
UD
1429// For big s: c = S - r
1430// For small s do more work: U_lo = N_0 * d_1 - U_hi
1431//
1432(p9) fmerge.s FR_U_hiabs = f0, FR_U_hi
a334319f 1433 nop.i 999
8da2915d
UD
1434}
1435{ .mfi
a334319f 1436 nop.m 999
8da2915d 1437//
a334319f 1438// For big s: Is |r| < 2**(-3)
8da2915d
UD
1439// For big s: if p12 set, prepare to branch to Small_R.
1440// For big s: If p13 set, prepare to branch to Normal_R.
1441//
a334319f
UD
1442(p8) fms.s1 FR_c = FR_s, f1, FR_r
1443 nop.i 999 ;;
8da2915d
UD
1444}
1445{ .mfi
a334319f 1446 nop.m 999
8da2915d
UD
1447//
1448// For small S: V_hi = N * P_2
1449// w = N * P_3
1450// Note the product does not include the (-) as in the writeup
1451// so (-) missing for V_hi and w.
1452//
1453(p8) fcmp.lt.unc.s1 p12, p13 = FR_r, FR_Two_to_M3
a334319f 1454 nop.i 999 ;;
8da2915d
UD
1455}
1456{ .mfi
a334319f 1457 nop.m 999
8da2915d 1458(p12) fcmp.gt.s1 p12, p13 = FR_r, FR_Neg_Two_to_M3
a334319f 1459 nop.i 999 ;;
8da2915d
UD
1460}
1461{ .mfi
a334319f 1462 nop.m 999
8da2915d 1463(p8) fma.s1 FR_c = FR_c, f1, FR_w
a334319f 1464 nop.i 999
8da2915d
UD
1465}
1466{ .mfb
a334319f 1467 nop.m 999
8da2915d 1468(p9) fms.s1 FR_w = FR_N_0, FR_d_2, FR_w
a334319f 1469(p12) br.cond.spnt L(SINCOSL_SMALL_R) ;;
8da2915d
UD
1470}
1471{ .mib
a334319f
UD
1472 nop.m 999
1473 nop.i 999
1474(p13) br.cond.sptk L(SINCOSL_NORMAL_R) ;;
8da2915d
UD
1475}
1476{ .mfi
a334319f
UD
1477 nop.m 999
1478//
1479// Big s: Vector off when |r| < 2**(-3). Recall that p8 will be true.
8da2915d
UD
1480// The remaining stuff is for Case 4.
1481// Small s: V_lo = N * P_2 + U_hi (U_hi is in place of V_hi in writeup)
1482// Note: the (-) is still missing for V_lo.
1483// Small s: w = w + N_0 * d_2
1484// Note: the (-) is now incorporated in w.
1485//
a334319f
UD
1486(p9) fcmp.ge.unc.s1 p10, p11 = FR_U_hiabs, FR_V_hiabs
1487(p0) extr.u GR_i_1 = GR_N_Inc, 0, 1
8da2915d
UD
1488}
1489{ .mfi
a334319f 1490 nop.m 999
8da2915d
UD
1491//
1492// C_hi = S + A
1493//
a334319f
UD
1494(p9) fma.s1 FR_t = FR_U_lo, f1, FR_V_lo
1495(p0) extr.u GR_i_0 = GR_N_Inc, 1, 1 ;;
8da2915d
UD
1496}
1497{ .mfi
a334319f 1498 nop.m 999
8da2915d 1499//
a334319f 1500// t = U_lo + V_lo
8da2915d
UD
1501//
1502//
a334319f
UD
1503(p10) fms.s1 FR_a = FR_U_hi, f1, FR_A
1504 nop.i 999 ;;
8da2915d
UD
1505}
1506{ .mfi
a334319f
UD
1507 nop.m 999
1508(p11) fma.s1 FR_a = FR_V_hi, f1, FR_A
1509 nop.i 999
1510}
1511;;
1512
1513{ .mmi
1514 nop.m 999
1515(p0) addl GR_Table_Base = @ltoff(FSINCOSL_CONSTANTS#), gp
1516 nop.i 999
1517}
1518;;
1519
1520{ .mmi
1521 ld8 GR_Table_Base = [GR_Table_Base]
1522 nop.m 999
1523 nop.i 999
8da2915d
UD
1524}
1525;;
1526
a334319f 1527
8da2915d 1528{ .mfi
a334319f 1529(p0) add GR_Table_Base = 528, GR_Table_Base
8da2915d
UD
1530//
1531// Is U_hiabs >= V_hiabs?
1532//
a334319f
UD
1533(p9) fma.s1 FR_C_hi = FR_s, f1, FR_A
1534 nop.i 999 ;;
8da2915d
UD
1535}
1536{ .mmi
a334319f
UD
1537(p0) ldfe FR_C_1 = [GR_Table_Base], 16 ;;
1538(p0) ldfe FR_C_2 = [GR_Table_Base], 64
1539 nop.i 999 ;;
8da2915d
UD
1540}
1541//
1542// c = c + C_lo finished.
1543// Load C_2
1544//
1545{ .mfi
a334319f 1546(p0) ldfe FR_S_1 = [GR_Table_Base], 16
8da2915d 1547//
a334319f 1548// C_lo = S - C_hi
8da2915d 1549//
a334319f
UD
1550(p0) fma.s1 FR_t = FR_t, f1, FR_w
1551 nop.i 999 ;;
8da2915d
UD
1552}
1553//
1554// r and c have been computed.
1555// Make sure ftz mode is set - should be automatic when using wre
1556// |r| < 2**(-3)
1557// Get [i_0,i_1] - two lsb of N_fix.
1558// Load S_1
1559//
1560{ .mfi
a334319f 1561(p0) ldfe FR_S_2 = [GR_Table_Base], 64
8da2915d 1562//
a334319f 1563// t = t + w
8da2915d 1564//
a334319f
UD
1565(p10) fms.s1 FR_a = FR_a, f1, FR_V_hi
1566(p0) cmp.eq.unc p9, p10 = 0x0, GR_i_0 ;;
8da2915d
UD
1567}
1568{ .mfi
a334319f 1569 nop.m 999
8da2915d
UD
1570//
1571// For larger u than v: a = U_hi - A
1572// Else a = V_hi - A (do an add to account for missing (-) on V_hi
1573//
a334319f
UD
1574(p0) fms.s1 FR_C_lo = FR_s, f1, FR_C_hi
1575 nop.i 999 ;;
8da2915d
UD
1576}
1577{ .mfi
a334319f
UD
1578 nop.m 999
1579(p11) fms.s1 FR_a = FR_U_hi, f1, FR_a
1580(p0) cmp.eq.unc p11, p12 = 0x0, GR_i_1 ;;
8da2915d
UD
1581}
1582{ .mfi
a334319f 1583 nop.m 999
8da2915d
UD
1584//
1585// If u > v: a = (U_hi - A) + V_hi
1586// Else a = (V_hi - A) + U_hi
1587// In each case account for negative missing from V_hi.
1588//
a334319f
UD
1589(p0) fma.s1 FR_C_lo = FR_C_lo, f1, FR_A
1590 nop.i 999 ;;
8da2915d
UD
1591}
1592{ .mfi
a334319f 1593 nop.m 999
8da2915d 1594//
a334319f 1595// C_lo = (S - C_hi) + A
8da2915d 1596//
a334319f
UD
1597(p0) fma.s1 FR_t = FR_t, f1, FR_a
1598 nop.i 999 ;;
8da2915d
UD
1599}
1600{ .mfi
a334319f 1601 nop.m 999
8da2915d 1602//
a334319f 1603// t = t + a
8da2915d 1604//
a334319f
UD
1605(p0) fma.s1 FR_C_lo = FR_C_lo, f1, FR_t
1606 nop.i 999 ;;
8da2915d
UD
1607}
1608{ .mfi
a334319f 1609 nop.m 999
8da2915d
UD
1610//
1611// C_lo = C_lo + t
a334319f 1612// Adjust Table_Base to beginning of table
8da2915d 1613//
a334319f
UD
1614(p0) fma.s1 FR_r = FR_C_hi, f1, FR_C_lo
1615 nop.i 999 ;;
8da2915d
UD
1616}
1617{ .mfi
a334319f 1618 nop.m 999
8da2915d
UD
1619//
1620// Load S_2
1621//
a334319f
UD
1622(p0) fma.s1 FR_rsq = FR_r, FR_r, f0
1623 nop.i 999
8da2915d
UD
1624}
1625{ .mfi
a334319f 1626 nop.m 999
8da2915d 1627//
a334319f 1628// Table_Base points to C_1
8da2915d
UD
1629// r = C_hi + C_lo
1630//
a334319f
UD
1631(p0) fms.s1 FR_c = FR_C_hi, f1, FR_r
1632 nop.i 999 ;;
8da2915d
UD
1633}
1634{ .mfi
a334319f 1635 nop.m 999
8da2915d
UD
1636//
1637// if i_1 ==0: poly = S_2 * FR_rsq + S_1
1638// else poly = C_2 * FR_rsq + C_1
1639//
a334319f
UD
1640(p11) fma.s1 FR_Input_X = f0, f1, FR_r
1641 nop.i 999 ;;
8da2915d
UD
1642}
1643{ .mfi
a334319f
UD
1644 nop.m 999
1645(p12) fma.s1 FR_Input_X = f0, f1, f1
1646 nop.i 999 ;;
8da2915d
UD
1647}
1648{ .mfi
a334319f 1649 nop.m 999
8da2915d 1650//
a334319f 1651// Compute r_cube = FR_rsq * r
8da2915d 1652//
a334319f
UD
1653(p11) fma.s1 FR_poly = FR_rsq, FR_S_2, FR_S_1
1654 nop.i 999 ;;
8da2915d
UD
1655}
1656{ .mfi
a334319f
UD
1657 nop.m 999
1658(p12) fma.s1 FR_poly = FR_rsq, FR_C_2, FR_C_1
1659 nop.i 999
8da2915d
UD
1660}
1661{ .mfi
a334319f 1662 nop.m 999
8da2915d
UD
1663//
1664// Compute FR_rsq = r * r
1665// Is i_1 == 0 ?
1666//
a334319f
UD
1667(p0) fma.s1 FR_r_cubed = FR_rsq, FR_r, f0
1668 nop.i 999 ;;
8da2915d
UD
1669}
1670{ .mfi
a334319f 1671 nop.m 999
8da2915d
UD
1672//
1673// c = C_hi - r
1674// Load C_1
1675//
a334319f
UD
1676(p0) fma.s1 FR_c = FR_c, f1, FR_C_lo
1677 nop.i 999
8da2915d
UD
1678}
1679{ .mfi
a334319f 1680 nop.m 999
8da2915d
UD
1681//
1682// if i_1 ==0: poly = r_cube * poly + c
1683// else poly = FR_rsq * poly
1684//
a334319f
UD
1685(p10) fms.s1 FR_Input_X = f0, f1, FR_Input_X
1686 nop.i 999 ;;
8da2915d
UD
1687}
1688{ .mfi
a334319f 1689 nop.m 999
8da2915d
UD
1690//
1691// if i_1 ==0: Result = r
1692// else Result = 1.0
1693//
a334319f
UD
1694(p11) fma.s1 FR_poly = FR_r_cubed, FR_poly, FR_c
1695 nop.i 999 ;;
8da2915d
UD
1696}
1697{ .mfi
a334319f
UD
1698 nop.m 999
1699(p12) fma.s1 FR_poly = FR_rsq, FR_poly, f0
1700 nop.i 999 ;;
8da2915d
UD
1701}
1702{ .mfi
a334319f 1703 nop.m 999
8da2915d 1704//
a334319f 1705// if i_0 !=0: Result = -Result
8da2915d 1706//
a334319f
UD
1707(p9) fma.s0 FR_Input_X = FR_Input_X, f1, FR_poly
1708 nop.i 999 ;;
8da2915d
UD
1709}
1710{ .mfb
a334319f
UD
1711 nop.m 999
1712(p10) fms.s0 FR_Input_X = FR_Input_X, f1, FR_poly
8da2915d
UD
1713//
1714// if i_0 == 0: Result = Result + poly
1715// else Result = Result - poly
1716//
a334319f 1717(p0) br.ret.sptk b0 ;;
8da2915d 1718}
a334319f
UD
1719L(SINCOSL_SMALL_R):
1720{ .mii
1721 nop.m 999
1722(p0) extr.u GR_i_1 = GR_N_Inc, 0, 1 ;;
8da2915d
UD
1723//
1724//
1725// Compare both i_1 and i_0 with 0.
1726// if i_1 == 0, set p9.
1727// if i_0 == 0, set p11.
1728//
a334319f
UD
1729(p0) cmp.eq.unc p9, p10 = 0x0, GR_i_1 ;;
1730}
8da2915d 1731{ .mfi
a334319f
UD
1732 nop.m 999
1733(p0) fma.s1 FR_rsq = FR_r, FR_r, f0
1734(p0) extr.u GR_i_0 = GR_N_Inc, 1, 1 ;;
1735}
1736{ .mfi
1737 nop.m 999
1738//
1739// Z = Z * FR_rsq
1740//
1741(p10) fnma.s1 FR_c = FR_c, FR_r, f0
1742(p0) cmp.eq.unc p11, p12 = 0x0, GR_i_0
8da2915d
UD
1743}
1744;;
1745
a334319f
UD
1746// ******************************************************************
1747// ******************************************************************
1748// ******************************************************************
1749// r and c have been computed.
1750// We know whether this is the sine or cosine routine.
1751// Make sure ftz mode is set - should be automatic when using wre
1752// |r| < 2**(-3)
1753//
1754// Set table_ptr1 to beginning of constant table.
1755// Get [i_0,i_1] - two lsb of N_fix_gr.
1756//
1757
8da2915d 1758{ .mmi
a334319f
UD
1759 nop.m 999
1760(p0) addl GR_Table_Base = @ltoff(FSINCOSL_CONSTANTS#), gp
8da2915d
UD
1761 nop.i 999
1762}
1763;;
1764
1765{ .mmi
a334319f
UD
1766 ld8 GR_Table_Base = [GR_Table_Base]
1767 nop.m 999
8da2915d
UD
1768 nop.i 999
1769}
1770;;
1771
a334319f
UD
1772
1773//
1774// Set table_ptr1 to point to S_5.
1775// Set table_ptr1 to point to C_5.
1776// Compute FR_rsq = r * r
1777//
0ecb606c 1778{ .mfi
a334319f
UD
1779(p9) add GR_Table_Base = 672, GR_Table_Base
1780(p10) fmerge.s FR_r = f1, f1
1781(p10) add GR_Table_Base = 592, GR_Table_Base ;;
1782}
1783//
1784// Set table_ptr1 to point to S_5.
1785// Set table_ptr1 to point to C_5.
1786//
1787{ .mmi
1788(p9) ldfe FR_S_5 = [GR_Table_Base], -16 ;;
1789//
1790// if (i_1 == 0) load S_5
1791// if (i_1 != 0) load C_5
1792//
1793(p9) ldfe FR_S_4 = [GR_Table_Base], -16
1794 nop.i 999 ;;
8da2915d
UD
1795}
1796{ .mmf
a334319f
UD
1797(p10) ldfe FR_C_5 = [GR_Table_Base], -16
1798//
1799// Z = FR_rsq * FR_rsq
1800//
1801(p9) ldfe FR_S_3 = [GR_Table_Base], -16
1802//
1803// Compute FR_rsq = r * r
1804// if (i_1 == 0) load S_4
1805// if (i_1 != 0) load C_4
1806//
1807(p0) fma.s1 FR_Z = FR_rsq, FR_rsq, f0 ;;
8da2915d 1808}
a334319f
UD
1809//
1810// if (i_1 == 0) load S_3
1811// if (i_1 != 0) load C_3
1812//
8da2915d 1813{ .mmi
a334319f
UD
1814(p9) ldfe FR_S_2 = [GR_Table_Base], -16 ;;
1815//
1816// if (i_1 == 0) load S_2
1817// if (i_1 != 0) load C_2
1818//
1819(p9) ldfe FR_S_1 = [GR_Table_Base], -16
1820 nop.i 999
1821}
1822{ .mmi
1823(p10) ldfe FR_C_4 = [GR_Table_Base], -16 ;;
1824(p10) ldfe FR_C_3 = [GR_Table_Base], -16
1825 nop.i 999 ;;
1826}
1827{ .mmi
1828(p10) ldfe FR_C_2 = [GR_Table_Base], -16 ;;
1829(p10) ldfe FR_C_1 = [GR_Table_Base], -16
1830 nop.i 999
8da2915d
UD
1831}
1832{ .mfi
a334319f
UD
1833 nop.m 999
1834//
1835// if (i_1 != 0):
1836// poly_lo = FR_rsq * C_5 + C_4
1837// poly_hi = FR_rsq * C_2 + C_1
1838//
1839(p9) fma.s1 FR_Z = FR_Z, FR_r, f0
1840 nop.i 999 ;;
8da2915d
UD
1841}
1842{ .mfi
a334319f
UD
1843 nop.m 999
1844//
1845// if (i_1 == 0) load S_1
1846// if (i_1 != 0) load C_1
1847//
1848(p9) fma.s1 FR_poly_lo = FR_rsq, FR_S_5, FR_S_4
1849 nop.i 999
8da2915d
UD
1850}
1851{ .mfi
a334319f
UD
1852 nop.m 999
1853//
1854// c = -c * r
1855// dummy fmpy's to flag inexact.
1856//
1857(p9) fma.s0 FR_S_4 = FR_S_4, FR_S_4, f0
1858 nop.i 999 ;;
8da2915d
UD
1859}
1860{ .mfi
a334319f
UD
1861 nop.m 999
1862//
1863// poly_lo = FR_rsq * poly_lo + C_3
1864// poly_hi = FR_rsq * poly_hi
1865//
1866(p0) fma.s1 FR_Z = FR_Z, FR_rsq, f0
1867 nop.i 999 ;;
8da2915d
UD
1868}
1869{ .mfi
a334319f
UD
1870 nop.m 999
1871(p9) fma.s1 FR_poly_hi = FR_rsq, FR_S_2, FR_S_1
1872 nop.i 999
8da2915d
UD
1873}
1874{ .mfi
a334319f
UD
1875 nop.m 999
1876//
1877// if (i_1 == 0):
1878// poly_lo = FR_rsq * S_5 + S_4
1879// poly_hi = FR_rsq * S_2 + S_1
1880//
1881(p10) fma.s1 FR_poly_lo = FR_rsq, FR_C_5, FR_C_4
1882 nop.i 999 ;;
8da2915d
UD
1883}
1884{ .mfi
a334319f
UD
1885 nop.m 999
1886//
1887// if (i_1 == 0):
1888// Z = Z * r for only one of the small r cases - not there
1889// in original implementation notes.
1890//
1891(p9) fma.s1 FR_poly_lo = FR_rsq, FR_poly_lo, FR_S_3
1892 nop.i 999 ;;
8da2915d
UD
1893}
1894{ .mfi
a334319f
UD
1895 nop.m 999
1896(p10) fma.s1 FR_poly_hi = FR_rsq, FR_C_2, FR_C_1
1897 nop.i 999
8da2915d
UD
1898}
1899{ .mfi
a334319f
UD
1900 nop.m 999
1901(p10) fma.s0 FR_C_1 = FR_C_1, FR_C_1, f0
1902 nop.i 999 ;;
8da2915d
UD
1903}
1904{ .mfi
a334319f
UD
1905 nop.m 999
1906(p9) fma.s1 FR_poly_hi = FR_poly_hi, FR_rsq, f0
1907 nop.i 999
8da2915d
UD
1908}
1909{ .mfi
a334319f
UD
1910 nop.m 999
1911//
1912// poly_lo = FR_rsq * poly_lo + S_3
1913// poly_hi = FR_rsq * poly_hi
1914//
1915(p10) fma.s1 FR_poly_lo = FR_rsq, FR_poly_lo, FR_C_3
1916 nop.i 999 ;;
8da2915d
UD
1917}
1918{ .mfi
a334319f
UD
1919 nop.m 999
1920(p10) fma.s1 FR_poly_hi = FR_poly_hi, FR_rsq, f0
1921 nop.i 999 ;;
8da2915d
UD
1922}
1923{ .mfi
a334319f
UD
1924 nop.m 999
1925//
1926// if (i_1 == 0): dummy fmpy's to flag inexact
1927// r = 1
1928//
1929(p9) fma.s1 FR_poly_hi = FR_r, FR_poly_hi, f0
1930 nop.i 999
8da2915d
UD
1931}
1932{ .mfi
a334319f
UD
1933 nop.m 999
1934//
1935// poly_hi = r * poly_hi
1936//
1937(p0) fma.s1 FR_poly = FR_Z, FR_poly_lo, FR_c
1938 nop.i 999 ;;
8da2915d
UD
1939}
1940{ .mfi
a334319f
UD
1941 nop.m 999
1942(p12) fms.s1 FR_r = f0, f1, FR_r
1943 nop.i 999 ;;
8da2915d
UD
1944}
1945{ .mfi
a334319f
UD
1946 nop.m 999
1947//
1948// poly_hi = Z * poly_lo + c
1949// if i_0 == 1: r = -r
1950//
1951(p0) fma.s1 FR_poly = FR_poly, f1, FR_poly_hi
1952 nop.i 999 ;;
8da2915d 1953}
a334319f
UD
1954{ .mfi
1955 nop.m 999
1956(p12) fms.s0 FR_Input_X = FR_r, f1, FR_poly
1957 nop.i 999
1958}
1959{ .mfb
1960 nop.m 999
1961//
1962// poly = poly + poly_hi
1963//
1964(p11) fma.s0 FR_Input_X = FR_r, f1, FR_poly
8da2915d
UD
1965//
1966// if (i_0 == 0) Result = r + poly
1967// if (i_0 != 0) Result = r - poly
1968//
a334319f
UD
1969(p0) br.ret.sptk b0 ;;
1970}
1971L(SINCOSL_NORMAL_R):
1972{ .mii
1973 nop.m 999
1974(p0) extr.u GR_i_1 = GR_N_Inc, 0, 1 ;;
1975//
1976// Set table_ptr1 and table_ptr2 to base address of
1977// constant table.
1978(p0) cmp.eq.unc p9, p10 = 0x0, GR_i_1 ;;
1979}
8da2915d 1980{ .mfi
a334319f
UD
1981 nop.m 999
1982(p0) fma.s1 FR_rsq = FR_r, FR_r, f0
1983(p0) extr.u GR_i_0 = GR_N_Inc, 1, 1 ;;
8da2915d 1984}
a334319f
UD
1985{ .mfi
1986 nop.m 999
1987(p0) frcpa.s1 FR_r_hi, p6 = f1, FR_r
1988(p0) cmp.eq.unc p11, p12 = 0x0, GR_i_0
8da2915d
UD
1989}
1990;;
1991
a334319f
UD
1992// ******************************************************************
1993// ******************************************************************
1994// ******************************************************************
8da2915d 1995//
a334319f
UD
1996// r and c have been computed.
1997// We known whether this is the sine or cosine routine.
1998// Make sure ftz mode is set - should be automatic when using wre
1999// Get [i_0,i_1] - two lsb of N_fix_gr alone.
8da2915d 2000//
a334319f
UD
2001
2002{ .mmi
2003 nop.m 999
2004(p0) addl GR_Table_Base = @ltoff(FSINCOSL_CONSTANTS#), gp
8da2915d
UD
2005 nop.i 999
2006}
2007;;
2008
2009{ .mmi
a334319f
UD
2010 ld8 GR_Table_Base = [GR_Table_Base]
2011 nop.m 999
8da2915d
UD
2012 nop.i 999
2013}
2014;;
2015
2016
2017{ .mfi
a334319f
UD
2018(p10) add GR_Table_Base = 384, GR_Table_Base
2019(p12) fms.s1 FR_Input_X = f0, f1, f1
2020(p9) add GR_Table_Base = 224, GR_Table_Base ;;
8da2915d
UD
2021}
2022{ .mfi
a334319f
UD
2023(p10) ldfe FR_QQ_8 = [GR_Table_Base], 16
2024//
2025// if (i_1==0) poly = poly * FR_rsq + PP_1_lo
2026// else poly = FR_rsq * poly
2027//
2028(p11) fma.s1 FR_Input_X = f0, f1, f1
2029 nop.i 999 ;;
2030}
2031{ .mmb
2032(p10) ldfe FR_QQ_7 = [GR_Table_Base], 16
2033//
2034// Adjust table pointers based on i_0
2035// Compute rsq = r * r
2036//
2037(p9) ldfe FR_PP_8 = [GR_Table_Base], 16
2038 nop.b 999 ;;
8da2915d
UD
2039}
2040{ .mfi
a334319f
UD
2041 nop.m 999
2042(p0) fma.s1 FR_r_cubed = FR_r, FR_rsq, f0
2043 nop.i 999 ;;
8da2915d
UD
2044}
2045{ .mmf
a334319f
UD
2046(p9) ldfe FR_PP_7 = [GR_Table_Base], 16
2047(p10) ldfe FR_QQ_6 = [GR_Table_Base], 16
2048//
2049// Load PP_8 and QQ_8; PP_7 and QQ_7
2050//
2051(p0) frcpa.s1 FR_r_hi, p6 = f1, FR_r_hi ;;
8da2915d 2052}
a334319f
UD
2053//
2054// if (i_1==0) poly = PP_7 + FR_rsq * PP_8.
2055// else poly = QQ_7 + FR_rsq * QQ_8.
2056//
2057{ .mmb
2058(p9) ldfe FR_PP_6 = [GR_Table_Base], 16
2059(p10) ldfe FR_QQ_5 = [GR_Table_Base], 16
2060 nop.b 999 ;;
8da2915d 2061}
a334319f
UD
2062{ .mmb
2063(p9) ldfe FR_PP_5 = [GR_Table_Base], 16
2064(p10) ldfe FR_S_1 = [GR_Table_Base], 16
2065 nop.b 999 ;;
2066}
2067{ .mmb
2068(p10) ldfe FR_QQ_1 = [GR_Table_Base], 16
2069(p9) ldfe FR_C_1 = [GR_Table_Base], 16
2070 nop.b 999 ;;
2071}
2072{ .mmb
2073(p10) ldfe FR_QQ_4 = [GR_Table_Base], 16
2074(p9) ldfe FR_PP_1 = [GR_Table_Base], 16
2075 nop.b 999 ;;
2076}
2077{ .mmb
2078(p10) ldfe FR_QQ_3 = [GR_Table_Base], 16
2079//
2080// if (i_1=0) corr = corr + c*c
2081// else corr = corr * c
2082//
2083(p9) ldfe FR_PP_4 = [GR_Table_Base], 16
2084 nop.b 999 ;;
8da2915d
UD
2085}
2086{ .mfi
a334319f
UD
2087 nop.m 999
2088(p10) fma.s1 FR_poly = FR_rsq, FR_QQ_8, FR_QQ_7
2089 nop.i 999 ;;
2090}
2091//
2092// if (i_1=0) poly = rsq * poly + PP_5
2093// else poly = rsq * poly + QQ_5
2094// Load PP_4 or QQ_4
2095//
2096{ .mmi
2097(p9) ldfe FR_PP_3 = [GR_Table_Base], 16 ;;
2098(p10) ldfe FR_QQ_2 = [GR_Table_Base], 16
2099 nop.i 999
8da2915d
UD
2100}
2101{ .mfi
a334319f
UD
2102 nop.m 999
2103//
2104// r_hi = frcpa(frcpa(r)).
2105// r_cube = r * FR_rsq.
2106//
2107(p9) fma.s1 FR_poly = FR_rsq, FR_PP_8, FR_PP_7
2108 nop.i 999 ;;
8da2915d 2109}
a334319f
UD
2110//
2111// Do dummy multiplies so inexact is always set.
2112//
8da2915d 2113{ .mfi
a334319f
UD
2114(p9) ldfe FR_PP_2 = [GR_Table_Base], 16
2115//
2116// r_lo = r - r_hi
2117//
2118(p9) fma.s1 FR_U_lo = FR_r_hi, FR_r_hi, f0
2119 nop.i 999 ;;
2120}
2121{ .mbb
2122(p9) ldfe FR_PP_1_lo = [GR_Table_Base], 16
2123 nop.b 999
2124 nop.b 999 ;;
8da2915d
UD
2125}
2126{ .mfi
a334319f
UD
2127 nop.m 999
2128(p10) fma.s1 FR_corr = FR_S_1, FR_r_cubed, FR_r
2129 nop.i 999
8da2915d
UD
2130}
2131{ .mfi
a334319f
UD
2132 nop.m 999
2133(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_6
2134 nop.i 999 ;;
8da2915d
UD
2135}
2136{ .mfi
a334319f
UD
2137 nop.m 999
2138//
2139// if (i_1=0) U_lo = r_hi * r_hi
2140// else U_lo = r_hi + r
2141//
2142(p9) fma.s1 FR_corr = FR_C_1, FR_rsq, f0
2143 nop.i 999 ;;
8da2915d
UD
2144}
2145{ .mfi
a334319f
UD
2146 nop.m 999
2147//
2148// if (i_1=0) corr = C_1 * rsq
2149// else corr = S_1 * r_cubed + r
2150//
2151(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_6
2152 nop.i 999 ;;
8da2915d
UD
2153}
2154{ .mfi
a334319f
UD
2155 nop.m 999
2156(p10) fma.s1 FR_U_lo = FR_r_hi, f1, FR_r
2157 nop.i 999
8da2915d
UD
2158}
2159{ .mfi
a334319f
UD
2160 nop.m 999
2161//
2162// if (i_1=0) U_hi = r_hi + U_hi
2163// else U_hi = QQ_1 * U_hi + 1
2164//
2165(p9) fma.s1 FR_U_lo = FR_r, FR_r_hi, FR_U_lo
2166 nop.i 999 ;;
8da2915d
UD
2167}
2168{ .mfi
a334319f
UD
2169 nop.m 999
2170//
2171// U_hi = r_hi * r_hi
2172//
2173(p0) fms.s1 FR_r_lo = FR_r, f1, FR_r_hi
2174 nop.i 999
8da2915d
UD
2175}
2176{ .mfi
a334319f
UD
2177 nop.m 999
2178//
2179// Load PP_1, PP_6, PP_5, and C_1
2180// Load QQ_1, QQ_6, QQ_5, and S_1
2181//
2182(p0) fma.s1 FR_U_hi = FR_r_hi, FR_r_hi, f0
2183 nop.i 999 ;;
8da2915d
UD
2184}
2185{ .mfi
a334319f
UD
2186 nop.m 999
2187(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_5
2188 nop.i 999
8da2915d
UD
2189}
2190{ .mfi
a334319f
UD
2191 nop.m 999
2192(p10) fnma.s1 FR_corr = FR_corr, FR_c, f0
2193 nop.i 999 ;;
8da2915d
UD
2194}
2195{ .mfi
a334319f
UD
2196 nop.m 999
2197//
2198// if (i_1=0) U_lo = r * r_hi + U_lo
2199// else U_lo = r_lo * U_lo
2200//
2201(p9) fma.s1 FR_corr = FR_corr, FR_c, FR_c
2202 nop.i 999 ;;
8da2915d
UD
2203}
2204{ .mfi
a334319f
UD
2205 nop.m 999
2206(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_5
2207 nop.i 999
8da2915d
UD
2208}
2209{ .mfi
a334319f
UD
2210 nop.m 999
2211//
2212// if (i_1 =0) U_hi = r + U_hi
2213// if (i_1 =0) U_lo = r_lo * U_lo
2214//
2215//
2216(p9) fma.s0 FR_PP_5 = FR_PP_5, FR_PP_4, f0
2217 nop.i 999 ;;
8da2915d
UD
2218}
2219{ .mfi
a334319f
UD
2220 nop.m 999
2221(p9) fma.s1 FR_U_lo = FR_r, FR_r, FR_U_lo
2222 nop.i 999 ;;
8da2915d
UD
2223}
2224{ .mfi
a334319f
UD
2225 nop.m 999
2226(p10) fma.s1 FR_U_lo = FR_r_lo, FR_U_lo, f0
2227 nop.i 999 ;;
8da2915d
UD
2228}
2229{ .mfi
a334319f
UD
2230 nop.m 999
2231//
2232// if (i_1=0) poly = poly * rsq + PP_6
2233// else poly = poly * rsq + QQ_6
2234//
2235(p9) fma.s1 FR_U_hi = FR_r_hi, FR_U_hi, f0
2236 nop.i 999
8da2915d
UD
2237}
2238{ .mfi
a334319f
UD
2239 nop.m 999
2240(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_4
2241 nop.i 999 ;;
8da2915d
UD
2242}
2243{ .mfi
a334319f
UD
2244 nop.m 999
2245(p10) fma.s1 FR_U_hi = FR_QQ_1, FR_U_hi, f1
2246 nop.i 999
8da2915d
UD
2247}
2248{ .mfi
a334319f
UD
2249 nop.m 999
2250(p10) fma.s0 FR_QQ_5 = FR_QQ_5, FR_QQ_5, f0
2251 nop.i 999 ;;
8da2915d
UD
2252}
2253{ .mfi
a334319f
UD
2254 nop.m 999
2255//
2256// if (i_1!=0) U_hi = PP_1 * U_hi
2257// if (i_1!=0) U_lo = r * r + U_lo
2258// Load PP_3 or QQ_3
2259//
2260(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_4
2261 nop.i 999 ;;
8da2915d
UD
2262}
2263{ .mfi
a334319f
UD
2264 nop.m 999
2265(p9) fma.s1 FR_U_lo = FR_r_lo, FR_U_lo, f0
2266 nop.i 999 ;;
8da2915d
UD
2267}
2268{ .mfi
a334319f
UD
2269 nop.m 999
2270(p10) fma.s1 FR_U_lo = FR_QQ_1,FR_U_lo, f0
2271 nop.i 999 ;;
8da2915d
UD
2272}
2273{ .mfi
a334319f
UD
2274 nop.m 999
2275(p9) fma.s1 FR_U_hi = FR_PP_1, FR_U_hi, f0
2276 nop.i 999
8da2915d
UD
2277}
2278{ .mfi
a334319f
UD
2279 nop.m 999
2280(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_3
2281 nop.i 999 ;;
8da2915d
UD
2282}
2283{ .mfi
a334319f
UD
2284 nop.m 999
2285//
2286// Load PP_2, QQ_2
2287//
2288(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_3
2289 nop.i 999 ;;
8da2915d
UD
2290}
2291{ .mfi
a334319f
UD
2292 nop.m 999
2293//
2294// if (i_1==0) poly = FR_rsq * poly + PP_3
2295// else poly = FR_rsq * poly + QQ_3
2296// Load PP_1_lo
2297//
2298(p9) fma.s1 FR_U_lo = FR_PP_1, FR_U_lo, f0
2299 nop.i 999 ;;
8da2915d
UD
2300}
2301{ .mfi
a334319f
UD
2302 nop.m 999
2303//
2304// if (i_1 =0) poly = poly * rsq + pp_r4
2305// else poly = poly * rsq + qq_r4
2306//
2307(p9) fma.s1 FR_U_hi = FR_r, f1, FR_U_hi
2308 nop.i 999
8da2915d
UD
2309}
2310{ .mfi
a334319f
UD
2311 nop.m 999
2312(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_2
2313 nop.i 999 ;;
8da2915d
UD
2314}
2315{ .mfi
a334319f
UD
2316 nop.m 999
2317//
2318// if (i_1==0) U_lo = PP_1_hi * U_lo
2319// else U_lo = QQ_1 * U_lo
2320//
2321(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_2
2322 nop.i 999 ;;
8da2915d
UD
2323}
2324{ .mfi
a334319f
UD
2325 nop.m 999
2326//
2327// if (i_0==0) Result = 1
2328// else Result = -1
2329//
2330(p0) fma.s1 FR_V = FR_U_lo, f1, FR_corr
2331 nop.i 999 ;;
8da2915d
UD
2332}
2333{ .mfi
a334319f
UD
2334 nop.m 999
2335(p10) fma.s1 FR_poly = FR_rsq, FR_poly, f0
2336 nop.i 999 ;;
8da2915d
UD
2337}
2338{ .mfi
a334319f
UD
2339 nop.m 999
2340//
2341// if (i_1==0) poly = FR_rsq * poly + PP_2
2342// else poly = FR_rsq * poly + QQ_2
2343//
2344(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_1_lo
2345 nop.i 999 ;;
8da2915d
UD
2346}
2347{ .mfi
a334319f
UD
2348 nop.m 999
2349(p10) fma.s1 FR_poly = FR_rsq, FR_poly, f0
2350 nop.i 999 ;;
8da2915d 2351}
a334319f
UD
2352{ .mfi
2353 nop.m 999
2354//
2355// V = U_lo + corr
2356//
2357(p9) fma.s1 FR_poly = FR_r_cubed, FR_poly, f0
2358 nop.i 999 ;;
0ecb606c 2359}
0ecb606c 2360{ .mfi
a334319f
UD
2361 nop.m 999
2362//
2363// if (i_1==0) poly = r_cube * poly
2364// else poly = FR_rsq * poly
2365//
2366(p0) fma.s1 FR_V = FR_poly, f1, FR_V
2367 nop.i 999 ;;
8da2915d
UD
2368}
2369{ .mfi
a334319f
UD
2370 nop.m 999
2371(p12) fms.s0 FR_Input_X = FR_Input_X, FR_U_hi, FR_V
2372 nop.i 999
8da2915d
UD
2373}
2374{ .mfb
a334319f
UD
2375 nop.m 999
2376//
2377// V = V + poly
2378//
2379(p11) fma.s0 FR_Input_X = FR_Input_X, FR_U_hi, FR_V
2380//
2381// if (i_0==0) Result = Result * U_hi + V
2382// else Result = Result * U_hi - V
2383//
2384(p0) br.ret.sptk b0
2385};;
0ecb606c 2386
a334319f
UD
2387//
2388// If cosine, FR_Input_X = 1
2389// If sine, FR_Input_X = +/-Zero (Input FR_Input_X)
2390// Results are exact, no exceptions
2391//
0ecb606c 2392
a334319f
UD
2393L(SINCOSL_ZERO):
2394{ .mbb
2395(p0) cmp.eq.unc p6, p7 = 0x1, GR_Sin_or_Cos
2396 nop.b 999
2397 nop.b 999 ;;
2398}
2399{ .mfi
2400 nop.m 999
2401(p7) fmerge.s FR_Input_X = FR_Input_X, FR_Input_X
2402 nop.i 999
8da2915d 2403}
a334319f
UD
2404{ .mfb
2405 nop.m 999
2406(p6) fmerge.s FR_Input_X = f1, f1
2407(p0) br.ret.sptk b0 ;;
2408}
2409L(SINCOSL_SPECIAL):
8da2915d
UD
2410{ .mfb
2411 nop.m 999
2412//
2413// Path for Arg = +/- QNaN, SNaN, Inf
2414// Invalid can be raised. SNaNs
2415// become QNaNs
2416//
a334319f
UD
2417(p0) fmpy.s0 FR_Input_X = FR_Input_X, f0
2418(p0) br.ret.sptk b0 ;;
8da2915d 2419}
a334319f
UD
2420.endp cosl#
2421ASM_SIZE_DIRECTIVE(cosl#)
8da2915d 2422
a334319f
UD
2423// Call int pi_by_2_reduce(double* x, double *y)
2424// for |arguments| >= 2**63
2425// Address to save r and c as double
2426//
2427// sp+32 -> f0
2428// r45 sp+16 -> f0
2429// r44 -> sp -> InputX
2430//
0ecb606c 2431
a334319f
UD
2432.proc __libm_callout
2433__libm_callout:
2434L(SINCOSL_ARG_TOO_LARGE):
8da2915d
UD
2435.prologue
2436{ .mfi
a334319f 2437 add r45=-32,sp // Parameter: r address
8da2915d
UD
2438 nop.f 0
2439.save ar.pfs,GR_SAVE_PFS
2440 mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
a334319f
UD
2441}
2442{ .mfi
2443.fframe 64
2444 add sp=-64,sp // Create new stack
2445 nop.f 0
2446 mov GR_SAVE_GP=gp // Save gp
8da2915d
UD
2447};;
2448{ .mmi
a334319f
UD
2449 stfe [r45] = f0,16 // Clear Parameter r on stack
2450 add r44 = 16,sp // Parameter x address
8da2915d
UD
2451.save b0, GR_SAVE_B0
2452 mov GR_SAVE_B0=b0 // Save b0
2453};;
2454.body
2455{ .mib
a334319f
UD
2456 stfe [r45] = f0,-16 // Clear Parameter c on stack
2457 nop.i 0
2458 nop.b 0
2459}
2460{ .mib
2461 stfe [r44] = FR_Input_X // Store Parameter x on stack
8da2915d 2462 nop.i 0
a334319f 2463(p0) br.call.sptk b0=__libm_pi_by_2_reduce# ;;
8da2915d 2464};;
a334319f
UD
2465{ .mii
2466(p0) ldfe FR_Input_X =[r44],16
2467//
2468// Get r and c off stack
2469//
2470(p0) adds GR_Table_Base1 = -16, GR_Table_Base1
2471//
2472// Get r and c off stack
2473//
2474(p0) add GR_N_Inc = GR_Sin_or_Cos,r8 ;;
2475}
2476{ .mmb
2477(p0) ldfe FR_r =[r45],16
2478//
2479// Get X off the stack
2480// Readjust Table ptr
2481//
2482(p0) ldfs FR_Two_to_M3 = [GR_Table_Base1],4
2483 nop.b 999 ;;
2484}
2485{ .mmb
2486(p0) ldfs FR_Neg_Two_to_M3 = [GR_Table_Base1],0
2487(p0) ldfe FR_c =[r45]
2488 nop.b 999 ;;
2489}
8da2915d 2490{ .mfi
a334319f
UD
2491.restore sp
2492 add sp = 64,sp // Restore stack pointer
2493(p0) fcmp.lt.unc.s1 p6, p0 = FR_r, FR_Two_to_M3
8da2915d
UD
2494 mov b0 = GR_SAVE_B0 // Restore return address
2495};;
a334319f 2496{ .mib
8da2915d
UD
2497 mov gp = GR_SAVE_GP // Restore gp
2498 mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
a334319f 2499 nop.b 0
8da2915d 2500};;
a334319f
UD
2501{ .mfi
2502 nop.m 999
2503(p6) fcmp.gt.unc.s1 p6, p0 = FR_r, FR_Neg_Two_to_M3
2504 nop.i 999 ;;
2505}
2506{ .mib
2507 nop.m 999
2508 nop.i 999
2509(p6) br.cond.spnt L(SINCOSL_SMALL_R) ;;
2510}
2511{ .mib
2512 nop.m 999
2513 nop.i 999
2514(p0) br.cond.sptk L(SINCOSL_NORMAL_R) ;;
2515}
2516.endp __libm_callout
2517ASM_SIZE_DIRECTIVE(__libm_callout)
8da2915d
UD
2518.type __libm_pi_by_2_reduce#,@function
2519.global __libm_pi_by_2_reduce#