]>
Commit | Line | Data |
---|---|---|
8da2915d UD |
1 | .file "sincosl.s" |
2 | ||
a334319f | 3 | // Copyright (C) 2000, 2001, Intel Corporation |
8da2915d | 4 | // All rights reserved. |
a334319f UD |
5 | // |
6 | // Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, | |
7 | // and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. | |
aeb25823 AJ |
8 | // |
9 | // Redistribution and use in source and binary forms, with or without | |
10 | // modification, are permitted provided that the following conditions are | |
11 | // met: | |
12 | // | |
13 | // * Redistributions of source code must retain the above copyright | |
14 | // notice, this list of conditions and the following disclaimer. | |
15 | // | |
16 | // * Redistributions in binary form must reproduce the above copyright | |
17 | // notice, this list of conditions and the following disclaimer in the | |
18 | // documentation and/or other materials provided with the distribution. | |
19 | // | |
20 | // * The name of Intel Corporation may not be used to endorse or promote | |
21 | // products derived from this software without specific prior written | |
22 | // permission. | |
a334319f UD |
23 | // |
24 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
25 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
8da2915d | 26 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
a334319f | 27 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS |
8da2915d | 28 | // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
a334319f UD |
29 | // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
30 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | |
31 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY | |
8da2915d | 32 | // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING |
a334319f UD |
33 | // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
34 | // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
35 | // | |
8da2915d | 36 | // Intel Corporation is the author of this code, and requests that all |
a334319f UD |
37 | // problem reports or change requests be submitted to it directly at |
38 | // http://developer.intel.com/opensource. | |
8da2915d | 39 | // |
a334319f | 40 | // ********************************************************************* |
8da2915d | 41 | // |
a334319f UD |
42 | // History: |
43 | // 2/02/2000 (hand-optimized) | |
44 | // 4/04/00 Unwind support added | |
8da2915d | 45 | // |
a334319f | 46 | // ********************************************************************* |
8da2915d UD |
47 | // |
48 | // Function: Combined sinl(x) and cosl(x), where | |
49 | // | |
50 | // sinl(x) = sine(x), for double-extended precision x values | |
51 | // cosl(x) = cosine(x), for double-extended precision x values | |
52 | // | |
a334319f | 53 | // ********************************************************************* |
8da2915d UD |
54 | // |
55 | // Resources Used: | |
56 | // | |
a334319f | 57 | // Floating-Point Registers: f8 (Input and Return Value) |
8da2915d UD |
58 | // f32-f99 |
59 | // | |
60 | // General Purpose Registers: | |
a334319f UD |
61 | // r32-r43 |
62 | // r44-r45 (Used to pass arguments to pi_by_2 reduce routine) | |
8da2915d UD |
63 | // |
64 | // Predicate Registers: p6-p13 | |
65 | // | |
a334319f | 66 | // ********************************************************************* |
8da2915d UD |
67 | // |
68 | // IEEE Special Conditions: | |
69 | // | |
70 | // Denormal fault raised on denormal inputs | |
71 | // Overflow exceptions do not occur | |
a334319f | 72 | // Underflow exceptions raised when appropriate for sin |
8da2915d UD |
73 | // (No specialized error handling for this routine) |
74 | // Inexact raised when appropriate by algorithm | |
75 | // | |
76 | // sinl(SNaN) = QNaN | |
77 | // sinl(QNaN) = QNaN | |
a334319f | 78 | // sinl(inf) = QNaN |
8da2915d | 79 | // sinl(+/-0) = +/-0 |
a334319f | 80 | // cosl(inf) = QNaN |
8da2915d UD |
81 | // cosl(SNaN) = QNaN |
82 | // cosl(QNaN) = QNaN | |
83 | // cosl(0) = 1 | |
a334319f UD |
84 | // |
85 | // ********************************************************************* | |
8da2915d UD |
86 | // |
87 | // Mathematical Description | |
88 | // ======================== | |
89 | // | |
a334319f UD |
90 | // The computation of FSIN and FCOS is best handled in one piece of |
91 | // code. The main reason is that given any argument Arg, computation | |
92 | // of trigonometric functions first calculate N and an approximation | |
8da2915d UD |
93 | // to alpha where |
94 | // | |
95 | // Arg = N pi/2 + alpha, |alpha| <= pi/4. | |
96 | // | |
97 | // Since | |
98 | // | |
99 | // cosl( Arg ) = sinl( (N+1) pi/2 + alpha ), | |
100 | // | |
a334319f UD |
101 | // therefore, the code for computing sine will produce cosine as long |
102 | // as 1 is added to N immediately after the argument reduction | |
8da2915d UD |
103 | // process. |
104 | // | |
105 | // Let M = N if sine | |
a334319f | 106 | // N+1 if cosine. |
8da2915d UD |
107 | // |
108 | // Now, given | |
109 | // | |
110 | // Arg = M pi/2 + alpha, |alpha| <= pi/4, | |
111 | // | |
a334319f | 112 | // let I = M mod 4, or I be the two lsb of M when M is represented |
8da2915d UD |
113 | // as 2's complement. I = [i_0 i_1]. Then |
114 | // | |
a334319f | 115 | // sinl( Arg ) = (-1)^i_0 sinl( alpha ) if i_1 = 0, |
8da2915d UD |
116 | // = (-1)^i_0 cosl( alpha ) if i_1 = 1. |
117 | // | |
118 | // For example: | |
a334319f | 119 | // if M = -1, I = 11 |
8da2915d | 120 | // sin ((-pi/2 + alpha) = (-1) cos (alpha) |
a334319f | 121 | // if M = 0, I = 00 |
8da2915d | 122 | // sin (alpha) = sin (alpha) |
a334319f | 123 | // if M = 1, I = 01 |
8da2915d | 124 | // sin (pi/2 + alpha) = cos (alpha) |
a334319f | 125 | // if M = 2, I = 10 |
8da2915d | 126 | // sin (pi + alpha) = (-1) sin (alpha) |
a334319f | 127 | // if M = 3, I = 11 |
8da2915d UD |
128 | // sin ((3/2)pi + alpha) = (-1) cos (alpha) |
129 | // | |
a334319f | 130 | // The value of alpha is obtained by argument reduction and |
8da2915d UD |
131 | // represented by two working precision numbers r and c where |
132 | // | |
133 | // alpha = r + c accurately. | |
134 | // | |
135 | // The reduction method is described in a previous write up. | |
a334319f UD |
136 | // The argument reduction scheme identifies 4 cases. For Cases 2 |
137 | // and 4, because |alpha| is small, sinl(r+c) and cosl(r+c) can be | |
138 | // computed very easily by 2 or 3 terms of the Taylor series | |
8da2915d UD |
139 | // expansion as follows: |
140 | // | |
141 | // Case 2: | |
142 | // ------- | |
143 | // | |
a334319f UD |
144 | // sinl(r + c) = r + c - r^3/6 accurately |
145 | // cosl(r + c) = 1 - 2^(-67) accurately | |
8da2915d UD |
146 | // |
147 | // Case 4: | |
148 | // ------- | |
149 | // | |
a334319f UD |
150 | // sinl(r + c) = r + c - r^3/6 + r^5/120 accurately |
151 | // cosl(r + c) = 1 - r^2/2 + r^4/24 accurately | |
8da2915d | 152 | // |
a334319f UD |
153 | // The only cases left are Cases 1 and 3 of the argument reduction |
154 | // procedure. These two cases will be merged since after the | |
155 | // argument is reduced in either cases, we have the reduced argument | |
156 | // represented as r + c and that the magnitude |r + c| is not small | |
8da2915d UD |
157 | // enough to allow the usage of a very short approximation. |
158 | // | |
159 | // The required calculation is either | |
160 | // | |
161 | // sinl(r + c) = sinl(r) + correction, or | |
162 | // cosl(r + c) = cosl(r) + correction. | |
163 | // | |
164 | // Specifically, | |
165 | // | |
a334319f UD |
166 | // sinl(r + c) = sinl(r) + c sin'(r) + O(c^2) |
167 | // = sinl(r) + c cos (r) + O(c^2) | |
168 | // = sinl(r) + c(1 - r^2/2) accurately. | |
8da2915d UD |
169 | // Similarly, |
170 | // | |
a334319f UD |
171 | // cosl(r + c) = cosl(r) - c sinl(r) + O(c^2) |
172 | // = cosl(r) - c(r - r^3/6) accurately. | |
8da2915d | 173 | // |
a334319f | 174 | // We therefore concentrate on accurately calculating sinl(r) and |
8da2915d UD |
175 | // cosl(r) for a working-precision number r, |r| <= pi/4 to within |
176 | // 0.1% or so. | |
177 | // | |
a334319f | 178 | // The greatest challenge of this task is that the second terms of |
8da2915d | 179 | // the Taylor series |
a334319f UD |
180 | // |
181 | // r - r^3/3! + r^r/5! - ... | |
8da2915d UD |
182 | // |
183 | // and | |
184 | // | |
a334319f | 185 | // 1 - r^2/2! + r^4/4! - ... |
8da2915d | 186 | // |
a334319f UD |
187 | // are not very small when |r| is close to pi/4 and the rounding |
188 | // errors will be a concern if simple polynomial accumulation is | |
189 | // used. When |r| < 2^-3, however, the second terms will be small | |
190 | // enough (6 bits or so of right shift) that a normal Horner | |
191 | // recurrence suffices. Hence there are two cases that we consider | |
8da2915d UD |
192 | // in the accurate computation of sinl(r) and cosl(r), |r| <= pi/4. |
193 | // | |
194 | // Case small_r: |r| < 2^(-3) | |
195 | // -------------------------- | |
196 | // | |
197 | // Since Arg = M pi/4 + r + c accurately, and M mod 4 is [i_0 i_1], | |
198 | // we have | |
199 | // | |
a334319f UD |
200 | // sinl(Arg) = (-1)^i_0 * sinl(r + c) if i_1 = 0 |
201 | // = (-1)^i_0 * cosl(r + c) if i_1 = 1 | |
8da2915d UD |
202 | // |
203 | // can be accurately approximated by | |
204 | // | |
a334319f | 205 | // sinl(Arg) = (-1)^i_0 * [sinl(r) + c] if i_1 = 0 |
8da2915d UD |
206 | // = (-1)^i_0 * [cosl(r) - c*r] if i_1 = 1 |
207 | // | |
a334319f | 208 | // because |r| is small and thus the second terms in the correction |
8da2915d UD |
209 | // are unneccessary. |
210 | // | |
a334319f | 211 | // Finally, sinl(r) and cosl(r) are approximated by polynomials of |
8da2915d UD |
212 | // moderate lengths. |
213 | // | |
214 | // sinl(r) = r + S_1 r^3 + S_2 r^5 + ... + S_5 r^11 | |
215 | // cosl(r) = 1 + C_1 r^2 + C_2 r^4 + ... + C_5 r^10 | |
216 | // | |
a334319f UD |
217 | // We can make use of predicates to selectively calculate |
218 | // sinl(r) or cosl(r) based on i_1. | |
8da2915d UD |
219 | // |
220 | // Case normal_r: 2^(-3) <= |r| <= pi/4 | |
221 | // ------------------------------------ | |
222 | // | |
223 | // This case is more likely than the previous one if one considers | |
224 | // r to be uniformly distributed in [-pi/4 pi/4]. Again, | |
a334319f UD |
225 | // |
226 | // sinl(Arg) = (-1)^i_0 * sinl(r + c) if i_1 = 0 | |
227 | // = (-1)^i_0 * cosl(r + c) if i_1 = 1. | |
8da2915d | 228 | // |
a334319f | 229 | // Because |r| is now larger, we need one extra term in the |
8da2915d UD |
230 | // correction. sinl(Arg) can be accurately approximated by |
231 | // | |
232 | // sinl(Arg) = (-1)^i_0 * [sinl(r) + c(1-r^2/2)] if i_1 = 0 | |
233 | // = (-1)^i_0 * [cosl(r) - c*r*(1 - r^2/6)] i_1 = 1. | |
234 | // | |
a334319f | 235 | // Finally, sinl(r) and cosl(r) are approximated by polynomials of |
8da2915d UD |
236 | // moderate lengths. |
237 | // | |
a334319f UD |
238 | // sinl(r) = r + PP_1_hi r^3 + PP_1_lo r^3 + |
239 | // PP_2 r^5 + ... + PP_8 r^17 | |
8da2915d | 240 | // |
a334319f | 241 | // cosl(r) = 1 + QQ_1 r^2 + QQ_2 r^4 + ... + QQ_8 r^16 |
8da2915d | 242 | // |
a334319f UD |
243 | // where PP_1_hi is only about 16 bits long and QQ_1 is -1/2. |
244 | // The crux in accurate computation is to calculate | |
8da2915d UD |
245 | // |
246 | // r + PP_1_hi r^3 or 1 + QQ_1 r^2 | |
247 | // | |
a334319f UD |
248 | // accurately as two pieces: U_hi and U_lo. The way to achieve this |
249 | // is to obtain r_hi as a 10 sig. bit number that approximates r to | |
8da2915d UD |
250 | // roughly 8 bits or so of accuracy. (One convenient way is |
251 | // | |
252 | // r_hi := frcpa( frcpa( r ) ).) | |
253 | // | |
254 | // This way, | |
255 | // | |
a334319f UD |
256 | // r + PP_1_hi r^3 = r + PP_1_hi r_hi^3 + |
257 | // PP_1_hi (r^3 - r_hi^3) | |
258 | // = [r + PP_1_hi r_hi^3] + | |
259 | // [PP_1_hi (r - r_hi) | |
260 | // (r^2 + r_hi r + r_hi^2) ] | |
261 | // = U_hi + U_lo | |
8da2915d UD |
262 | // |
263 | // Since r_hi is only 10 bit long and PP_1_hi is only 16 bit long, | |
a334319f UD |
264 | // PP_1_hi * r_hi^3 is only at most 46 bit long and thus computed |
265 | // exactly. Furthermore, r and PP_1_hi r_hi^3 are of opposite sign | |
266 | // and that there is no more than 8 bit shift off between r and | |
267 | // PP_1_hi * r_hi^3. Hence the sum, U_hi, is representable and thus | |
268 | // calculated without any error. Finally, the fact that | |
8da2915d | 269 | // |
a334319f | 270 | // |U_lo| <= 2^(-8) |U_hi| |
8da2915d | 271 | // |
a334319f | 272 | // says that U_hi + U_lo is approximating r + PP_1_hi r^3 to roughly |
8da2915d UD |
273 | // 8 extra bits of accuracy. |
274 | // | |
275 | // Similarly, | |
276 | // | |
a334319f UD |
277 | // 1 + QQ_1 r^2 = [1 + QQ_1 r_hi^2] + |
278 | // [QQ_1 (r - r_hi)(r + r_hi)] | |
279 | // = U_hi + U_lo. | |
280 | // | |
281 | // Summarizing, we calculate r_hi = frcpa( frcpa( r ) ). | |
8da2915d UD |
282 | // |
283 | // If i_1 = 0, then | |
284 | // | |
285 | // U_hi := r + PP_1_hi * r_hi^3 | |
286 | // U_lo := PP_1_hi * (r - r_hi) * (r^2 + r*r_hi + r_hi^2) | |
287 | // poly := PP_1_lo r^3 + PP_2 r^5 + ... + PP_8 r^17 | |
288 | // correction := c * ( 1 + C_1 r^2 ) | |
289 | // | |
290 | // Else ...i_1 = 1 | |
291 | // | |
292 | // U_hi := 1 + QQ_1 * r_hi * r_hi | |
293 | // U_lo := QQ_1 * (r - r_hi) * (r + r_hi) | |
294 | // poly := QQ_2 * r^4 + QQ_3 * r^6 + ... + QQ_8 r^16 | |
295 | // correction := -c * r * (1 + S_1 * r^2) | |
296 | // | |
297 | // End | |
298 | // | |
299 | // Finally, | |
a334319f UD |
300 | // |
301 | // V := poly + ( U_lo + correction ) | |
8da2915d UD |
302 | // |
303 | // / U_hi + V if i_0 = 0 | |
a334319f | 304 | // result := | |
8da2915d UD |
305 | // \ (-U_hi) - V if i_0 = 1 |
306 | // | |
a334319f UD |
307 | // It is important that in the last step, negation of U_hi is |
308 | // performed prior to the subtraction which is to be performed in | |
309 | // the user-set rounding mode. | |
8da2915d UD |
310 | // |
311 | // | |
312 | // Algorithmic Description | |
313 | // ======================= | |
314 | // | |
a334319f UD |
315 | // The argument reduction algorithm is tightly integrated into FSIN |
316 | // and FCOS which share the same code. The following is complete and | |
317 | // self-contained. The argument reduction description given | |
8da2915d UD |
318 | // previously is repeated below. |
319 | // | |
320 | // | |
a334319f | 321 | // Step 0. Initialization. |
8da2915d UD |
322 | // |
323 | // If FSIN is invoked, set N_inc := 0; else if FCOS is invoked, | |
324 | // set N_inc := 1. | |
325 | // | |
326 | // Step 1. Check for exceptional and special cases. | |
327 | // | |
a334319f | 328 | // * If Arg is +-0, +-inf, NaN, NaT, go to Step 10 for special |
8da2915d UD |
329 | // handling. |
330 | // * If |Arg| < 2^24, go to Step 2 for reduction of moderate | |
331 | // arguments. This is the most likely case. | |
332 | // * If |Arg| < 2^63, go to Step 8 for pre-reduction of large | |
333 | // arguments. | |
334 | // * If |Arg| >= 2^63, go to Step 10 for special handling. | |
335 | // | |
336 | // Step 2. Reduction of moderate arguments. | |
337 | // | |
a334319f UD |
338 | // If |Arg| < pi/4 ...quick branch |
339 | // N_fix := N_inc (integer) | |
8da2915d UD |
340 | // r := Arg |
341 | // c := 0.0 | |
342 | // Branch to Step 4, Case_1_complete | |
a334319f UD |
343 | // Else ...cf. argument reduction |
344 | // N := Arg * two_by_PI (fp) | |
345 | // N_fix := fcvt.fx( N ) (int) | |
8da2915d UD |
346 | // N := fcvt.xf( N_fix ) |
347 | // N_fix := N_fix + N_inc | |
a334319f UD |
348 | // s := Arg - N * P_1 (first piece of pi/2) |
349 | // w := -N * P_2 (second piece of pi/2) | |
8da2915d UD |
350 | // |
351 | // If |s| >= 2^(-33) | |
352 | // go to Step 3, Case_1_reduce | |
353 | // Else | |
354 | // go to Step 7, Case_2_reduce | |
355 | // Endif | |
356 | // Endif | |
357 | // | |
358 | // Step 3. Case_1_reduce. | |
359 | // | |
360 | // r := s + w | |
a334319f UD |
361 | // c := (s - r) + w ...observe order |
362 | // | |
8da2915d UD |
363 | // Step 4. Case_1_complete |
364 | // | |
365 | // ...At this point, the reduced argument alpha is | |
366 | // ...accurately represented as r + c. | |
367 | // If |r| < 2^(-3), go to Step 6, small_r. | |
368 | // | |
369 | // Step 5. Normal_r. | |
370 | // | |
371 | // Let [i_0 i_1] by the 2 lsb of N_fix. | |
372 | // FR_rsq := r * r | |
373 | // r_hi := frcpa( frcpa( r ) ) | |
374 | // r_lo := r - r_hi | |
375 | // | |
376 | // If i_1 = 0, then | |
377 | // poly := r*FR_rsq*(PP_1_lo + FR_rsq*(PP_2 + ... FR_rsq*PP_8)) | |
a334319f | 378 | // U_hi := r + PP_1_hi*r_hi*r_hi*r_hi ...any order |
8da2915d | 379 | // U_lo := PP_1_hi*r_lo*(r*r + r*r_hi + r_hi*r_hi) |
a334319f | 380 | // correction := c + c*C_1*FR_rsq ...any order |
8da2915d UD |
381 | // Else |
382 | // poly := FR_rsq*FR_rsq*(QQ_2 + FR_rsq*(QQ_3 + ... + FR_rsq*QQ_8)) | |
a334319f | 383 | // U_hi := 1 + QQ_1 * r_hi * r_hi ...any order |
8da2915d | 384 | // U_lo := QQ_1 * r_lo * (r + r_hi) |
a334319f | 385 | // correction := -c*(r + S_1*FR_rsq*r) ...any order |
8da2915d UD |
386 | // Endif |
387 | // | |
a334319f | 388 | // V := poly + (U_lo + correction) ...observe order |
8da2915d UD |
389 | // |
390 | // result := (i_0 == 0? 1.0 : -1.0) | |
391 | // | |
392 | // Last instruction in user-set rounding mode | |
393 | // | |
394 | // result := (i_0 == 0? result*U_hi + V : | |
395 | // result*U_hi - V) | |
396 | // | |
397 | // Return | |
398 | // | |
399 | // Step 6. Small_r. | |
a334319f | 400 | // |
8da2915d UD |
401 | // ...Use flush to zero mode without causing exception |
402 | // Let [i_0 i_1] be the two lsb of N_fix. | |
403 | // | |
404 | // FR_rsq := r * r | |
405 | // | |
406 | // If i_1 = 0 then | |
407 | // z := FR_rsq*FR_rsq; z := FR_rsq*z *r | |
408 | // poly_lo := S_3 + FR_rsq*(S_4 + FR_rsq*S_5) | |
409 | // poly_hi := r*FR_rsq*(S_1 + FR_rsq*S_2) | |
410 | // correction := c | |
411 | // result := r | |
412 | // Else | |
413 | // z := FR_rsq*FR_rsq; z := FR_rsq*z | |
414 | // poly_lo := C_3 + FR_rsq*(C_4 + FR_rsq*C_5) | |
a334319f | 415 | // poly_hi := FR_rsq*(C_1 + FR_rsq*C_2) |
8da2915d UD |
416 | // correction := -c*r |
417 | // result := 1 | |
418 | // Endif | |
419 | // | |
420 | // poly := poly_hi + (z * poly_lo + correction) | |
421 | // | |
422 | // If i_0 = 1, result := -result | |
423 | // | |
424 | // Last operation. Perform in user-set rounding mode | |
425 | // | |
426 | // result := (i_0 == 0? result + poly : | |
427 | // result - poly ) | |
428 | // Return | |
429 | // | |
430 | // Step 7. Case_2_reduce. | |
431 | // | |
a334319f | 432 | // ...Refer to the write up for argument reduction for |
8da2915d UD |
433 | // ...rationale. The reduction algorithm below is taken from |
434 | // ...argument reduction description and integrated this. | |
435 | // | |
436 | // w := N*P_3 | |
a334319f UD |
437 | // U_1 := N*P_2 + w ...FMA |
438 | // U_2 := (N*P_2 - U_1) + w ...2 FMA | |
8da2915d | 439 | // ...U_1 + U_2 is N*(P_2+P_3) accurately |
a334319f | 440 | // |
8da2915d UD |
441 | // r := s - U_1 |
442 | // c := ( (s - r) - U_1 ) - U_2 | |
443 | // | |
444 | // ...The mathematical sum r + c approximates the reduced | |
445 | // ...argument accurately. Note that although compared to | |
446 | // ...Case 1, this case requires much more work to reduce | |
447 | // ...the argument, the subsequent calculation needed for | |
448 | // ...any of the trigonometric function is very little because | |
a334319f | 449 | // ...|alpha| < 1.01*2^(-33) and thus two terms of the |
8da2915d UD |
450 | // ...Taylor series expansion suffices. |
451 | // | |
452 | // If i_1 = 0 then | |
a334319f | 453 | // poly := c + S_1 * r * r * r ...any order |
8da2915d UD |
454 | // result := r |
455 | // Else | |
456 | // poly := -2^(-67) | |
457 | // result := 1.0 | |
458 | // Endif | |
a334319f | 459 | // |
8da2915d UD |
460 | // If i_0 = 1, result := -result |
461 | // | |
462 | // Last operation. Perform in user-set rounding mode | |
463 | // | |
464 | // result := (i_0 == 0? result + poly : | |
465 | // result - poly ) | |
a334319f | 466 | // |
8da2915d UD |
467 | // Return |
468 | // | |
a334319f | 469 | // |
8da2915d | 470 | // Step 8. Pre-reduction of large arguments. |
a334319f | 471 | // |
8da2915d UD |
472 | // ...Again, the following reduction procedure was described |
473 | // ...in the separate write up for argument reduction, which | |
474 | // ...is tightly integrated here. | |
475 | ||
476 | // N_0 := Arg * Inv_P_0 | |
477 | // N_0_fix := fcvt.fx( N_0 ) | |
478 | // N_0 := fcvt.xf( N_0_fix) | |
a334319f | 479 | |
8da2915d UD |
480 | // Arg' := Arg - N_0 * P_0 |
481 | // w := N_0 * d_1 | |
482 | // N := Arg' * two_by_PI | |
483 | // N_fix := fcvt.fx( N ) | |
484 | // N := fcvt.xf( N_fix ) | |
a334319f | 485 | // N_fix := N_fix + N_inc |
8da2915d UD |
486 | // |
487 | // s := Arg' - N * P_1 | |
488 | // w := w - N * P_2 | |
489 | // | |
490 | // If |s| >= 2^(-14) | |
491 | // go to Step 3 | |
492 | // Else | |
493 | // go to Step 9 | |
494 | // Endif | |
495 | // | |
496 | // Step 9. Case_4_reduce. | |
a334319f | 497 | // |
8da2915d | 498 | // ...first obtain N_0*d_1 and -N*P_2 accurately |
a334319f UD |
499 | // U_hi := N_0 * d_1 V_hi := -N*P_2 |
500 | // U_lo := N_0 * d_1 - U_hi V_lo := -N*P_2 - U_hi ...FMAs | |
8da2915d UD |
501 | // |
502 | // ...compute the contribution from N_0*d_1 and -N*P_3 | |
503 | // w := -N*P_3 | |
504 | // w := w + N_0*d_2 | |
a334319f | 505 | // t := U_lo + V_lo + w ...any order |
8da2915d UD |
506 | // |
507 | // ...at this point, the mathematical value | |
508 | // ...s + U_hi + V_hi + t approximates the true reduced argument | |
509 | // ...accurately. Just need to compute this accurately. | |
510 | // | |
511 | // ...Calculate U_hi + V_hi accurately: | |
512 | // A := U_hi + V_hi | |
513 | // if |U_hi| >= |V_hi| then | |
514 | // a := (U_hi - A) + V_hi | |
515 | // else | |
516 | // a := (V_hi - A) + U_hi | |
517 | // endif | |
518 | // ...order in computing "a" must be observed. This branch is | |
519 | // ...best implemented by predicates. | |
a334319f | 520 | // ...A + a is U_hi + V_hi accurately. Moreover, "a" is |
8da2915d UD |
521 | // ...much smaller than A: |a| <= (1/2)ulp(A). |
522 | // | |
523 | // ...Just need to calculate s + A + a + t | |
a334319f UD |
524 | // C_hi := s + A t := t + a |
525 | // C_lo := (s - C_hi) + A | |
8da2915d UD |
526 | // C_lo := C_lo + t |
527 | // | |
528 | // ...Final steps for reduction | |
529 | // r := C_hi + C_lo | |
530 | // c := (C_hi - r) + C_lo | |
531 | // | |
532 | // ...At this point, we have r and c | |
533 | // ...And all we need is a couple of terms of the corresponding | |
534 | // ...Taylor series. | |
535 | // | |
536 | // If i_1 = 0 | |
537 | // poly := c + r*FR_rsq*(S_1 + FR_rsq*S_2) | |
538 | // result := r | |
539 | // Else | |
540 | // poly := FR_rsq*(C_1 + FR_rsq*C_2) | |
541 | // result := 1 | |
542 | // Endif | |
543 | // | |
544 | // If i_0 = 1, result := -result | |
545 | // | |
546 | // Last operation. Perform in user-set rounding mode | |
547 | // | |
548 | // result := (i_0 == 0? result + poly : | |
549 | // result - poly ) | |
550 | // Return | |
a334319f | 551 | // |
8da2915d UD |
552 | // Large Arguments: For arguments above 2**63, a Payne-Hanek |
553 | // style argument reduction is used and pi_by_2 reduce is called. | |
554 | // | |
555 | ||
a334319f UD |
556 | #include "libm_support.h" |
557 | ||
558 | #ifdef _LIBC | |
559 | .rodata | |
560 | #else | |
561 | .data | |
562 | #endif | |
563 | .align 64 | |
564 | ||
565 | FSINCOSL_CONSTANTS: | |
566 | ASM_TYPE_DIRECTIVE(FSINCOSL_CONSTANTS,@object) | |
567 | data4 0x4B800000, 0xCB800000, 0x00000000,0x00000000 // two**24, -two**24 | |
568 | data4 0x4E44152A, 0xA2F9836E, 0x00003FFE,0x00000000 // Inv_pi_by_2 | |
569 | data4 0xCE81B9F1, 0xC84D32B0, 0x00004016,0x00000000 // P_0 | |
570 | data4 0x2168C235, 0xC90FDAA2, 0x00003FFF,0x00000000 // P_1 | |
571 | data4 0xFC8F8CBB, 0xECE675D1, 0x0000BFBD,0x00000000 // P_2 | |
572 | data4 0xACC19C60, 0xB7ED8FBB, 0x0000BF7C,0x00000000 // P_3 | |
573 | data4 0x5F000000, 0xDF000000, 0x00000000,0x00000000 // two_to_63, -two_to_63 | |
574 | data4 0x6EC6B45A, 0xA397E504, 0x00003FE7,0x00000000 // Inv_P_0 | |
575 | data4 0xDBD171A1, 0x8D848E89, 0x0000BFBF,0x00000000 // d_1 | |
576 | data4 0x18A66F8E, 0xD5394C36, 0x0000BF7C,0x00000000 // d_2 | |
577 | data4 0x2168C234, 0xC90FDAA2, 0x00003FFE,0x00000000 // pi_by_4 | |
578 | data4 0x2168C234, 0xC90FDAA2, 0x0000BFFE,0x00000000 // neg_pi_by_4 | |
579 | data4 0x3E000000, 0xBE000000, 0x00000000,0x00000000 // two**-3, -two**-3 | |
580 | data4 0x2F000000, 0xAF000000, 0x9E000000,0x00000000 // two**-33, -two**-33, -two**-67 | |
581 | data4 0xA21C0BC9, 0xCC8ABEBC, 0x00003FCE,0x00000000 // PP_8 | |
582 | data4 0x720221DA, 0xD7468A05, 0x0000BFD6,0x00000000 // PP_7 | |
583 | data4 0x640AD517, 0xB092382F, 0x00003FDE,0x00000000 // PP_6 | |
584 | data4 0xD1EB75A4, 0xD7322B47, 0x0000BFE5,0x00000000 // PP_5 | |
585 | data4 0xFFFFFFFE, 0xFFFFFFFF, 0x0000BFFD,0x00000000 // C_1 | |
586 | data4 0x00000000, 0xAAAA0000, 0x0000BFFC,0x00000000 // PP_1_hi | |
587 | data4 0xBAF69EEA, 0xB8EF1D2A, 0x00003FEC,0x00000000 // PP_4 | |
588 | data4 0x0D03BB69, 0xD00D00D0, 0x0000BFF2,0x00000000 // PP_3 | |
589 | data4 0x88888962, 0x88888888, 0x00003FF8,0x00000000 // PP_2 | |
590 | data4 0xAAAB0000, 0xAAAAAAAA, 0x0000BFEC,0x00000000 // PP_1_lo | |
591 | data4 0xC2B0FE52, 0xD56232EF, 0x00003FD2,0x00000000 // QQ_8 | |
592 | data4 0x2B48DCA6, 0xC9C99ABA, 0x0000BFDA,0x00000000 // QQ_7 | |
593 | data4 0x9C716658, 0x8F76C650, 0x00003FE2,0x00000000 // QQ_6 | |
594 | data4 0xFDA8D0FC, 0x93F27DBA, 0x0000BFE9,0x00000000 // QQ_5 | |
595 | data4 0xAAAAAAAA, 0xAAAAAAAA, 0x0000BFFC,0x00000000 // S_1 | |
596 | data4 0x00000000, 0x80000000, 0x0000BFFE,0x00000000 // QQ_1 | |
597 | data4 0x0C6E5041, 0xD00D00D0, 0x00003FEF,0x00000000 // QQ_4 | |
598 | data4 0x0B607F60, 0xB60B60B6, 0x0000BFF5,0x00000000 // QQ_3 | |
599 | data4 0xAAAAAA9B, 0xAAAAAAAA, 0x00003FFA,0x00000000 // QQ_2 | |
600 | data4 0xFFFFFFFE, 0xFFFFFFFF, 0x0000BFFD,0x00000000 // C_1 | |
601 | data4 0xAAAA719F, 0xAAAAAAAA, 0x00003FFA,0x00000000 // C_2 | |
602 | data4 0x0356F994, 0xB60B60B6, 0x0000BFF5,0x00000000 // C_3 | |
603 | data4 0xB2385EA9, 0xD00CFFD5, 0x00003FEF,0x00000000 // C_4 | |
604 | data4 0x292A14CD, 0x93E4BD18, 0x0000BFE9,0x00000000 // C_5 | |
605 | data4 0xAAAAAAAA, 0xAAAAAAAA, 0x0000BFFC,0x00000000 // S_1 | |
606 | data4 0x888868DB, 0x88888888, 0x00003FF8,0x00000000 // S_2 | |
607 | data4 0x055EFD4B, 0xD00D00D0, 0x0000BFF2,0x00000000 // S_3 | |
608 | data4 0x839730B9, 0xB8EF1C5D, 0x00003FEC,0x00000000 // S_4 | |
609 | data4 0xE5B3F492, 0xD71EA3A4, 0x0000BFE5,0x00000000 // S_5 | |
610 | data4 0x38800000, 0xB8800000, 0x00000000 // two**-14, -two**-14 | |
611 | ASM_SIZE_DIRECTIVE(FSINCOSL_CONSTANTS) | |
612 | ||
613 | FR_Input_X = f8 | |
614 | FR_Neg_Two_to_M3 = f32 | |
615 | FR_Two_to_63 = f32 | |
616 | FR_Two_to_24 = f33 | |
617 | FR_Pi_by_4 = f33 | |
618 | FR_Two_to_M14 = f34 | |
619 | FR_Two_to_M33 = f35 | |
620 | FR_Neg_Two_to_24 = f36 | |
621 | FR_Neg_Pi_by_4 = f36 | |
622 | FR_Neg_Two_to_M14 = f37 | |
623 | FR_Neg_Two_to_M33 = f38 | |
624 | FR_Neg_Two_to_M67 = f39 | |
625 | FR_Inv_pi_by_2 = f40 | |
626 | FR_N_float = f41 | |
627 | FR_N_fix = f42 | |
628 | FR_P_1 = f43 | |
629 | FR_P_2 = f44 | |
630 | FR_P_3 = f45 | |
631 | FR_s = f46 | |
632 | FR_w = f47 | |
633 | FR_c = f48 | |
634 | FR_r = f49 | |
635 | FR_Z = f50 | |
636 | FR_A = f51 | |
637 | FR_a = f52 | |
638 | FR_t = f53 | |
639 | FR_U_1 = f54 | |
640 | FR_U_2 = f55 | |
641 | FR_C_1 = f56 | |
642 | FR_C_2 = f57 | |
643 | FR_C_3 = f58 | |
644 | FR_C_4 = f59 | |
645 | FR_C_5 = f60 | |
646 | FR_S_1 = f61 | |
647 | FR_S_2 = f62 | |
648 | FR_S_3 = f63 | |
649 | FR_S_4 = f64 | |
650 | FR_S_5 = f65 | |
651 | FR_poly_hi = f66 | |
652 | FR_poly_lo = f67 | |
653 | FR_r_hi = f68 | |
654 | FR_r_lo = f69 | |
655 | FR_rsq = f70 | |
656 | FR_r_cubed = f71 | |
657 | FR_C_hi = f72 | |
658 | FR_N_0 = f73 | |
659 | FR_d_1 = f74 | |
660 | FR_V = f75 | |
661 | FR_V_hi = f75 | |
662 | FR_V_lo = f76 | |
663 | FR_U_hi = f77 | |
664 | FR_U_lo = f78 | |
665 | FR_U_hiabs = f79 | |
666 | FR_V_hiabs = f80 | |
667 | FR_PP_8 = f81 | |
668 | FR_QQ_8 = f81 | |
669 | FR_PP_7 = f82 | |
670 | FR_QQ_7 = f82 | |
671 | FR_PP_6 = f83 | |
672 | FR_QQ_6 = f83 | |
673 | FR_PP_5 = f84 | |
674 | FR_QQ_5 = f84 | |
675 | FR_PP_4 = f85 | |
676 | FR_QQ_4 = f85 | |
677 | FR_PP_3 = f86 | |
678 | FR_QQ_3 = f86 | |
679 | FR_PP_2 = f87 | |
680 | FR_QQ_2 = f87 | |
681 | FR_QQ_1 = f88 | |
682 | FR_N_0_fix = f89 | |
683 | FR_Inv_P_0 = f90 | |
684 | FR_corr = f91 | |
685 | FR_poly = f92 | |
686 | FR_d_2 = f93 | |
687 | FR_Two_to_M3 = f94 | |
688 | FR_Neg_Two_to_63 = f94 | |
689 | FR_P_0 = f95 | |
690 | FR_C_lo = f96 | |
691 | FR_PP_1 = f97 | |
692 | FR_PP_1_lo = f98 | |
693 | FR_ArgPrime = f99 | |
694 | ||
695 | GR_Table_Base = r32 | |
696 | GR_Table_Base1 = r33 | |
697 | GR_i_0 = r34 | |
698 | GR_i_1 = r35 | |
699 | GR_N_Inc = r36 | |
700 | GR_Sin_or_Cos = r37 | |
8da2915d UD |
701 | |
702 | // Added for unwind support | |
703 | ||
704 | GR_SAVE_B0 = r39 | |
705 | GR_SAVE_GP = r40 | |
706 | GR_SAVE_PFS = r41 | |
707 | ||
708 | ||
a334319f UD |
709 | .global sinl# |
710 | .global cosl# | |
711 | #ifdef _LIBC | |
712 | .global __sinl# | |
713 | .global __cosl# | |
714 | #endif | |
0ecb606c | 715 | |
a334319f UD |
716 | .section .text |
717 | .proc sinl# | |
718 | #ifdef _LIBC | |
719 | .proc __sinl# | |
720 | #endif | |
721 | .align 64 | |
722 | sinl: | |
723 | #ifdef _LIBC | |
724 | __sinl: | |
725 | #endif | |
8da2915d | 726 | { .mlx |
a334319f UD |
727 | alloc GR_Table_Base = ar.pfs,0,12,2,0 |
728 | (p0) movl GR_Sin_or_Cos = 0x0 ;; | |
8da2915d | 729 | } |
a334319f UD |
730 | |
731 | { .mmi | |
732 | nop.m 999 | |
733 | (p0) addl GR_Table_Base = @ltoff(FSINCOSL_CONSTANTS#), gp | |
734 | nop.i 999 | |
8da2915d UD |
735 | } |
736 | ;; | |
737 | ||
a334319f UD |
738 | { .mmb |
739 | ld8 GR_Table_Base = [GR_Table_Base] | |
8da2915d | 740 | nop.m 999 |
a334319f | 741 | (p0) br.cond.sptk L(SINCOSL_CONTINUE) ;; |
8da2915d UD |
742 | } |
743 | ;; | |
744 | ||
745 | ||
a334319f UD |
746 | .endp sinl# |
747 | ASM_SIZE_DIRECTIVE(sinl#) | |
748 | ||
749 | .section .text | |
750 | .proc cosl# | |
751 | cosl: | |
752 | #ifdef _LIBC | |
753 | .proc __cosl# | |
754 | __cosl: | |
755 | #endif | |
8da2915d | 756 | { .mlx |
a334319f UD |
757 | alloc GR_Table_Base= ar.pfs,0,12,2,0 |
758 | (p0) movl GR_Sin_or_Cos = 0x1 ;; | |
8da2915d UD |
759 | } |
760 | ;; | |
761 | ||
a334319f | 762 | { .mmi |
8da2915d | 763 | nop.m 999 |
a334319f | 764 | (p0) addl GR_Table_Base = @ltoff(FSINCOSL_CONSTANTS#), gp |
8da2915d UD |
765 | nop.i 999 |
766 | } | |
767 | ;; | |
768 | ||
a334319f UD |
769 | { .mmb |
770 | ld8 GR_Table_Base = [GR_Table_Base] | |
771 | nop.m 999 | |
772 | nop.b 999 | |
8da2915d UD |
773 | } |
774 | ;; | |
775 | ||
776 | ||
a334319f UD |
777 | |
778 | // | |
779 | // Load Table Address | |
780 | // | |
781 | ||
782 | L(SINCOSL_CONTINUE): | |
783 | { .mmi | |
784 | (p0) add GR_Table_Base1 = 96, GR_Table_Base | |
785 | (p0) ldfs FR_Two_to_24 = [GR_Table_Base], 4 | |
786 | // GR_Sin_or_Cos denotes | |
787 | (p0) mov r39 = b0 ;; | |
8da2915d | 788 | } |
a334319f UD |
789 | { .mmi |
790 | nop.m 0 | |
791 | // | |
792 | // Load 2**24, load 2**63. | |
793 | // | |
794 | (p0) ldfs FR_Neg_Two_to_24 = [GR_Table_Base], 12 | |
795 | nop.i 0 | |
8da2915d UD |
796 | } |
797 | { .mfi | |
a334319f UD |
798 | (p0) ldfs FR_Two_to_63 = [GR_Table_Base1], 4 |
799 | // | |
800 | // Check for unnormals - unsupported operands. We do not want | |
801 | // to generate denormal exception | |
802 | // Check for NatVals, QNaNs, SNaNs, +/-Infs | |
803 | // Check for EM unsupporteds | |
804 | // Check for Zero | |
805 | // | |
806 | (p0) fclass.m.unc p6, p0 = FR_Input_X, 0x1E3 | |
807 | nop.i 0 | |
808 | };; | |
809 | { .mmf | |
810 | nop.m 999 | |
811 | (p0) ldfs FR_Neg_Two_to_63 = [GR_Table_Base1], 12 | |
812 | (p0) fclass.nm.unc p8, p0 = FR_Input_X, 0x1FF | |
8da2915d | 813 | } |
a334319f UD |
814 | { .mfb |
815 | nop.m 999 | |
816 | (p0) fclass.m.unc p10, p0 = FR_Input_X, 0x007 | |
817 | (p6) br.cond.spnt L(SINCOSL_SPECIAL) ;; | |
8da2915d | 818 | } |
a334319f UD |
819 | { .mib |
820 | nop.m 999 | |
821 | nop.i 999 | |
822 | (p8) br.cond.spnt L(SINCOSL_SPECIAL) ;; | |
8da2915d | 823 | } |
0ecb606c | 824 | { .mib |
a334319f UD |
825 | nop.m 999 |
826 | nop.i 999 | |
827 | // | |
828 | // Branch if +/- NaN, Inf. | |
829 | // Load -2**24, load -2**63. | |
830 | // | |
831 | (p10) br.cond.spnt L(SINCOSL_ZERO) ;; | |
8da2915d | 832 | } |
a334319f UD |
833 | { .mmb |
834 | (p0) ldfe FR_Inv_pi_by_2 = [GR_Table_Base], 16 | |
835 | (p0) ldfe FR_Inv_P_0 = [GR_Table_Base1], 16 | |
836 | nop.b 999 ;; | |
8da2915d | 837 | } |
a334319f UD |
838 | { .mmb |
839 | (p0) ldfe FR_d_1 = [GR_Table_Base1], 16 | |
840 | // | |
841 | // Raise possible denormal operand flag with useful fcmp | |
842 | // Is x <= -2**63 | |
843 | // Load Inv_P_0 for pre-reduction | |
844 | // Load Inv_pi_by_2 | |
845 | // | |
846 | (p0) ldfe FR_P_0 = [GR_Table_Base], 16 | |
847 | nop.b 999 ;; | |
8da2915d | 848 | } |
a334319f UD |
849 | { .mmb |
850 | (p0) ldfe FR_d_2 = [GR_Table_Base1], 16 | |
851 | // | |
852 | // Load P_0 | |
853 | // Load d_1 | |
854 | // Is x >= 2**63 | |
855 | // Is x <= -2**24? | |
856 | // | |
857 | (p0) ldfe FR_P_1 = [GR_Table_Base], 16 | |
858 | nop.b 999 ;; | |
8da2915d | 859 | } |
a334319f UD |
860 | // |
861 | // Load P_1 | |
862 | // Load d_2 | |
863 | // Is x >= 2**24? | |
864 | // | |
0ecb606c | 865 | { .mfi |
a334319f UD |
866 | (p0) ldfe FR_P_2 = [GR_Table_Base], 16 |
867 | (p0) fcmp.le.unc.s1 p7, p8 = FR_Input_X, FR_Neg_Two_to_24 | |
868 | nop.i 999 ;; | |
8da2915d | 869 | } |
a334319f UD |
870 | { .mbb |
871 | (p0) ldfe FR_P_3 = [GR_Table_Base], 16 | |
872 | nop.b 999 | |
873 | nop.b 999 ;; | |
8da2915d UD |
874 | } |
875 | { .mfi | |
a334319f UD |
876 | nop.m 999 |
877 | (p8) fcmp.ge.s1 p7, p0 = FR_Input_X, FR_Two_to_24 | |
878 | nop.i 999 | |
8da2915d | 879 | } |
0ecb606c | 880 | { .mfi |
a334319f UD |
881 | (p0) ldfe FR_Pi_by_4 = [GR_Table_Base1], 16 |
882 | // | |
883 | // Branch if +/- zero. | |
884 | // Decide about the paths to take: | |
885 | // If -2**24 < FR_Input_X < 2**24 - CASE 1 OR 2 | |
886 | // OTHERWISE - CASE 3 OR 4 | |
887 | // | |
888 | (p0) fcmp.le.unc.s0 p10, p11 = FR_Input_X, FR_Neg_Two_to_63 | |
889 | nop.i 999 ;; | |
8da2915d | 890 | } |
a334319f UD |
891 | { .mmi |
892 | (p0) ldfe FR_Neg_Pi_by_4 = [GR_Table_Base1], 16 ;; | |
893 | (p0) ldfs FR_Two_to_M3 = [GR_Table_Base1], 4 | |
894 | nop.i 999 | |
8da2915d | 895 | } |
0ecb606c | 896 | { .mfi |
a334319f UD |
897 | nop.m 999 |
898 | (p11) fcmp.ge.s1 p10, p0 = FR_Input_X, FR_Two_to_63 | |
899 | nop.i 999 ;; | |
8da2915d UD |
900 | } |
901 | { .mib | |
a334319f UD |
902 | (p0) ldfs FR_Neg_Two_to_M3 = [GR_Table_Base1], 12 |
903 | nop.i 999 | |
904 | // | |
905 | // Load P_2 | |
906 | // Load P_3 | |
907 | // Load pi_by_4 | |
908 | // Load neg_pi_by_4 | |
909 | // Load 2**(-3) | |
910 | // Load -2**(-3). | |
911 | // | |
912 | (p10) br.cond.spnt L(SINCOSL_ARG_TOO_LARGE) ;; | |
0ecb606c | 913 | } |
a334319f UD |
914 | { .mib |
915 | nop.m 999 | |
916 | nop.i 999 | |
917 | // | |
918 | // Branch out if x >= 2**63. Use Payne-Hanek Reduction | |
919 | // | |
920 | (p7) br.cond.spnt L(SINCOSL_LARGER_ARG) ;; | |
8da2915d UD |
921 | } |
922 | { .mfi | |
a334319f UD |
923 | nop.m 999 |
924 | // | |
925 | // Branch if Arg <= -2**24 or Arg >= 2**24 and use pre-reduction. | |
926 | // | |
927 | (p0) fma.s1 FR_N_float = FR_Input_X, FR_Inv_pi_by_2, f0 | |
928 | nop.i 999 ;; | |
8da2915d UD |
929 | } |
930 | { .mfi | |
a334319f UD |
931 | nop.m 999 |
932 | (p0) fcmp.lt.unc.s1 p6, p7 = FR_Input_X, FR_Pi_by_4 | |
933 | nop.i 999 ;; | |
0ecb606c | 934 | } |
a334319f UD |
935 | { .mfi |
936 | nop.m 999 | |
937 | // | |
938 | // Select the case when |Arg| < pi/4 | |
939 | // Else Select the case when |Arg| >= pi/4 | |
940 | // | |
941 | (p0) fcvt.fx.s1 FR_N_fix = FR_N_float | |
942 | nop.i 999 ;; | |
0ecb606c | 943 | } |
a334319f UD |
944 | { .mfi |
945 | nop.m 999 | |
8da2915d UD |
946 | // |
947 | // N = Arg * 2/pi | |
948 | // Check if Arg < pi/4 | |
949 | // | |
a334319f UD |
950 | (p6) fcmp.gt.s1 p6, p7 = FR_Input_X, FR_Neg_Pi_by_4 |
951 | nop.i 999 ;; | |
952 | } | |
8da2915d UD |
953 | // |
954 | // Case 2: Convert integer N_fix back to normalized floating-point value. | |
955 | // Case 1: p8 is only affected when p6 is set | |
956 | // | |
a334319f UD |
957 | { .mfi |
958 | (p7) ldfs FR_Two_to_M33 = [GR_Table_Base1], 4 | |
8da2915d UD |
959 | // |
960 | // Grab the integer part of N and call it N_fix | |
961 | // | |
a334319f UD |
962 | (p6) fmerge.se FR_r = FR_Input_X, FR_Input_X |
963 | // If |x| < pi/4, r = x and c = 0 | |
8da2915d | 964 | // lf |x| < pi/4, is x < 2**(-3). |
a334319f | 965 | // r = Arg |
8da2915d | 966 | // c = 0 |
a334319f | 967 | (p6) mov GR_N_Inc = GR_Sin_or_Cos ;; |
8da2915d | 968 | } |
a334319f UD |
969 | { .mmf |
970 | nop.m 999 | |
971 | (p7) ldfs FR_Neg_Two_to_M33 = [GR_Table_Base1], 4 | |
972 | (p6) fmerge.se FR_c = f0, f0 | |
973 | } | |
974 | { .mfi | |
975 | nop.m 999 | |
976 | (p6) fcmp.lt.unc.s1 p8, p9 = FR_Input_X, FR_Two_to_M3 | |
977 | nop.i 999 ;; | |
978 | } | |
979 | { .mfi | |
980 | nop.m 999 | |
8da2915d UD |
981 | // |
982 | // lf |x| < pi/4, is -2**(-3)< x < 2**(-3) - set p8. | |
a334319f UD |
983 | // If |x| >= pi/4, |
984 | // Create the right N for |x| < pi/4 and otherwise | |
8da2915d UD |
985 | // Case 2: Place integer part of N in GP register |
986 | // | |
a334319f UD |
987 | (p7) fcvt.xf FR_N_float = FR_N_fix |
988 | nop.i 999 ;; | |
8da2915d | 989 | } |
a334319f UD |
990 | { .mmf |
991 | nop.m 999 | |
992 | (p7) getf.sig GR_N_Inc = FR_N_fix | |
993 | (p8) fcmp.gt.s1 p8, p0 = FR_Input_X, FR_Neg_Two_to_M3 ;; | |
8da2915d | 994 | } |
a334319f UD |
995 | { .mib |
996 | nop.m 999 | |
997 | nop.i 999 | |
998 | // | |
999 | // Load 2**(-33), -2**(-33) | |
1000 | // | |
1001 | (p8) br.cond.spnt L(SINCOSL_SMALL_R) ;; | |
8da2915d | 1002 | } |
a334319f UD |
1003 | { .mib |
1004 | nop.m 999 | |
1005 | nop.i 999 | |
1006 | (p6) br.cond.sptk L(SINCOSL_NORMAL_R) ;; | |
8da2915d | 1007 | } |
a334319f UD |
1008 | // |
1009 | // if |x| < pi/4, branch based on |x| < 2**(-3) or otherwise. | |
1010 | // | |
1011 | // | |
1012 | // In this branch, |x| >= pi/4. | |
1013 | // | |
8da2915d | 1014 | { .mfi |
a334319f UD |
1015 | (p0) ldfs FR_Neg_Two_to_M67 = [GR_Table_Base1], 8 |
1016 | // | |
1017 | // Load -2**(-67) | |
1018 | // | |
1019 | (p0) fnma.s1 FR_s = FR_N_float, FR_P_1, FR_Input_X | |
1020 | // | |
1021 | // w = N * P_2 | |
1022 | // s = -N * P_1 + Arg | |
1023 | // | |
1024 | (p0) add GR_N_Inc = GR_N_Inc, GR_Sin_or_Cos | |
8da2915d UD |
1025 | } |
1026 | { .mfi | |
a334319f UD |
1027 | nop.m 999 |
1028 | (p0) fma.s1 FR_w = FR_N_float, FR_P_2, f0 | |
1029 | nop.i 999 ;; | |
8da2915d UD |
1030 | } |
1031 | { .mfi | |
a334319f UD |
1032 | nop.m 999 |
1033 | // | |
1034 | // Adjust N_fix by N_inc to determine whether sine or | |
1035 | // cosine is being calculated | |
1036 | // | |
1037 | (p0) fcmp.lt.unc.s1 p7, p6 = FR_s, FR_Two_to_M33 | |
1038 | nop.i 999 ;; | |
8da2915d UD |
1039 | } |
1040 | { .mfi | |
a334319f UD |
1041 | nop.m 999 |
1042 | (p7) fcmp.gt.s1 p7, p6 = FR_s, FR_Neg_Two_to_M33 | |
1043 | nop.i 999 ;; | |
8da2915d UD |
1044 | } |
1045 | { .mfi | |
a334319f UD |
1046 | nop.m 999 |
1047 | // Remember x >= pi/4. | |
1048 | // Is s <= -2**(-33) or s >= 2**(-33) (p6) | |
1049 | // or -2**(-33) < s < 2**(-33) (p7) | |
1050 | (p6) fms.s1 FR_r = FR_s, f1, FR_w | |
1051 | nop.i 999 | |
8da2915d UD |
1052 | } |
1053 | { .mfi | |
a334319f UD |
1054 | nop.m 999 |
1055 | (p7) fma.s1 FR_w = FR_N_float, FR_P_3, f0 | |
1056 | nop.i 999 ;; | |
8da2915d UD |
1057 | } |
1058 | { .mfi | |
a334319f UD |
1059 | nop.m 999 |
1060 | (p7) fma.s1 FR_U_1 = FR_N_float, FR_P_2, FR_w | |
1061 | nop.i 999 | |
0ecb606c | 1062 | } |
0ecb606c | 1063 | { .mfi |
a334319f UD |
1064 | nop.m 999 |
1065 | (p6) fms.s1 FR_c = FR_s, f1, FR_r | |
1066 | nop.i 999 ;; | |
0ecb606c | 1067 | } |
0ecb606c | 1068 | { .mfi |
a334319f UD |
1069 | nop.m 999 |
1070 | // | |
1071 | // For big s: r = s - w: No futher reduction is necessary | |
1072 | // For small s: w = N * P_3 (change sign) More reduction | |
1073 | // | |
1074 | (p6) fcmp.lt.unc.s1 p8, p9 = FR_r, FR_Two_to_M3 | |
1075 | nop.i 999 ;; | |
8da2915d UD |
1076 | } |
1077 | { .mfi | |
a334319f UD |
1078 | nop.m 999 |
1079 | (p8) fcmp.gt.s1 p8, p9 = FR_r, FR_Neg_Two_to_M3 | |
1080 | nop.i 999 ;; | |
8da2915d UD |
1081 | } |
1082 | { .mfi | |
a334319f | 1083 | nop.m 999 |
8da2915d | 1084 | (p7) fms.s1 FR_r = FR_s, f1, FR_U_1 |
a334319f | 1085 | nop.i 999 |
0ecb606c | 1086 | } |
a334319f UD |
1087 | { .mfb |
1088 | nop.m 999 | |
8da2915d UD |
1089 | // |
1090 | // For big s: Is |r| < 2**(-3)? | |
1091 | // For big s: c = S - r | |
1092 | // For small s: U_1 = N * P_2 + w | |
1093 | // | |
1094 | // If p8 is set, prepare to branch to Small_R. | |
1095 | // If p9 is set, prepare to branch to Normal_R. | |
1096 | // For big s, r is complete here. | |
1097 | // | |
a334319f UD |
1098 | (p6) fms.s1 FR_c = FR_c, f1, FR_w |
1099 | // | |
8da2915d UD |
1100 | // For big s: c = c + w (w has not been negated.) |
1101 | // For small s: r = S - U_1 | |
1102 | // | |
a334319f | 1103 | (p8) br.cond.spnt L(SINCOSL_SMALL_R) ;; |
8da2915d | 1104 | } |
a334319f UD |
1105 | { .mib |
1106 | nop.m 999 | |
1107 | nop.i 999 | |
1108 | (p9) br.cond.sptk L(SINCOSL_NORMAL_R) ;; | |
8da2915d | 1109 | } |
a334319f UD |
1110 | { .mfi |
1111 | (p7) add GR_Table_Base1 = 224, GR_Table_Base1 | |
8da2915d | 1112 | // |
a334319f | 1113 | // Branch to SINCOSL_SMALL_R or SINCOSL_NORMAL_R |
8da2915d | 1114 | // |
a334319f UD |
1115 | (p7) fms.s1 FR_U_2 = FR_N_float, FR_P_2, FR_U_1 |
1116 | // | |
8da2915d UD |
1117 | // c = S - U_1 |
1118 | // r = S_1 * r | |
1119 | // | |
1120 | // | |
a334319f | 1121 | (p7) extr.u GR_i_1 = GR_N_Inc, 0, 1 ;; |
8da2915d UD |
1122 | } |
1123 | { .mmi | |
a334319f | 1124 | nop.m 999 |
8da2915d UD |
1125 | // |
1126 | // Get [i_0,i_1] - two lsb of N_fix_gr. | |
1127 | // Do dummy fmpy so inexact is always set. | |
1128 | // | |
a334319f UD |
1129 | (p7) cmp.eq.unc p9, p10 = 0x0, GR_i_1 |
1130 | (p7) extr.u GR_i_0 = GR_N_Inc, 1, 1 ;; | |
8da2915d | 1131 | } |
a334319f | 1132 | // |
8da2915d UD |
1133 | // For small s: U_2 = N * P_2 - U_1 |
1134 | // S_1 stored constant - grab the one stored with the | |
1135 | // coefficients. | |
a334319f | 1136 | // |
8da2915d | 1137 | { .mfi |
a334319f | 1138 | (p7) ldfe FR_S_1 = [GR_Table_Base1], 16 |
8da2915d UD |
1139 | // |
1140 | // Check if i_1 and i_0 != 0 | |
1141 | // | |
a334319f UD |
1142 | (p10) fma.s1 FR_poly = f0, f1, FR_Neg_Two_to_M67 |
1143 | (p7) cmp.eq.unc p11, p12 = 0x0, GR_i_0 ;; | |
8da2915d UD |
1144 | } |
1145 | { .mfi | |
a334319f UD |
1146 | nop.m 999 |
1147 | (p7) fms.s1 FR_s = FR_s, f1, FR_r | |
1148 | nop.i 999 | |
8da2915d UD |
1149 | } |
1150 | { .mfi | |
a334319f UD |
1151 | nop.m 999 |
1152 | // | |
8da2915d UD |
1153 | // S = S - r |
1154 | // U_2 = U_2 + w | |
1155 | // load S_1 | |
1156 | // | |
a334319f UD |
1157 | (p7) fma.s1 FR_rsq = FR_r, FR_r, f0 |
1158 | nop.i 999 ;; | |
8da2915d UD |
1159 | } |
1160 | { .mfi | |
a334319f UD |
1161 | nop.m 999 |
1162 | (p7) fma.s1 FR_U_2 = FR_U_2, f1, FR_w | |
1163 | nop.i 999 | |
8da2915d UD |
1164 | } |
1165 | { .mfi | |
a334319f UD |
1166 | nop.m 999 |
1167 | (p7) fmerge.se FR_Input_X = FR_r, FR_r | |
1168 | nop.i 999 ;; | |
8da2915d UD |
1169 | } |
1170 | { .mfi | |
a334319f UD |
1171 | nop.m 999 |
1172 | (p10) fma.s1 FR_Input_X = f0, f1, f1 | |
1173 | nop.i 999 ;; | |
8da2915d UD |
1174 | } |
1175 | { .mfi | |
a334319f UD |
1176 | nop.m 999 |
1177 | // | |
8da2915d UD |
1178 | // FR_rsq = r * r |
1179 | // Save r as the result. | |
1180 | // | |
a334319f UD |
1181 | (p7) fms.s1 FR_c = FR_s, f1, FR_U_1 |
1182 | nop.i 999 ;; | |
8da2915d UD |
1183 | } |
1184 | { .mfi | |
a334319f UD |
1185 | nop.m 999 |
1186 | // | |
8da2915d UD |
1187 | // if ( i_1 ==0) poly = c + S_1*r*r*r |
1188 | // else Result = 1 | |
1189 | // | |
a334319f UD |
1190 | (p12) fnma.s1 FR_Input_X = FR_Input_X, f1, f0 |
1191 | nop.i 999 | |
8da2915d UD |
1192 | } |
1193 | { .mfi | |
a334319f UD |
1194 | nop.m 999 |
1195 | (p7) fma.s1 FR_r = FR_S_1, FR_r, f0 | |
1196 | nop.i 999 ;; | |
8da2915d UD |
1197 | } |
1198 | { .mfi | |
a334319f UD |
1199 | nop.m 999 |
1200 | (p7) fma.s0 FR_S_1 = FR_S_1, FR_S_1, f0 | |
1201 | nop.i 999 ;; | |
8da2915d UD |
1202 | } |
1203 | { .mfi | |
a334319f | 1204 | nop.m 999 |
8da2915d UD |
1205 | // |
1206 | // If i_1 != 0, poly = 2**(-67) | |
1207 | // | |
a334319f UD |
1208 | (p7) fms.s1 FR_c = FR_c, f1, FR_U_2 |
1209 | nop.i 999 ;; | |
8da2915d UD |
1210 | } |
1211 | { .mfi | |
a334319f UD |
1212 | nop.m 999 |
1213 | // | |
8da2915d | 1214 | // c = c - U_2 |
a334319f | 1215 | // |
8da2915d | 1216 | (p9) fma.s1 FR_poly = FR_r, FR_rsq, FR_c |
a334319f | 1217 | nop.i 999 ;; |
8da2915d UD |
1218 | } |
1219 | { .mfi | |
a334319f | 1220 | nop.m 999 |
8da2915d UD |
1221 | // |
1222 | // i_0 != 0, so Result = -Result | |
1223 | // | |
a334319f UD |
1224 | (p11) fma.s0 FR_Input_X = FR_Input_X, f1, FR_poly |
1225 | nop.i 999 ;; | |
8da2915d UD |
1226 | } |
1227 | { .mfb | |
a334319f UD |
1228 | nop.m 999 |
1229 | (p12) fms.s0 FR_Input_X = FR_Input_X, f1, FR_poly | |
8da2915d UD |
1230 | // |
1231 | // if (i_0 == 0), Result = Result + poly | |
1232 | // else Result = Result - poly | |
1233 | // | |
a334319f UD |
1234 | (p0) br.ret.sptk b0 ;; |
1235 | } | |
1236 | L(SINCOSL_LARGER_ARG): | |
1237 | { .mfi | |
1238 | nop.m 999 | |
1239 | (p0) fma.s1 FR_N_0 = FR_Input_X, FR_Inv_P_0, f0 | |
1240 | nop.i 999 | |
8da2915d UD |
1241 | } |
1242 | ;; | |
1243 | ||
a334319f UD |
1244 | // This path for argument > 2*24 |
1245 | // Adjust table_ptr1 to beginning of table. | |
8da2915d | 1246 | // |
a334319f UD |
1247 | |
1248 | { .mmi | |
1249 | nop.m 999 | |
1250 | (p0) addl GR_Table_Base = @ltoff(FSINCOSL_CONSTANTS#), gp | |
1251 | nop.i 999 | |
8da2915d UD |
1252 | } |
1253 | ;; | |
1254 | ||
a334319f UD |
1255 | { .mmi |
1256 | ld8 GR_Table_Base = [GR_Table_Base] | |
1257 | nop.m 999 | |
1258 | nop.i 999 | |
1259 | } | |
1260 | ;; | |
1261 | ||
1262 | ||
1263 | // | |
1264 | // Point to 2*-14 | |
8da2915d UD |
1265 | // N_0 = Arg * Inv_P_0 |
1266 | // | |
1267 | { .mmi | |
a334319f UD |
1268 | (p0) add GR_Table_Base = 688, GR_Table_Base ;; |
1269 | (p0) ldfs FR_Two_to_M14 = [GR_Table_Base], 4 | |
1270 | nop.i 999 ;; | |
8da2915d UD |
1271 | } |
1272 | { .mfi | |
a334319f UD |
1273 | (p0) ldfs FR_Neg_Two_to_M14 = [GR_Table_Base], 0 |
1274 | nop.f 999 | |
1275 | nop.i 999 ;; | |
8da2915d UD |
1276 | } |
1277 | { .mfi | |
a334319f | 1278 | nop.m 999 |
8da2915d | 1279 | // |
a334319f | 1280 | // Load values 2**(-14) and -2**(-14) |
8da2915d | 1281 | // |
a334319f UD |
1282 | (p0) fcvt.fx.s1 FR_N_0_fix = FR_N_0 |
1283 | nop.i 999 ;; | |
8da2915d UD |
1284 | } |
1285 | { .mfi | |
a334319f | 1286 | nop.m 999 |
8da2915d UD |
1287 | // |
1288 | // N_0_fix = integer part of N_0 | |
1289 | // | |
a334319f UD |
1290 | (p0) fcvt.xf FR_N_0 = FR_N_0_fix |
1291 | nop.i 999 ;; | |
8da2915d UD |
1292 | } |
1293 | { .mfi | |
a334319f | 1294 | nop.m 999 |
8da2915d UD |
1295 | // |
1296 | // Make N_0 the integer part | |
1297 | // | |
a334319f UD |
1298 | (p0) fnma.s1 FR_ArgPrime = FR_N_0, FR_P_0, FR_Input_X |
1299 | nop.i 999 | |
8da2915d UD |
1300 | } |
1301 | { .mfi | |
a334319f UD |
1302 | nop.m 999 |
1303 | (p0) fma.s1 FR_w = FR_N_0, FR_d_1, f0 | |
1304 | nop.i 999 ;; | |
8da2915d UD |
1305 | } |
1306 | { .mfi | |
a334319f | 1307 | nop.m 999 |
8da2915d UD |
1308 | // |
1309 | // Arg' = -N_0 * P_0 + Arg | |
1310 | // w = N_0 * d_1 | |
1311 | // | |
a334319f UD |
1312 | (p0) fma.s1 FR_N_float = FR_ArgPrime, FR_Inv_pi_by_2, f0 |
1313 | nop.i 999 ;; | |
8da2915d UD |
1314 | } |
1315 | { .mfi | |
a334319f | 1316 | nop.m 999 |
8da2915d | 1317 | // |
a334319f | 1318 | // N = A' * 2/pi |
8da2915d | 1319 | // |
a334319f UD |
1320 | (p0) fcvt.fx.s1 FR_N_fix = FR_N_float |
1321 | nop.i 999 ;; | |
8da2915d UD |
1322 | } |
1323 | { .mfi | |
a334319f | 1324 | nop.m 999 |
8da2915d | 1325 | // |
a334319f | 1326 | // N_fix is the integer part |
8da2915d | 1327 | // |
a334319f UD |
1328 | (p0) fcvt.xf FR_N_float = FR_N_fix |
1329 | nop.i 999 ;; | |
8da2915d UD |
1330 | } |
1331 | { .mfi | |
a334319f UD |
1332 | (p0) getf.sig GR_N_Inc = FR_N_fix |
1333 | nop.f 999 | |
1334 | nop.i 999 ;; | |
8da2915d UD |
1335 | } |
1336 | { .mii | |
a334319f UD |
1337 | nop.m 999 |
1338 | nop.i 999 ;; | |
1339 | (p0) add GR_N_Inc = GR_N_Inc, GR_Sin_or_Cos ;; | |
8da2915d UD |
1340 | } |
1341 | { .mfi | |
a334319f | 1342 | nop.m 999 |
8da2915d UD |
1343 | // |
1344 | // N is the integer part of the reduced-reduced argument. | |
1345 | // Put the integer in a GP register | |
1346 | // | |
a334319f UD |
1347 | (p0) fnma.s1 FR_s = FR_N_float, FR_P_1, FR_ArgPrime |
1348 | nop.i 999 | |
8da2915d UD |
1349 | } |
1350 | { .mfi | |
a334319f UD |
1351 | nop.m 999 |
1352 | (p0) fnma.s1 FR_w = FR_N_float, FR_P_2, FR_w | |
1353 | nop.i 999 ;; | |
8da2915d UD |
1354 | } |
1355 | { .mfi | |
a334319f | 1356 | nop.m 999 |
8da2915d UD |
1357 | // |
1358 | // s = -N*P_1 + Arg' | |
1359 | // w = -N*P_2 + w | |
1360 | // N_fix_gr = N_fix_gr + N_inc | |
1361 | // | |
a334319f UD |
1362 | (p0) fcmp.lt.unc.s1 p9, p8 = FR_s, FR_Two_to_M14 |
1363 | nop.i 999 ;; | |
8da2915d UD |
1364 | } |
1365 | { .mfi | |
a334319f UD |
1366 | nop.m 999 |
1367 | (p9) fcmp.gt.s1 p9, p8 = FR_s, FR_Neg_Two_to_M14 | |
1368 | nop.i 999 ;; | |
8da2915d UD |
1369 | } |
1370 | { .mfi | |
a334319f | 1371 | nop.m 999 |
8da2915d UD |
1372 | // |
1373 | // For |s| > 2**(-14) r = S + w (r complete) | |
1374 | // Else U_hi = N_0 * d_1 | |
1375 | // | |
1376 | (p9) fma.s1 FR_V_hi = FR_N_float, FR_P_2, f0 | |
a334319f | 1377 | nop.i 999 |
8da2915d UD |
1378 | } |
1379 | { .mfi | |
a334319f | 1380 | nop.m 999 |
8da2915d | 1381 | (p9) fma.s1 FR_U_hi = FR_N_0, FR_d_1, f0 |
a334319f | 1382 | nop.i 999 ;; |
8da2915d UD |
1383 | } |
1384 | { .mfi | |
a334319f | 1385 | nop.m 999 |
8da2915d UD |
1386 | // |
1387 | // Either S <= -2**(-14) or S >= 2**(-14) | |
1388 | // or -2**(-14) < s < 2**(-14) | |
1389 | // | |
1390 | (p8) fma.s1 FR_r = FR_s, f1, FR_w | |
a334319f | 1391 | nop.i 999 |
8da2915d UD |
1392 | } |
1393 | { .mfi | |
a334319f | 1394 | nop.m 999 |
8da2915d | 1395 | (p9) fma.s1 FR_w = FR_N_float, FR_P_3, f0 |
a334319f | 1396 | nop.i 999 ;; |
8da2915d UD |
1397 | } |
1398 | { .mfi | |
a334319f | 1399 | nop.m 999 |
8da2915d UD |
1400 | // |
1401 | // We need abs of both U_hi and V_hi - don't | |
1402 | // worry about switched sign of V_hi. | |
1403 | // | |
1404 | (p9) fms.s1 FR_A = FR_U_hi, f1, FR_V_hi | |
a334319f | 1405 | nop.i 999 |
8da2915d UD |
1406 | } |
1407 | { .mfi | |
a334319f | 1408 | nop.m 999 |
8da2915d | 1409 | // |
a334319f | 1410 | // Big s: finish up c = (S - r) + w (c complete) |
8da2915d UD |
1411 | // Case 4: A = U_hi + V_hi |
1412 | // Note: Worry about switched sign of V_hi, so subtract instead of add. | |
1413 | // | |
1414 | (p9) fnma.s1 FR_V_lo = FR_N_float, FR_P_2, FR_V_hi | |
a334319f | 1415 | nop.i 999 ;; |
8da2915d UD |
1416 | } |
1417 | { .mmf | |
a334319f UD |
1418 | nop.m 999 |
1419 | nop.m 999 | |
8da2915d UD |
1420 | (p9) fms.s1 FR_U_lo = FR_N_0, FR_d_1, FR_U_hi |
1421 | } | |
1422 | { .mfi | |
a334319f | 1423 | nop.m 999 |
8da2915d | 1424 | (p9) fmerge.s FR_V_hiabs = f0, FR_V_hi |
a334319f | 1425 | nop.i 999 ;; |
8da2915d UD |
1426 | } |
1427 | { .mfi | |
a334319f | 1428 | nop.m 999 |
8da2915d UD |
1429 | // For big s: c = S - r |
1430 | // For small s do more work: U_lo = N_0 * d_1 - U_hi | |
1431 | // | |
1432 | (p9) fmerge.s FR_U_hiabs = f0, FR_U_hi | |
a334319f | 1433 | nop.i 999 |
8da2915d UD |
1434 | } |
1435 | { .mfi | |
a334319f | 1436 | nop.m 999 |
8da2915d | 1437 | // |
a334319f | 1438 | // For big s: Is |r| < 2**(-3) |
8da2915d UD |
1439 | // For big s: if p12 set, prepare to branch to Small_R. |
1440 | // For big s: If p13 set, prepare to branch to Normal_R. | |
1441 | // | |
a334319f UD |
1442 | (p8) fms.s1 FR_c = FR_s, f1, FR_r |
1443 | nop.i 999 ;; | |
8da2915d UD |
1444 | } |
1445 | { .mfi | |
a334319f | 1446 | nop.m 999 |
8da2915d UD |
1447 | // |
1448 | // For small S: V_hi = N * P_2 | |
1449 | // w = N * P_3 | |
1450 | // Note the product does not include the (-) as in the writeup | |
1451 | // so (-) missing for V_hi and w. | |
1452 | // | |
1453 | (p8) fcmp.lt.unc.s1 p12, p13 = FR_r, FR_Two_to_M3 | |
a334319f | 1454 | nop.i 999 ;; |
8da2915d UD |
1455 | } |
1456 | { .mfi | |
a334319f | 1457 | nop.m 999 |
8da2915d | 1458 | (p12) fcmp.gt.s1 p12, p13 = FR_r, FR_Neg_Two_to_M3 |
a334319f | 1459 | nop.i 999 ;; |
8da2915d UD |
1460 | } |
1461 | { .mfi | |
a334319f | 1462 | nop.m 999 |
8da2915d | 1463 | (p8) fma.s1 FR_c = FR_c, f1, FR_w |
a334319f | 1464 | nop.i 999 |
8da2915d UD |
1465 | } |
1466 | { .mfb | |
a334319f | 1467 | nop.m 999 |
8da2915d | 1468 | (p9) fms.s1 FR_w = FR_N_0, FR_d_2, FR_w |
a334319f | 1469 | (p12) br.cond.spnt L(SINCOSL_SMALL_R) ;; |
8da2915d UD |
1470 | } |
1471 | { .mib | |
a334319f UD |
1472 | nop.m 999 |
1473 | nop.i 999 | |
1474 | (p13) br.cond.sptk L(SINCOSL_NORMAL_R) ;; | |
8da2915d UD |
1475 | } |
1476 | { .mfi | |
a334319f UD |
1477 | nop.m 999 |
1478 | // | |
1479 | // Big s: Vector off when |r| < 2**(-3). Recall that p8 will be true. | |
8da2915d UD |
1480 | // The remaining stuff is for Case 4. |
1481 | // Small s: V_lo = N * P_2 + U_hi (U_hi is in place of V_hi in writeup) | |
1482 | // Note: the (-) is still missing for V_lo. | |
1483 | // Small s: w = w + N_0 * d_2 | |
1484 | // Note: the (-) is now incorporated in w. | |
1485 | // | |
a334319f UD |
1486 | (p9) fcmp.ge.unc.s1 p10, p11 = FR_U_hiabs, FR_V_hiabs |
1487 | (p0) extr.u GR_i_1 = GR_N_Inc, 0, 1 | |
8da2915d UD |
1488 | } |
1489 | { .mfi | |
a334319f | 1490 | nop.m 999 |
8da2915d UD |
1491 | // |
1492 | // C_hi = S + A | |
1493 | // | |
a334319f UD |
1494 | (p9) fma.s1 FR_t = FR_U_lo, f1, FR_V_lo |
1495 | (p0) extr.u GR_i_0 = GR_N_Inc, 1, 1 ;; | |
8da2915d UD |
1496 | } |
1497 | { .mfi | |
a334319f | 1498 | nop.m 999 |
8da2915d | 1499 | // |
a334319f | 1500 | // t = U_lo + V_lo |
8da2915d UD |
1501 | // |
1502 | // | |
a334319f UD |
1503 | (p10) fms.s1 FR_a = FR_U_hi, f1, FR_A |
1504 | nop.i 999 ;; | |
8da2915d UD |
1505 | } |
1506 | { .mfi | |
a334319f UD |
1507 | nop.m 999 |
1508 | (p11) fma.s1 FR_a = FR_V_hi, f1, FR_A | |
1509 | nop.i 999 | |
1510 | } | |
1511 | ;; | |
1512 | ||
1513 | { .mmi | |
1514 | nop.m 999 | |
1515 | (p0) addl GR_Table_Base = @ltoff(FSINCOSL_CONSTANTS#), gp | |
1516 | nop.i 999 | |
1517 | } | |
1518 | ;; | |
1519 | ||
1520 | { .mmi | |
1521 | ld8 GR_Table_Base = [GR_Table_Base] | |
1522 | nop.m 999 | |
1523 | nop.i 999 | |
8da2915d UD |
1524 | } |
1525 | ;; | |
1526 | ||
a334319f | 1527 | |
8da2915d | 1528 | { .mfi |
a334319f | 1529 | (p0) add GR_Table_Base = 528, GR_Table_Base |
8da2915d UD |
1530 | // |
1531 | // Is U_hiabs >= V_hiabs? | |
1532 | // | |
a334319f UD |
1533 | (p9) fma.s1 FR_C_hi = FR_s, f1, FR_A |
1534 | nop.i 999 ;; | |
8da2915d UD |
1535 | } |
1536 | { .mmi | |
a334319f UD |
1537 | (p0) ldfe FR_C_1 = [GR_Table_Base], 16 ;; |
1538 | (p0) ldfe FR_C_2 = [GR_Table_Base], 64 | |
1539 | nop.i 999 ;; | |
8da2915d UD |
1540 | } |
1541 | // | |
1542 | // c = c + C_lo finished. | |
1543 | // Load C_2 | |
1544 | // | |
1545 | { .mfi | |
a334319f | 1546 | (p0) ldfe FR_S_1 = [GR_Table_Base], 16 |
8da2915d | 1547 | // |
a334319f | 1548 | // C_lo = S - C_hi |
8da2915d | 1549 | // |
a334319f UD |
1550 | (p0) fma.s1 FR_t = FR_t, f1, FR_w |
1551 | nop.i 999 ;; | |
8da2915d UD |
1552 | } |
1553 | // | |
1554 | // r and c have been computed. | |
1555 | // Make sure ftz mode is set - should be automatic when using wre | |
1556 | // |r| < 2**(-3) | |
1557 | // Get [i_0,i_1] - two lsb of N_fix. | |
1558 | // Load S_1 | |
1559 | // | |
1560 | { .mfi | |
a334319f | 1561 | (p0) ldfe FR_S_2 = [GR_Table_Base], 64 |
8da2915d | 1562 | // |
a334319f | 1563 | // t = t + w |
8da2915d | 1564 | // |
a334319f UD |
1565 | (p10) fms.s1 FR_a = FR_a, f1, FR_V_hi |
1566 | (p0) cmp.eq.unc p9, p10 = 0x0, GR_i_0 ;; | |
8da2915d UD |
1567 | } |
1568 | { .mfi | |
a334319f | 1569 | nop.m 999 |
8da2915d UD |
1570 | // |
1571 | // For larger u than v: a = U_hi - A | |
1572 | // Else a = V_hi - A (do an add to account for missing (-) on V_hi | |
1573 | // | |
a334319f UD |
1574 | (p0) fms.s1 FR_C_lo = FR_s, f1, FR_C_hi |
1575 | nop.i 999 ;; | |
8da2915d UD |
1576 | } |
1577 | { .mfi | |
a334319f UD |
1578 | nop.m 999 |
1579 | (p11) fms.s1 FR_a = FR_U_hi, f1, FR_a | |
1580 | (p0) cmp.eq.unc p11, p12 = 0x0, GR_i_1 ;; | |
8da2915d UD |
1581 | } |
1582 | { .mfi | |
a334319f | 1583 | nop.m 999 |
8da2915d UD |
1584 | // |
1585 | // If u > v: a = (U_hi - A) + V_hi | |
1586 | // Else a = (V_hi - A) + U_hi | |
1587 | // In each case account for negative missing from V_hi. | |
1588 | // | |
a334319f UD |
1589 | (p0) fma.s1 FR_C_lo = FR_C_lo, f1, FR_A |
1590 | nop.i 999 ;; | |
8da2915d UD |
1591 | } |
1592 | { .mfi | |
a334319f | 1593 | nop.m 999 |
8da2915d | 1594 | // |
a334319f | 1595 | // C_lo = (S - C_hi) + A |
8da2915d | 1596 | // |
a334319f UD |
1597 | (p0) fma.s1 FR_t = FR_t, f1, FR_a |
1598 | nop.i 999 ;; | |
8da2915d UD |
1599 | } |
1600 | { .mfi | |
a334319f | 1601 | nop.m 999 |
8da2915d | 1602 | // |
a334319f | 1603 | // t = t + a |
8da2915d | 1604 | // |
a334319f UD |
1605 | (p0) fma.s1 FR_C_lo = FR_C_lo, f1, FR_t |
1606 | nop.i 999 ;; | |
8da2915d UD |
1607 | } |
1608 | { .mfi | |
a334319f | 1609 | nop.m 999 |
8da2915d UD |
1610 | // |
1611 | // C_lo = C_lo + t | |
a334319f | 1612 | // Adjust Table_Base to beginning of table |
8da2915d | 1613 | // |
a334319f UD |
1614 | (p0) fma.s1 FR_r = FR_C_hi, f1, FR_C_lo |
1615 | nop.i 999 ;; | |
8da2915d UD |
1616 | } |
1617 | { .mfi | |
a334319f | 1618 | nop.m 999 |
8da2915d UD |
1619 | // |
1620 | // Load S_2 | |
1621 | // | |
a334319f UD |
1622 | (p0) fma.s1 FR_rsq = FR_r, FR_r, f0 |
1623 | nop.i 999 | |
8da2915d UD |
1624 | } |
1625 | { .mfi | |
a334319f | 1626 | nop.m 999 |
8da2915d | 1627 | // |
a334319f | 1628 | // Table_Base points to C_1 |
8da2915d UD |
1629 | // r = C_hi + C_lo |
1630 | // | |
a334319f UD |
1631 | (p0) fms.s1 FR_c = FR_C_hi, f1, FR_r |
1632 | nop.i 999 ;; | |
8da2915d UD |
1633 | } |
1634 | { .mfi | |
a334319f | 1635 | nop.m 999 |
8da2915d UD |
1636 | // |
1637 | // if i_1 ==0: poly = S_2 * FR_rsq + S_1 | |
1638 | // else poly = C_2 * FR_rsq + C_1 | |
1639 | // | |
a334319f UD |
1640 | (p11) fma.s1 FR_Input_X = f0, f1, FR_r |
1641 | nop.i 999 ;; | |
8da2915d UD |
1642 | } |
1643 | { .mfi | |
a334319f UD |
1644 | nop.m 999 |
1645 | (p12) fma.s1 FR_Input_X = f0, f1, f1 | |
1646 | nop.i 999 ;; | |
8da2915d UD |
1647 | } |
1648 | { .mfi | |
a334319f | 1649 | nop.m 999 |
8da2915d | 1650 | // |
a334319f | 1651 | // Compute r_cube = FR_rsq * r |
8da2915d | 1652 | // |
a334319f UD |
1653 | (p11) fma.s1 FR_poly = FR_rsq, FR_S_2, FR_S_1 |
1654 | nop.i 999 ;; | |
8da2915d UD |
1655 | } |
1656 | { .mfi | |
a334319f UD |
1657 | nop.m 999 |
1658 | (p12) fma.s1 FR_poly = FR_rsq, FR_C_2, FR_C_1 | |
1659 | nop.i 999 | |
8da2915d UD |
1660 | } |
1661 | { .mfi | |
a334319f | 1662 | nop.m 999 |
8da2915d UD |
1663 | // |
1664 | // Compute FR_rsq = r * r | |
1665 | // Is i_1 == 0 ? | |
1666 | // | |
a334319f UD |
1667 | (p0) fma.s1 FR_r_cubed = FR_rsq, FR_r, f0 |
1668 | nop.i 999 ;; | |
8da2915d UD |
1669 | } |
1670 | { .mfi | |
a334319f | 1671 | nop.m 999 |
8da2915d UD |
1672 | // |
1673 | // c = C_hi - r | |
1674 | // Load C_1 | |
1675 | // | |
a334319f UD |
1676 | (p0) fma.s1 FR_c = FR_c, f1, FR_C_lo |
1677 | nop.i 999 | |
8da2915d UD |
1678 | } |
1679 | { .mfi | |
a334319f | 1680 | nop.m 999 |
8da2915d UD |
1681 | // |
1682 | // if i_1 ==0: poly = r_cube * poly + c | |
1683 | // else poly = FR_rsq * poly | |
1684 | // | |
a334319f UD |
1685 | (p10) fms.s1 FR_Input_X = f0, f1, FR_Input_X |
1686 | nop.i 999 ;; | |
8da2915d UD |
1687 | } |
1688 | { .mfi | |
a334319f | 1689 | nop.m 999 |
8da2915d UD |
1690 | // |
1691 | // if i_1 ==0: Result = r | |
1692 | // else Result = 1.0 | |
1693 | // | |
a334319f UD |
1694 | (p11) fma.s1 FR_poly = FR_r_cubed, FR_poly, FR_c |
1695 | nop.i 999 ;; | |
8da2915d UD |
1696 | } |
1697 | { .mfi | |
a334319f UD |
1698 | nop.m 999 |
1699 | (p12) fma.s1 FR_poly = FR_rsq, FR_poly, f0 | |
1700 | nop.i 999 ;; | |
8da2915d UD |
1701 | } |
1702 | { .mfi | |
a334319f | 1703 | nop.m 999 |
8da2915d | 1704 | // |
a334319f | 1705 | // if i_0 !=0: Result = -Result |
8da2915d | 1706 | // |
a334319f UD |
1707 | (p9) fma.s0 FR_Input_X = FR_Input_X, f1, FR_poly |
1708 | nop.i 999 ;; | |
8da2915d UD |
1709 | } |
1710 | { .mfb | |
a334319f UD |
1711 | nop.m 999 |
1712 | (p10) fms.s0 FR_Input_X = FR_Input_X, f1, FR_poly | |
8da2915d UD |
1713 | // |
1714 | // if i_0 == 0: Result = Result + poly | |
1715 | // else Result = Result - poly | |
1716 | // | |
a334319f | 1717 | (p0) br.ret.sptk b0 ;; |
8da2915d | 1718 | } |
a334319f UD |
1719 | L(SINCOSL_SMALL_R): |
1720 | { .mii | |
1721 | nop.m 999 | |
1722 | (p0) extr.u GR_i_1 = GR_N_Inc, 0, 1 ;; | |
8da2915d UD |
1723 | // |
1724 | // | |
1725 | // Compare both i_1 and i_0 with 0. | |
1726 | // if i_1 == 0, set p9. | |
1727 | // if i_0 == 0, set p11. | |
1728 | // | |
a334319f UD |
1729 | (p0) cmp.eq.unc p9, p10 = 0x0, GR_i_1 ;; |
1730 | } | |
8da2915d | 1731 | { .mfi |
a334319f UD |
1732 | nop.m 999 |
1733 | (p0) fma.s1 FR_rsq = FR_r, FR_r, f0 | |
1734 | (p0) extr.u GR_i_0 = GR_N_Inc, 1, 1 ;; | |
1735 | } | |
1736 | { .mfi | |
1737 | nop.m 999 | |
1738 | // | |
1739 | // Z = Z * FR_rsq | |
1740 | // | |
1741 | (p10) fnma.s1 FR_c = FR_c, FR_r, f0 | |
1742 | (p0) cmp.eq.unc p11, p12 = 0x0, GR_i_0 | |
8da2915d UD |
1743 | } |
1744 | ;; | |
1745 | ||
a334319f UD |
1746 | // ****************************************************************** |
1747 | // ****************************************************************** | |
1748 | // ****************************************************************** | |
1749 | // r and c have been computed. | |
1750 | // We know whether this is the sine or cosine routine. | |
1751 | // Make sure ftz mode is set - should be automatic when using wre | |
1752 | // |r| < 2**(-3) | |
1753 | // | |
1754 | // Set table_ptr1 to beginning of constant table. | |
1755 | // Get [i_0,i_1] - two lsb of N_fix_gr. | |
1756 | // | |
1757 | ||
8da2915d | 1758 | { .mmi |
a334319f UD |
1759 | nop.m 999 |
1760 | (p0) addl GR_Table_Base = @ltoff(FSINCOSL_CONSTANTS#), gp | |
8da2915d UD |
1761 | nop.i 999 |
1762 | } | |
1763 | ;; | |
1764 | ||
1765 | { .mmi | |
a334319f UD |
1766 | ld8 GR_Table_Base = [GR_Table_Base] |
1767 | nop.m 999 | |
8da2915d UD |
1768 | nop.i 999 |
1769 | } | |
1770 | ;; | |
1771 | ||
a334319f UD |
1772 | |
1773 | // | |
1774 | // Set table_ptr1 to point to S_5. | |
1775 | // Set table_ptr1 to point to C_5. | |
1776 | // Compute FR_rsq = r * r | |
1777 | // | |
0ecb606c | 1778 | { .mfi |
a334319f UD |
1779 | (p9) add GR_Table_Base = 672, GR_Table_Base |
1780 | (p10) fmerge.s FR_r = f1, f1 | |
1781 | (p10) add GR_Table_Base = 592, GR_Table_Base ;; | |
1782 | } | |
1783 | // | |
1784 | // Set table_ptr1 to point to S_5. | |
1785 | // Set table_ptr1 to point to C_5. | |
1786 | // | |
1787 | { .mmi | |
1788 | (p9) ldfe FR_S_5 = [GR_Table_Base], -16 ;; | |
1789 | // | |
1790 | // if (i_1 == 0) load S_5 | |
1791 | // if (i_1 != 0) load C_5 | |
1792 | // | |
1793 | (p9) ldfe FR_S_4 = [GR_Table_Base], -16 | |
1794 | nop.i 999 ;; | |
8da2915d UD |
1795 | } |
1796 | { .mmf | |
a334319f UD |
1797 | (p10) ldfe FR_C_5 = [GR_Table_Base], -16 |
1798 | // | |
1799 | // Z = FR_rsq * FR_rsq | |
1800 | // | |
1801 | (p9) ldfe FR_S_3 = [GR_Table_Base], -16 | |
1802 | // | |
1803 | // Compute FR_rsq = r * r | |
1804 | // if (i_1 == 0) load S_4 | |
1805 | // if (i_1 != 0) load C_4 | |
1806 | // | |
1807 | (p0) fma.s1 FR_Z = FR_rsq, FR_rsq, f0 ;; | |
8da2915d | 1808 | } |
a334319f UD |
1809 | // |
1810 | // if (i_1 == 0) load S_3 | |
1811 | // if (i_1 != 0) load C_3 | |
1812 | // | |
8da2915d | 1813 | { .mmi |
a334319f UD |
1814 | (p9) ldfe FR_S_2 = [GR_Table_Base], -16 ;; |
1815 | // | |
1816 | // if (i_1 == 0) load S_2 | |
1817 | // if (i_1 != 0) load C_2 | |
1818 | // | |
1819 | (p9) ldfe FR_S_1 = [GR_Table_Base], -16 | |
1820 | nop.i 999 | |
1821 | } | |
1822 | { .mmi | |
1823 | (p10) ldfe FR_C_4 = [GR_Table_Base], -16 ;; | |
1824 | (p10) ldfe FR_C_3 = [GR_Table_Base], -16 | |
1825 | nop.i 999 ;; | |
1826 | } | |
1827 | { .mmi | |
1828 | (p10) ldfe FR_C_2 = [GR_Table_Base], -16 ;; | |
1829 | (p10) ldfe FR_C_1 = [GR_Table_Base], -16 | |
1830 | nop.i 999 | |
8da2915d UD |
1831 | } |
1832 | { .mfi | |
a334319f UD |
1833 | nop.m 999 |
1834 | // | |
1835 | // if (i_1 != 0): | |
1836 | // poly_lo = FR_rsq * C_5 + C_4 | |
1837 | // poly_hi = FR_rsq * C_2 + C_1 | |
1838 | // | |
1839 | (p9) fma.s1 FR_Z = FR_Z, FR_r, f0 | |
1840 | nop.i 999 ;; | |
8da2915d UD |
1841 | } |
1842 | { .mfi | |
a334319f UD |
1843 | nop.m 999 |
1844 | // | |
1845 | // if (i_1 == 0) load S_1 | |
1846 | // if (i_1 != 0) load C_1 | |
1847 | // | |
1848 | (p9) fma.s1 FR_poly_lo = FR_rsq, FR_S_5, FR_S_4 | |
1849 | nop.i 999 | |
8da2915d UD |
1850 | } |
1851 | { .mfi | |
a334319f UD |
1852 | nop.m 999 |
1853 | // | |
1854 | // c = -c * r | |
1855 | // dummy fmpy's to flag inexact. | |
1856 | // | |
1857 | (p9) fma.s0 FR_S_4 = FR_S_4, FR_S_4, f0 | |
1858 | nop.i 999 ;; | |
8da2915d UD |
1859 | } |
1860 | { .mfi | |
a334319f UD |
1861 | nop.m 999 |
1862 | // | |
1863 | // poly_lo = FR_rsq * poly_lo + C_3 | |
1864 | // poly_hi = FR_rsq * poly_hi | |
1865 | // | |
1866 | (p0) fma.s1 FR_Z = FR_Z, FR_rsq, f0 | |
1867 | nop.i 999 ;; | |
8da2915d UD |
1868 | } |
1869 | { .mfi | |
a334319f UD |
1870 | nop.m 999 |
1871 | (p9) fma.s1 FR_poly_hi = FR_rsq, FR_S_2, FR_S_1 | |
1872 | nop.i 999 | |
8da2915d UD |
1873 | } |
1874 | { .mfi | |
a334319f UD |
1875 | nop.m 999 |
1876 | // | |
1877 | // if (i_1 == 0): | |
1878 | // poly_lo = FR_rsq * S_5 + S_4 | |
1879 | // poly_hi = FR_rsq * S_2 + S_1 | |
1880 | // | |
1881 | (p10) fma.s1 FR_poly_lo = FR_rsq, FR_C_5, FR_C_4 | |
1882 | nop.i 999 ;; | |
8da2915d UD |
1883 | } |
1884 | { .mfi | |
a334319f UD |
1885 | nop.m 999 |
1886 | // | |
1887 | // if (i_1 == 0): | |
1888 | // Z = Z * r for only one of the small r cases - not there | |
1889 | // in original implementation notes. | |
1890 | // | |
1891 | (p9) fma.s1 FR_poly_lo = FR_rsq, FR_poly_lo, FR_S_3 | |
1892 | nop.i 999 ;; | |
8da2915d UD |
1893 | } |
1894 | { .mfi | |
a334319f UD |
1895 | nop.m 999 |
1896 | (p10) fma.s1 FR_poly_hi = FR_rsq, FR_C_2, FR_C_1 | |
1897 | nop.i 999 | |
8da2915d UD |
1898 | } |
1899 | { .mfi | |
a334319f UD |
1900 | nop.m 999 |
1901 | (p10) fma.s0 FR_C_1 = FR_C_1, FR_C_1, f0 | |
1902 | nop.i 999 ;; | |
8da2915d UD |
1903 | } |
1904 | { .mfi | |
a334319f UD |
1905 | nop.m 999 |
1906 | (p9) fma.s1 FR_poly_hi = FR_poly_hi, FR_rsq, f0 | |
1907 | nop.i 999 | |
8da2915d UD |
1908 | } |
1909 | { .mfi | |
a334319f UD |
1910 | nop.m 999 |
1911 | // | |
1912 | // poly_lo = FR_rsq * poly_lo + S_3 | |
1913 | // poly_hi = FR_rsq * poly_hi | |
1914 | // | |
1915 | (p10) fma.s1 FR_poly_lo = FR_rsq, FR_poly_lo, FR_C_3 | |
1916 | nop.i 999 ;; | |
8da2915d UD |
1917 | } |
1918 | { .mfi | |
a334319f UD |
1919 | nop.m 999 |
1920 | (p10) fma.s1 FR_poly_hi = FR_poly_hi, FR_rsq, f0 | |
1921 | nop.i 999 ;; | |
8da2915d UD |
1922 | } |
1923 | { .mfi | |
a334319f UD |
1924 | nop.m 999 |
1925 | // | |
1926 | // if (i_1 == 0): dummy fmpy's to flag inexact | |
1927 | // r = 1 | |
1928 | // | |
1929 | (p9) fma.s1 FR_poly_hi = FR_r, FR_poly_hi, f0 | |
1930 | nop.i 999 | |
8da2915d UD |
1931 | } |
1932 | { .mfi | |
a334319f UD |
1933 | nop.m 999 |
1934 | // | |
1935 | // poly_hi = r * poly_hi | |
1936 | // | |
1937 | (p0) fma.s1 FR_poly = FR_Z, FR_poly_lo, FR_c | |
1938 | nop.i 999 ;; | |
8da2915d UD |
1939 | } |
1940 | { .mfi | |
a334319f UD |
1941 | nop.m 999 |
1942 | (p12) fms.s1 FR_r = f0, f1, FR_r | |
1943 | nop.i 999 ;; | |
8da2915d UD |
1944 | } |
1945 | { .mfi | |
a334319f UD |
1946 | nop.m 999 |
1947 | // | |
1948 | // poly_hi = Z * poly_lo + c | |
1949 | // if i_0 == 1: r = -r | |
1950 | // | |
1951 | (p0) fma.s1 FR_poly = FR_poly, f1, FR_poly_hi | |
1952 | nop.i 999 ;; | |
8da2915d | 1953 | } |
a334319f UD |
1954 | { .mfi |
1955 | nop.m 999 | |
1956 | (p12) fms.s0 FR_Input_X = FR_r, f1, FR_poly | |
1957 | nop.i 999 | |
1958 | } | |
1959 | { .mfb | |
1960 | nop.m 999 | |
1961 | // | |
1962 | // poly = poly + poly_hi | |
1963 | // | |
1964 | (p11) fma.s0 FR_Input_X = FR_r, f1, FR_poly | |
8da2915d UD |
1965 | // |
1966 | // if (i_0 == 0) Result = r + poly | |
1967 | // if (i_0 != 0) Result = r - poly | |
1968 | // | |
a334319f UD |
1969 | (p0) br.ret.sptk b0 ;; |
1970 | } | |
1971 | L(SINCOSL_NORMAL_R): | |
1972 | { .mii | |
1973 | nop.m 999 | |
1974 | (p0) extr.u GR_i_1 = GR_N_Inc, 0, 1 ;; | |
1975 | // | |
1976 | // Set table_ptr1 and table_ptr2 to base address of | |
1977 | // constant table. | |
1978 | (p0) cmp.eq.unc p9, p10 = 0x0, GR_i_1 ;; | |
1979 | } | |
8da2915d | 1980 | { .mfi |
a334319f UD |
1981 | nop.m 999 |
1982 | (p0) fma.s1 FR_rsq = FR_r, FR_r, f0 | |
1983 | (p0) extr.u GR_i_0 = GR_N_Inc, 1, 1 ;; | |
8da2915d | 1984 | } |
a334319f UD |
1985 | { .mfi |
1986 | nop.m 999 | |
1987 | (p0) frcpa.s1 FR_r_hi, p6 = f1, FR_r | |
1988 | (p0) cmp.eq.unc p11, p12 = 0x0, GR_i_0 | |
8da2915d UD |
1989 | } |
1990 | ;; | |
1991 | ||
a334319f UD |
1992 | // ****************************************************************** |
1993 | // ****************************************************************** | |
1994 | // ****************************************************************** | |
8da2915d | 1995 | // |
a334319f UD |
1996 | // r and c have been computed. |
1997 | // We known whether this is the sine or cosine routine. | |
1998 | // Make sure ftz mode is set - should be automatic when using wre | |
1999 | // Get [i_0,i_1] - two lsb of N_fix_gr alone. | |
8da2915d | 2000 | // |
a334319f UD |
2001 | |
2002 | { .mmi | |
2003 | nop.m 999 | |
2004 | (p0) addl GR_Table_Base = @ltoff(FSINCOSL_CONSTANTS#), gp | |
8da2915d UD |
2005 | nop.i 999 |
2006 | } | |
2007 | ;; | |
2008 | ||
2009 | { .mmi | |
a334319f UD |
2010 | ld8 GR_Table_Base = [GR_Table_Base] |
2011 | nop.m 999 | |
8da2915d UD |
2012 | nop.i 999 |
2013 | } | |
2014 | ;; | |
2015 | ||
2016 | ||
2017 | { .mfi | |
a334319f UD |
2018 | (p10) add GR_Table_Base = 384, GR_Table_Base |
2019 | (p12) fms.s1 FR_Input_X = f0, f1, f1 | |
2020 | (p9) add GR_Table_Base = 224, GR_Table_Base ;; | |
8da2915d UD |
2021 | } |
2022 | { .mfi | |
a334319f UD |
2023 | (p10) ldfe FR_QQ_8 = [GR_Table_Base], 16 |
2024 | // | |
2025 | // if (i_1==0) poly = poly * FR_rsq + PP_1_lo | |
2026 | // else poly = FR_rsq * poly | |
2027 | // | |
2028 | (p11) fma.s1 FR_Input_X = f0, f1, f1 | |
2029 | nop.i 999 ;; | |
2030 | } | |
2031 | { .mmb | |
2032 | (p10) ldfe FR_QQ_7 = [GR_Table_Base], 16 | |
2033 | // | |
2034 | // Adjust table pointers based on i_0 | |
2035 | // Compute rsq = r * r | |
2036 | // | |
2037 | (p9) ldfe FR_PP_8 = [GR_Table_Base], 16 | |
2038 | nop.b 999 ;; | |
8da2915d UD |
2039 | } |
2040 | { .mfi | |
a334319f UD |
2041 | nop.m 999 |
2042 | (p0) fma.s1 FR_r_cubed = FR_r, FR_rsq, f0 | |
2043 | nop.i 999 ;; | |
8da2915d UD |
2044 | } |
2045 | { .mmf | |
a334319f UD |
2046 | (p9) ldfe FR_PP_7 = [GR_Table_Base], 16 |
2047 | (p10) ldfe FR_QQ_6 = [GR_Table_Base], 16 | |
2048 | // | |
2049 | // Load PP_8 and QQ_8; PP_7 and QQ_7 | |
2050 | // | |
2051 | (p0) frcpa.s1 FR_r_hi, p6 = f1, FR_r_hi ;; | |
8da2915d | 2052 | } |
a334319f UD |
2053 | // |
2054 | // if (i_1==0) poly = PP_7 + FR_rsq * PP_8. | |
2055 | // else poly = QQ_7 + FR_rsq * QQ_8. | |
2056 | // | |
2057 | { .mmb | |
2058 | (p9) ldfe FR_PP_6 = [GR_Table_Base], 16 | |
2059 | (p10) ldfe FR_QQ_5 = [GR_Table_Base], 16 | |
2060 | nop.b 999 ;; | |
8da2915d | 2061 | } |
a334319f UD |
2062 | { .mmb |
2063 | (p9) ldfe FR_PP_5 = [GR_Table_Base], 16 | |
2064 | (p10) ldfe FR_S_1 = [GR_Table_Base], 16 | |
2065 | nop.b 999 ;; | |
2066 | } | |
2067 | { .mmb | |
2068 | (p10) ldfe FR_QQ_1 = [GR_Table_Base], 16 | |
2069 | (p9) ldfe FR_C_1 = [GR_Table_Base], 16 | |
2070 | nop.b 999 ;; | |
2071 | } | |
2072 | { .mmb | |
2073 | (p10) ldfe FR_QQ_4 = [GR_Table_Base], 16 | |
2074 | (p9) ldfe FR_PP_1 = [GR_Table_Base], 16 | |
2075 | nop.b 999 ;; | |
2076 | } | |
2077 | { .mmb | |
2078 | (p10) ldfe FR_QQ_3 = [GR_Table_Base], 16 | |
2079 | // | |
2080 | // if (i_1=0) corr = corr + c*c | |
2081 | // else corr = corr * c | |
2082 | // | |
2083 | (p9) ldfe FR_PP_4 = [GR_Table_Base], 16 | |
2084 | nop.b 999 ;; | |
8da2915d UD |
2085 | } |
2086 | { .mfi | |
a334319f UD |
2087 | nop.m 999 |
2088 | (p10) fma.s1 FR_poly = FR_rsq, FR_QQ_8, FR_QQ_7 | |
2089 | nop.i 999 ;; | |
2090 | } | |
2091 | // | |
2092 | // if (i_1=0) poly = rsq * poly + PP_5 | |
2093 | // else poly = rsq * poly + QQ_5 | |
2094 | // Load PP_4 or QQ_4 | |
2095 | // | |
2096 | { .mmi | |
2097 | (p9) ldfe FR_PP_3 = [GR_Table_Base], 16 ;; | |
2098 | (p10) ldfe FR_QQ_2 = [GR_Table_Base], 16 | |
2099 | nop.i 999 | |
8da2915d UD |
2100 | } |
2101 | { .mfi | |
a334319f UD |
2102 | nop.m 999 |
2103 | // | |
2104 | // r_hi = frcpa(frcpa(r)). | |
2105 | // r_cube = r * FR_rsq. | |
2106 | // | |
2107 | (p9) fma.s1 FR_poly = FR_rsq, FR_PP_8, FR_PP_7 | |
2108 | nop.i 999 ;; | |
8da2915d | 2109 | } |
a334319f UD |
2110 | // |
2111 | // Do dummy multiplies so inexact is always set. | |
2112 | // | |
8da2915d | 2113 | { .mfi |
a334319f UD |
2114 | (p9) ldfe FR_PP_2 = [GR_Table_Base], 16 |
2115 | // | |
2116 | // r_lo = r - r_hi | |
2117 | // | |
2118 | (p9) fma.s1 FR_U_lo = FR_r_hi, FR_r_hi, f0 | |
2119 | nop.i 999 ;; | |
2120 | } | |
2121 | { .mbb | |
2122 | (p9) ldfe FR_PP_1_lo = [GR_Table_Base], 16 | |
2123 | nop.b 999 | |
2124 | nop.b 999 ;; | |
8da2915d UD |
2125 | } |
2126 | { .mfi | |
a334319f UD |
2127 | nop.m 999 |
2128 | (p10) fma.s1 FR_corr = FR_S_1, FR_r_cubed, FR_r | |
2129 | nop.i 999 | |
8da2915d UD |
2130 | } |
2131 | { .mfi | |
a334319f UD |
2132 | nop.m 999 |
2133 | (p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_6 | |
2134 | nop.i 999 ;; | |
8da2915d UD |
2135 | } |
2136 | { .mfi | |
a334319f UD |
2137 | nop.m 999 |
2138 | // | |
2139 | // if (i_1=0) U_lo = r_hi * r_hi | |
2140 | // else U_lo = r_hi + r | |
2141 | // | |
2142 | (p9) fma.s1 FR_corr = FR_C_1, FR_rsq, f0 | |
2143 | nop.i 999 ;; | |
8da2915d UD |
2144 | } |
2145 | { .mfi | |
a334319f UD |
2146 | nop.m 999 |
2147 | // | |
2148 | // if (i_1=0) corr = C_1 * rsq | |
2149 | // else corr = S_1 * r_cubed + r | |
2150 | // | |
2151 | (p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_6 | |
2152 | nop.i 999 ;; | |
8da2915d UD |
2153 | } |
2154 | { .mfi | |
a334319f UD |
2155 | nop.m 999 |
2156 | (p10) fma.s1 FR_U_lo = FR_r_hi, f1, FR_r | |
2157 | nop.i 999 | |
8da2915d UD |
2158 | } |
2159 | { .mfi | |
a334319f UD |
2160 | nop.m 999 |
2161 | // | |
2162 | // if (i_1=0) U_hi = r_hi + U_hi | |
2163 | // else U_hi = QQ_1 * U_hi + 1 | |
2164 | // | |
2165 | (p9) fma.s1 FR_U_lo = FR_r, FR_r_hi, FR_U_lo | |
2166 | nop.i 999 ;; | |
8da2915d UD |
2167 | } |
2168 | { .mfi | |
a334319f UD |
2169 | nop.m 999 |
2170 | // | |
2171 | // U_hi = r_hi * r_hi | |
2172 | // | |
2173 | (p0) fms.s1 FR_r_lo = FR_r, f1, FR_r_hi | |
2174 | nop.i 999 | |
8da2915d UD |
2175 | } |
2176 | { .mfi | |
a334319f UD |
2177 | nop.m 999 |
2178 | // | |
2179 | // Load PP_1, PP_6, PP_5, and C_1 | |
2180 | // Load QQ_1, QQ_6, QQ_5, and S_1 | |
2181 | // | |
2182 | (p0) fma.s1 FR_U_hi = FR_r_hi, FR_r_hi, f0 | |
2183 | nop.i 999 ;; | |
8da2915d UD |
2184 | } |
2185 | { .mfi | |
a334319f UD |
2186 | nop.m 999 |
2187 | (p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_5 | |
2188 | nop.i 999 | |
8da2915d UD |
2189 | } |
2190 | { .mfi | |
a334319f UD |
2191 | nop.m 999 |
2192 | (p10) fnma.s1 FR_corr = FR_corr, FR_c, f0 | |
2193 | nop.i 999 ;; | |
8da2915d UD |
2194 | } |
2195 | { .mfi | |
a334319f UD |
2196 | nop.m 999 |
2197 | // | |
2198 | // if (i_1=0) U_lo = r * r_hi + U_lo | |
2199 | // else U_lo = r_lo * U_lo | |
2200 | // | |
2201 | (p9) fma.s1 FR_corr = FR_corr, FR_c, FR_c | |
2202 | nop.i 999 ;; | |
8da2915d UD |
2203 | } |
2204 | { .mfi | |
a334319f UD |
2205 | nop.m 999 |
2206 | (p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_5 | |
2207 | nop.i 999 | |
8da2915d UD |
2208 | } |
2209 | { .mfi | |
a334319f UD |
2210 | nop.m 999 |
2211 | // | |
2212 | // if (i_1 =0) U_hi = r + U_hi | |
2213 | // if (i_1 =0) U_lo = r_lo * U_lo | |
2214 | // | |
2215 | // | |
2216 | (p9) fma.s0 FR_PP_5 = FR_PP_5, FR_PP_4, f0 | |
2217 | nop.i 999 ;; | |
8da2915d UD |
2218 | } |
2219 | { .mfi | |
a334319f UD |
2220 | nop.m 999 |
2221 | (p9) fma.s1 FR_U_lo = FR_r, FR_r, FR_U_lo | |
2222 | nop.i 999 ;; | |
8da2915d UD |
2223 | } |
2224 | { .mfi | |
a334319f UD |
2225 | nop.m 999 |
2226 | (p10) fma.s1 FR_U_lo = FR_r_lo, FR_U_lo, f0 | |
2227 | nop.i 999 ;; | |
8da2915d UD |
2228 | } |
2229 | { .mfi | |
a334319f UD |
2230 | nop.m 999 |
2231 | // | |
2232 | // if (i_1=0) poly = poly * rsq + PP_6 | |
2233 | // else poly = poly * rsq + QQ_6 | |
2234 | // | |
2235 | (p9) fma.s1 FR_U_hi = FR_r_hi, FR_U_hi, f0 | |
2236 | nop.i 999 | |
8da2915d UD |
2237 | } |
2238 | { .mfi | |
a334319f UD |
2239 | nop.m 999 |
2240 | (p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_4 | |
2241 | nop.i 999 ;; | |
8da2915d UD |
2242 | } |
2243 | { .mfi | |
a334319f UD |
2244 | nop.m 999 |
2245 | (p10) fma.s1 FR_U_hi = FR_QQ_1, FR_U_hi, f1 | |
2246 | nop.i 999 | |
8da2915d UD |
2247 | } |
2248 | { .mfi | |
a334319f UD |
2249 | nop.m 999 |
2250 | (p10) fma.s0 FR_QQ_5 = FR_QQ_5, FR_QQ_5, f0 | |
2251 | nop.i 999 ;; | |
8da2915d UD |
2252 | } |
2253 | { .mfi | |
a334319f UD |
2254 | nop.m 999 |
2255 | // | |
2256 | // if (i_1!=0) U_hi = PP_1 * U_hi | |
2257 | // if (i_1!=0) U_lo = r * r + U_lo | |
2258 | // Load PP_3 or QQ_3 | |
2259 | // | |
2260 | (p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_4 | |
2261 | nop.i 999 ;; | |
8da2915d UD |
2262 | } |
2263 | { .mfi | |
a334319f UD |
2264 | nop.m 999 |
2265 | (p9) fma.s1 FR_U_lo = FR_r_lo, FR_U_lo, f0 | |
2266 | nop.i 999 ;; | |
8da2915d UD |
2267 | } |
2268 | { .mfi | |
a334319f UD |
2269 | nop.m 999 |
2270 | (p10) fma.s1 FR_U_lo = FR_QQ_1,FR_U_lo, f0 | |
2271 | nop.i 999 ;; | |
8da2915d UD |
2272 | } |
2273 | { .mfi | |
a334319f UD |
2274 | nop.m 999 |
2275 | (p9) fma.s1 FR_U_hi = FR_PP_1, FR_U_hi, f0 | |
2276 | nop.i 999 | |
8da2915d UD |
2277 | } |
2278 | { .mfi | |
a334319f UD |
2279 | nop.m 999 |
2280 | (p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_3 | |
2281 | nop.i 999 ;; | |
8da2915d UD |
2282 | } |
2283 | { .mfi | |
a334319f UD |
2284 | nop.m 999 |
2285 | // | |
2286 | // Load PP_2, QQ_2 | |
2287 | // | |
2288 | (p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_3 | |
2289 | nop.i 999 ;; | |
8da2915d UD |
2290 | } |
2291 | { .mfi | |
a334319f UD |
2292 | nop.m 999 |
2293 | // | |
2294 | // if (i_1==0) poly = FR_rsq * poly + PP_3 | |
2295 | // else poly = FR_rsq * poly + QQ_3 | |
2296 | // Load PP_1_lo | |
2297 | // | |
2298 | (p9) fma.s1 FR_U_lo = FR_PP_1, FR_U_lo, f0 | |
2299 | nop.i 999 ;; | |
8da2915d UD |
2300 | } |
2301 | { .mfi | |
a334319f UD |
2302 | nop.m 999 |
2303 | // | |
2304 | // if (i_1 =0) poly = poly * rsq + pp_r4 | |
2305 | // else poly = poly * rsq + qq_r4 | |
2306 | // | |
2307 | (p9) fma.s1 FR_U_hi = FR_r, f1, FR_U_hi | |
2308 | nop.i 999 | |
8da2915d UD |
2309 | } |
2310 | { .mfi | |
a334319f UD |
2311 | nop.m 999 |
2312 | (p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_2 | |
2313 | nop.i 999 ;; | |
8da2915d UD |
2314 | } |
2315 | { .mfi | |
a334319f UD |
2316 | nop.m 999 |
2317 | // | |
2318 | // if (i_1==0) U_lo = PP_1_hi * U_lo | |
2319 | // else U_lo = QQ_1 * U_lo | |
2320 | // | |
2321 | (p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_2 | |
2322 | nop.i 999 ;; | |
8da2915d UD |
2323 | } |
2324 | { .mfi | |
a334319f UD |
2325 | nop.m 999 |
2326 | // | |
2327 | // if (i_0==0) Result = 1 | |
2328 | // else Result = -1 | |
2329 | // | |
2330 | (p0) fma.s1 FR_V = FR_U_lo, f1, FR_corr | |
2331 | nop.i 999 ;; | |
8da2915d UD |
2332 | } |
2333 | { .mfi | |
a334319f UD |
2334 | nop.m 999 |
2335 | (p10) fma.s1 FR_poly = FR_rsq, FR_poly, f0 | |
2336 | nop.i 999 ;; | |
8da2915d UD |
2337 | } |
2338 | { .mfi | |
a334319f UD |
2339 | nop.m 999 |
2340 | // | |
2341 | // if (i_1==0) poly = FR_rsq * poly + PP_2 | |
2342 | // else poly = FR_rsq * poly + QQ_2 | |
2343 | // | |
2344 | (p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_1_lo | |
2345 | nop.i 999 ;; | |
8da2915d UD |
2346 | } |
2347 | { .mfi | |
a334319f UD |
2348 | nop.m 999 |
2349 | (p10) fma.s1 FR_poly = FR_rsq, FR_poly, f0 | |
2350 | nop.i 999 ;; | |
8da2915d | 2351 | } |
a334319f UD |
2352 | { .mfi |
2353 | nop.m 999 | |
2354 | // | |
2355 | // V = U_lo + corr | |
2356 | // | |
2357 | (p9) fma.s1 FR_poly = FR_r_cubed, FR_poly, f0 | |
2358 | nop.i 999 ;; | |
0ecb606c | 2359 | } |
0ecb606c | 2360 | { .mfi |
a334319f UD |
2361 | nop.m 999 |
2362 | // | |
2363 | // if (i_1==0) poly = r_cube * poly | |
2364 | // else poly = FR_rsq * poly | |
2365 | // | |
2366 | (p0) fma.s1 FR_V = FR_poly, f1, FR_V | |
2367 | nop.i 999 ;; | |
8da2915d UD |
2368 | } |
2369 | { .mfi | |
a334319f UD |
2370 | nop.m 999 |
2371 | (p12) fms.s0 FR_Input_X = FR_Input_X, FR_U_hi, FR_V | |
2372 | nop.i 999 | |
8da2915d UD |
2373 | } |
2374 | { .mfb | |
a334319f UD |
2375 | nop.m 999 |
2376 | // | |
2377 | // V = V + poly | |
2378 | // | |
2379 | (p11) fma.s0 FR_Input_X = FR_Input_X, FR_U_hi, FR_V | |
2380 | // | |
2381 | // if (i_0==0) Result = Result * U_hi + V | |
2382 | // else Result = Result * U_hi - V | |
2383 | // | |
2384 | (p0) br.ret.sptk b0 | |
2385 | };; | |
0ecb606c | 2386 | |
a334319f UD |
2387 | // |
2388 | // If cosine, FR_Input_X = 1 | |
2389 | // If sine, FR_Input_X = +/-Zero (Input FR_Input_X) | |
2390 | // Results are exact, no exceptions | |
2391 | // | |
0ecb606c | 2392 | |
a334319f UD |
2393 | L(SINCOSL_ZERO): |
2394 | { .mbb | |
2395 | (p0) cmp.eq.unc p6, p7 = 0x1, GR_Sin_or_Cos | |
2396 | nop.b 999 | |
2397 | nop.b 999 ;; | |
2398 | } | |
2399 | { .mfi | |
2400 | nop.m 999 | |
2401 | (p7) fmerge.s FR_Input_X = FR_Input_X, FR_Input_X | |
2402 | nop.i 999 | |
8da2915d | 2403 | } |
a334319f UD |
2404 | { .mfb |
2405 | nop.m 999 | |
2406 | (p6) fmerge.s FR_Input_X = f1, f1 | |
2407 | (p0) br.ret.sptk b0 ;; | |
2408 | } | |
2409 | L(SINCOSL_SPECIAL): | |
8da2915d UD |
2410 | { .mfb |
2411 | nop.m 999 | |
2412 | // | |
2413 | // Path for Arg = +/- QNaN, SNaN, Inf | |
2414 | // Invalid can be raised. SNaNs | |
2415 | // become QNaNs | |
2416 | // | |
a334319f UD |
2417 | (p0) fmpy.s0 FR_Input_X = FR_Input_X, f0 |
2418 | (p0) br.ret.sptk b0 ;; | |
8da2915d | 2419 | } |
a334319f UD |
2420 | .endp cosl# |
2421 | ASM_SIZE_DIRECTIVE(cosl#) | |
8da2915d | 2422 | |
a334319f UD |
2423 | // Call int pi_by_2_reduce(double* x, double *y) |
2424 | // for |arguments| >= 2**63 | |
2425 | // Address to save r and c as double | |
2426 | // | |
2427 | // sp+32 -> f0 | |
2428 | // r45 sp+16 -> f0 | |
2429 | // r44 -> sp -> InputX | |
2430 | // | |
0ecb606c | 2431 | |
a334319f UD |
2432 | .proc __libm_callout |
2433 | __libm_callout: | |
2434 | L(SINCOSL_ARG_TOO_LARGE): | |
8da2915d UD |
2435 | .prologue |
2436 | { .mfi | |
a334319f | 2437 | add r45=-32,sp // Parameter: r address |
8da2915d UD |
2438 | nop.f 0 |
2439 | .save ar.pfs,GR_SAVE_PFS | |
2440 | mov GR_SAVE_PFS=ar.pfs // Save ar.pfs | |
a334319f UD |
2441 | } |
2442 | { .mfi | |
2443 | .fframe 64 | |
2444 | add sp=-64,sp // Create new stack | |
2445 | nop.f 0 | |
2446 | mov GR_SAVE_GP=gp // Save gp | |
8da2915d UD |
2447 | };; |
2448 | { .mmi | |
a334319f UD |
2449 | stfe [r45] = f0,16 // Clear Parameter r on stack |
2450 | add r44 = 16,sp // Parameter x address | |
8da2915d UD |
2451 | .save b0, GR_SAVE_B0 |
2452 | mov GR_SAVE_B0=b0 // Save b0 | |
2453 | };; | |
2454 | .body | |
2455 | { .mib | |
a334319f UD |
2456 | stfe [r45] = f0,-16 // Clear Parameter c on stack |
2457 | nop.i 0 | |
2458 | nop.b 0 | |
2459 | } | |
2460 | { .mib | |
2461 | stfe [r44] = FR_Input_X // Store Parameter x on stack | |
8da2915d | 2462 | nop.i 0 |
a334319f | 2463 | (p0) br.call.sptk b0=__libm_pi_by_2_reduce# ;; |
8da2915d | 2464 | };; |
a334319f UD |
2465 | { .mii |
2466 | (p0) ldfe FR_Input_X =[r44],16 | |
2467 | // | |
2468 | // Get r and c off stack | |
2469 | // | |
2470 | (p0) adds GR_Table_Base1 = -16, GR_Table_Base1 | |
2471 | // | |
2472 | // Get r and c off stack | |
2473 | // | |
2474 | (p0) add GR_N_Inc = GR_Sin_or_Cos,r8 ;; | |
2475 | } | |
2476 | { .mmb | |
2477 | (p0) ldfe FR_r =[r45],16 | |
2478 | // | |
2479 | // Get X off the stack | |
2480 | // Readjust Table ptr | |
2481 | // | |
2482 | (p0) ldfs FR_Two_to_M3 = [GR_Table_Base1],4 | |
2483 | nop.b 999 ;; | |
2484 | } | |
2485 | { .mmb | |
2486 | (p0) ldfs FR_Neg_Two_to_M3 = [GR_Table_Base1],0 | |
2487 | (p0) ldfe FR_c =[r45] | |
2488 | nop.b 999 ;; | |
2489 | } | |
8da2915d | 2490 | { .mfi |
a334319f UD |
2491 | .restore sp |
2492 | add sp = 64,sp // Restore stack pointer | |
2493 | (p0) fcmp.lt.unc.s1 p6, p0 = FR_r, FR_Two_to_M3 | |
8da2915d UD |
2494 | mov b0 = GR_SAVE_B0 // Restore return address |
2495 | };; | |
a334319f | 2496 | { .mib |
8da2915d UD |
2497 | mov gp = GR_SAVE_GP // Restore gp |
2498 | mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs | |
a334319f | 2499 | nop.b 0 |
8da2915d | 2500 | };; |
a334319f UD |
2501 | { .mfi |
2502 | nop.m 999 | |
2503 | (p6) fcmp.gt.unc.s1 p6, p0 = FR_r, FR_Neg_Two_to_M3 | |
2504 | nop.i 999 ;; | |
2505 | } | |
2506 | { .mib | |
2507 | nop.m 999 | |
2508 | nop.i 999 | |
2509 | (p6) br.cond.spnt L(SINCOSL_SMALL_R) ;; | |
2510 | } | |
2511 | { .mib | |
2512 | nop.m 999 | |
2513 | nop.i 999 | |
2514 | (p0) br.cond.sptk L(SINCOSL_NORMAL_R) ;; | |
2515 | } | |
2516 | .endp __libm_callout | |
2517 | ASM_SIZE_DIRECTIVE(__libm_callout) | |
8da2915d UD |
2518 | .type __libm_pi_by_2_reduce#,@function |
2519 | .global __libm_pi_by_2_reduce# |