]>
Commit | Line | Data |
---|---|---|
d5efd131 MF |
1 | .file "atanl.s" |
2 | ||
3 | ||
4 | // Copyright (c) 2000 - 2005, Intel Corporation | |
5 | // All rights reserved. | |
6 | // | |
7 | // Contributed 2000 by the Intel Numerics Group, Intel Corporation | |
8 | // | |
9 | // Redistribution and use in source and binary forms, with or without | |
10 | // modification, are permitted provided that the following conditions are | |
11 | // met: | |
12 | // | |
13 | // * Redistributions of source code must retain the above copyright | |
14 | // notice, this list of conditions and the following disclaimer. | |
15 | // | |
16 | // * Redistributions in binary form must reproduce the above copyright | |
17 | // notice, this list of conditions and the following disclaimer in the | |
18 | // documentation and/or other materials provided with the distribution. | |
19 | // | |
20 | // * The name of Intel Corporation may not be used to endorse or promote | |
21 | // products derived from this software without specific prior written | |
22 | // permission. | |
23 | ||
0347518d MF |
24 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
25 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
d5efd131 | 26 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
0347518d | 27 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS |
d5efd131 | 28 | // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
0347518d MF |
29 | // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
30 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | |
31 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY | |
d5efd131 | 32 | // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING |
0347518d MF |
33 | // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
34 | // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
35 | // | |
d5efd131 | 36 | // Intel Corporation is the author of this code, and requests that all |
0347518d | 37 | // problem reports or change requests be submitted to it directly at |
d5efd131 MF |
38 | // http://www.intel.com/software/products/opensource/libraries/num.htm. |
39 | // | |
40 | // | |
41 | //********************************************************************* | |
42 | // | |
43 | // History | |
44 | // 02/02/00 (hand-optimized) | |
45 | // 04/04/00 Unwind support added | |
46 | // 08/15/00 Bundle added after call to __libm_error_support to properly | |
47 | // set [the previously overwritten] GR_Parameter_RESULT. | |
48 | // 03/13/01 Fixed flags when denormal raised on intermediate result | |
49 | // 01/08/02 Improved speed. | |
50 | // 02/06/02 Corrected .section statement | |
51 | // 05/20/02 Cleaned up namespace and sf0 syntax | |
52 | // 02/10/03 Reordered header: .section, .global, .proc, .align; | |
53 | // used data8 for long double table values | |
54 | // 03/31/05 Reformatted delimiters between data tables | |
55 | // | |
56 | //********************************************************************* | |
57 | // | |
58 | // Function: atanl(x) = inverse tangent(x), for double extended x values | |
59 | // Function: atan2l(y,x) = atan(y/x), for double extended y, x values | |
60 | // | |
61 | // API | |
62 | // | |
63 | // long double atanl (long double x) | |
64 | // long double atan2l (long double y, long double x) | |
65 | // | |
66 | //********************************************************************* | |
67 | // | |
68 | // Resources Used: | |
69 | // | |
70 | // Floating-Point Registers: f8 (Input and Return Value) | |
71 | // f9 (Input for atan2l) | |
72 | // f10-f15, f32-f83 | |
73 | // | |
74 | // General Purpose Registers: | |
75 | // r32-r51 | |
76 | // r49-r52 (Arguments to error support for 0,0 case) | |
77 | // | |
78 | // Predicate Registers: p6-p15 | |
79 | // | |
80 | //********************************************************************* | |
81 | // | |
82 | // IEEE Special Conditions: | |
83 | // | |
84 | // Denormal fault raised on denormal inputs | |
0347518d | 85 | // Underflow exceptions may occur |
d5efd131 MF |
86 | // Special error handling for the y=0 and x=0 case |
87 | // Inexact raised when appropriate by algorithm | |
88 | // | |
89 | // atanl(SNaN) = QNaN | |
90 | // atanl(QNaN) = QNaN | |
91 | // atanl(+/-0) = +/- 0 | |
0347518d | 92 | // atanl(+/-Inf) = +/-pi/2 |
d5efd131 MF |
93 | // |
94 | // atan2l(Any NaN for x or y) = QNaN | |
0347518d MF |
95 | // atan2l(+/-0,x) = +/-0 for x > 0 |
96 | // atan2l(+/-0,x) = +/-pi for x < 0 | |
97 | // atan2l(+/-0,+0) = +/-0 | |
98 | // atan2l(+/-0,-0) = +/-pi | |
d5efd131 MF |
99 | // atan2l(y,+/-0) = pi/2 y > 0 |
100 | // atan2l(y,+/-0) = -pi/2 y < 0 | |
101 | // atan2l(+/-y, Inf) = +/-0 for finite y > 0 | |
0347518d MF |
102 | // atan2l(+/-Inf, x) = +/-pi/2 for finite x |
103 | // atan2l(+/-y, -Inf) = +/-pi for finite y > 0 | |
d5efd131 MF |
104 | // atan2l(+/-Inf, Inf) = +/-pi/4 |
105 | // atan2l(+/-Inf, -Inf) = +/-3pi/4 | |
106 | // | |
107 | //********************************************************************* | |
108 | // | |
109 | // Mathematical Description | |
110 | // --------------------------- | |
111 | // | |
112 | // The function ATANL( Arg_Y, Arg_X ) returns the "argument" | |
113 | // or the "phase" of the complex number | |
114 | // | |
115 | // Arg_X + i Arg_Y | |
116 | // | |
117 | // or equivalently, the angle in radians from the positive | |
118 | // x-axis to the line joining the origin and the point | |
119 | // (Arg_X,Arg_Y) | |
120 | // | |
121 | // | |
122 | // (Arg_X, Arg_Y) x | |
123 | // \ | |
124 | // \ | |
125 | // \ | |
126 | // \ | |
127 | // \ angle between is ATANL(Arg_Y,Arg_X) | |
128 | ||
129 | ||
130 | ||
131 | ||
132 | // \ | |
133 | // ------------------> X-axis | |
134 | ||
135 | // Origin | |
136 | // | |
137 | // Moreover, this angle is reported in the range [-pi,pi] thus | |
138 | // | |
139 | // -pi <= ATANL( Arg_Y, Arg_X ) <= pi. | |
140 | // | |
141 | // From the geometry, it is easy to define ATANL when one of | |
142 | // Arg_X or Arg_Y is +-0 or +-inf: | |
143 | // | |
144 | // | |
145 | // \ Y | | |
146 | // X \ | +0 | -0 | +inf | -inf | finite non-zero | |
147 | // \ | | | | | | |
148 | // ______________________________________________________ | |
149 | // | | | | | |
150 | // +-0 | Invalid/ | pi/2 | -pi/2 | sign(Y)*pi/2 | |
151 | // | qNaN | | | | |
152 | // -------------------------------------------------------- | |
153 | // | | | | | | |
154 | // +inf | +0 | -0 | pi/4 | -pi/4 | sign(Y)*0 | |
155 | // -------------------------------------------------------- | |
156 | // | | | | | | |
157 | // -inf | +pi | -pi | 3pi/4 | -3pi/4 | sign(Y)*pi | |
158 | // -------------------------------------------------------- | |
159 | // finite | X>0? | pi/2 | -pi/2 | normal case | |
160 | // non-zero| sign(Y)*0: | | | | |
161 | // | sign(Y)*pi | | | | |
162 | // | |
163 | // | |
164 | // One must take note that ATANL is NOT the arctangent of the | |
165 | // value Arg_Y/Arg_X; but rather ATANL and arctan are related | |
166 | // in a slightly more complicated way as follows: | |
167 | // | |
168 | // Let U := max(|Arg_X|, |Arg_Y|); V := min(|Arg_X|, |Arg_Y|); | |
169 | // sign_X be the sign bit of Arg_X, i.e., sign_X is 0 or 1; | |
170 | // s_X be the sign of Arg_X, i.e., s_X = (-1)^sign_X; | |
171 | // | |
172 | // sign_Y be the sign bit of Arg_Y, i.e., sign_Y is 0 or 1; | |
173 | // s_Y be the sign of Arg_Y, i.e., s_Y = (-1)^sign_Y; | |
174 | // | |
175 | // swap be 0 if |Arg_X| >= |Arg_Y| and 1 otherwise. | |
176 | // | |
177 | // Then, ATANL(Arg_Y, Arg_X) = | |
178 | // | |
179 | // / arctan(V/U) \ sign_X = 0 & swap = 0 | |
180 | // | pi/2 - arctan(V/U) | sign_X = 0 & swap = 1 | |
181 | // s_Y * | | | |
182 | // | pi - arctan(V/U) | sign_X = 1 & swap = 0 | |
183 | // \ pi/2 + arctan(V/U) / sign_X = 1 & swap = 1 | |
184 | // | |
185 | // | |
186 | // This relationship also suggest that the algorithm's major | |
187 | // task is to calculate arctan(V/U) for 0 < V <= U; and the | |
188 | // final Result is given by | |
189 | // | |
190 | // s_Y * { (P_hi + P_lo) + sigma * arctan(V/U) } | |
191 | // | |
192 | // where | |
193 | // | |
194 | // (P_hi,P_lo) represents M(sign_X,swap)*(pi/2) accurately | |
195 | // | |
196 | // M(sign_X,swap) = 0 for sign_X = 0 and swap = 0 | |
197 | // 1 for swap = 1 | |
198 | // 2 for sign_X = 1 and swap = 0 | |
199 | // | |
200 | // and | |
201 | // | |
202 | // sigma = { (sign_X XOR swap) : -1.0 : 1.0 } | |
203 | // | |
204 | // = (-1) ^ ( sign_X XOR swap ) | |
205 | // | |
206 | // Both (P_hi,P_lo) and sigma can be stored in a table and fetched | |
207 | // using (sign_X,swap) as an index. (P_hi, P_lo) can be stored as a | |
208 | // double-precision, and single-precision pair; and sigma can | |
209 | // obviously be just a single-precision number. | |
210 | // | |
211 | // In the algorithm we propose, arctan(V/U) is calculated to high accuracy | |
212 | // as A_hi + A_lo. Consequently, the Result ATANL( Arg_Y, Arg_X ) is | |
213 | // given by | |
214 | // | |
215 | // s_Y*P_hi + s_Y*sigma*A_hi + s_Y*(sigma*A_lo + P_lo) | |
216 | // | |
217 | // We now discuss the calculation of arctan(V/U) for 0 < V <= U. | |
218 | // | |
219 | // For (V/U) < 2^(-3), we use a simple polynomial of the form | |
220 | // | |
221 | // z + z^3*(P_1 + z^2*(P_2 + z^2*(P_3 + ... + P_8))) | |
222 | // | |
223 | // where z = V/U. | |
224 | // | |
225 | // For the sake of accuracy, the first term "z" must approximate V/U to | |
226 | // extra precision. For z^3 and higher power, a working precision | |
227 | // approximation to V/U suffices. Thus, we obtain: | |
228 | // | |
229 | // z_hi + z_lo = V/U to extra precision and | |
230 | // z = V/U to working precision | |
231 | // | |
232 | // The value arctan(V/U) is delivered as two pieces (A_hi, A_lo) | |
233 | // | |
234 | // (A_hi,A_lo) = (z_hi, z^3*(P_1 + ... + P_8) + z_lo). | |
235 | // | |
236 | // | |
237 | // For 2^(-3) <= (V/U) <= 1, we use a table-driven approach. | |
238 | // Consider | |
239 | // | |
240 | // (V/U) = 2^k * 1.b_1 b_2 .... b_63 b_64 b_65 .... | |
241 | // | |
242 | // Define | |
243 | // | |
244 | // z_hi = 2^k * 1.b_1 b_2 b_3 b_4 1 | |
245 | // | |
246 | // then | |
247 | // / \ | |
248 | // | (V/U) - z_hi | | |
249 | ||
250 | // arctan(V/U) = arctan(z_hi) + acrtan| -------------- | | |
251 | // | 1 + (V/U)*z_hi | | |
252 | // \ / | |
253 | // | |
254 | // / \ | |
255 | // | V - z_hi*U | | |
256 | ||
257 | // = arctan(z_hi) + acrtan| -------------- | | |
258 | // | U + V*z_hi | | |
259 | // \ / | |
260 | // | |
261 | // = arctan(z_hi) + acrtan( V' / U' ) | |
262 | // | |
263 | // | |
264 | // where | |
265 | // | |
266 | // V' = V - U*z_hi; U' = U + V*z_hi. | |
267 | // | |
268 | // Let | |
269 | // | |
270 | // w_hi + w_lo = V'/U' to extra precision and | |
271 | // w = V'/U' to working precision | |
272 | // | |
273 | // then we can approximate arctan(V'/U') by | |
274 | // | |
275 | // arctan(V'/U') = w_hi + w_lo | |
276 | // + w^3*(Q_1 + w^2*(Q_2 + w^2*(Q_3 + w^2*Q_4))) | |
277 | // | |
278 | // = w_hi + w_lo + poly | |
279 | // | |
280 | // Finally, arctan(z_hi) is calculated beforehand and stored in a table | |
281 | // as Tbl_hi, Tbl_lo. Thus, | |
282 | // | |
283 | // (A_hi, A_lo) = (Tbl_hi, w_hi+(poly+(w_lo+Tbl_lo))) | |
284 | // | |
285 | // This completes the mathematical description. | |
286 | // | |
287 | // | |
288 | // Algorithm | |
289 | // ------------- | |
290 | // | |
291 | // Step 0. Check for unsupported format. | |
292 | // | |
293 | // If | |
294 | // ( expo(Arg_X) not zero AND msb(Arg_X) = 0 ) OR | |
295 | // ( expo(Arg_Y) not zero AND msb(Arg_Y) = 0 ) | |
296 | // | |
297 | // then one of the arguments is unsupported. Generate an | |
298 | // invalid and return qNaN. | |
299 | // | |
300 | // Step 1. Initialize | |
301 | // | |
302 | // Normalize Arg_X and Arg_Y and set the following | |
303 | // | |
304 | // sign_X := sign_bit(Arg_X) | |
305 | // s_Y := (sign_bit(Arg_Y)==0? 1.0 : -1.0) | |
306 | // swap := (|Arg_X| >= |Arg_Y|? 0 : 1 ) | |
307 | // U := max( |Arg_X|, |Arg_Y| ) | |
308 | // V := min( |Arg_X|, |Arg_Y| ) | |
309 | // | |
310 | // execute: frcpa E, pred, V, U | |
311 | // If pred is 0, go to Step 5 for special cases handling. | |
312 | // | |
313 | // Step 2. Decide on branch. | |
314 | // | |
315 | // Q := E * V | |
316 | // If Q < 2^(-3) go to Step 4 for simple polynomial case. | |
317 | // | |
318 | // Step 3. Table-driven algorithm. | |
319 | // | |
320 | // Q is represented as | |
321 | // | |
322 | // 2^(-k) * 1.b_1 b_2 b_3 ... b_63; k = 0,-1,-2,-3 | |
323 | // | |
324 | // and that if k = 0, b_1 = b_2 = b_3 = b_4 = 0. | |
325 | // | |
326 | // Define | |
327 | // | |
328 | // z_hi := 2^(-k) * 1.b_1 b_2 b_3 b_4 1 | |
329 | // | |
330 | // (note that there are 49 possible values of z_hi). | |
331 | // | |
332 | // ...We now calculate V' and U'. While V' is representable | |
333 | // ...as a 64-bit number because of cancellation, U' is | |
334 | // ...not in general a 64-bit number. Obtaining U' accurately | |
335 | // ...requires two working precision numbers | |
336 | // | |
337 | // U_prime_hi := U + V * z_hi ...WP approx. to U' | |
338 | // U_prime_lo := ( U - U_prime_hi ) + V*z_hi ...observe order | |
339 | // V_prime := V - U * z_hi ...this is exact | |
340 | // | |
341 | // C_hi := frcpa (1.0, U_prime_hi) ...C_hi approx 1/U'_hi | |
342 | // | |
343 | // loop 3 times | |
344 | // C_hi := C_hi + C_hi*(1.0 - C_hi*U_prime_hi) | |
345 | // | |
346 | // ...at this point C_hi is (1/U_prime_hi) to roughly 64 bits | |
347 | // | |
348 | // w_hi := V_prime * C_hi ...w_hi is V_prime/U_prime to | |
349 | // ...roughly working precision | |
350 | // | |
351 | // ...note that we want w_hi + w_lo to approximate | |
352 | // ...V_prime/(U_prime_hi + U_prime_lo) to extra precision | |
353 | // ...but for now, w_hi is good enough for the polynomial | |
354 | // ...calculation. | |
355 | // | |
356 | // wsq := w_hi*w_hi | |
357 | // poly := w_hi*wsq*(Q_1 + wsq*(Q_2 + wsq*(Q_3 + wsq*Q_4))) | |
358 | // | |
359 | // Fetch | |
360 | // (Tbl_hi, Tbl_lo) = atan(z_hi) indexed by (k,b_1,b_2,b_3,b_4) | |
361 | // ...Tbl_hi is a double-precision number | |
362 | // ...Tbl_lo is a single-precision number | |
363 | // | |
364 | // (P_hi, P_lo) := M(sign_X,swap)*(Pi_by_2_hi, Pi_by_2_lo) | |
365 | // ...as discussed previous. Again; the implementation can | |
366 | // ...chose to fetch P_hi and P_lo from a table indexed by | |
367 | // ...(sign_X, swap). | |
368 | // ...P_hi is a double-precision number; | |
369 | // ...P_lo is a single-precision number. | |
370 | // | |
371 | // ...calculate w_lo so that w_hi + w_lo is V'/U' accurately | |
372 | // w_lo := ((V_prime - w_hi*U_prime_hi) - | |
373 | // w_hi*U_prime_lo) * C_hi ...observe order | |
374 | // | |
375 | // | |
376 | // ...Ready to deliver arctan(V'/U') as A_hi, A_lo | |
377 | // A_hi := Tbl_hi | |
378 | // A_lo := w_hi + (poly + (Tbl_lo + w_lo)) ...observe order | |
379 | // | |
380 | // ...Deliver final Result | |
381 | // ...s_Y*P_hi + s_Y*sigma*A_hi + s_Y*(sigma*A_lo + P_lo) | |
382 | // | |
383 | // sigma := ( (sign_X XOR swap) ? -1.0 : 1.0 ) | |
384 | // ...sigma can be obtained by a table lookup using | |
385 | // ...(sign_X,swap) as index and stored as single precision | |
386 | // ...sigma should be calculated earlier | |
387 | // | |
388 | // P_hi := s_Y*P_hi | |
389 | // A_hi := s_Y*A_hi | |
390 | // | |
391 | // Res_hi := P_hi + sigma*A_hi ...this is exact because | |
392 | // ...both P_hi and Tbl_hi | |
393 | // ...are double-precision | |
394 | // ...and |Tbl_hi| > 2^(-4) | |
395 | // ...P_hi is either 0 or | |
396 | // ...between (1,4) | |
397 | // | |
398 | // Res_lo := sigma*A_lo + P_lo | |
399 | // | |
400 | // Return Res_hi + s_Y*Res_lo in user-defined rounding control | |
401 | // | |
402 | // Step 4. Simple polynomial case. | |
403 | // | |
404 | // ...E and Q are inherited from Step 2. | |
405 | // | |
406 | // A_hi := Q ...Q is inherited from Step 2 Q approx V/U | |
407 | // | |
408 | // loop 3 times | |
409 | // E := E + E2(1.0 - E*U1 | |
410 | // ...at this point E approximates 1/U to roughly working precision | |
411 | // | |
412 | // z := V * E ...z approximates V/U to roughly working precision | |
413 | // zsq := z * z | |
414 | // z4 := zsq * zsq; z8 := z4 * z4 | |
415 | // | |
416 | // poly1 := P_4 + zsq*(P_5 + zsq*(P_6 + zsq*(P_7 + zsq*P_8))) | |
417 | // poly2 := zsq*(P_1 + zsq*(P_2 + zsq*P_3)) | |
418 | // | |
419 | // poly := poly1 + z8*poly2 | |
420 | // | |
421 | // z_lo := (V - A_hi*U)*E | |
422 | // | |
423 | // A_lo := z*poly + z_lo | |
424 | // ...A_hi, A_lo approximate arctan(V/U) accurately | |
425 | // | |
426 | // (P_hi, P_lo) := M(sign_X,swap)*(Pi_by_2_hi, Pi_by_2_lo) | |
427 | // ...one can store the M(sign_X,swap) as single precision | |
428 | // ...values | |
429 | // | |
430 | // ...Deliver final Result | |
431 | // ...s_Y*P_hi + s_Y*sigma*A_hi + s_Y*(sigma*A_lo + P_lo) | |
432 | // | |
433 | // sigma := ( (sign_X XOR swap) ? -1.0 : 1.0 ) | |
434 | // ...sigma can be obtained by a table lookup using | |
435 | // ...(sign_X,swap) as index and stored as single precision | |
436 | // ...sigma should be calculated earlier | |
437 | // | |
438 | // P_hi := s_Y*P_hi | |
439 | // A_hi := s_Y*A_hi | |
440 | // | |
441 | // Res_hi := P_hi + sigma*A_hi ...need to compute | |
442 | // ...P_hi + sigma*A_hi | |
443 | // ...exactly | |
444 | // | |
445 | // tmp := (P_hi - Res_hi) + sigma*A_hi | |
446 | // | |
447 | // Res_lo := s_Y*(sigma*A_lo + P_lo) + tmp | |
448 | // | |
449 | // Return Res_hi + Res_lo in user-defined rounding control | |
450 | // | |
451 | // Step 5. Special Cases | |
452 | // | |
453 | // These are detected early in the function by fclass instructions. | |
454 | // | |
455 | // We are in one of those special cases when X or Y is 0,+-inf or NaN | |
456 | // | |
457 | // If one of X and Y is NaN, return X+Y (which will generate | |
458 | // invalid in case one is a signaling NaN). Otherwise, | |
459 | // return the Result as described in the table | |
460 | // | |
461 | // | |
462 | // | |
463 | // \ Y | | |
464 | // X \ | +0 | -0 | +inf | -inf | finite non-zero | |
465 | // \ | | | | | | |
466 | // ______________________________________________________ | |
467 | // | | | | | |
468 | // +-0 | Invalid/ | pi/2 | -pi/2 | sign(Y)*pi/2 | |
469 | // | qNaN | | | | |
470 | // -------------------------------------------------------- | |
471 | // | | | | | | |
472 | // +inf | +0 | -0 | pi/4 | -pi/4 | sign(Y)*0 | |
473 | // -------------------------------------------------------- | |
474 | // | | | | | | |
475 | // -inf | +pi | -pi | 3pi/4 | -3pi/4 | sign(Y)*pi | |
476 | // -------------------------------------------------------- | |
477 | // finite | X>0? | pi/2 | -pi/2 | | |
478 | // non-zero| sign(Y)*0: | | | N/A | |
479 | // | sign(Y)*pi | | | | |
480 | // | |
481 | // | |
482 | ||
483 | ArgY_orig = f8 | |
484 | Result = f8 | |
485 | FR_RESULT = f8 | |
486 | ArgX_orig = f9 | |
487 | ArgX = f10 | |
488 | FR_X = f10 | |
489 | ArgY = f11 | |
490 | FR_Y = f11 | |
491 | s_Y = f12 | |
492 | U = f13 | |
493 | V = f14 | |
494 | E = f15 | |
495 | Q = f32 | |
496 | z_hi = f33 | |
497 | U_prime_hi = f34 | |
498 | U_prime_lo = f35 | |
499 | V_prime = f36 | |
500 | C_hi = f37 | |
501 | w_hi = f38 | |
502 | w_lo = f39 | |
503 | wsq = f40 | |
504 | poly = f41 | |
505 | Tbl_hi = f42 | |
506 | Tbl_lo = f43 | |
507 | P_hi = f44 | |
508 | P_lo = f45 | |
509 | A_hi = f46 | |
510 | A_lo = f47 | |
511 | sigma = f48 | |
512 | Res_hi = f49 | |
513 | Res_lo = f50 | |
514 | Z = f52 | |
515 | zsq = f53 | |
516 | z4 = f54 | |
517 | z8 = f54 | |
518 | poly1 = f55 | |
519 | poly2 = f56 | |
520 | z_lo = f57 | |
521 | tmp = f58 | |
522 | P_1 = f59 | |
523 | Q_1 = f60 | |
524 | P_2 = f61 | |
525 | Q_2 = f62 | |
526 | P_3 = f63 | |
527 | Q_3 = f64 | |
528 | P_4 = f65 | |
529 | Q_4 = f66 | |
530 | P_5 = f67 | |
531 | P_6 = f68 | |
532 | P_7 = f69 | |
533 | P_8 = f70 | |
534 | U_hold = f71 | |
535 | TWO_TO_NEG3 = f72 | |
536 | C_hi_hold = f73 | |
537 | E_hold = f74 | |
538 | M = f75 | |
539 | ArgX_abs = f76 | |
540 | ArgY_abs = f77 | |
541 | Result_lo = f78 | |
542 | A_temp = f79 | |
543 | FR_temp = f80 | |
544 | Xsq = f81 | |
545 | Ysq = f82 | |
546 | tmp_small = f83 | |
547 | ||
548 | GR_SAVE_PFS = r33 | |
549 | GR_SAVE_B0 = r34 | |
550 | GR_SAVE_GP = r35 | |
551 | sign_X = r36 | |
0347518d MF |
552 | sign_Y = r37 |
553 | swap = r38 | |
554 | table_ptr1 = r39 | |
555 | table_ptr2 = r40 | |
556 | k = r41 | |
557 | lookup = r42 | |
558 | exp_ArgX = r43 | |
559 | exp_ArgY = r44 | |
560 | exponent_Q = r45 | |
561 | significand_Q = r46 | |
562 | special = r47 | |
563 | sp_exp_Q = r48 | |
564 | sp_exp_4sig_Q = r49 | |
565 | table_base = r50 | |
d5efd131 MF |
566 | int_temp = r51 |
567 | ||
568 | GR_Parameter_X = r49 | |
569 | GR_Parameter_Y = r50 | |
570 | GR_Parameter_RESULT = r51 | |
571 | GR_Parameter_TAG = r52 | |
572 | GR_temp = r52 | |
573 | ||
574 | RODATA | |
0347518d | 575 | .align 16 |
d5efd131 MF |
576 | |
577 | LOCAL_OBJECT_START(Constants_atan) | |
578 | // double pi/2 | |
579 | data8 0x3FF921FB54442D18 | |
580 | // single lo_pi/2, two**(-3) | |
581 | data4 0x248D3132, 0x3E000000 | |
582 | data8 0xAAAAAAAAAAAAAAA3, 0xBFFD // P_1 | |
583 | data8 0xCCCCCCCCCCCC54B2, 0x3FFC // P_2 | |
584 | data8 0x9249249247E4D0C2, 0xBFFC // P_3 | |
585 | data8 0xE38E38E058870889, 0x3FFB // P_4 | |
586 | data8 0xBA2E895B290149F8, 0xBFFB // P_5 | |
587 | data8 0x9D88E6D4250F733D, 0x3FFB // P_6 | |
588 | data8 0x884E51FFFB8745A0, 0xBFFB // P_7 | |
589 | data8 0xE1C7412B394396BD, 0x3FFA // P_8 | |
590 | data8 0xAAAAAAAAAAAAA52F, 0xBFFD // Q_1 | |
591 | data8 0xCCCCCCCCC75B60D3, 0x3FFC // Q_2 | |
592 | data8 0x924923AD011F1940, 0xBFFC // Q_3 | |
593 | data8 0xE36F716D2A5F89BD, 0x3FFB // Q_4 | |
594 | // | |
595 | // Entries Tbl_hi (double precision) | |
596 | // B = 1+Index/16+1/32 Index = 0 | |
597 | // Entries Tbl_lo (single precision) | |
598 | // B = 1+Index/16+1/32 Index = 0 | |
599 | // | |
0347518d | 600 | data8 0x3FE9A000A935BD8E |
d5efd131 MF |
601 | data4 0x23ACA08F, 0x00000000 |
602 | // | |
603 | // Entries Tbl_hi (double precision) Index = 0,1,...,15 | |
604 | // B = 2^(-1)*(1+Index/16+1/32) | |
605 | // Entries Tbl_lo (single precision) | |
606 | // Index = 0,1,...,15 B = 2^(-1)*(1+Index/16+1/32) | |
607 | // | |
0347518d | 608 | data8 0x3FDE77EB7F175A34 |
d5efd131 | 609 | data4 0x238729EE, 0x00000000 |
0347518d | 610 | data8 0x3FE0039C73C1A40B |
d5efd131 | 611 | data4 0x249334DB, 0x00000000 |
0347518d | 612 | data8 0x3FE0C6145B5B43DA |
d5efd131 | 613 | data4 0x22CBA7D1, 0x00000000 |
0347518d | 614 | data8 0x3FE1835A88BE7C13 |
d5efd131 | 615 | data4 0x246310E7, 0x00000000 |
0347518d | 616 | data8 0x3FE23B71E2CC9E6A |
d5efd131 | 617 | data4 0x236210E5, 0x00000000 |
0347518d | 618 | data8 0x3FE2EE628406CBCA |
d5efd131 | 619 | data4 0x2462EAF5, 0x00000000 |
0347518d | 620 | data8 0x3FE39C391CD41719 |
d5efd131 | 621 | data4 0x24B73EF3, 0x00000000 |
0347518d | 622 | data8 0x3FE445065B795B55 |
d5efd131 | 623 | data4 0x24C11260, 0x00000000 |
0347518d | 624 | data8 0x3FE4E8DE5BB6EC04 |
d5efd131 | 625 | data4 0x242519EE, 0x00000000 |
0347518d | 626 | data8 0x3FE587D81F732FBA |
d5efd131 | 627 | data4 0x24D4346C, 0x00000000 |
0347518d | 628 | data8 0x3FE6220D115D7B8D |
d5efd131 | 629 | data4 0x24ED487B, 0x00000000 |
0347518d | 630 | data8 0x3FE6B798920B3D98 |
d5efd131 | 631 | data4 0x2495FF1E, 0x00000000 |
0347518d | 632 | data8 0x3FE748978FBA8E0F |
d5efd131 | 633 | data4 0x223D9531, 0x00000000 |
0347518d | 634 | data8 0x3FE7D528289FA093 |
d5efd131 | 635 | data4 0x242B0411, 0x00000000 |
0347518d | 636 | data8 0x3FE85D69576CC2C5 |
d5efd131 | 637 | data4 0x2335B374, 0x00000000 |
0347518d | 638 | data8 0x3FE8E17AA99CC05D |
d5efd131 MF |
639 | data4 0x24C27CFB, 0x00000000 |
640 | // | |
641 | // Entries Tbl_hi (double precision) Index = 0,1,...,15 | |
642 | // B = 2^(-2)*(1+Index/16+1/32) | |
643 | // Entries Tbl_lo (single precision) | |
644 | // Index = 0,1,...,15 B = 2^(-2)*(1+Index/16+1/32) | |
645 | // | |
0347518d | 646 | data8 0x3FD025FA510665B5 |
d5efd131 MF |
647 | data4 0x24263482, 0x00000000 |
648 | data8 0x3FD1151A362431C9 | |
649 | data4 0x242C8DC9, 0x00000000 | |
650 | data8 0x3FD2025567E47C95 | |
651 | data4 0x245CF9BA, 0x00000000 | |
652 | data8 0x3FD2ED987A823CFE | |
653 | data4 0x235C892C, 0x00000000 | |
654 | data8 0x3FD3D6D129271134 | |
655 | data4 0x2389BE52, 0x00000000 | |
656 | data8 0x3FD4BDEE586890E6 | |
657 | data4 0x24436471, 0x00000000 | |
658 | data8 0x3FD5A2E0175E0F4E | |
659 | data4 0x2389DBD4, 0x00000000 | |
660 | data8 0x3FD685979F5FA6FD | |
661 | data4 0x2476D43F, 0x00000000 | |
662 | data8 0x3FD7660752817501 | |
663 | data4 0x24711774, 0x00000000 | |
664 | data8 0x3FD84422B8DF95D7 | |
665 | data4 0x23EBB501, 0x00000000 | |
666 | data8 0x3FD91FDE7CD0C662 | |
667 | data4 0x23883A0C, 0x00000000 | |
668 | data8 0x3FD9F93066168001 | |
669 | data4 0x240DF63F, 0x00000000 | |
670 | data8 0x3FDAD00F5422058B | |
671 | data4 0x23FE261A, 0x00000000 | |
672 | data8 0x3FDBA473378624A5 | |
673 | data4 0x23A8CD0E, 0x00000000 | |
674 | data8 0x3FDC76550AAD71F8 | |
675 | data4 0x2422D1D0, 0x00000000 | |
676 | data8 0x3FDD45AEC9EC862B | |
677 | data4 0x2344A109, 0x00000000 | |
678 | // | |
679 | // Entries Tbl_hi (double precision) Index = 0,1,...,15 | |
680 | // B = 2^(-3)*(1+Index/16+1/32) | |
681 | // Entries Tbl_lo (single precision) | |
682 | // Index = 0,1,...,15 B = 2^(-3)*(1+Index/16+1/32) | |
683 | // | |
684 | data8 0x3FC068D584212B3D | |
685 | data4 0x239874B6, 0x00000000 | |
686 | data8 0x3FC1646541060850 | |
687 | data4 0x2335E774, 0x00000000 | |
688 | data8 0x3FC25F6E171A535C | |
689 | data4 0x233E36BE, 0x00000000 | |
690 | data8 0x3FC359E8EDEB99A3 | |
691 | data4 0x239680A3, 0x00000000 | |
692 | data8 0x3FC453CEC6092A9E | |
693 | data4 0x230FB29E, 0x00000000 | |
694 | data8 0x3FC54D18BA11570A | |
695 | data4 0x230C1418, 0x00000000 | |
696 | data8 0x3FC645BFFFB3AA73 | |
697 | data4 0x23F0564A, 0x00000000 | |
698 | data8 0x3FC73DBDE8A7D201 | |
699 | data4 0x23D4A5E1, 0x00000000 | |
700 | data8 0x3FC8350BE398EBC7 | |
701 | data4 0x23D4ADDA, 0x00000000 | |
702 | data8 0x3FC92BA37D050271 | |
703 | data4 0x23BCB085, 0x00000000 | |
704 | data8 0x3FCA217E601081A5 | |
705 | data4 0x23BC841D, 0x00000000 | |
706 | data8 0x3FCB1696574D780B | |
707 | data4 0x23CF4A8E, 0x00000000 | |
708 | data8 0x3FCC0AE54D768466 | |
709 | data4 0x23BECC90, 0x00000000 | |
710 | data8 0x3FCCFE654E1D5395 | |
711 | data4 0x2323DCD2, 0x00000000 | |
712 | data8 0x3FCDF110864C9D9D | |
713 | data4 0x23F53F3A, 0x00000000 | |
714 | data8 0x3FCEE2E1451D980C | |
715 | data4 0x23CCB11F, 0x00000000 | |
716 | // | |
717 | data8 0x400921FB54442D18, 0x3CA1A62633145C07 // PI two doubles | |
718 | data8 0x3FF921FB54442D18, 0x3C91A62633145C07 // PI_by_2 two dbles | |
719 | data8 0x3FE921FB54442D18, 0x3C81A62633145C07 // PI_by_4 two dbles | |
720 | data8 0x4002D97C7F3321D2, 0x3C9A79394C9E8A0A // 3PI_by_4 two dbles | |
721 | LOCAL_OBJECT_END(Constants_atan) | |
722 | ||
723 | ||
724 | .section .text | |
725 | GLOBAL_IEEE754_ENTRY(atanl) | |
726 | ||
727 | // Use common code with atan2l after setting x=1.0 | |
728 | { .mfi | |
729 | alloc r32 = ar.pfs, 0, 17, 4, 0 | |
730 | fma.s1 Ysq = ArgY_orig, ArgY_orig, f0 // Form y*y | |
731 | nop.i 999 | |
732 | } | |
733 | { .mfi | |
734 | addl table_ptr1 = @ltoff(Constants_atan#), gp // Address of table pointer | |
735 | fma.s1 Xsq = f1, f1, f0 // Form x*x | |
736 | nop.i 999 | |
737 | } | |
738 | ;; | |
739 | ||
740 | { .mfi | |
741 | ld8 table_ptr1 = [table_ptr1] // Get table pointer | |
742 | fnorm.s1 ArgY = ArgY_orig | |
743 | nop.i 999 | |
744 | } | |
745 | { .mfi | |
746 | nop.m 999 | |
747 | fnorm.s1 ArgX = f1 | |
748 | nop.i 999 | |
749 | } | |
750 | ;; | |
751 | ||
752 | { .mfi | |
753 | getf.exp sign_X = f1 // Get signexp of x | |
754 | fmerge.s ArgX_abs = f0, f1 // Form |x| | |
755 | nop.i 999 | |
756 | } | |
757 | { .mfi | |
758 | nop.m 999 | |
759 | fnorm.s1 ArgX_orig = f1 | |
760 | nop.i 999 | |
761 | } | |
762 | ;; | |
763 | ||
764 | { .mfi | |
765 | getf.exp sign_Y = ArgY_orig // Get signexp of y | |
766 | fmerge.s ArgY_abs = f0, ArgY_orig // Form |y| | |
767 | mov table_base = table_ptr1 // Save base pointer to tables | |
768 | } | |
769 | ;; | |
770 | ||
771 | { .mfi | |
772 | ldfd P_hi = [table_ptr1],8 // Load double precision hi part of pi | |
773 | fclass.m p8,p0 = ArgY_orig, 0x1e7 // Test y natval, nan, inf, zero | |
0347518d | 774 | nop.i 999 |
d5efd131 MF |
775 | } |
776 | ;; | |
777 | ||
778 | { .mfi | |
779 | ldfps P_lo, TWO_TO_NEG3 = [table_ptr1], 8 // Load P_lo and constant 2^-3 | |
0347518d MF |
780 | nop.f 999 |
781 | nop.i 999 | |
d5efd131 MF |
782 | } |
783 | { .mfi | |
784 | nop.m 999 | |
785 | fma.s1 M = f1, f1, f0 // Set M = 1.0 | |
0347518d | 786 | nop.i 999 |
d5efd131 MF |
787 | } |
788 | ;; | |
789 | ||
790 | // | |
791 | // Check for everything - if false, then must be pseudo-zero | |
792 | // or pseudo-nan (IA unsupporteds). | |
793 | // | |
794 | { .mfb | |
795 | nop.m 999 | |
796 | fclass.m p0,p12 = f1, 0x1FF // Test x unsupported | |
797 | (p8) br.cond.spnt ATANL_Y_SPECIAL // Branch if y natval, nan, inf, zero | |
798 | } | |
799 | ;; | |
800 | ||
801 | // U = max(ArgX_abs,ArgY_abs) | |
802 | // V = min(ArgX_abs,ArgY_abs) | |
803 | { .mfi | |
804 | nop.m 999 | |
805 | fcmp.ge.s1 p6,p7 = Xsq, Ysq // Test for |x| >= |y| using squares | |
0347518d | 806 | nop.i 999 |
d5efd131 MF |
807 | } |
808 | { .mfb | |
809 | nop.m 999 | |
810 | fma.s1 V = ArgX_abs, f1, f0 // Set V assuming |x| < |y| | |
811 | br.cond.sptk ATANL_COMMON // Branch to common code | |
812 | } | |
813 | ;; | |
814 | ||
815 | GLOBAL_IEEE754_END(atanl) | |
816 | ||
817 | GLOBAL_IEEE754_ENTRY(atan2l) | |
818 | ||
819 | { .mfi | |
820 | alloc r32 = ar.pfs, 0, 17, 4, 0 | |
821 | fma.s1 Ysq = ArgY_orig, ArgY_orig, f0 // Form y*y | |
822 | nop.i 999 | |
823 | } | |
824 | { .mfi | |
825 | addl table_ptr1 = @ltoff(Constants_atan#), gp // Address of table pointer | |
826 | fma.s1 Xsq = ArgX_orig, ArgX_orig, f0 // Form x*x | |
827 | nop.i 999 | |
828 | } | |
829 | ;; | |
830 | ||
831 | { .mfi | |
832 | ld8 table_ptr1 = [table_ptr1] // Get table pointer | |
833 | fnorm.s1 ArgY = ArgY_orig | |
834 | nop.i 999 | |
835 | } | |
836 | { .mfi | |
837 | nop.m 999 | |
838 | fnorm.s1 ArgX = ArgX_orig | |
839 | nop.i 999 | |
840 | } | |
841 | ;; | |
842 | ||
843 | { .mfi | |
844 | getf.exp sign_X = ArgX_orig // Get signexp of x | |
845 | fmerge.s ArgX_abs = f0, ArgX_orig // Form |x| | |
846 | nop.i 999 | |
847 | } | |
848 | ;; | |
849 | ||
850 | { .mfi | |
851 | getf.exp sign_Y = ArgY_orig // Get signexp of y | |
852 | fmerge.s ArgY_abs = f0, ArgY_orig // Form |y| | |
853 | mov table_base = table_ptr1 // Save base pointer to tables | |
854 | } | |
855 | ;; | |
856 | ||
857 | { .mfi | |
858 | ldfd P_hi = [table_ptr1],8 // Load double precision hi part of pi | |
859 | fclass.m p8,p0 = ArgY_orig, 0x1e7 // Test y natval, nan, inf, zero | |
0347518d | 860 | nop.i 999 |
d5efd131 MF |
861 | } |
862 | ;; | |
863 | ||
864 | { .mfi | |
865 | ldfps P_lo, TWO_TO_NEG3 = [table_ptr1], 8 // Load P_lo and constant 2^-3 | |
866 | fclass.m p9,p0 = ArgX_orig, 0x1e7 // Test x natval, nan, inf, zero | |
0347518d | 867 | nop.i 999 |
d5efd131 MF |
868 | } |
869 | { .mfi | |
870 | nop.m 999 | |
871 | fma.s1 M = f1, f1, f0 // Set M = 1.0 | |
0347518d | 872 | nop.i 999 |
d5efd131 MF |
873 | } |
874 | ;; | |
875 | ||
876 | // | |
877 | // Check for everything - if false, then must be pseudo-zero | |
878 | // or pseudo-nan (IA unsupporteds). | |
879 | // | |
880 | { .mfb | |
881 | nop.m 999 | |
882 | fclass.m p0,p12 = ArgX_orig, 0x1FF // Test x unsupported | |
883 | (p8) br.cond.spnt ATANL_Y_SPECIAL // Branch if y natval, nan, inf, zero | |
884 | } | |
885 | ;; | |
886 | ||
887 | // U = max(ArgX_abs,ArgY_abs) | |
888 | // V = min(ArgX_abs,ArgY_abs) | |
889 | { .mfi | |
890 | nop.m 999 | |
891 | fcmp.ge.s1 p6,p7 = Xsq, Ysq // Test for |x| >= |y| using squares | |
0347518d | 892 | nop.i 999 |
d5efd131 MF |
893 | } |
894 | { .mfb | |
895 | nop.m 999 | |
896 | fma.s1 V = ArgX_abs, f1, f0 // Set V assuming |x| < |y| | |
897 | (p9) br.cond.spnt ATANL_X_SPECIAL // Branch if x natval, nan, inf, zero | |
898 | } | |
899 | ;; | |
900 | ||
901 | // Now common code for atanl and atan2l | |
902 | ATANL_COMMON: | |
903 | { .mfi | |
904 | nop.m 999 | |
905 | fclass.m p0,p13 = ArgY_orig, 0x1FF // Test y unsupported | |
906 | shr sign_X = sign_X, 17 // Get sign bit of x | |
907 | } | |
908 | { .mfi | |
909 | nop.m 999 | |
910 | fma.s1 U = ArgY_abs, f1, f0 // Set U assuming |x| < |y| | |
911 | adds table_ptr1 = 176, table_ptr1 // Point to Q4 | |
912 | } | |
913 | ;; | |
914 | ||
915 | { .mfi | |
916 | (p6) add swap = r0, r0 // Set swap=0 if |x| >= |y| | |
917 | (p6) frcpa.s1 E, p0 = ArgY_abs, ArgX_abs // Compute E if |x| >= |y| | |
918 | shr sign_Y = sign_Y, 17 // Get sign bit of y | |
919 | } | |
920 | { .mfb | |
921 | nop.m 999 | |
922 | (p6) fma.s1 V = ArgY_abs, f1, f0 // Set V if |x| >= |y| | |
923 | (p12) br.cond.spnt ATANL_UNSUPPORTED // Branch if x unsupported | |
924 | } | |
925 | ;; | |
926 | ||
927 | // Set p8 if y >=0 | |
928 | // Set p9 if y < 0 | |
929 | // Set p10 if |x| >= |y| and x >=0 | |
930 | // Set p11 if |x| >= |y| and x < 0 | |
931 | { .mfi | |
932 | cmp.eq p8, p9 = 0, sign_Y // Test for y >= 0 | |
933 | (p7) frcpa.s1 E, p0 = ArgX_abs, ArgY_abs // Compute E if |x| < |y| | |
934 | (p7) add swap = 1, r0 // Set swap=1 if |x| < |y| | |
935 | } | |
936 | { .mfb | |
937 | (p6) cmp.eq.unc p10, p11 = 0, sign_X // If |x| >= |y|, test for x >= 0 | |
938 | (p6) fma.s1 U = ArgX_abs, f1, f0 // Set U if |x| >= |y| | |
939 | (p13) br.cond.spnt ATANL_UNSUPPORTED // Branch if y unsupported | |
940 | } | |
941 | ;; | |
942 | ||
943 | // | |
944 | // if p8, s_Y = 1.0 | |
945 | // if p9, s_Y = -1.0 | |
946 | // | |
947 | .pred.rel "mutex",p8,p9 | |
948 | { .mfi | |
949 | nop.m 999 | |
950 | (p8) fadd.s1 s_Y = f0, f1 // If y >= 0 set s_Y = 1.0 | |
951 | nop.i 999 | |
952 | } | |
953 | { .mfi | |
954 | nop.m 999 | |
955 | (p9) fsub.s1 s_Y = f0, f1 // If y < 0 set s_Y = -1.0 | |
956 | nop.i 999 | |
957 | } | |
958 | ;; | |
959 | ||
960 | .pred.rel "mutex",p10,p11 | |
961 | { .mfi | |
962 | nop.m 999 | |
963 | (p10) fsub.s1 M = M, f1 // If |x| >= |y| and x >=0, set M=0 | |
964 | nop.i 999 | |
965 | } | |
966 | { .mfi | |
967 | nop.m 999 | |
968 | (p11) fadd.s1 M = M, f1 // If |x| >= |y| and x < 0, set M=2.0 | |
969 | nop.i 999 | |
970 | } | |
971 | ;; | |
972 | ||
973 | { .mfi | |
974 | nop.m 999 | |
975 | fcmp.eq.s0 p0, p9 = ArgX_orig, ArgY_orig // Dummy to set denormal flag | |
976 | nop.i 999 | |
977 | } | |
978 | // ************************************************* | |
979 | // ********************* STEP2 ********************* | |
980 | // ************************************************* | |
981 | // | |
982 | // Q = E * V | |
983 | // | |
984 | { .mfi | |
985 | nop.m 999 | |
986 | fmpy.s1 Q = E, V | |
987 | nop.i 999 | |
988 | } | |
989 | ;; | |
990 | ||
991 | { .mfi | |
992 | nop.m 999 | |
993 | fnma.s1 E_hold = E, U, f1 // E_hold = 1.0 - E*U (1) if POLY path | |
994 | nop.i 999 | |
995 | } | |
996 | ;; | |
997 | ||
0347518d | 998 | // Create a single precision representation of the signexp of Q with the |
d5efd131 MF |
999 | // 4 most significant bits of the significand followed by a 1 and then 18 0's |
1000 | { .mfi | |
1001 | nop.m 999 | |
1002 | fmpy.s1 P_hi = M, P_hi | |
1003 | dep.z special = 0x1, 18, 1 // Form 0x0000000000040000 | |
1004 | } | |
1005 | { .mfi | |
1006 | nop.m 999 | |
1007 | fmpy.s1 P_lo = M, P_lo | |
1008 | add table_ptr2 = 32, table_ptr1 | |
1009 | } | |
1010 | ;; | |
1011 | ||
1012 | { .mfi | |
1013 | nop.m 999 | |
1014 | fma.s1 A_temp = Q, f1, f0 // Set A_temp if POLY path | |
1015 | nop.i 999 | |
1016 | } | |
1017 | { .mfi | |
1018 | nop.m 999 | |
1019 | fma.s1 E = E, E_hold, E // E = E + E*E_hold (1) if POLY path | |
1020 | nop.i 999 | |
1021 | } | |
1022 | ;; | |
1023 | ||
1024 | // | |
1025 | // Is Q < 2**(-3)? | |
1026 | // swap = xor(swap,sign_X) | |
1027 | // | |
1028 | { .mfi | |
1029 | nop.m 999 | |
1030 | fcmp.lt.s1 p9, p0 = Q, TWO_TO_NEG3 // Test Q < 2^-3 | |
1031 | xor swap = sign_X, swap | |
1032 | } | |
1033 | ;; | |
1034 | ||
1035 | // P_hi = s_Y * P_hi | |
1036 | { .mmf | |
1037 | getf.exp exponent_Q = Q // Get signexp of Q | |
1038 | cmp.eq.unc p7, p6 = 0x00000, swap | |
1039 | fmpy.s1 P_hi = s_Y, P_hi | |
1040 | } | |
1041 | ;; | |
1042 | ||
1043 | // | |
1044 | // if (PR_1) sigma = -1.0 | |
1045 | // if (PR_2) sigma = 1.0 | |
1046 | // | |
1047 | { .mfi | |
1048 | getf.sig significand_Q = Q // Get significand of Q | |
1049 | (p6) fsub.s1 sigma = f0, f1 | |
1050 | nop.i 999 | |
1051 | } | |
1052 | { .mfb | |
1053 | (p9) add table_ptr1 = 128, table_base // Point to P8 if POLY path | |
1054 | (p7) fadd.s1 sigma = f0, f1 | |
1055 | (p9) br.cond.spnt ATANL_POLY // Branch to POLY if 0 < Q < 2^-3 | |
1056 | } | |
1057 | ;; | |
1058 | ||
1059 | // | |
1060 | // ************************************************* | |
1061 | // ******************** STEP3 ********************** | |
1062 | // ************************************************* | |
1063 | // | |
1064 | // lookup = b_1 b_2 b_3 B_4 | |
1065 | // | |
1066 | { .mmi | |
1067 | nop.m 999 | |
1068 | nop.m 999 | |
1069 | andcm k = 0x0003, exponent_Q // k=0,1,2,3 for exp_Q=0,-1,-2,-3 | |
1070 | } | |
1071 | ;; | |
1072 | ||
1073 | // | |
0347518d | 1074 | // Generate sign_exp_Q b_1 b_2 b_3 b_4 1 0 0 0 ... 0 in single precision |
d5efd131 MF |
1075 | // representation. Note sign of Q is always 0. |
1076 | // | |
1077 | { .mfi | |
1078 | cmp.eq p8, p9 = 0x0000, k // Test k=0 | |
1079 | nop.f 999 | |
1080 | extr.u lookup = significand_Q, 59, 4 // Extract b_1 b_2 b_3 b_4 for index | |
1081 | } | |
1082 | { .mfi | |
1083 | sub sp_exp_Q = 0x7f, k // Form single prec biased exp of Q | |
1084 | nop.f 999 | |
1085 | sub k = k, r0, 1 // Decrement k | |
1086 | } | |
1087 | ;; | |
1088 | ||
1089 | // Form pointer to B index table | |
1090 | { .mfi | |
1091 | ldfe Q_4 = [table_ptr1], -16 // Load Q_4 | |
1092 | nop.f 999 | |
1093 | (p9) shl k = k, 8 // k = 0, 256, or 512 | |
1094 | } | |
1095 | { .mfi | |
1096 | (p9) shladd table_ptr2 = lookup, 4, table_ptr2 | |
1097 | nop.f 999 | |
1098 | shladd sp_exp_4sig_Q = sp_exp_Q, 4, lookup // Shift and add in 4 high bits | |
1099 | } | |
1100 | ;; | |
1101 | ||
1102 | { .mmi | |
1103 | (p8) add table_ptr2 = -16, table_ptr2 // Pointer if original k was 0 | |
1104 | (p9) add table_ptr2 = k, table_ptr2 // Pointer if k was 1, 2, 3 | |
1105 | dep special = sp_exp_4sig_Q, special, 19, 13 // Form z_hi as single prec | |
1106 | } | |
1107 | ;; | |
1108 | ||
1109 | // z_hi = s exp 1.b_1 b_2 b_3 b_4 1 0 0 0 ... 0 | |
1110 | { .mmi | |
1111 | ldfd Tbl_hi = [table_ptr2], 8 // Load Tbl_hi from index table | |
1112 | ;; | |
1113 | setf.s z_hi = special // Form z_hi | |
1114 | nop.i 999 | |
1115 | } | |
1116 | { .mmi | |
1117 | ldfs Tbl_lo = [table_ptr2], 8 // Load Tbl_lo from index table | |
1118 | ;; | |
1119 | ldfe Q_3 = [table_ptr1], -16 // Load Q_3 | |
1120 | nop.i 999 | |
1121 | } | |
1122 | ;; | |
1123 | ||
1124 | { .mmi | |
1125 | ldfe Q_2 = [table_ptr1], -16 // Load Q_2 | |
1126 | nop.m 999 | |
1127 | nop.i 999 | |
1128 | } | |
1129 | ;; | |
1130 | ||
1131 | { .mmf | |
1132 | ldfe Q_1 = [table_ptr1], -16 // Load Q_1 | |
1133 | nop.m 999 | |
1134 | nop.f 999 | |
1135 | } | |
1136 | ;; | |
1137 | ||
1138 | { .mfi | |
1139 | nop.m 999 | |
1140 | fma.s1 U_prime_hi = V, z_hi, U // U_prime_hi = U + V * z_hi | |
1141 | nop.i 999 | |
1142 | } | |
1143 | { .mfi | |
1144 | nop.m 999 | |
1145 | fnma.s1 V_prime = U, z_hi, V // V_prime = V - U * z_hi | |
1146 | nop.i 999 | |
1147 | } | |
1148 | ;; | |
1149 | ||
1150 | { .mfi | |
1151 | nop.m 999 | |
1152 | mov A_hi = Tbl_hi // Start with A_hi = Tbl_hi | |
1153 | nop.i 999 | |
1154 | } | |
1155 | ;; | |
1156 | ||
1157 | { .mfi | |
1158 | nop.m 999 | |
1159 | fsub.s1 U_hold = U, U_prime_hi // U_hold = U - U_prime_hi | |
1160 | nop.i 999 | |
1161 | } | |
1162 | ;; | |
1163 | ||
1164 | { .mfi | |
1165 | nop.m 999 | |
1166 | frcpa.s1 C_hi, p0 = f1, U_prime_hi // C_hi = frcpa(1,U_prime_hi) | |
1167 | nop.i 999 | |
1168 | } | |
1169 | ;; | |
1170 | ||
1171 | { .mfi | |
1172 | nop.m 999 | |
1173 | fmpy.s1 A_hi = s_Y, A_hi // A_hi = s_Y * A_hi | |
1174 | nop.i 999 | |
1175 | } | |
1176 | ;; | |
1177 | ||
1178 | { .mfi | |
1179 | nop.m 999 | |
1180 | fma.s1 U_prime_lo = z_hi, V, U_hold // U_prime_lo = U_hold + V * z_hi | |
1181 | nop.i 999 | |
1182 | } | |
1183 | ;; | |
1184 | ||
1185 | // C_hi_hold = 1 - C_hi * U_prime_hi (1) | |
1186 | { .mfi | |
1187 | nop.m 999 | |
0347518d | 1188 | fnma.s1 C_hi_hold = C_hi, U_prime_hi, f1 |
d5efd131 MF |
1189 | nop.i 999 |
1190 | } | |
1191 | ;; | |
1192 | ||
1193 | { .mfi | |
1194 | nop.m 999 | |
1195 | fma.s1 Res_hi = sigma, A_hi, P_hi // Res_hi = P_hi + sigma * A_hi | |
1196 | nop.i 999 | |
1197 | } | |
1198 | ;; | |
1199 | ||
1200 | { .mfi | |
1201 | nop.m 999 | |
1202 | fma.s1 C_hi = C_hi_hold, C_hi, C_hi // C_hi = C_hi + C_hi * C_hi_hold (1) | |
1203 | nop.i 999 | |
1204 | } | |
1205 | ;; | |
1206 | ||
1207 | // C_hi_hold = 1 - C_hi * U_prime_hi (2) | |
1208 | { .mfi | |
1209 | nop.m 999 | |
1210 | fnma.s1 C_hi_hold = C_hi, U_prime_hi, f1 | |
1211 | nop.i 999 | |
1212 | } | |
1213 | ;; | |
1214 | ||
1215 | { .mfi | |
1216 | nop.m 999 | |
1217 | fma.s1 C_hi = C_hi_hold, C_hi, C_hi // C_hi = C_hi + C_hi * C_hi_hold (2) | |
1218 | nop.i 999 | |
1219 | } | |
1220 | ;; | |
1221 | ||
1222 | // C_hi_hold = 1 - C_hi * U_prime_hi (3) | |
1223 | { .mfi | |
1224 | nop.m 999 | |
0347518d | 1225 | fnma.s1 C_hi_hold = C_hi, U_prime_hi, f1 |
d5efd131 MF |
1226 | nop.i 999 |
1227 | } | |
1228 | ;; | |
1229 | ||
1230 | { .mfi | |
1231 | nop.m 999 | |
1232 | fma.s1 C_hi = C_hi_hold, C_hi, C_hi // C_hi = C_hi + C_hi * C_hi_hold (3) | |
1233 | nop.i 999 | |
1234 | } | |
1235 | ;; | |
1236 | ||
1237 | { .mfi | |
1238 | nop.m 999 | |
1239 | fmpy.s1 w_hi = V_prime, C_hi // w_hi = V_prime * C_hi | |
1240 | nop.i 999 | |
1241 | } | |
1242 | ;; | |
1243 | ||
1244 | { .mfi | |
1245 | nop.m 999 | |
1246 | fmpy.s1 wsq = w_hi, w_hi // wsq = w_hi * w_hi | |
1247 | nop.i 999 | |
1248 | } | |
1249 | { .mfi | |
1250 | nop.m 999 | |
1251 | fnma.s1 w_lo = w_hi, U_prime_hi, V_prime // w_lo = V_prime-w_hi*U_prime_hi | |
1252 | nop.i 999 | |
1253 | } | |
1254 | ;; | |
1255 | ||
1256 | { .mfi | |
1257 | nop.m 999 | |
1258 | fma.s1 poly = wsq, Q_4, Q_3 // poly = Q_3 + wsq * Q_4 | |
1259 | nop.i 999 | |
1260 | } | |
1261 | { .mfi | |
1262 | nop.m 999 | |
1263 | fnma.s1 w_lo = w_hi, U_prime_lo, w_lo // w_lo = w_lo - w_hi * U_prime_lo | |
1264 | nop.i 999 | |
1265 | } | |
1266 | ;; | |
1267 | ||
1268 | { .mfi | |
1269 | nop.m 999 | |
1270 | fma.s1 poly = wsq, poly, Q_2 // poly = Q_2 + wsq * poly | |
1271 | nop.i 999 | |
1272 | } | |
1273 | { .mfi | |
1274 | nop.m 999 | |
1275 | fmpy.s1 w_lo = C_hi, w_lo // w_lo = = w_lo * C_hi | |
1276 | nop.i 999 | |
1277 | } | |
1278 | ;; | |
1279 | ||
1280 | { .mfi | |
1281 | nop.m 999 | |
1282 | fma.s1 poly = wsq, poly, Q_1 // poly = Q_1 + wsq * poly | |
1283 | nop.i 999 | |
1284 | } | |
1285 | { .mfi | |
1286 | nop.m 999 | |
1287 | fadd.s1 A_lo = Tbl_lo, w_lo // A_lo = Tbl_lo + w_lo | |
1288 | nop.i 999 | |
1289 | } | |
1290 | ;; | |
1291 | ||
1292 | { .mfi | |
1293 | nop.m 999 | |
1294 | fmpy.s0 Q_1 = Q_1, Q_1 // Dummy operation to raise inexact | |
1295 | nop.i 999 | |
1296 | } | |
1297 | ;; | |
1298 | ||
1299 | { .mfi | |
1300 | nop.m 999 | |
1301 | fmpy.s1 poly = wsq, poly // poly = wsq * poly | |
1302 | nop.i 999 | |
1303 | } | |
1304 | ;; | |
1305 | ||
1306 | { .mfi | |
1307 | nop.m 999 | |
1308 | fmpy.s1 poly = w_hi, poly // poly = w_hi * poly | |
1309 | nop.i 999 | |
1310 | } | |
1311 | ;; | |
1312 | ||
1313 | { .mfi | |
1314 | nop.m 999 | |
1315 | fadd.s1 A_lo = A_lo, poly // A_lo = A_lo + poly | |
1316 | nop.i 999 | |
1317 | } | |
1318 | ;; | |
1319 | ||
1320 | { .mfi | |
1321 | nop.m 999 | |
1322 | fadd.s1 A_lo = A_lo, w_hi // A_lo = A_lo + w_hi | |
1323 | nop.i 999 | |
1324 | } | |
1325 | ;; | |
1326 | ||
1327 | { .mfi | |
1328 | nop.m 999 | |
1329 | fma.s1 Res_lo = sigma, A_lo, P_lo // Res_lo = P_lo + sigma * A_lo | |
1330 | nop.i 999 | |
1331 | } | |
1332 | ;; | |
1333 | ||
1334 | // | |
1335 | // Result = Res_hi + Res_lo * s_Y (User Supplied Rounding Mode) | |
1336 | // | |
1337 | { .mfb | |
1338 | nop.m 999 | |
1339 | fma.s0 Result = Res_lo, s_Y, Res_hi | |
1340 | br.ret.sptk b0 // Exit table path 2^-3 <= V/U < 1 | |
1341 | } | |
1342 | ;; | |
1343 | ||
1344 | ||
0347518d | 1345 | ATANL_POLY: |
d5efd131 MF |
1346 | // Here if 0 < V/U < 2^-3 |
1347 | // | |
1348 | // *********************************************** | |
1349 | // ******************** STEP4 ******************** | |
1350 | // *********************************************** | |
1351 | ||
1352 | // | |
1353 | // Following: | |
1354 | // Iterate 3 times E = E + E*(1.0 - E*U) | |
1355 | // Also load P_8, P_7, P_6, P_5, P_4 | |
1356 | // | |
1357 | { .mfi | |
1358 | ldfe P_8 = [table_ptr1], -16 // Load P_8 | |
1359 | fnma.s1 z_lo = A_temp, U, V // z_lo = V - A_temp * U | |
1360 | nop.i 999 | |
1361 | } | |
1362 | { .mfi | |
1363 | nop.m 999 | |
1364 | fnma.s1 E_hold = E, U, f1 // E_hold = 1.0 - E*U (2) | |
1365 | nop.i 999 | |
1366 | } | |
1367 | ;; | |
1368 | ||
1369 | { .mmi | |
1370 | ldfe P_7 = [table_ptr1], -16 // Load P_7 | |
1371 | ;; | |
1372 | ldfe P_6 = [table_ptr1], -16 // Load P_6 | |
1373 | nop.i 999 | |
1374 | } | |
1375 | ;; | |
1376 | ||
1377 | { .mfi | |
1378 | ldfe P_5 = [table_ptr1], -16 // Load P_5 | |
1379 | fma.s1 E = E, E_hold, E // E = E + E_hold*E (2) | |
1380 | nop.i 999 | |
1381 | } | |
1382 | ;; | |
1383 | ||
1384 | { .mmi | |
1385 | ldfe P_4 = [table_ptr1], -16 // Load P_4 | |
1386 | ;; | |
1387 | ldfe P_3 = [table_ptr1], -16 // Load P_3 | |
1388 | nop.i 999 | |
1389 | } | |
1390 | ;; | |
1391 | ||
1392 | { .mfi | |
1393 | ldfe P_2 = [table_ptr1], -16 // Load P_2 | |
1394 | fnma.s1 E_hold = E, U, f1 // E_hold = 1.0 - E*U (3) | |
1395 | nop.i 999 | |
1396 | } | |
1397 | { .mlx | |
1398 | nop.m 999 | |
1399 | movl int_temp = 0x24005 // Signexp for small neg number | |
1400 | } | |
1401 | ;; | |
1402 | ||
1403 | { .mmf | |
1404 | ldfe P_1 = [table_ptr1], -16 // Load P_1 | |
1405 | setf.exp tmp_small = int_temp // Form small neg number | |
1406 | fma.s1 E = E, E_hold, E // E = E + E_hold*E (3) | |
1407 | } | |
1408 | ;; | |
1409 | ||
1410 | // | |
1411 | // | |
1412 | // At this point E approximates 1/U to roughly working precision | |
1413 | // Z = V*E approximates V/U | |
1414 | // | |
1415 | { .mfi | |
1416 | nop.m 999 | |
1417 | fmpy.s1 Z = V, E // Z = V * E | |
1418 | nop.i 999 | |
1419 | } | |
1420 | { .mfi | |
1421 | nop.m 999 | |
1422 | fmpy.s1 z_lo = z_lo, E // z_lo = z_lo * E | |
1423 | nop.i 999 | |
1424 | } | |
1425 | ;; | |
1426 | ||
1427 | // | |
1428 | // Now what we want to do is | |
1429 | // poly1 = P_4 + zsq*(P_5 + zsq*(P_6 + zsq*(P_7 + zsq*P_8))) | |
1430 | // poly2 = zsq*(P_1 + zsq*(P_2 + zsq*P_3)) | |
1431 | // | |
1432 | // | |
1433 | // Fixup added to force inexact later - | |
1434 | // A_hi = A_temp + z_lo | |
1435 | // z_lo = (A_temp - A_hi) + z_lo | |
1436 | // | |
1437 | { .mfi | |
1438 | nop.m 999 | |
1439 | fmpy.s1 zsq = Z, Z // zsq = Z * Z | |
1440 | nop.i 999 | |
1441 | } | |
1442 | { .mfi | |
1443 | nop.m 999 | |
1444 | fadd.s1 A_hi = A_temp, z_lo // A_hi = A_temp + z_lo | |
1445 | nop.i 999 | |
1446 | } | |
1447 | ;; | |
1448 | ||
1449 | { .mfi | |
1450 | nop.m 999 | |
1451 | fma.s1 poly1 = zsq, P_8, P_7 // poly1 = P_7 + zsq * P_8 | |
1452 | nop.i 999 | |
1453 | } | |
1454 | { .mfi | |
1455 | nop.m 999 | |
1456 | fma.s1 poly2 = zsq, P_3, P_2 // poly2 = P_2 + zsq * P_3 | |
1457 | nop.i 999 | |
1458 | } | |
1459 | ;; | |
1460 | ||
1461 | { .mfi | |
1462 | nop.m 999 | |
1463 | fmpy.s1 z4 = zsq, zsq // z4 = zsq * zsq | |
1464 | nop.i 999 | |
1465 | } | |
1466 | { .mfi | |
1467 | nop.m 999 | |
1468 | fsub.s1 A_temp = A_temp, A_hi // A_temp = A_temp - A_hi | |
1469 | nop.i 999 | |
1470 | } | |
1471 | ;; | |
1472 | ||
1473 | { .mfi | |
1474 | nop.m 999 | |
1475 | fmerge.s tmp = A_hi, A_hi // Copy tmp = A_hi | |
1476 | nop.i 999 | |
1477 | } | |
1478 | ;; | |
1479 | ||
1480 | { .mfi | |
1481 | nop.m 999 | |
1482 | fma.s1 poly1 = zsq, poly1, P_6 // poly1 = P_6 + zsq * poly1 | |
1483 | nop.i 999 | |
1484 | } | |
1485 | { .mfi | |
1486 | nop.m 999 | |
1487 | fma.s1 poly2 = zsq, poly2, P_1 // poly2 = P_2 + zsq * poly2 | |
1488 | nop.i 999 | |
1489 | } | |
1490 | ;; | |
1491 | ||
1492 | { .mfi | |
1493 | nop.m 999 | |
1494 | fmpy.s1 z8 = z4, z4 // z8 = z4 * z4 | |
1495 | nop.i 999 | |
1496 | } | |
1497 | { .mfi | |
1498 | nop.m 999 | |
1499 | fadd.s1 z_lo = A_temp, z_lo // z_lo = (A_temp - A_hi) + z_lo | |
1500 | nop.i 999 | |
1501 | } | |
1502 | ;; | |
1503 | ||
1504 | { .mfi | |
1505 | nop.m 999 | |
1506 | fma.s1 poly1 = zsq, poly1, P_5 // poly1 = P_5 + zsq * poly1 | |
1507 | nop.i 999 | |
1508 | } | |
1509 | { .mfi | |
1510 | nop.m 999 | |
1511 | fmpy.s1 poly2 = poly2, zsq // poly2 = zsq * poly2 | |
1512 | nop.i 999 | |
1513 | } | |
1514 | ;; | |
1515 | ||
1516 | // Create small GR double in case need to raise underflow | |
1517 | { .mfi | |
1518 | nop.m 999 | |
1519 | fma.s1 poly1 = zsq, poly1, P_4 // poly1 = P_4 + zsq * poly1 | |
1520 | dep GR_temp = -1,r0,0,53 | |
1521 | } | |
1522 | ;; | |
1523 | ||
1524 | // Create small double in case need to raise underflow | |
1525 | { .mfi | |
0347518d | 1526 | setf.d FR_temp = GR_temp |
d5efd131 MF |
1527 | fma.s1 poly = z8, poly1, poly2 // poly = poly2 + z8 * poly1 |
1528 | nop.i 999 | |
1529 | } | |
1530 | ;; | |
1531 | ||
1532 | { .mfi | |
1533 | nop.m 999 | |
1534 | fma.s1 A_lo = Z, poly, z_lo // A_lo = z_lo + Z * poly | |
1535 | nop.i 999 | |
1536 | } | |
1537 | ;; | |
1538 | ||
1539 | { .mfi | |
1540 | nop.m 999 | |
1541 | fadd.s1 A_hi = tmp, A_lo // A_hi = tmp + A_lo | |
1542 | nop.i 999 | |
1543 | } | |
1544 | ;; | |
1545 | ||
1546 | { .mfi | |
1547 | nop.m 999 | |
1548 | fsub.s1 tmp = tmp, A_hi // tmp = tmp - A_hi | |
1549 | nop.i 999 | |
1550 | } | |
1551 | { .mfi | |
1552 | nop.m 999 | |
1553 | fmpy.s1 A_hi = s_Y, A_hi // A_hi = s_Y * A_hi | |
1554 | nop.i 999 | |
1555 | } | |
1556 | ;; | |
1557 | ||
1558 | { .mfi | |
1559 | nop.m 999 | |
1560 | fadd.s1 A_lo = tmp, A_lo // A_lo = tmp + A_lo | |
1561 | nop.i 999 | |
1562 | } | |
1563 | { .mfi | |
1564 | nop.m 999 | |
1565 | fma.s1 Res_hi = sigma, A_hi, P_hi // Res_hi = P_hi + sigma * A_hi | |
1566 | nop.i 999 | |
1567 | } | |
1568 | ;; | |
1569 | ||
1570 | { .mfi | |
1571 | nop.m 999 | |
1572 | fsub.s1 tmp = P_hi, Res_hi // tmp = P_hi - Res_hi | |
1573 | nop.i 999 | |
1574 | } | |
1575 | ;; | |
1576 | ||
1577 | // | |
1578 | // Test if A_lo is zero | |
1579 | // | |
1580 | { .mfi | |
1581 | nop.m 999 | |
1582 | fclass.m p6,p0 = A_lo, 0x007 // Test A_lo = 0 | |
1583 | nop.i 999 | |
1584 | } | |
1585 | ;; | |
1586 | ||
1587 | { .mfi | |
1588 | nop.m 999 | |
1589 | (p6) mov A_lo = tmp_small // If A_lo zero, make very small | |
1590 | nop.i 999 | |
1591 | } | |
1592 | ;; | |
1593 | ||
1594 | { .mfi | |
1595 | nop.m 999 | |
1596 | fma.s1 tmp = A_hi, sigma, tmp // tmp = sigma * A_hi + tmp | |
1597 | nop.i 999 | |
1598 | } | |
1599 | { .mfi | |
1600 | nop.m 999 | |
1601 | fma.s1 sigma = A_lo, sigma, P_lo // sigma = A_lo * sigma + P_lo | |
1602 | nop.i 999 | |
1603 | } | |
1604 | ;; | |
1605 | ||
1606 | { .mfi | |
1607 | nop.m 999 | |
1608 | fma.s1 Res_lo = s_Y, sigma, tmp // Res_lo = s_Y * sigma + tmp | |
1609 | nop.i 999 | |
1610 | } | |
1611 | ;; | |
1612 | ||
1613 | // | |
1614 | // Test if Res_lo is denormal | |
1615 | // | |
1616 | { .mfi | |
1617 | nop.m 999 | |
1618 | fclass.m p14, p15 = Res_lo, 0x0b | |
1619 | nop.i 999 | |
1620 | } | |
1621 | ;; | |
1622 | ||
1623 | // | |
1624 | // Compute Result = Res_lo + Res_hi. Use s3 if Res_lo is denormal. | |
1625 | // | |
1626 | { .mfi | |
1627 | nop.m 999 | |
1628 | (p14) fadd.s3 Result = Res_lo, Res_hi // Result for Res_lo denormal | |
1629 | nop.i 999 | |
1630 | } | |
1631 | { .mfi | |
1632 | nop.m 999 | |
1633 | (p15) fadd.s0 Result = Res_lo, Res_hi // Result for Res_lo normal | |
1634 | nop.i 999 | |
1635 | } | |
1636 | ;; | |
1637 | ||
0347518d | 1638 | // |
d5efd131 | 1639 | // If Res_lo is denormal test if Result equals zero |
0347518d | 1640 | // |
d5efd131 MF |
1641 | { .mfi |
1642 | nop.m 999 | |
1643 | (p14) fclass.m.unc p14, p0 = Result, 0x07 | |
1644 | nop.i 999 | |
1645 | } | |
1646 | ;; | |
1647 | ||
1648 | // | |
1649 | // If Res_lo is denormal and Result equals zero, raise inexact, underflow | |
1650 | // by squaring small double | |
1651 | // | |
1652 | { .mfb | |
1653 | nop.m 999 | |
1654 | (p14) fmpy.d.s0 FR_temp = FR_temp, FR_temp | |
1655 | br.ret.sptk b0 // Exit POLY path, 0 < Q < 2^-3 | |
1656 | } | |
1657 | ;; | |
1658 | ||
1659 | ||
0347518d | 1660 | ATANL_UNSUPPORTED: |
d5efd131 MF |
1661 | { .mfb |
1662 | nop.m 999 | |
0347518d | 1663 | fmpy.s0 Result = ArgX,ArgY |
d5efd131 MF |
1664 | br.ret.sptk b0 |
1665 | } | |
1666 | ;; | |
1667 | ||
1668 | // Here if y natval, nan, inf, zero | |
1669 | ATANL_Y_SPECIAL: | |
1670 | // Here if x natval, nan, inf, zero | |
1671 | ATANL_X_SPECIAL: | |
1672 | { .mfi | |
1673 | nop.m 999 | |
1674 | fclass.m p13,p12 = ArgY_orig, 0x0c3 // Test y nan | |
1675 | nop.i 999 | |
1676 | } | |
1677 | ;; | |
1678 | ||
1679 | { .mfi | |
1680 | nop.m 999 | |
1681 | fclass.m p15,p14 = ArgY_orig, 0x103 // Test y natval | |
1682 | nop.i 999 | |
1683 | } | |
1684 | ;; | |
1685 | ||
1686 | { .mfi | |
1687 | nop.m 999 | |
1688 | (p12) fclass.m p13,p0 = ArgX_orig, 0x0c3 // Test x nan | |
1689 | nop.i 999 | |
1690 | } | |
1691 | ;; | |
1692 | ||
1693 | { .mfi | |
1694 | nop.m 999 | |
1695 | (p14) fclass.m p15,p0 = ArgX_orig, 0x103 // Test x natval | |
1696 | nop.i 999 | |
1697 | } | |
1698 | ;; | |
1699 | ||
1700 | { .mfb | |
1701 | nop.m 999 | |
1702 | (p13) fmpy.s0 Result = ArgX_orig, ArgY_orig // Result nan if x or y nan | |
1703 | (p13) br.ret.spnt b0 // Exit if x or y nan | |
1704 | } | |
1705 | ;; | |
1706 | ||
1707 | { .mfb | |
1708 | nop.m 999 | |
1709 | (p15) fmpy.s0 Result = ArgX_orig, ArgY_orig // Result natval if x or y natval | |
1710 | (p15) br.ret.spnt b0 // Exit if x or y natval | |
1711 | } | |
1712 | ;; | |
1713 | ||
1714 | ||
1715 | // Here if x or y inf or zero | |
0347518d | 1716 | ATANL_SPECIAL_HANDLING: |
d5efd131 MF |
1717 | { .mfi |
1718 | nop.m 999 | |
1719 | fclass.m p6, p7 = ArgY_orig, 0x007 // Test y zero | |
1720 | mov special = 992 // Offset to table | |
1721 | } | |
1722 | ;; | |
1723 | ||
1724 | { .mfb | |
1725 | add table_ptr1 = table_base, special // Point to 3pi/4 | |
1726 | fcmp.eq.s0 p0, p9 = ArgX_orig, ArgY_orig // Dummy to set denormal flag | |
1727 | (p7) br.cond.spnt ATANL_ArgY_Not_ZERO // Branch if y not zero | |
1728 | } | |
1729 | ;; | |
1730 | ||
1731 | // Here if y zero | |
1732 | { .mmf | |
1733 | ldfd Result = [table_ptr1], 8 // Get pi high | |
1734 | nop.m 999 | |
1735 | fclass.m p14, p0 = ArgX, 0x035 // Test for x>=+0 | |
1736 | } | |
1737 | ;; | |
1738 | ||
1739 | { .mmf | |
1740 | nop.m 999 | |
1741 | ldfd Result_lo = [table_ptr1], -8 // Get pi lo | |
1742 | fclass.m p15, p0 = ArgX, 0x036 // Test for x<=-0 | |
1743 | } | |
1744 | ;; | |
1745 | ||
1746 | // | |
1747 | // Return sign_Y * 0 when ArgX > +0 | |
1748 | // | |
1749 | { .mfi | |
1750 | nop.m 999 | |
1751 | (p14) fmerge.s Result = ArgY, f0 // If x>=+0, y=0, hi sgn(y)*0 | |
1752 | nop.i 999 | |
1753 | } | |
1754 | ;; | |
1755 | ||
1756 | { .mfi | |
1757 | nop.m 999 | |
1758 | fclass.m p13, p0 = ArgX, 0x007 // Test for x=0 | |
1759 | nop.i 999 | |
1760 | } | |
1761 | ;; | |
1762 | ||
1763 | { .mfi | |
1764 | nop.m 999 | |
1765 | (p14) fmerge.s Result_lo = ArgY, f0 // If x>=+0, y=0, lo sgn(y)*0 | |
1766 | nop.i 999 | |
1767 | } | |
1768 | ;; | |
1769 | ||
1770 | { .mfi | |
1771 | (p13) mov GR_Parameter_TAG = 36 // Error tag for x=0, y=0 | |
1772 | nop.f 999 | |
1773 | nop.i 999 | |
1774 | } | |
1775 | ;; | |
1776 | ||
1777 | // | |
1778 | // Return sign_Y * pi when ArgX < -0 | |
1779 | // | |
1780 | { .mfi | |
1781 | nop.m 999 | |
1782 | (p15) fmerge.s Result = ArgY, Result // If x<0, y=0, hi=sgn(y)*pi | |
1783 | nop.i 999 | |
1784 | } | |
1785 | ;; | |
1786 | ||
1787 | { .mfi | |
1788 | nop.m 999 | |
1789 | (p15) fmerge.s Result_lo = ArgY, Result_lo // If x<0, y=0, lo=sgn(y)*pi | |
1790 | nop.i 999 | |
1791 | } | |
1792 | ;; | |
1793 | ||
1794 | // | |
1795 | // Call error support function for atan(0,0) | |
1796 | // | |
1797 | { .mfb | |
1798 | nop.m 999 | |
1799 | fadd.s0 Result = Result, Result_lo | |
1800 | (p13) br.cond.spnt __libm_error_region // Branch if atan(0,0) | |
1801 | } | |
1802 | ;; | |
1803 | ||
1804 | { .mib | |
1805 | nop.m 999 | |
1806 | nop.i 999 | |
1807 | br.ret.sptk b0 // Exit for y=0, x not 0 | |
1808 | } | |
1809 | ;; | |
1810 | ||
1811 | // Here if y not zero | |
0347518d | 1812 | ATANL_ArgY_Not_ZERO: |
d5efd131 MF |
1813 | { .mfi |
1814 | nop.m 999 | |
1815 | fclass.m p0, p10 = ArgY, 0x023 // Test y inf | |
1816 | nop.i 999 | |
1817 | } | |
1818 | ;; | |
1819 | ||
1820 | { .mfb | |
1821 | nop.m 999 | |
1822 | fclass.m p6, p0 = ArgX, 0x017 // Test for 0 <= |x| < inf | |
1823 | (p10) br.cond.spnt ATANL_ArgY_Not_INF // Branch if 0 < |y| < inf | |
1824 | } | |
1825 | ;; | |
1826 | ||
1827 | // Here if y=inf | |
1828 | // | |
1829 | // Return +PI/2 when ArgY = +Inf and ArgX = +/-0 or normal | |
1830 | // Return -PI/2 when ArgY = -Inf and ArgX = +/-0 or normal | |
1831 | // Return +PI/4 when ArgY = +Inf and ArgX = +Inf | |
1832 | // Return -PI/4 when ArgY = -Inf and ArgX = +Inf | |
1833 | // Return +3PI/4 when ArgY = +Inf and ArgX = -Inf | |
1834 | // Return -3PI/4 when ArgY = -Inf and ArgX = -Inf | |
1835 | // | |
1836 | { .mfi | |
1837 | nop.m 999 | |
1838 | fclass.m p7, p0 = ArgX, 0x021 // Test for x=+inf | |
1839 | nop.i 999 | |
1840 | } | |
1841 | ;; | |
1842 | ||
1843 | { .mfi | |
0347518d | 1844 | (p6) add table_ptr1 = 16, table_ptr1 // Point to pi/2, if x finite |
d5efd131 MF |
1845 | fclass.m p8, p0 = ArgX, 0x022 // Test for x=-inf |
1846 | nop.i 999 | |
1847 | } | |
1848 | ;; | |
1849 | ||
1850 | { .mmi | |
1851 | (p7) add table_ptr1 = 32, table_ptr1 // Point to pi/4 if x=+inf | |
1852 | ;; | |
1853 | (p8) add table_ptr1 = 48, table_ptr1 // Point to 3pi/4 if x=-inf | |
1854 | ||
1855 | nop.i 999 | |
1856 | } | |
1857 | ;; | |
1858 | ||
1859 | { .mmi | |
1860 | ldfd Result = [table_ptr1], 8 // Load pi/2, pi/4, or 3pi/4 hi | |
1861 | ;; | |
1862 | ldfd Result_lo = [table_ptr1], -8 // Load pi/2, pi/4, or 3pi/4 lo | |
1863 | nop.i 999 | |
1864 | } | |
1865 | ;; | |
1866 | ||
1867 | { .mfi | |
1868 | nop.m 999 | |
1869 | fmerge.s Result = ArgY, Result // Merge sgn(y) in hi | |
1870 | nop.i 999 | |
1871 | } | |
1872 | ;; | |
1873 | ||
1874 | { .mfi | |
1875 | nop.m 999 | |
1876 | fmerge.s Result_lo = ArgY, Result_lo // Merge sgn(y) in lo | |
1877 | nop.i 999 | |
1878 | } | |
1879 | ;; | |
1880 | ||
1881 | { .mfb | |
1882 | nop.m 999 | |
1883 | fadd.s0 Result = Result, Result_lo // Compute complete result | |
1884 | br.ret.sptk b0 // Exit for y=inf | |
1885 | } | |
1886 | ;; | |
1887 | ||
1888 | // Here if y not INF, and x=0 or INF | |
0347518d | 1889 | ATANL_ArgY_Not_INF: |
d5efd131 MF |
1890 | // |
1891 | // Return +PI/2 when ArgY NOT Inf, ArgY > 0 and ArgX = +/-0 | |
1892 | // Return -PI/2 when ArgY NOT Inf, ArgY < 0 and ArgX = +/-0 | |
1893 | // Return +0 when ArgY NOT Inf, ArgY > 0 and ArgX = +Inf | |
1894 | // Return -0 when ArgY NOT Inf, ArgY > 0 and ArgX = +Inf | |
1895 | // Return +PI when ArgY NOT Inf, ArgY > 0 and ArgX = -Inf | |
1896 | // Return -PI when ArgY NOT Inf, ArgY > 0 and ArgX = -Inf | |
1897 | // | |
1898 | { .mfi | |
1899 | nop.m 999 | |
1900 | fclass.m p7, p9 = ArgX, 0x021 // Test for x=+inf | |
1901 | nop.i 999 | |
1902 | } | |
1903 | ;; | |
1904 | ||
1905 | { .mfi | |
1906 | nop.m 999 | |
1907 | fclass.m p6, p0 = ArgX, 0x007 // Test for x=0 | |
1908 | nop.i 999 | |
1909 | } | |
1910 | ;; | |
1911 | ||
1912 | { .mfi | |
1913 | (p6) add table_ptr1 = 16, table_ptr1 // Point to pi/2 | |
1914 | fclass.m p8, p0 = ArgX, 0x022 // Test for x=-inf | |
1915 | nop.i 999 | |
1916 | } | |
1917 | ;; | |
1918 | ||
1919 | .pred.rel "mutex",p7,p9 | |
1920 | { .mfi | |
1921 | (p9) ldfd Result = [table_ptr1], 8 // Load pi or pi/2 hi | |
1922 | (p7) fmerge.s Result = ArgY, f0 // If y not inf, x=+inf, sgn(y)*0 | |
1923 | nop.i 999 | |
1924 | } | |
1925 | ;; | |
1926 | ||
1927 | { .mfi | |
1928 | (p9) ldfd Result_lo = [table_ptr1], -8 // Load pi or pi/2 lo | |
1929 | (p7) fnorm.s0 Result = Result // If y not inf, x=+inf normalize | |
1930 | nop.i 999 | |
1931 | } | |
1932 | ;; | |
1933 | ||
1934 | { .mfi | |
1935 | nop.m 999 | |
1936 | (p9) fmerge.s Result = ArgY, Result // Merge sgn(y) in hi | |
1937 | nop.i 999 | |
1938 | } | |
1939 | ;; | |
1940 | ||
1941 | { .mfi | |
1942 | nop.m 999 | |
1943 | (p9) fmerge.s Result_lo = ArgY, Result_lo // Merge sgn(y) in lo | |
1944 | nop.i 999 | |
1945 | } | |
1946 | ;; | |
1947 | ||
1948 | { .mfb | |
1949 | nop.m 999 | |
1950 | (p9) fadd.s0 Result = Result, Result_lo // Compute complete result | |
1951 | br.ret.spnt b0 // Exit for y not inf, x=0,inf | |
1952 | } | |
1953 | ;; | |
1954 | ||
1955 | GLOBAL_IEEE754_END(atan2l) | |
0347518d | 1956 | |
d5efd131 MF |
1957 | LOCAL_LIBM_ENTRY(__libm_error_region) |
1958 | .prologue | |
1959 | { .mfi | |
1960 | add GR_Parameter_Y=-32,sp // Parameter 2 value | |
1961 | nop.f 0 | |
1962 | .save ar.pfs,GR_SAVE_PFS | |
1963 | mov GR_SAVE_PFS=ar.pfs // Save ar.pfs | |
1964 | } | |
1965 | { .mfi | |
1966 | .fframe 64 | |
1967 | add sp=-64,sp // Create new stack | |
1968 | nop.f 0 | |
1969 | mov GR_SAVE_GP=gp // Save gp | |
1970 | };; | |
1971 | { .mmi | |
1972 | stfe [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack | |
1973 | add GR_Parameter_X = 16,sp // Parameter 1 address | |
1974 | .save b0, GR_SAVE_B0 | |
1975 | mov GR_SAVE_B0=b0 // Save b0 | |
1976 | };; | |
1977 | .body | |
1978 | { .mib | |
1979 | stfe [GR_Parameter_X] = FR_X // Store Parameter 1 on stack | |
1980 | add GR_Parameter_RESULT = 0,GR_Parameter_Y | |
1981 | nop.b 0 // Parameter 3 address | |
1982 | } | |
1983 | { .mib | |
1984 | stfe [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack | |
1985 | add GR_Parameter_Y = -16,GR_Parameter_Y | |
1986 | br.call.sptk b0=__libm_error_support# // Call error handling function | |
1987 | };; | |
1988 | { .mmi | |
1989 | nop.m 0 | |
1990 | nop.m 0 | |
1991 | add GR_Parameter_RESULT = 48,sp | |
1992 | };; | |
1993 | { .mmi | |
1994 | ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack | |
1995 | .restore sp | |
1996 | add sp = 64,sp // Restore stack pointer | |
1997 | mov b0 = GR_SAVE_B0 // Restore return address | |
1998 | };; | |
1999 | { .mib | |
2000 | mov gp = GR_SAVE_GP // Restore gp | |
2001 | mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs | |
2002 | br.ret.sptk b0 // Return | |
2003 | };; | |
2004 | ||
2005 | LOCAL_LIBM_END(__libm_error_region#) | |
2006 | .type __libm_error_support#,@function | |
2007 | .global __libm_error_support# |