]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/ia64/fpu/libm_reduce.S
ia64: move from main tree
[thirdparty/glibc.git] / sysdeps / ia64 / fpu / libm_reduce.S
1 .file "libm_reduce.s"
2
3
4 // Copyright (c) 2000 - 2003, Intel Corporation
5 // All rights reserved.
6 //
7 // Contributed 2000 by the Intel Numerics Group, Intel Corporation
8 //
9 // Redistribution and use in source and binary forms, with or without
10 // modification, are permitted provided that the following conditions are
11 // met:
12 //
13 // * Redistributions of source code must retain the above copyright
14 // notice, this list of conditions and the following disclaimer.
15 //
16 // * Redistributions in binary form must reproduce the above copyright
17 // notice, this list of conditions and the following disclaimer in the
18 // documentation and/or other materials provided with the distribution.
19 //
20 // * The name of Intel Corporation may not be used to endorse or promote
21 // products derived from this software without specific prior written
22 // permission.
23
24 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
25 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
26 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
27 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
28 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
29 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
30 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
31 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
32 // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
33 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
34 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35 //
36 // Intel Corporation is the author of this code, and requests that all
37 // problem reports or change requests be submitted to it directly at
38 // http://www.intel.com/software/products/opensource/libraries/num.htm.
39 //
40 // History:
41 // 02/02/00 Initial Version
42 // 05/13/02 Rescheduled for speed, changed interface to pass
43 // parameters in fp registers
44 // 02/10/03 Reordered header: .section, .global, .proc, .align;
45 // used data8 for long double data storage
46 //
47 //*********************************************************************
48 //*********************************************************************
49 //
50 // Function: __libm_pi_by_two_reduce(x) return r, c, and N where
51 // x = N * pi/4 + (r+c) , where |r+c| <= pi/4.
52 // This function is not designed to be used by the
53 // general user.
54 //
55 //*********************************************************************
56 //
57 // Accuracy: Returns double-precision values
58 //
59 //*********************************************************************
60 //
61 // Resources Used:
62 //
63 // Floating-Point Registers:
64 // f8 = Input x, return value r
65 // f9 = return value c
66 // f32-f70
67 //
68 // General Purpose Registers:
69 // r8 = return value N
70 // r34-r64
71 //
72 // Predicate Registers: p6-p14
73 //
74 //*********************************************************************
75 //
76 // IEEE Special Conditions:
77 //
78 // No condions should be raised.
79 //
80 //*********************************************************************
81 //
82 // I. Introduction
83 // ===============
84 //
85 // For the forward trigonometric functions sin, cos, sincos, and
86 // tan, the original algorithms for IA 64 handle arguments up to
87 // 1 ulp less than 2^63 in magnitude. For double-extended arguments x,
88 // |x| >= 2^63, this routine returns N and r_hi, r_lo where
89 //
90 // x is accurately approximated by
91 // 2*K*pi + N * pi/2 + r_hi + r_lo, |r_hi+r_lo| <= pi/4.
92 // CASE = 1 or 2.
93 // CASE is 1 unless |r_hi + r_lo| < 2^(-33).
94 //
95 // The exact value of K is not determined, but that information is
96 // not required in trigonometric function computations.
97 //
98 // We first assume the argument x in question satisfies x >= 2^(63).
99 // In particular, it is positive. Negative x can be handled by symmetry:
100 //
101 // -x is accurately approximated by
102 // -2*K*pi + (-N) * pi/2 - (r_hi + r_lo), |r_hi+r_lo| <= pi/4.
103 //
104 // The idea of the reduction is that
105 //
106 // x * 2/pi = N_big + N + f, |f| <= 1/2
107 //
108 // Moreover, for double extended x, |f| >= 2^(-75). (This is an
109 // non-obvious fact found by enumeration using a special algorithm
110 // involving continued fraction.) The algorithm described below
111 // calculates N and an accurate approximation of f.
112 //
113 // Roughly speaking, an appropriate 256-bit (4 X 64) portion of
114 // 2/pi is multiplied with x to give the desired information.
115 //
116 // II. Representation of 2/PI
117 // ==========================
118 //
119 // The value of 2/pi in binary fixed-point is
120 //
121 // .101000101111100110......
122 //
123 // We store 2/pi in a table, starting at the position corresponding
124 // to bit position 63
125 //
126 // bit position 63 62 ... 0 -1 -2 -3 -4 -5 -6 -7 .... -16576
127 //
128 // 0 0 ... 0 . 1 0 1 0 1 0 1 .... X
129 //
130 // ^
131 // |__ implied binary pt
132 //
133 // III. Algorithm
134 // ==============
135 //
136 // This describes the algorithm in the most natural way using
137 // unsigned interger multiplication. The implementation section
138 // describes how the integer arithmetic is simulated.
139 //
140 // STEP 0. Initialization
141 // ----------------------
142 //
143 // Let the input argument x be
144 //
145 // x = 2^m * ( 1. b_1 b_2 b_3 ... b_63 ), 63 <= m <= 16383.
146 //
147 // The first crucial step is to fetch four 64-bit portions of 2/pi.
148 // To fulfill this goal, we calculate the bit position L of the
149 // beginning of these 256-bit quantity by
150 //
151 // L := 62 - m.
152 //
153 // Note that -16321 <= L <= -1 because 63 <= m <= 16383; and that
154 // the storage of 2/pi is adequate.
155 //
156 // Fetch P_1, P_2, P_3, P_4 beginning at bit position L thus:
157 //
158 // bit position L L-1 L-2 ... L-63
159 //
160 // P_1 = b b b ... b
161 //
162 // each b can be 0 or 1. Also, let P_0 be the two bits correspoding to
163 // bit positions L+2 and L+1. So, when each of the P_j is interpreted
164 // with appropriate scaling, we have
165 //
166 // 2/pi = P_big + P_0 + (P_1 + P_2 + P_3 + P_4) + P_small
167 //
168 // Note that P_big and P_small can be ignored. The reasons are as follow.
169 // First, consider P_big. If P_big = 0, we can certainly ignore it.
170 // Otherwise, P_big >= 2^(L+3). Now,
171 //
172 // P_big * ulp(x) >= 2^(L+3) * 2^(m-63)
173 // >= 2^(65-m + m-63 )
174 // >= 2^2
175 //
176 // Thus, P_big * x is an integer of the form 4*K. So
177 //
178 // x = 4*K * (pi/2) + x*(P_0 + P_1 + P_2 + P_3 + P_4)*(pi/2)
179 // + x*P_small*(pi/2).
180 //
181 // Hence, P_big*x corresponds to information that can be ignored for
182 // trigonometic function evaluation.
183 //
184 // Next, we must estimate the effect of ignoring P_small. The absolute
185 // error made by ignoring P_small is bounded by
186 //
187 // |P_small * x| <= ulp(P_4) * x
188 // <= 2^(L-255) * 2^(m+1)
189 // <= 2^(62-m-255 + m + 1)
190 // <= 2^(-192)
191 //
192 // Since for double-extended precision, x * 2/pi = integer + f,
193 // 0.5 >= |f| >= 2^(-75), the relative error introduced by ignoring
194 // P_small is bounded by 2^(-192+75) <= 2^(-117), which is acceptable.
195 //
196 // Further note that if x is split into x_hi + x_lo where x_lo is the
197 // two bits corresponding to bit positions 2^(m-62) and 2^(m-63); then
198 //
199 // P_0 * x_hi
200 //
201 // is also an integer of the form 4*K; and thus can also be ignored.
202 // Let M := P_0 * x_lo which is a small integer. The main part of the
203 // calculation is really the multiplication of x with the four pieces
204 // P_1, P_2, P_3, and P_4.
205 //
206 // Unless the reduced argument is extremely small in magnitude, it
207 // suffices to carry out the multiplication of x with P_1, P_2, and
208 // P_3. x*P_4 will be carried out and added on as a correction only
209 // when it is found to be needed. Note also that x*P_4 need not be
210 // computed exactly. A straightforward multiplication suffices since
211 // the rounding error thus produced would be bounded by 2^(-3*64),
212 // that is 2^(-192) which is small enough as the reduced argument
213 // is bounded from below by 2^(-75).
214 //
215 // Now that we have four 64-bit data representing 2/pi and a
216 // 64-bit x. We first need to calculate a highly accurate product
217 // of x and P_1, P_2, P_3. This is best understood as integer
218 // multiplication.
219 //
220 //
221 // STEP 1. Multiplication
222 // ----------------------
223 //
224 //
225 // --------- --------- ---------
226 // | P_1 | | P_2 | | P_3 |
227 // --------- --------- ---------
228 //
229 // ---------
230 // X | X |
231 // ---------
232 // ----------------------------------------------------
233 //
234 // --------- ---------
235 // | A_hi | | A_lo |
236 // --------- ---------
237 //
238 //
239 // --------- ---------
240 // | B_hi | | B_lo |
241 // --------- ---------
242 //
243 //
244 // --------- ---------
245 // | C_hi | | C_lo |
246 // --------- ---------
247 //
248 // ====================================================
249 // --------- --------- --------- ---------
250 // | S_0 | | S_1 | | S_2 | | S_3 |
251 // --------- --------- --------- ---------
252 //
253 //
254 //
255 // STEP 2. Get N and f
256 // -------------------
257 //
258 // Conceptually, after the individual pieces S_0, S_1, ..., are obtained,
259 // we have to sum them and obtain an integer part, N, and a fraction, f.
260 // Here, |f| <= 1/2, and N is an integer. Note also that N need only to
261 // be known to module 2^k, k >= 2. In the case when |f| is small enough,
262 // we would need to add in the value x*P_4.
263 //
264 //
265 // STEP 3. Get reduced argument
266 // ----------------------------
267 //
268 // The value f is not yet the reduced argument that we seek. The
269 // equation
270 //
271 // x * 2/pi = 4K + N + f
272 //
273 // says that
274 //
275 // x = 2*K*pi + N * pi/2 + f * (pi/2).
276 //
277 // Thus, the reduced argument is given by
278 //
279 // reduced argument = f * pi/2.
280 //
281 // This multiplication must be performed to extra precision.
282 //
283 // IV. Implementation
284 // ==================
285 //
286 // Step 0. Initialization
287 // ----------------------
288 //
289 // Set sgn_x := sign(x); x := |x|; x_lo := 2 lsb of x.
290 //
291 // In memory, 2/pi is stored contigously as
292 //
293 // 0x00000000 0x00000000 0xA2F....
294 // ^
295 // |__ implied binary bit
296 //
297 // Given x = 2^m * 1.xxxx...xxx; we calculate L := 62 - m. Thus
298 // -1 <= L <= -16321. We fetch from memory 5 integer pieces of data.
299 //
300 // P_0 is the two bits corresponding to bit positions L+2 and L+1
301 // P_1 is the 64-bit starting at bit position L
302 // P_2 is the 64-bit starting at bit position L-64
303 // P_3 is the 64-bit starting at bit position L-128
304 // P_4 is the 64-bit starting at bit position L-192
305 //
306 // For example, if m = 63, P_0 would be 0 and P_1 would look like
307 // 0xA2F...
308 //
309 // If m = 65, P_0 would be the two msb of 0xA, thus, P_0 is 10 in binary.
310 // P_1 in binary would be 1 0 0 0 1 0 1 1 1 1 ....
311 //
312 // Step 1. Multiplication
313 // ----------------------
314 //
315 // At this point, P_1, P_2, P_3, P_4 are integers. They are
316 // supposed to be interpreted as
317 //
318 // 2^(L-63) * P_1;
319 // 2^(L-63-64) * P_2;
320 // 2^(L-63-128) * P_3;
321 // 2^(L-63-192) * P_4;
322 //
323 // Since each of them need to be multiplied to x, we would scale
324 // both x and the P_j's by some convenient factors: scale each
325 // of P_j's up by 2^(63-L), and scale x down by 2^(L-63).
326 //
327 // p_1 := fcvt.xf ( P_1 )
328 // p_2 := fcvt.xf ( P_2 ) * 2^(-64)
329 // p_3 := fcvt.xf ( P_3 ) * 2^(-128)
330 // p_4 := fcvt.xf ( P_4 ) * 2^(-192)
331 // x := replace exponent of x by -1
332 // because 2^m * 1.xxxx...xxx * 2^(L-63)
333 // is 2^(-1) * 1.xxxx...xxx
334 //
335 // We are now faced with the task of computing the following
336 //
337 // --------- --------- ---------
338 // | P_1 | | P_2 | | P_3 |
339 // --------- --------- ---------
340 //
341 // ---------
342 // X | X |
343 // ---------
344 // ----------------------------------------------------
345 //
346 // --------- ---------
347 // | A_hi | | A_lo |
348 // --------- ---------
349 //
350 // --------- ---------
351 // | B_hi | | B_lo |
352 // --------- ---------
353 //
354 // --------- ---------
355 // | C_hi | | C_lo |
356 // --------- ---------
357 //
358 // ====================================================
359 // ----------- --------- --------- ---------
360 // | S_0 | | S_1 | | S_2 | | S_3 |
361 // ----------- --------- --------- ---------
362 // ^ ^
363 // | |___ binary point
364 // |
365 // |___ possibly one more bit
366 //
367 // Let FPSR3 be set to round towards zero with widest precision
368 // and exponent range. Unless an explicit FPSR is given,
369 // round-to-nearest with widest precision and exponent range is
370 // used.
371 //
372 // Define sigma_C := 2^63; sigma_B := 2^(-1); sigma_C := 2^(-65).
373 //
374 // Tmp_C := fmpy.fpsr3( x, p_1 );
375 // If Tmp_C >= sigma_C then
376 // C_hi := Tmp_C;
377 // C_lo := x*p_1 - C_hi ...fma, exact
378 // Else
379 // C_hi := fadd.fpsr3(sigma_C, Tmp_C) - sigma_C
380 // ...subtraction is exact, regardless
381 // ...of rounding direction
382 // C_lo := x*p_1 - C_hi ...fma, exact
383 // End If
384 //
385 // Tmp_B := fmpy.fpsr3( x, p_2 );
386 // If Tmp_B >= sigma_B then
387 // B_hi := Tmp_B;
388 // B_lo := x*p_2 - B_hi ...fma, exact
389 // Else
390 // B_hi := fadd.fpsr3(sigma_B, Tmp_B) - sigma_B
391 // ...subtraction is exact, regardless
392 // ...of rounding direction
393 // B_lo := x*p_2 - B_hi ...fma, exact
394 // End If
395 //
396 // Tmp_A := fmpy.fpsr3( x, p_3 );
397 // If Tmp_A >= sigma_A then
398 // A_hi := Tmp_A;
399 // A_lo := x*p_3 - A_hi ...fma, exact
400 // Else
401 // A_hi := fadd.fpsr3(sigma_A, Tmp_A) - sigma_A
402 // ...subtraction is exact, regardless
403 // ...of rounding direction
404 // A_lo := x*p_3 - A_hi ...fma, exact
405 // End If
406 //
407 // ...Note that C_hi is of integer value. We need only the
408 // ...last few bits. Thus we can ensure C_hi is never a big
409 // ...integer, freeing us from overflow worry.
410 //
411 // Tmp_C := fadd.fpsr3( C_hi, 2^(70) ) - 2^(70);
412 // ...Tmp_C is the upper portion of C_hi
413 // C_hi := C_hi - Tmp_C
414 // ...0 <= C_hi < 2^7
415 //
416 // Step 2. Get N and f
417 // -------------------
418 //
419 // At this point, we have all the components to obtain
420 // S_0, S_1, S_2, S_3 and thus N and f. We start by adding
421 // C_lo and B_hi. This sum together with C_hi gives a good
422 // estimation of N and f.
423 //
424 // A := fadd.fpsr3( B_hi, C_lo )
425 // B := max( B_hi, C_lo )
426 // b := min( B_hi, C_lo )
427 //
428 // a := (B - A) + b ...exact. Note that a is either 0
429 // ...or 2^(-64).
430 //
431 // N := round_to_nearest_integer_value( A );
432 // f := A - N; ...exact because lsb(A) >= 2^(-64)
433 // ...and |f| <= 1/2.
434 //
435 // f := f + a ...exact because a is 0 or 2^(-64);
436 // ...the msb of the sum is <= 1/2
437 // ...lsb >= 2^(-64).
438 //
439 // N := convert to integer format( C_hi + N );
440 // M := P_0 * x_lo;
441 // N := N + M;
442 //
443 // If sgn_x == 1 (that is original x was negative)
444 // N := 2^10 - N
445 // ...this maintains N to be non-negative, but still
446 // ...equivalent to the (negated N) mod 4.
447 // End If
448 //
449 // If |f| >= 2^(-33)
450 //
451 // ...Case 1
452 // CASE := 1
453 // g := A_hi + B_lo;
454 // s_hi := f + g;
455 // s_lo := (f - s_hi) + g;
456 //
457 // Else
458 //
459 // ...Case 2
460 // CASE := 2
461 // A := fadd.fpsr3( A_hi, B_lo )
462 // B := max( A_hi, B_lo )
463 // b := min( A_hi, B_lo )
464 //
465 // a := (B - A) + b ...exact. Note that a is either 0
466 // ...or 2^(-128).
467 //
468 // f_hi := A + f;
469 // f_lo := (f - f_hi) + A;
470 // ...this is exact.
471 // ...f-f_hi is exact because either |f| >= |A|, in which
472 // ...case f-f_hi is clearly exact; or otherwise, 0<|f|<|A|
473 // ...means msb(f) <= msb(A) = 2^(-64) => |f| = 2^(-64).
474 // ...If f = 2^(-64), f-f_hi involves cancellation and is
475 // ...exact. If f = -2^(-64), then A + f is exact. Hence
476 // ...f-f_hi is -A exactly, giving f_lo = 0.
477 //
478 // f_lo := f_lo + a;
479 //
480 // If |f| >= 2^(-50) then
481 // s_hi := f_hi;
482 // s_lo := f_lo;
483 // Else
484 // f_lo := (f_lo + A_lo) + x*p_4
485 // s_hi := f_hi + f_lo
486 // s_lo := (f_hi - s_hi) + f_lo
487 // End If
488 //
489 // End If
490 //
491 // Step 3. Get reduced argument
492 // ----------------------------
493 //
494 // If sgn_x == 0 (that is original x is positive)
495 //
496 // D_hi := Pi_by_2_hi
497 // D_lo := Pi_by_2_lo
498 // ...load from table
499 //
500 // Else
501 //
502 // D_hi := neg_Pi_by_2_hi
503 // D_lo := neg_Pi_by_2_lo
504 // ...load from table
505 // End If
506 //
507 // r_hi := s_hi*D_hi
508 // r_lo := s_hi*D_hi - r_hi ...fma
509 // r_lo := (s_hi*D_lo + r_lo) + s_lo*D_hi
510 //
511 // Return N, r_hi, r_lo
512 //
513 FR_input_X = f8
514 FR_r_hi = f8
515 FR_r_lo = f9
516
517 FR_X = f32
518 FR_N = f33
519 FR_p_1 = f34
520 FR_TWOM33 = f35
521 FR_TWOM50 = f36
522 FR_g = f37
523 FR_p_2 = f38
524 FR_f = f39
525 FR_s_lo = f40
526 FR_p_3 = f41
527 FR_f_abs = f42
528 FR_D_lo = f43
529 FR_p_4 = f44
530 FR_D_hi = f45
531 FR_Tmp2_C = f46
532 FR_s_hi = f47
533 FR_sigma_A = f48
534 FR_A = f49
535 FR_sigma_B = f50
536 FR_B = f51
537 FR_sigma_C = f52
538 FR_b = f53
539 FR_ScaleP2 = f54
540 FR_ScaleP3 = f55
541 FR_ScaleP4 = f56
542 FR_Tmp_A = f57
543 FR_Tmp_B = f58
544 FR_Tmp_C = f59
545 FR_A_hi = f60
546 FR_f_hi = f61
547 FR_RSHF = f62
548 FR_A_lo = f63
549 FR_B_hi = f64
550 FR_a = f65
551 FR_B_lo = f66
552 FR_f_lo = f67
553 FR_N_fix = f68
554 FR_C_hi = f69
555 FR_C_lo = f70
556
557 GR_N = r8
558 GR_Exp_x = r36
559 GR_Temp = r37
560 GR_BIASL63 = r38
561 GR_CASE = r39
562 GR_x_lo = r40
563 GR_sgn_x = r41
564 GR_M = r42
565 GR_BASE = r43
566 GR_LENGTH1 = r44
567 GR_LENGTH2 = r45
568 GR_ASUB = r46
569 GR_P_0 = r47
570 GR_P_1 = r48
571 GR_P_2 = r49
572 GR_P_3 = r50
573 GR_P_4 = r51
574 GR_START = r52
575 GR_SEGMENT = r53
576 GR_A = r54
577 GR_B = r55
578 GR_C = r56
579 GR_D = r57
580 GR_E = r58
581 GR_TEMP1 = r59
582 GR_TEMP2 = r60
583 GR_TEMP3 = r61
584 GR_TEMP4 = r62
585 GR_TEMP5 = r63
586 GR_TEMP6 = r64
587 GR_rshf = r64
588
589 RODATA
590 .align 64
591
592 LOCAL_OBJECT_START(Constants_Bits_of_2_by_pi)
593 data8 0x0000000000000000,0xA2F9836E4E441529
594 data8 0xFC2757D1F534DDC0,0xDB6295993C439041
595 data8 0xFE5163ABDEBBC561,0xB7246E3A424DD2E0
596 data8 0x06492EEA09D1921C,0xFE1DEB1CB129A73E
597 data8 0xE88235F52EBB4484,0xE99C7026B45F7E41
598 data8 0x3991D639835339F4,0x9C845F8BBDF9283B
599 data8 0x1FF897FFDE05980F,0xEF2F118B5A0A6D1F
600 data8 0x6D367ECF27CB09B7,0x4F463F669E5FEA2D
601 data8 0x7527BAC7EBE5F17B,0x3D0739F78A5292EA
602 data8 0x6BFB5FB11F8D5D08,0x56033046FC7B6BAB
603 data8 0xF0CFBC209AF4361D,0xA9E391615EE61B08
604 data8 0x6599855F14A06840,0x8DFFD8804D732731
605 data8 0x06061556CA73A8C9,0x60E27BC08C6B47C4
606 data8 0x19C367CDDCE8092A,0x8359C4768B961CA6
607 data8 0xDDAF44D15719053E,0xA5FF07053F7E33E8
608 data8 0x32C2DE4F98327DBB,0xC33D26EF6B1E5EF8
609 data8 0x9F3A1F35CAF27F1D,0x87F121907C7C246A
610 data8 0xFA6ED5772D30433B,0x15C614B59D19C3C2
611 data8 0xC4AD414D2C5D000C,0x467D862D71E39AC6
612 data8 0x9B0062337CD2B497,0xA7B4D55537F63ED7
613 data8 0x1810A3FC764D2A9D,0x64ABD770F87C6357
614 data8 0xB07AE715175649C0,0xD9D63B3884A7CB23
615 data8 0x24778AD623545AB9,0x1F001B0AF1DFCE19
616 data8 0xFF319F6A1E666157,0x9947FBACD87F7EB7
617 data8 0x652289E83260BFE6,0xCDC4EF09366CD43F
618 data8 0x5DD7DE16DE3B5892,0x9BDE2822D2E88628
619 data8 0x4D58E232CAC616E3,0x08CB7DE050C017A7
620 data8 0x1DF35BE01834132E,0x6212830148835B8E
621 data8 0xF57FB0ADF2E91E43,0x4A48D36710D8DDAA
622 data8 0x425FAECE616AA428,0x0AB499D3F2A6067F
623 data8 0x775C83C2A3883C61,0x78738A5A8CAFBDD7
624 data8 0x6F63A62DCBBFF4EF,0x818D67C12645CA55
625 data8 0x36D9CAD2A8288D61,0xC277C9121426049B
626 data8 0x4612C459C444C5C8,0x91B24DF31700AD43
627 data8 0xD4E5492910D5FDFC,0xBE00CC941EEECE70
628 data8 0xF53E1380F1ECC3E7,0xB328F8C79405933E
629 data8 0x71C1B3092EF3450B,0x9C12887B20AB9FB5
630 data8 0x2EC292472F327B6D,0x550C90A7721FE76B
631 data8 0x96CB314A1679E279,0x4189DFF49794E884
632 data8 0xE6E29731996BED88,0x365F5F0EFDBBB49A
633 data8 0x486CA46742727132,0x5D8DB8159F09E5BC
634 data8 0x25318D3974F71C05,0x30010C0D68084B58
635 data8 0xEE2C90AA4702E774,0x24D6BDA67DF77248
636 data8 0x6EEF169FA6948EF6,0x91B45153D1F20ACF
637 data8 0x3398207E4BF56863,0xB25F3EDD035D407F
638 data8 0x8985295255C06437,0x10D86D324832754C
639 data8 0x5BD4714E6E5445C1,0x090B69F52AD56614
640 data8 0x9D072750045DDB3B,0xB4C576EA17F9877D
641 data8 0x6B49BA271D296996,0xACCCC65414AD6AE2
642 data8 0x9089D98850722CBE,0xA4049407777030F3
643 data8 0x27FC00A871EA49C2,0x663DE06483DD9797
644 data8 0x3FA3FD94438C860D,0xDE41319D39928C70
645 data8 0xDDE7B7173BDF082B,0x3715A0805C93805A
646 data8 0x921110D8E80FAF80,0x6C4BFFDB0F903876
647 data8 0x185915A562BBCB61,0xB989C7BD401004F2
648 data8 0xD2277549F6B6EBBB,0x22DBAA140A2F2689
649 data8 0x768364333B091A94,0x0EAA3A51C2A31DAE
650 data8 0xEDAF12265C4DC26D,0x9C7A2D9756C0833F
651 data8 0x03F6F0098C402B99,0x316D07B43915200C
652 data8 0x5BC3D8C492F54BAD,0xC6A5CA4ECD37A736
653 data8 0xA9E69492AB6842DD,0xDE6319EF8C76528B
654 data8 0x6837DBFCABA1AE31,0x15DFA1AE00DAFB0C
655 data8 0x664D64B705ED3065,0x29BF56573AFF47B9
656 data8 0xF96AF3BE75DF9328,0x3080ABF68C6615CB
657 data8 0x040622FA1DE4D9A4,0xB33D8F1B5709CD36
658 data8 0xE9424EA4BE13B523,0x331AAAF0A8654FA5
659 data8 0xC1D20F3F0BCD785B,0x76F923048B7B7217
660 data8 0x8953A6C6E26E6F00,0xEBEF584A9BB7DAC4
661 data8 0xBA66AACFCF761D02,0xD12DF1B1C1998C77
662 data8 0xADC3DA4886A05DF7,0xF480C62FF0AC9AEC
663 data8 0xDDBC5C3F6DDED01F,0xC790B6DB2A3A25A3
664 data8 0x9AAF009353AD0457,0xB6B42D297E804BA7
665 data8 0x07DA0EAA76A1597B,0x2A12162DB7DCFDE5
666 data8 0xFAFEDB89FDBE896C,0x76E4FCA90670803E
667 data8 0x156E85FF87FD073E,0x2833676186182AEA
668 data8 0xBD4DAFE7B36E6D8F,0x3967955BBF3148D7
669 data8 0x8416DF30432DC735,0x6125CE70C9B8CB30
670 data8 0xFD6CBFA200A4E46C,0x05A0DD5A476F21D2
671 data8 0x1262845CB9496170,0xE0566B0152993755
672 data8 0x50B7D51EC4F1335F,0x6E13E4305DA92E85
673 data8 0xC3B21D3632A1A4B7,0x08D4B1EA21F716E4
674 data8 0x698F77FF2780030C,0x2D408DA0CD4F99A5
675 data8 0x20D3A2B30A5D2F42,0xF9B4CBDA11D0BE7D
676 data8 0xC1DB9BBD17AB81A2,0xCA5C6A0817552E55
677 data8 0x0027F0147F8607E1,0x640B148D4196DEBE
678 data8 0x872AFDDAB6256B34,0x897BFEF3059EBFB9
679 data8 0x4F6A68A82A4A5AC4,0x4FBCF82D985AD795
680 data8 0xC7F48D4D0DA63A20,0x5F57A4B13F149538
681 data8 0x800120CC86DD71B6,0xDEC9F560BF11654D
682 data8 0x6B0701ACB08CD0C0,0xB24855510EFB1EC3
683 data8 0x72953B06A33540C0,0x7BDC06CC45E0FA29
684 data8 0x4EC8CAD641F3E8DE,0x647CD8649B31BED9
685 data8 0xC397A4D45877C5E3,0x6913DAF03C3ABA46
686 data8 0x18465F7555F5BDD2,0xC6926E5D2EACED44
687 data8 0x0E423E1C87C461E9,0xFD29F3D6E7CA7C22
688 data8 0x35916FC5E0088DD7,0xFFE26A6EC6FDB0C1
689 data8 0x0893745D7CB2AD6B,0x9D6ECD7B723E6A11
690 data8 0xC6A9CFF7DF7329BA,0xC9B55100B70DB2E2
691 data8 0x24BA74607DE58AD8,0x742C150D0C188194
692 data8 0x667E162901767A9F,0xBEFDFDEF4556367E
693 data8 0xD913D9ECB9BA8BFC,0x97C427A831C36EF1
694 data8 0x36C59456A8D8B5A8,0xB40ECCCF2D891234
695 data8 0x576F89562CE3CE99,0xB920D6AA5E6B9C2A
696 data8 0x3ECC5F114A0BFDFB,0xF4E16D3B8E2C86E2
697 data8 0x84D4E9A9B4FCD1EE,0xEFC9352E61392F44
698 data8 0x2138C8D91B0AFC81,0x6A4AFBD81C2F84B4
699 data8 0x538C994ECC2254DC,0x552AD6C6C096190B
700 data8 0xB8701A649569605A,0x26EE523F0F117F11
701 data8 0xB5F4F5CBFC2DBC34,0xEEBC34CC5DE8605E
702 data8 0xDD9B8E67EF3392B8,0x17C99B5861BC57E1
703 data8 0xC68351103ED84871,0xDDDD1C2DA118AF46
704 data8 0x2C21D7F359987AD9,0xC0549EFA864FFC06
705 data8 0x56AE79E536228922,0xAD38DC9367AAE855
706 data8 0x3826829BE7CAA40D,0x51B133990ED7A948
707 data8 0x0569F0B265A7887F,0x974C8836D1F9B392
708 data8 0x214A827B21CF98DC,0x9F405547DC3A74E1
709 data8 0x42EB67DF9DFE5FD4,0x5EA4677B7AACBAA2
710 data8 0xF65523882B55BA41,0x086E59862A218347
711 data8 0x39E6E389D49EE540,0xFB49E956FFCA0F1C
712 data8 0x8A59C52BFA94C5C1,0xD3CFC50FAE5ADB86
713 data8 0xC5476243853B8621,0x94792C8761107B4C
714 data8 0x2A1A2C8012BF4390,0x2688893C78E4C4A8
715 data8 0x7BDBE5C23AC4EAF4,0x268A67F7BF920D2B
716 data8 0xA365B1933D0B7CBD,0xDC51A463DD27DDE1
717 data8 0x6919949A9529A828,0xCE68B4ED09209F44
718 data8 0xCA984E638270237C,0x7E32B90F8EF5A7E7
719 data8 0x561408F1212A9DB5,0x4D7E6F5119A5ABF9
720 data8 0xB5D6DF8261DD9602,0x36169F3AC4A1A283
721 data8 0x6DED727A8D39A9B8,0x825C326B5B2746ED
722 data8 0x34007700D255F4FC,0x4D59018071E0E13F
723 data8 0x89B295F364A8F1AE,0xA74B38FC4CEAB2BB
724 LOCAL_OBJECT_END(Constants_Bits_of_2_by_pi)
725
726 LOCAL_OBJECT_START(Constants_Bits_of_pi_by_2)
727 data8 0xC90FDAA22168C234,0x00003FFF
728 data8 0xC4C6628B80DC1CD1,0x00003FBF
729 LOCAL_OBJECT_END(Constants_Bits_of_pi_by_2)
730
731 .section .text
732 .global __libm_pi_by_2_reduce#
733 .proc __libm_pi_by_2_reduce#
734 .align 32
735
736 __libm_pi_by_2_reduce:
737
738 // X is in f8
739 // Place the two-piece result r (r_hi) in f8 and c (r_lo) in f9
740 // N is returned in r8
741
742 { .mfi
743 alloc r34 = ar.pfs,2,34,0,0
744 fsetc.s3 0x00,0x7F // Set sf3 to round to zero, 82-bit prec, td, ftz
745 nop.i 999
746 }
747 { .mfi
748 addl GR_BASE = @ltoff(Constants_Bits_of_2_by_pi#), gp
749 nop.f 999
750 mov GR_BIASL63 = 0x1003E
751 }
752 ;;
753
754
755 // L -1-2-3-4
756 // 0 0 0 0 0. 1 0 1 0
757 // M 0 1 2 .... 63, 64 65 ... 127, 128
758 // ---------------------------------------------
759 // Segment 0. 1 , 2 , 3
760 // START = M - 63 M = 128 becomes 65
761 // LENGTH1 = START & 0x3F 65 become position 1
762 // SEGMENT = shr(START,6) + 1 0 maps to 1, 64 maps to 2,
763 // LENGTH2 = 64 - LENGTH1
764 // Address_BASE = shladd(SEGMENT,3) + BASE
765
766
767 { .mmi
768 getf.exp GR_Exp_x = FR_input_X
769 ld8 GR_BASE = [GR_BASE]
770 mov GR_TEMP5 = 0x0FFFE
771 }
772 ;;
773
774 // Define sigma_C := 2^63; sigma_B := 2^(-1); sigma_A := 2^(-65).
775 { .mmi
776 getf.sig GR_x_lo = FR_input_X
777 mov GR_TEMP6 = 0x0FFBE
778 nop.i 999
779 }
780 ;;
781
782 // Special Code for testing DE arguments
783 // movl GR_BIASL63 = 0x0000000000013FFE
784 // movl GR_x_lo = 0xFFFFFFFFFFFFFFFF
785 // setf.exp FR_X = GR_BIASL63
786 // setf.sig FR_ScaleP3 = GR_x_lo
787 // fmerge.se FR_X = FR_X,FR_ScaleP3
788 // Set sgn_x := sign(x); x := |x|; x_lo := 2 lsb of x.
789 // 2/pi is stored contigously as
790 // 0x00000000 0x00000000.0xA2F....
791 // M = EXP - BIAS ( M >= 63)
792 // Given x = 2^m * 1.xxxx...xxx; we calculate L := 62 - m.
793 // Thus -1 <= L <= -16321.
794 { .mmi
795 setf.exp FR_sigma_B = GR_TEMP5
796 setf.exp FR_sigma_A = GR_TEMP6
797 extr.u GR_M = GR_Exp_x,0,17
798 }
799 ;;
800
801 { .mii
802 and GR_x_lo = 0x03,GR_x_lo
803 sub GR_START = GR_M,GR_BIASL63
804 add GR_BASE = 8,GR_BASE // To effectively add 1 to SEGMENT
805 }
806 ;;
807
808 { .mii
809 and GR_LENGTH1 = 0x3F,GR_START
810 shr.u GR_SEGMENT = GR_START,6
811 nop.i 999
812 }
813 ;;
814
815 { .mmi
816 shladd GR_BASE = GR_SEGMENT,3,GR_BASE
817 sub GR_LENGTH2 = 0x40,GR_LENGTH1
818 cmp.le p6,p7 = 0x2,GR_LENGTH1
819 }
820 ;;
821
822 // P_0 is the two bits corresponding to bit positions L+2 and L+1
823 // P_1 is the 64-bit starting at bit position L
824 // P_2 is the 64-bit starting at bit position L-64
825 // P_3 is the 64-bit starting at bit position L-128
826 // P_4 is the 64-bit starting at bit position L-192
827 // P_1 is made up of Alo and Bhi
828 // P_1 = deposit Alo, position 0, length2 into P_1,position length1
829 // deposit Bhi, position length2, length1 into P_1, position 0
830 // P_2 is made up of Blo and Chi
831 // P_2 = deposit Blo, position 0, length2 into P_2, position length1
832 // deposit Chi, position length2, length1 into P_2, position 0
833 // P_3 is made up of Clo and Dhi
834 // P_3 = deposit Clo, position 0, length2 into P_3, position length1
835 // deposit Dhi, position length2, length1 into P_3, position 0
836 // P_4 is made up of Clo and Dhi
837 // P_4 = deposit Dlo, position 0, length2 into P_4, position length1
838 // deposit Ehi, position length2, length1 into P_4, position 0
839 { .mfi
840 ld8 GR_A = [GR_BASE],8
841 fabs FR_X = FR_input_X
842 (p7) cmp.eq.unc p8,p9 = 0x1,GR_LENGTH1
843 }
844 ;;
845
846 // ld_64 A at Base and increment Base by 8
847 // ld_64 B at Base and increment Base by 8
848 // ld_64 C at Base and increment Base by 8
849 // ld_64 D at Base and increment Base by 8
850 // ld_64 E at Base and increment Base by 8
851 // A/B/C/D
852 // ---------------------
853 // A, B, C, D, and E look like | length1 | length2 |
854 // ---------------------
855 // hi lo
856 { .mlx
857 ld8 GR_B = [GR_BASE],8
858 movl GR_rshf = 0x43e8000000000000 // 1.10000 2^63 for right shift N_fix
859 }
860 ;;
861
862 { .mmi
863 ld8 GR_C = [GR_BASE],8
864 nop.m 999
865 (p8) extr.u GR_Temp = GR_A,63,1
866 }
867 ;;
868
869 // If length1 >= 2,
870 // P_0 = deposit Ahi, position length2, 2 bit into P_0 at position 0.
871 { .mii
872 ld8 GR_D = [GR_BASE],8
873 shl GR_TEMP1 = GR_A,GR_LENGTH1 // MM instruction
874 (p6) shr.u GR_P_0 = GR_A,GR_LENGTH2 // MM instruction
875 }
876 ;;
877
878 { .mii
879 ld8 GR_E = [GR_BASE],-40
880 shl GR_TEMP2 = GR_B,GR_LENGTH1 // MM instruction
881 shr.u GR_P_1 = GR_B,GR_LENGTH2 // MM instruction
882 }
883 ;;
884
885 // Else
886 // Load 16 bit of ASUB from (Base_Address_of_A - 2)
887 // P_0 = ASUB & 0x3
888 // If length1 == 0,
889 // P_0 complete
890 // Else
891 // Deposit element 63 from Ahi and place in element 0 of P_0.
892 // Endif
893 // Endif
894
895 { .mii
896 (p7) ld2 GR_ASUB = [GR_BASE],8
897 shl GR_TEMP3 = GR_C,GR_LENGTH1 // MM instruction
898 shr.u GR_P_2 = GR_C,GR_LENGTH2 // MM instruction
899 }
900 ;;
901
902 { .mii
903 setf.d FR_RSHF = GR_rshf // Form right shift const 1.100 * 2^63
904 shl GR_TEMP4 = GR_D,GR_LENGTH1 // MM instruction
905 shr.u GR_P_3 = GR_D,GR_LENGTH2 // MM instruction
906 }
907 ;;
908
909 { .mmi
910 (p7) and GR_P_0 = 0x03,GR_ASUB
911 (p6) and GR_P_0 = 0x03,GR_P_0
912 shr.u GR_P_4 = GR_E,GR_LENGTH2 // MM instruction
913 }
914 ;;
915
916 { .mmi
917 nop.m 999
918 or GR_P_1 = GR_P_1,GR_TEMP1
919 (p8) and GR_P_0 = 0x1,GR_P_0
920 }
921 ;;
922
923 { .mmi
924 setf.sig FR_p_1 = GR_P_1
925 or GR_P_2 = GR_P_2,GR_TEMP2
926 (p8) shladd GR_P_0 = GR_P_0,1,GR_Temp
927 }
928 ;;
929
930 { .mmf
931 setf.sig FR_p_2 = GR_P_2
932 or GR_P_3 = GR_P_3,GR_TEMP3
933 fmerge.se FR_X = FR_sigma_B,FR_X
934 }
935 ;;
936
937 { .mmi
938 setf.sig FR_p_3 = GR_P_3
939 or GR_P_4 = GR_P_4,GR_TEMP4
940 pmpy2.r GR_M = GR_P_0,GR_x_lo
941 }
942 ;;
943
944 // P_1, P_2, P_3, P_4 are integers. They should be
945 // 2^(L-63) * P_1;
946 // 2^(L-63-64) * P_2;
947 // 2^(L-63-128) * P_3;
948 // 2^(L-63-192) * P_4;
949 // Since each of them need to be multiplied to x, we would scale
950 // both x and the P_j's by some convenient factors: scale each
951 // of P_j's up by 2^(63-L), and scale x down by 2^(L-63).
952 // p_1 := fcvt.xf ( P_1 )
953 // p_2 := fcvt.xf ( P_2 ) * 2^(-64)
954 // p_3 := fcvt.xf ( P_3 ) * 2^(-128)
955 // p_4 := fcvt.xf ( P_4 ) * 2^(-192)
956 // x= Set x's exp to -1 because 2^m*1.x...x *2^(L-63)=2^(-1)*1.x...xxx
957 // --------- --------- ---------
958 // | P_1 | | P_2 | | P_3 |
959 // --------- --------- ---------
960 // ---------
961 // X | X |
962 // ---------
963 // ----------------------------------------------------
964 // --------- ---------
965 // | A_hi | | A_lo |
966 // --------- ---------
967 // --------- ---------
968 // | B_hi | | B_lo |
969 // --------- ---------
970 // --------- ---------
971 // | C_hi | | C_lo |
972 // --------- ---------
973 // ====================================================
974 // ----------- --------- --------- ---------
975 // | S_0 | | S_1 | | S_2 | | S_3 |
976 // ----------- --------- --------- ---------
977 // | |___ binary point
978 // |___ possibly one more bit
979 //
980 // Let FPSR3 be set to round towards zero with widest precision
981 // and exponent range. Unless an explicit FPSR is given,
982 // round-to-nearest with widest precision and exponent range is
983 // used.
984 { .mmi
985 setf.sig FR_p_4 = GR_P_4
986 mov GR_TEMP1 = 0x0FFBF
987 nop.i 999
988 }
989 ;;
990
991 { .mmi
992 setf.exp FR_ScaleP2 = GR_TEMP1
993 mov GR_TEMP2 = 0x0FF7F
994 nop.i 999
995 }
996 ;;
997
998 { .mmi
999 setf.exp FR_ScaleP3 = GR_TEMP2
1000 mov GR_TEMP4 = 0x1003E
1001 nop.i 999
1002 }
1003 ;;
1004
1005 { .mmf
1006 setf.exp FR_sigma_C = GR_TEMP4
1007 mov GR_Temp = 0x0FFDE
1008 fcvt.xuf.s1 FR_p_1 = FR_p_1
1009 }
1010 ;;
1011
1012 { .mfi
1013 setf.exp FR_TWOM33 = GR_Temp
1014 fcvt.xuf.s1 FR_p_2 = FR_p_2
1015 nop.i 999
1016 }
1017 ;;
1018
1019 { .mfi
1020 nop.m 999
1021 fcvt.xuf.s1 FR_p_3 = FR_p_3
1022 nop.i 999
1023 }
1024 ;;
1025
1026 { .mfi
1027 nop.m 999
1028 fcvt.xuf.s1 FR_p_4 = FR_p_4
1029 nop.i 999
1030 }
1031 ;;
1032
1033 // Tmp_C := fmpy.fpsr3( x, p_1 );
1034 // Tmp_B := fmpy.fpsr3( x, p_2 );
1035 // Tmp_A := fmpy.fpsr3( x, p_3 );
1036 // If Tmp_C >= sigma_C then
1037 // C_hi := Tmp_C;
1038 // C_lo := x*p_1 - C_hi ...fma, exact
1039 // Else
1040 // C_hi := fadd.fpsr3(sigma_C, Tmp_C) - sigma_C
1041 // C_lo := x*p_1 - C_hi ...fma, exact
1042 // End If
1043 // If Tmp_B >= sigma_B then
1044 // B_hi := Tmp_B;
1045 // B_lo := x*p_2 - B_hi ...fma, exact
1046 // Else
1047 // B_hi := fadd.fpsr3(sigma_B, Tmp_B) - sigma_B
1048 // B_lo := x*p_2 - B_hi ...fma, exact
1049 // End If
1050 // If Tmp_A >= sigma_A then
1051 // A_hi := Tmp_A;
1052 // A_lo := x*p_3 - A_hi ...fma, exact
1053 // Else
1054 // A_hi := fadd.fpsr3(sigma_A, Tmp_A) - sigma_A
1055 // Exact, regardless ...of rounding direction
1056 // A_lo := x*p_3 - A_hi ...fma, exact
1057 // Endif
1058 { .mfi
1059 nop.m 999
1060 fmpy.s3 FR_Tmp_C = FR_X,FR_p_1
1061 nop.i 999
1062 }
1063 ;;
1064
1065 { .mfi
1066 mov GR_TEMP3 = 0x0FF3F
1067 fmpy.s1 FR_p_2 = FR_p_2,FR_ScaleP2
1068 nop.i 999
1069 }
1070 ;;
1071
1072 { .mmf
1073 setf.exp FR_ScaleP4 = GR_TEMP3
1074 mov GR_TEMP4 = 0x10045
1075 fmpy.s1 FR_p_3 = FR_p_3,FR_ScaleP3
1076 }
1077 ;;
1078
1079 { .mfi
1080 nop.m 999
1081 fadd.s3 FR_C_hi = FR_sigma_C,FR_Tmp_C // For Tmp_C < sigma_C case
1082 nop.i 999
1083 }
1084 ;;
1085
1086 { .mmf
1087 setf.exp FR_Tmp2_C = GR_TEMP4
1088 nop.m 999
1089 fmpy.s3 FR_Tmp_B = FR_X,FR_p_2
1090 }
1091 ;;
1092
1093 { .mfi
1094 addl GR_BASE = @ltoff(Constants_Bits_of_pi_by_2#), gp
1095 fcmp.ge.s1 p12, p9 = FR_Tmp_C,FR_sigma_C
1096 nop.i 999
1097 }
1098 { .mfi
1099 nop.m 999
1100 fmpy.s3 FR_Tmp_A = FR_X,FR_p_3
1101 nop.i 99
1102 }
1103 ;;
1104
1105 { .mfi
1106 ld8 GR_BASE = [GR_BASE]
1107 (p12) mov FR_C_hi = FR_Tmp_C
1108 nop.i 999
1109 }
1110 { .mfi
1111 nop.m 999
1112 (p9) fsub.s1 FR_C_hi = FR_C_hi,FR_sigma_C
1113 nop.i 999
1114 }
1115 ;;
1116
1117
1118
1119 // End If
1120 // Step 3. Get reduced argument
1121 // If sgn_x == 0 (that is original x is positive)
1122 // D_hi := Pi_by_2_hi
1123 // D_lo := Pi_by_2_lo
1124 // Load from table
1125 // Else
1126 // D_hi := neg_Pi_by_2_hi
1127 // D_lo := neg_Pi_by_2_lo
1128 // Load from table
1129 // End If
1130
1131 { .mfi
1132 nop.m 999
1133 fmpy.s1 FR_p_4 = FR_p_4,FR_ScaleP4
1134 nop.i 999
1135 }
1136 { .mfi
1137 nop.m 999
1138 fadd.s3 FR_B_hi = FR_sigma_B,FR_Tmp_B // For Tmp_B < sigma_B case
1139 nop.i 999
1140 }
1141 ;;
1142
1143 { .mfi
1144 nop.m 999
1145 fadd.s3 FR_A_hi = FR_sigma_A,FR_Tmp_A // For Tmp_A < sigma_A case
1146 nop.i 999
1147 }
1148 ;;
1149
1150 { .mfi
1151 nop.m 999
1152 fcmp.ge.s1 p13, p10 = FR_Tmp_B,FR_sigma_B
1153 nop.i 999
1154 }
1155 { .mfi
1156 nop.m 999
1157 fms.s1 FR_C_lo = FR_X,FR_p_1,FR_C_hi
1158 nop.i 999
1159 }
1160 ;;
1161
1162 { .mfi
1163 ldfe FR_D_hi = [GR_BASE],16
1164 fcmp.ge.s1 p14, p11 = FR_Tmp_A,FR_sigma_A
1165 nop.i 999
1166 }
1167 ;;
1168
1169 { .mfi
1170 ldfe FR_D_lo = [GR_BASE]
1171 (p13) mov FR_B_hi = FR_Tmp_B
1172 nop.i 999
1173 }
1174 { .mfi
1175 nop.m 999
1176 (p10) fsub.s1 FR_B_hi = FR_B_hi,FR_sigma_B
1177 nop.i 999
1178 }
1179 ;;
1180
1181 { .mfi
1182 nop.m 999
1183 (p14) mov FR_A_hi = FR_Tmp_A
1184 nop.i 999
1185 }
1186 { .mfi
1187 nop.m 999
1188 (p11) fsub.s1 FR_A_hi = FR_A_hi,FR_sigma_A
1189 nop.i 999
1190 }
1191 ;;
1192
1193 // Note that C_hi is of integer value. We need only the
1194 // last few bits. Thus we can ensure C_hi is never a big
1195 // integer, freeing us from overflow worry.
1196 // Tmp_C := fadd.fpsr3( C_hi, 2^(70) ) - 2^(70);
1197 // Tmp_C is the upper portion of C_hi
1198 { .mfi
1199 nop.m 999
1200 fadd.s3 FR_Tmp_C = FR_C_hi,FR_Tmp2_C
1201 tbit.z p12,p9 = GR_Exp_x, 17
1202 }
1203 ;;
1204
1205 { .mfi
1206 nop.m 999
1207 fms.s1 FR_B_lo = FR_X,FR_p_2,FR_B_hi
1208 nop.i 999
1209 }
1210 { .mfi
1211 nop.m 999
1212 fadd.s3 FR_A = FR_B_hi,FR_C_lo
1213 nop.i 999
1214 }
1215 ;;
1216
1217 { .mfi
1218 nop.m 999
1219 fms.s1 FR_A_lo = FR_X,FR_p_3,FR_A_hi
1220 nop.i 999
1221 }
1222 ;;
1223
1224 { .mfi
1225 nop.m 999
1226 fsub.s1 FR_Tmp_C = FR_Tmp_C,FR_Tmp2_C
1227 nop.i 999
1228 }
1229 ;;
1230
1231 // *******************
1232 // Step 2. Get N and f
1233 // *******************
1234 // We have all the components to obtain
1235 // S_0, S_1, S_2, S_3 and thus N and f. We start by adding
1236 // C_lo and B_hi. This sum together with C_hi estimates
1237 // N and f well.
1238 // A := fadd.fpsr3( B_hi, C_lo )
1239 // B := max( B_hi, C_lo )
1240 // b := min( B_hi, C_lo )
1241 { .mfi
1242 nop.m 999
1243 fmax.s1 FR_B = FR_B_hi,FR_C_lo
1244 nop.i 999
1245 }
1246 ;;
1247
1248 // We use a right-shift trick to get the integer part of A into the rightmost
1249 // bits of the significand by adding 1.1000..00 * 2^63. This operation is good
1250 // if |A| < 2^61, which it is in this case. We are doing this to save a few
1251 // cycles over using fcvt.fx followed by fnorm. The second step of the trick
1252 // is to subtract the same constant to float the rounded integer into a fp reg.
1253
1254 { .mfi
1255 nop.m 999
1256 // N := round_to_nearest_integer_value( A );
1257 fma.s1 FR_N_fix = FR_A, f1, FR_RSHF
1258 nop.i 999
1259 }
1260 ;;
1261
1262 { .mfi
1263 nop.m 999
1264 fmin.s1 FR_b = FR_B_hi,FR_C_lo
1265 nop.i 999
1266 }
1267 { .mfi
1268 nop.m 999
1269 // C_hi := C_hi - Tmp_C ...0 <= C_hi < 2^7
1270 fsub.s1 FR_C_hi = FR_C_hi,FR_Tmp_C
1271 nop.i 999
1272 }
1273 ;;
1274
1275 { .mfi
1276 nop.m 999
1277 // a := (B - A) + b: Exact - note that a is either 0 or 2^(-64).
1278 fsub.s1 FR_a = FR_B,FR_A
1279 nop.i 999
1280 }
1281 ;;
1282
1283 { .mfi
1284 nop.m 999
1285 fms.s1 FR_N = FR_N_fix, f1, FR_RSHF
1286 nop.i 999
1287 }
1288 ;;
1289
1290 { .mfi
1291 nop.m 999
1292 fadd.s1 FR_a = FR_a,FR_b
1293 nop.i 999
1294 }
1295 ;;
1296
1297 // f := A - N; Exact because lsb(A) >= 2^(-64) and |f| <= 1/2.
1298 // N := convert to integer format( C_hi + N );
1299 // M := P_0 * x_lo;
1300 // N := N + M;
1301 { .mfi
1302 nop.m 999
1303 fsub.s1 FR_f = FR_A,FR_N
1304 nop.i 999
1305 }
1306 { .mfi
1307 nop.m 999
1308 fadd.s1 FR_N = FR_N,FR_C_hi
1309 nop.i 999
1310 }
1311 ;;
1312
1313 { .mfi
1314 nop.m 999
1315 (p9) fsub.s1 FR_D_hi = f0, FR_D_hi
1316 nop.i 999
1317 }
1318 { .mfi
1319 nop.m 999
1320 (p9) fsub.s1 FR_D_lo = f0, FR_D_lo
1321 nop.i 999
1322 }
1323 ;;
1324
1325 { .mfi
1326 nop.m 999
1327 fadd.s1 FR_g = FR_A_hi,FR_B_lo // For Case 1, g=A_hi+B_lo
1328 nop.i 999
1329 }
1330 { .mfi
1331 nop.m 999
1332 fadd.s3 FR_A = FR_A_hi,FR_B_lo // For Case 2, A=A_hi+B_lo w/ sf3
1333 nop.i 999
1334 }
1335 ;;
1336
1337 { .mfi
1338 mov GR_Temp = 0x0FFCD // For Case 2, exponent of 2^-50
1339 fmax.s1 FR_B = FR_A_hi,FR_B_lo // For Case 2, B=max(A_hi,B_lo)
1340 nop.i 999
1341 }
1342 ;;
1343
1344 // f = f + a Exact because a is 0 or 2^(-64);
1345 // the msb of the sum is <= 1/2 and lsb >= 2^(-64).
1346 { .mfi
1347 setf.exp FR_TWOM50 = GR_Temp // For Case 2, form 2^-50
1348 fcvt.fx.s1 FR_N = FR_N
1349 nop.i 999
1350 }
1351 { .mfi
1352 nop.m 999
1353 fadd.s1 FR_f = FR_f,FR_a
1354 nop.i 999
1355 }
1356 ;;
1357
1358 { .mfi
1359 nop.m 999
1360 fmin.s1 FR_b = FR_A_hi,FR_B_lo // For Case 2, b=min(A_hi,B_lo)
1361 nop.i 999
1362 }
1363 ;;
1364
1365 { .mfi
1366 nop.m 999
1367 fsub.s1 FR_a = FR_B,FR_A // For Case 2, a=B-A
1368 nop.i 999
1369 }
1370 ;;
1371
1372 { .mfi
1373 nop.m 999
1374 fadd.s1 FR_s_hi = FR_f,FR_g // For Case 1, s_hi=f+g
1375 nop.i 999
1376 }
1377 { .mfi
1378 nop.m 999
1379 fadd.s1 FR_f_hi = FR_A,FR_f // For Case 2, f_hi=A+f
1380 nop.i 999
1381 }
1382 ;;
1383
1384 { .mfi
1385 nop.m 999
1386 fabs FR_f_abs = FR_f
1387 nop.i 999
1388 }
1389 ;;
1390
1391 { .mfi
1392 getf.sig GR_N = FR_N
1393 fsetc.s3 0x7F,0x40 // Reset sf3 to user settings + td
1394 nop.i 999
1395 }
1396 ;;
1397
1398 { .mfi
1399 nop.m 999
1400 fsub.s1 FR_s_lo = FR_f,FR_s_hi // For Case 1, s_lo=f-s_hi
1401 nop.i 999
1402 }
1403 { .mfi
1404 nop.m 999
1405 fsub.s1 FR_f_lo = FR_f,FR_f_hi // For Case 2, f_lo=f-f_hi
1406 nop.i 999
1407 }
1408 ;;
1409
1410 { .mfi
1411 nop.m 999
1412 fmpy.s1 FR_r_hi = FR_s_hi,FR_D_hi // For Case 1, r_hi=s_hi*D_hi
1413 nop.i 999
1414 }
1415 { .mfi
1416 nop.m 999
1417 fadd.s1 FR_a = FR_a,FR_b // For Case 2, a=a+b
1418 nop.i 999
1419 }
1420 ;;
1421
1422
1423 // If sgn_x == 1 (that is original x was negative)
1424 // N := 2^10 - N
1425 // this maintains N to be non-negative, but still
1426 // equivalent to the (negated N) mod 4.
1427 // End If
1428 { .mfi
1429 add GR_N = GR_N,GR_M
1430 fcmp.ge.s1 p13, p10 = FR_f_abs,FR_TWOM33
1431 mov GR_Temp = 0x00400
1432 }
1433 ;;
1434
1435 { .mfi
1436 (p9) sub GR_N = GR_Temp,GR_N
1437 fadd.s1 FR_s_lo = FR_s_lo,FR_g // For Case 1, s_lo=s_lo+g
1438 nop.i 999
1439 }
1440 { .mfi
1441 nop.m 999
1442 fadd.s1 FR_f_lo = FR_f_lo,FR_A // For Case 2, f_lo=f_lo+A
1443 nop.i 999
1444 }
1445 ;;
1446
1447 // a := (B - A) + b Exact.
1448 // Note that a is either 0 or 2^(-128).
1449 // f_hi := A + f;
1450 // f_lo := (f - f_hi) + A
1451 // f_lo=f-f_hi is exact because either |f| >= |A|, in which
1452 // case f-f_hi is clearly exact; or otherwise, 0<|f|<|A|
1453 // means msb(f) <= msb(A) = 2^(-64) => |f| = 2^(-64).
1454 // If f = 2^(-64), f-f_hi involves cancellation and is
1455 // exact. If f = -2^(-64), then A + f is exact. Hence
1456 // f-f_hi is -A exactly, giving f_lo = 0.
1457 // f_lo := f_lo + a;
1458
1459 // If |f| >= 2^(-33)
1460 // Case 1
1461 // CASE := 1
1462 // g := A_hi + B_lo;
1463 // s_hi := f + g;
1464 // s_lo := (f - s_hi) + g;
1465 // Else
1466 // Case 2
1467 // CASE := 2
1468 // A := fadd.fpsr3( A_hi, B_lo )
1469 // B := max( A_hi, B_lo )
1470 // b := min( A_hi, B_lo )
1471
1472 { .mfi
1473 nop.m 999
1474 (p10) fcmp.ge.unc.s1 p14, p11 = FR_f_abs,FR_TWOM50
1475 nop.i 999
1476 }
1477 { .mfi
1478 nop.m 999
1479 (p13) fms.s1 FR_r_lo = FR_s_hi,FR_D_hi,FR_r_hi //For Case 1, r_lo=s_hi*D_hi+r_hi
1480 nop.i 999
1481 }
1482 ;;
1483
1484 // If |f| >= 2^(-50) then
1485 // s_hi := f_hi;
1486 // s_lo := f_lo;
1487 // Else
1488 // f_lo := (f_lo + A_lo) + x*p_4
1489 // s_hi := f_hi + f_lo
1490 // s_lo := (f_hi - s_hi) + f_lo
1491 // End If
1492 { .mfi
1493 nop.m 999
1494 (p14) mov FR_s_hi = FR_f_hi
1495 nop.i 999
1496 }
1497 { .mfi
1498 nop.m 999
1499 (p10) fadd.s1 FR_f_lo = FR_f_lo,FR_a
1500 nop.i 999
1501 }
1502 ;;
1503
1504 { .mfi
1505 nop.m 999
1506 (p14) mov FR_s_lo = FR_f_lo
1507 nop.i 999
1508 }
1509 { .mfi
1510 nop.m 999
1511 (p11) fadd.s1 FR_f_lo = FR_f_lo,FR_A_lo
1512 nop.i 999
1513 }
1514 ;;
1515
1516 { .mfi
1517 nop.m 999
1518 (p11) fma.s1 FR_f_lo = FR_X,FR_p_4,FR_f_lo
1519 nop.i 999
1520 }
1521 ;;
1522
1523 { .mfi
1524 nop.m 999
1525 (p13) fma.s1 FR_r_lo = FR_s_hi,FR_D_lo,FR_r_lo //For Case 1, r_lo=s_hi*D_lo+r_lo
1526 nop.i 999
1527 }
1528 { .mfi
1529 nop.m 999
1530 (p11) fadd.s1 FR_s_hi = FR_f_hi,FR_f_lo
1531 nop.i 999
1532 }
1533 ;;
1534
1535 // r_hi := s_hi*D_hi
1536 // r_lo := s_hi*D_hi - r_hi with fma
1537 // r_lo := (s_hi*D_lo + r_lo) + s_lo*D_hi
1538 { .mfi
1539 nop.m 999
1540 (p10) fmpy.s1 FR_r_hi = FR_s_hi,FR_D_hi
1541 nop.i 999
1542 }
1543 { .mfi
1544 nop.m 999
1545 (p11) fsub.s1 FR_s_lo = FR_f_hi,FR_s_hi
1546 nop.i 999
1547 }
1548 ;;
1549
1550 { .mfi
1551 nop.m 999
1552 (p10) fms.s1 FR_r_lo = FR_s_hi,FR_D_hi,FR_r_hi
1553 nop.i 999
1554 }
1555 { .mfi
1556 nop.m 999
1557 (p11) fadd.s1 FR_s_lo = FR_s_lo,FR_f_lo
1558 nop.i 999
1559 }
1560 ;;
1561
1562 { .mfi
1563 nop.m 999
1564 (p10) fma.s1 FR_r_lo = FR_s_hi,FR_D_lo,FR_r_lo
1565 nop.i 999
1566 }
1567 ;;
1568
1569 // Return N, r_hi, r_lo
1570 // We do not return CASE
1571 { .mfb
1572 nop.m 999
1573 fma.s1 FR_r_lo = FR_s_lo,FR_D_hi,FR_r_lo
1574 br.ret.sptk b0
1575 }
1576 ;;
1577
1578 .endp __libm_pi_by_2_reduce#