]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/ia64/fpu/e_powl.S
2.5-18.1
[thirdparty/glibc.git] / sysdeps / ia64 / fpu / e_powl.S
1 .file "powl.s"
2
3
4 // Copyright (c) 2000 - 2003, Intel Corporation
5 // All rights reserved.
6 //
7 // Contributed 2000 by the Intel Numerics Group, Intel Corporation
8 //
9 // Redistribution and use in source and binary forms, with or without
10 // modification, are permitted provided that the following conditions are
11 // met:
12 //
13 // * Redistributions of source code must retain the above copyright
14 // notice, this list of conditions and the following disclaimer.
15 //
16 // * Redistributions in binary form must reproduce the above copyright
17 // notice, this list of conditions and the following disclaimer in the
18 // documentation and/or other materials provided with the distribution.
19 //
20 // * The name of Intel Corporation may not be used to endorse or promote
21 // products derived from this software without specific prior written
22 // permission.
23
24 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
25 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
26 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
27 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
28 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
29 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
30 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
31 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
32 // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
33 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
34 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35 //
36 // Intel Corporation is the author of this code, and requests that all
37 // problem reports or change requests be submitted to it directly at
38 // http://www.intel.com/software/products/opensource/libraries/num.htm.
39 //
40 //*********************************************************************
41 //
42 // Function: powl(x,y), where
43 // y
44 // powl(x,y) = x , for double extended precision x and y values
45 //
46 //*********************************************************************
47 //
48 // History:
49 // 02/02/00 (Hand Optimized)
50 // 04/04/00 Unwind support added
51 // 08/15/00 Bundle added after call to __libm_error_support to properly
52 // set [the previously overwritten] GR_Parameter_RESULT.
53 // 01/22/01 Corrected results for powl(1,inf), powl(1,nan), and
54 // powl(snan,0) to be 1 per C99, not nan. Fixed many flag settings.
55 // 02/06/01 Call __libm_error support if over/underflow when y=2.
56 // 04/17/01 Support added for y close to 1 and x a non-special value.
57 // Shared software under/overflow detection for all paths
58 // 02/07/02 Corrected sf3 setting to disable traps
59 // 05/13/02 Improved performance of all paths
60 // 02/10/03 Reordered header: .section, .global, .proc, .align;
61 // used data8 for long double table values
62 // 04/17/03 Added missing mutex directive
63 // 10/13/03 Corrected .endp names to match .proc names
64 //
65 //*********************************************************************
66 //
67 // Resources Used:
68 //
69 // Floating-Point Registers:
70 // f8 (Input x and Return Value)
71 // f9 (Input y)
72 // f10-f15,f32-f79
73 //
74 // General Purpose Registers:
75 // Locals r14-24,r32-r65
76 // Parameters to __libm_error_support r62,r63,r64,r65
77 //
78 // Predicate Registers: p6-p15
79 //
80 //*********************************************************************
81 //
82 // Special Cases and IEEE special conditions:
83 //
84 // Denormal fault raised on denormal inputs
85 // Overflow exceptions raised when appropriate for pow
86 // Underflow exceptions raised when appropriate for pow
87 // (Error Handling Routine called for overflow and Underflow)
88 // Inexact raised when appropriate by algorithm
89 //
90 // 1. (anything) ** NatVal or (NatVal) ** anything is NatVal
91 // 2. X or Y unsupported or sNaN is qNaN/Invalid
92 // 3. (anything) ** 0 is 1
93 // 4. (anything) ** 1 is itself
94 // 5. (anything except 1) ** qNAN is qNAN
95 // 6. qNAN ** (anything except 0) is qNAN
96 // 7. +-(|x| > 1) ** +INF is +INF
97 // 8. +-(|x| > 1) ** -INF is +0
98 // 9. +-(|x| < 1) ** +INF is +0
99 // 10. +-(|x| < 1) ** -INF is +INF
100 // 11. +-1 ** +-INF is +1
101 // 12. +0 ** (+anything except 0, NAN) is +0
102 // 13. -0 ** (+anything except 0, NAN, odd integer) is +0
103 // 14. +0 ** (-anything except 0, NAN) is +INF/div_0
104 // 15. -0 ** (-anything except 0, NAN, odd integer) is +INF/div_0
105 // 16. -0 ** (odd integer) = -( +0 ** (odd integer) )
106 // 17. +INF ** (+anything except 0,NAN) is +INF
107 // 18. +INF ** (-anything except 0,NAN) is +0
108 // 19. -INF ** (anything except NAN) = -0 ** (-anything)
109 // 20. (-anything) ** (integer) is (-1)**(integer)*(+anything**integer)
110 // 21. (-anything except 0 and inf) ** (non-integer) is qNAN/Invalid
111 // 22. X or Y denorm/unorm and denorm/unorm operand trap is enabled,
112 // generate denorm/unorm fault except if invalid or div_0 raised.
113 //
114 //*********************************************************************
115 //
116 // Algorithm
117 // =========
118 //
119 // Special Cases
120 //
121 // If Y = 2, return X*X.
122 // If Y = 0.5, return sqrt(X).
123 //
124 // Compute log(X) to extra precision.
125 //
126 // ker_log_80( X, logX_hi, logX_lo, Safe );
127 //
128 // ...logX_hi + logX_lo approximates log(X) to roughly 80
129 // ...significant bits of accuracy.
130 //
131 // Compute Y*log(X) to extra precision.
132 //
133 // P_hi := Y * logX_hi
134 // P_lo := Y * logX_hi - P_hi ...using FMA
135 // P_lo := Y * logX_lo + P_lo ...using FMA
136 //
137 // Compute exp(P_hi + P_lo)
138 //
139 // Flag := 2;
140 // Expo_Range := 2; (assuming double-extended power function)
141 // ker_exp_64( P_hi, P_lo, Flag, Expo_Range,
142 // Z_hi, Z_lo, scale, Safe )
143 //
144 // scale := sgn * scale
145 //
146 // If (Safe) then ...result will not over/underflow
147 // return scale*Z_hi + (scale*Z_lo)
148 // quickly
149 // Else
150 // take necessary precaution in computing
151 // scale*Z_hi + (scale*Z_lo)
152 // to set possible exceptions correctly.
153 // End If
154 //
155 // Case_Y_Special
156 //
157 // ...Follow the order of the case checks
158 //
159 // If Y is +-0, return +1 without raising any exception.
160 // If Y is +1, return X without raising any exception.
161 // If Y is qNaN, return Y without exception.
162 // If X is qNaN, return X without exception.
163 //
164 // At this point, X is real and Y is +-inf.
165 // Thus |X| can only be 1, strictly bigger than 1, or
166 // strictly less than 1.
167 //
168 // If |X| < 1, then
169 // return ( Y == +inf? +0 : +inf )
170 // elseif |X| > 1, then
171 // return ( Y == +inf? +0 : +inf )
172 // else
173 // goto Case_Invalid
174 //
175 // Case_X_Special
176 //
177 // ...Follow the order of the case checks
178 // ...Note that Y is real, finite, non-zero, and not +1.
179 //
180 // If X is qNaN, return X without exception.
181 //
182 // If X is +-0,
183 // return ( Y > 0 ? +0 : +inf )
184 //
185 // If X is +inf
186 // return ( Y > 0 ? +inf : +0 )
187 //
188 // If X is -inf
189 // return -0 ** -Y
190 // return ( Y > 0 ? +inf : +0 )
191 //
192 // Case_Invalid
193 //
194 // Return 0 * inf to generate a quiet NaN together
195 // with an invalid exception.
196 //
197 // Implementation
198 // ==============
199 //
200 // We describe the quick branch since this part is important
201 // in reaching the normal case efficiently.
202 //
203 // STAGE 1
204 // -------
205 // This stage contains two threads.
206 //
207 // Stage1.Thread1
208 //
209 // fclass.m X_excep, X_ok = X, (NatVal or s/qNaN) or
210 // +-0, +-infinity
211 //
212 // fclass.nm X_unsupp, X_supp = X, (NatVal or s/qNaN) or
213 // +-(0, unnorm, norm, infinity)
214 //
215 // X_norm := fnorm( X ) with traps disabled
216 //
217 // If (X_excep) goto Filtering (Step 2)
218 // If (X_unsupp) goto Filtering (Step 2)
219 //
220 // Stage1.Thread2
221 // ..............
222 //
223 // fclass.m Y_excep, Y_ok = Y, (NatVal or s/qNaN) or
224 // +-0, +-infinity
225 //
226 // fclass.nm Y_unsupp, Y_supp = Y, (NatVal or s/qNaN) or
227 // +-(0, unnorm, norm, infinity)
228 //
229 // Y_norm := fnorm( Y ) with traps disabled
230 //
231 // If (Y_excep) goto Filtering (Step 2)
232 // If (Y_unsupp) goto Filtering (Step 2)
233 //
234 //
235 // STAGE 2
236 // -------
237 // This stage contains two threads.
238 //
239 // Stage2.Thread1
240 // ..............
241 //
242 // Set X_lt_0 if X < 0 (using fcmp)
243 // sgn := +1.0
244 // If (X_lt_0) goto Filtering (Step 2)
245 //
246 // Stage2.Thread2
247 // ..............
248 //
249 // Set Y_is_1 if Y = +1 (using fcmp)
250 // If (Y_is_1) goto Filtering (Step 2)
251 //
252 // STAGE 3
253 // -------
254 // This stage contains two threads.
255 //
256 //
257 // Stage3.Thread1
258 // ..............
259 //
260 // X := fnorm(X) in prevailing traps
261 //
262 //
263 // Stage3.Thread2
264 // ..............
265 //
266 // Y := fnorm(Y) in prevailing traps
267 //
268 // STAGE 4
269 // -------
270 //
271 // Go to Case_Normal.
272 //
273
274
275 // ************* DO NOT CHANGE ORDER OF THESE TABLES ********************
276
277 // double-extended 1/ln(2)
278 // 3fff b8aa 3b29 5c17 f0bb be87fed0691d3e88
279 // 3fff b8aa 3b29 5c17 f0bc
280 // For speed the significand will be loaded directly with a movl and setf.sig
281 // and the exponent will be bias+63 instead of bias+0. Thus subsequent
282 // computations need to scale appropriately.
283 // The constant 2^12/ln(2) is needed for the computation of N. This is also
284 // obtained by scaling the computations.
285 //
286 // Two shifting constants are loaded directly with movl and setf.d.
287 // 1. RSHF_2TO51 = 1.1000..00 * 2^(63-12)
288 // This constant is added to x*1/ln2 to shift the integer part of
289 // x*2^12/ln2 into the rightmost bits of the significand.
290 // The result of this fma is N_signif.
291 // 2. RSHF = 1.1000..00 * 2^(63)
292 // This constant is subtracted from N_signif * 2^(-51) to give
293 // the integer part of N, N_fix, as a floating-point number.
294 // The result of this fms is float_N.
295 RODATA
296
297 .align 16
298 // L_hi, L_lo
299 LOCAL_OBJECT_START(Constants_exp_64_Arg)
300 data8 0xB17217F400000000,0x00003FF2 // L_hi = hi part log(2)/2^12
301 data8 0xF473DE6AF278ECE6,0x00003FD4 // L_lo = lo part log(2)/2^12
302 LOCAL_OBJECT_END(Constants_exp_64_Arg)
303
304 LOCAL_OBJECT_START(Constants_exp_64_A)
305 // Reversed
306 data8 0xAAAAAAABB1B736A0,0x00003FFA
307 data8 0xAAAAAAAB90CD6327,0x00003FFC
308 data8 0xFFFFFFFFFFFFFFFF,0x00003FFD
309 LOCAL_OBJECT_END(Constants_exp_64_A)
310
311 LOCAL_OBJECT_START(Constants_exp_64_P)
312 // Reversed
313 data8 0xD00D6C8143914A8A,0x00003FF2
314 data8 0xB60BC4AC30304B30,0x00003FF5
315 data8 0x888888887474C518,0x00003FF8
316 data8 0xAAAAAAAA8DAE729D,0x00003FFA
317 data8 0xAAAAAAAAAAAAAF61,0x00003FFC
318 data8 0x80000000000004C7,0x00003FFE
319 LOCAL_OBJECT_END(Constants_exp_64_P)
320
321 LOCAL_OBJECT_START(Constants_exp_64_T1)
322 data4 0x3F800000,0x3F8164D2,0x3F82CD87,0x3F843A29
323 data4 0x3F85AAC3,0x3F871F62,0x3F88980F,0x3F8A14D5
324 data4 0x3F8B95C2,0x3F8D1ADF,0x3F8EA43A,0x3F9031DC
325 data4 0x3F91C3D3,0x3F935A2B,0x3F94F4F0,0x3F96942D
326 data4 0x3F9837F0,0x3F99E046,0x3F9B8D3A,0x3F9D3EDA
327 data4 0x3F9EF532,0x3FA0B051,0x3FA27043,0x3FA43516
328 data4 0x3FA5FED7,0x3FA7CD94,0x3FA9A15B,0x3FAB7A3A
329 data4 0x3FAD583F,0x3FAF3B79,0x3FB123F6,0x3FB311C4
330 data4 0x3FB504F3,0x3FB6FD92,0x3FB8FBAF,0x3FBAFF5B
331 data4 0x3FBD08A4,0x3FBF179A,0x3FC12C4D,0x3FC346CD
332 data4 0x3FC5672A,0x3FC78D75,0x3FC9B9BE,0x3FCBEC15
333 data4 0x3FCE248C,0x3FD06334,0x3FD2A81E,0x3FD4F35B
334 data4 0x3FD744FD,0x3FD99D16,0x3FDBFBB8,0x3FDE60F5
335 data4 0x3FE0CCDF,0x3FE33F89,0x3FE5B907,0x3FE8396A
336 data4 0x3FEAC0C7,0x3FED4F30,0x3FEFE4BA,0x3FF28177
337 data4 0x3FF5257D,0x3FF7D0DF,0x3FFA83B3,0x3FFD3E0C
338 LOCAL_OBJECT_END(Constants_exp_64_T1)
339
340 LOCAL_OBJECT_START(Constants_exp_64_T2)
341 data4 0x3F800000,0x3F80058C,0x3F800B18,0x3F8010A4
342 data4 0x3F801630,0x3F801BBD,0x3F80214A,0x3F8026D7
343 data4 0x3F802C64,0x3F8031F2,0x3F803780,0x3F803D0E
344 data4 0x3F80429C,0x3F80482B,0x3F804DB9,0x3F805349
345 data4 0x3F8058D8,0x3F805E67,0x3F8063F7,0x3F806987
346 data4 0x3F806F17,0x3F8074A8,0x3F807A39,0x3F807FCA
347 data4 0x3F80855B,0x3F808AEC,0x3F80907E,0x3F809610
348 data4 0x3F809BA2,0x3F80A135,0x3F80A6C7,0x3F80AC5A
349 data4 0x3F80B1ED,0x3F80B781,0x3F80BD14,0x3F80C2A8
350 data4 0x3F80C83C,0x3F80CDD1,0x3F80D365,0x3F80D8FA
351 data4 0x3F80DE8F,0x3F80E425,0x3F80E9BA,0x3F80EF50
352 data4 0x3F80F4E6,0x3F80FA7C,0x3F810013,0x3F8105AA
353 data4 0x3F810B41,0x3F8110D8,0x3F81166F,0x3F811C07
354 data4 0x3F81219F,0x3F812737,0x3F812CD0,0x3F813269
355 data4 0x3F813802,0x3F813D9B,0x3F814334,0x3F8148CE
356 data4 0x3F814E68,0x3F815402,0x3F81599C,0x3F815F37
357 LOCAL_OBJECT_END(Constants_exp_64_T2)
358
359 LOCAL_OBJECT_START(Constants_exp_64_W1)
360 data8 0x0000000000000000, 0xBE384454171EC4B4
361 data8 0xBE6947414AA72766, 0xBE5D32B6D42518F8
362 data8 0x3E68D96D3A319149, 0xBE68F4DA62415F36
363 data8 0xBE6DDA2FC9C86A3B, 0x3E6B2E50F49228FE
364 data8 0xBE49C0C21188B886, 0x3E64BFC21A4C2F1F
365 data8 0xBE6A2FBB2CB98B54, 0x3E5DC5DE9A55D329
366 data8 0x3E69649039A7AACE, 0x3E54728B5C66DBA5
367 data8 0xBE62B0DBBA1C7D7D, 0x3E576E0409F1AF5F
368 data8 0x3E6125001A0DD6A1, 0xBE66A419795FBDEF
369 data8 0xBE5CDE8CE1BD41FC, 0xBE621376EA54964F
370 data8 0x3E6370BE476E76EE, 0x3E390D1A3427EB92
371 data8 0x3E1336DE2BF82BF8, 0xBE5FF1CBD0F7BD9E
372 data8 0xBE60A3550CEB09DD, 0xBE5CA37E0980F30D
373 data8 0xBE5C541B4C082D25, 0xBE5BBECA3B467D29
374 data8 0xBE400D8AB9D946C5, 0xBE5E2A0807ED374A
375 data8 0xBE66CB28365C8B0A, 0x3E3AAD5BD3403BCA
376 data8 0x3E526055C7EA21E0, 0xBE442C75E72880D6
377 data8 0x3E58B2BB85222A43, 0xBE5AAB79522C42BF
378 data8 0xBE605CB4469DC2BC, 0xBE589FA7A48C40DC
379 data8 0xBE51C2141AA42614, 0xBE48D087C37293F4
380 data8 0x3E367A1CA2D673E0, 0xBE51BEBB114F7A38
381 data8 0xBE6348E5661A4B48, 0xBDF526431D3B9962
382 data8 0x3E3A3B5E35A78A53, 0xBE46C46C1CECD788
383 data8 0xBE60B7EC7857D689, 0xBE594D3DD14F1AD7
384 data8 0xBE4F9C304C9A8F60, 0xBE52187302DFF9D2
385 data8 0xBE5E4C8855E6D68F, 0xBE62140F667F3DC4
386 data8 0xBE36961B3BF88747, 0x3E602861C96EC6AA
387 data8 0xBE3B5151D57FD718, 0x3E561CD0FC4A627B
388 data8 0xBE3A5217CA913FEA, 0x3E40A3CC9A5D193A
389 data8 0xBE5AB71310A9C312, 0x3E4FDADBC5F57719
390 data8 0x3E361428DBDF59D5, 0x3E5DB5DB61B4180D
391 data8 0xBE42AD5F7408D856, 0x3E2A314831B2B707
392 LOCAL_OBJECT_END(Constants_exp_64_W1)
393
394 LOCAL_OBJECT_START(Constants_exp_64_W2)
395 data8 0x0000000000000000, 0xBE641F2537A3D7A2
396 data8 0xBE68DD57AD028C40, 0xBE5C77D8F212B1B6
397 data8 0x3E57878F1BA5B070, 0xBE55A36A2ECAE6FE
398 data8 0xBE620608569DFA3B, 0xBE53B50EA6D300A3
399 data8 0x3E5B5EF2223F8F2C, 0xBE56A0D9D6DE0DF4
400 data8 0xBE64EEF3EAE28F51, 0xBE5E5AE2367EA80B
401 data8 0x3E47CB1A5FCBC02D, 0xBE656BA09BDAFEB7
402 data8 0x3E6E70C6805AFEE7, 0xBE6E0509A3415EBA
403 data8 0xBE56856B49BFF529, 0x3E66DD3300508651
404 data8 0x3E51165FC114BC13, 0x3E53333DC453290F
405 data8 0x3E6A072B05539FDA, 0xBE47CD877C0A7696
406 data8 0xBE668BF4EB05C6D9, 0xBE67C3E36AE86C93
407 data8 0xBE533904D0B3E84B, 0x3E63E8D9556B53CE
408 data8 0x3E212C8963A98DC8, 0xBE33138F032A7A22
409 data8 0x3E530FA9BC584008, 0xBE6ADF82CCB93C97
410 data8 0x3E5F91138370EA39, 0x3E5443A4FB6A05D8
411 data8 0x3E63DACD181FEE7A, 0xBE62B29DF0F67DEC
412 data8 0x3E65C4833DDE6307, 0x3E5BF030D40A24C1
413 data8 0x3E658B8F14E437BE, 0xBE631C29ED98B6C7
414 data8 0x3E6335D204CF7C71, 0x3E529EEDE954A79D
415 data8 0x3E5D9257F64A2FB8, 0xBE6BED1B854ED06C
416 data8 0x3E5096F6D71405CB, 0xBE3D4893ACB9FDF5
417 data8 0xBDFEB15801B68349, 0x3E628D35C6A463B9
418 data8 0xBE559725ADE45917, 0xBE68C29C042FC476
419 data8 0xBE67593B01E511FA, 0xBE4A4313398801ED
420 data8 0x3E699571DA7C3300, 0x3E5349BE08062A9E
421 data8 0x3E5229C4755BB28E, 0x3E67E42677A1F80D
422 data8 0xBE52B33F6B69C352, 0xBE6B3550084DA57F
423 data8 0xBE6DB03FD1D09A20, 0xBE60CBC42161B2C1
424 data8 0x3E56ED9C78A2B771, 0xBE508E319D0FA795
425 data8 0xBE59482AFD1A54E9, 0xBE2A17CEB07FD23E
426 data8 0x3E68BF5C17365712, 0x3E3956F9B3785569
427 LOCAL_OBJECT_END(Constants_exp_64_W2)
428
429 LOCAL_OBJECT_START(Constants_log_80_P)
430 // P_8, P_7, ..., P_1
431 data8 0xCCCE8B883B1042BC, 0x0000BFFB // P_8
432 data8 0xE38997B7CADC2149, 0x00003FFB // P_7
433 data8 0xFFFFFFFEB1ACB090, 0x0000BFFB // P_6
434 data8 0x9249249806481C81, 0x00003FFC // P_5
435 data8 0x0000000000000000, 0x00000000 // Pad for bank conflicts
436 data8 0xAAAAAAAAAAAAB0EF, 0x0000BFFC // P_4
437 data8 0xCCCCCCCCCCC91416, 0x00003FFC // P_3
438 data8 0x8000000000000000, 0x0000BFFD // P_2
439 data8 0xAAAAAAAAAAAAAAAB, 0x00003FFD // P_1
440 LOCAL_OBJECT_END(Constants_log_80_P)
441
442 LOCAL_OBJECT_START(Constants_log_80_Q)
443 // log2_hi, log2_lo, Q_6, Q_5, Q_4, Q_3, Q_2, Q_1
444 data8 0xB172180000000000,0x00003FFE
445 data8 0x82E308654361C4C6,0x0000BFE2
446 data8 0x92492453A51BE0AF,0x00003FFC
447 data8 0xAAAAAB73A0CFD29F,0x0000BFFC
448 data8 0xCCCCCCCCCCCE3872,0x00003FFC
449 data8 0xFFFFFFFFFFFFB4FB,0x0000BFFC
450 data8 0xAAAAAAAAAAAAAAAB,0x00003FFD
451 data8 0x8000000000000000,0x0000BFFE
452 LOCAL_OBJECT_END(Constants_log_80_Q)
453
454 LOCAL_OBJECT_START(Constants_log_80_Z_G_H_h1)
455 // Z1 - 16 bit fixed, G1 and H1 IEEE single, h1 IEEE double
456 data4 0x00008000,0x3F800000,0x00000000,0x00000000
457 data4 0x00000000,0x00000000,0x00000000,0x00000000
458 data4 0x00007879,0x3F70F0F0,0x3D785196,0x00000000
459 data4 0xEBA0E0D1,0x8B1D330B,0x00003FDA,0x00000000
460 data4 0x000071C8,0x3F638E38,0x3DF13843,0x00000000
461 data4 0x9EADD553,0xE2AF365E,0x00003FE2,0x00000000
462 data4 0x00006BCB,0x3F579430,0x3E2FF9A0,0x00000000
463 data4 0x752F34A2,0xF585FEC3,0x0000BFE3,0x00000000
464 data4 0x00006667,0x3F4CCCC8,0x3E647FD6,0x00000000
465 data4 0x893B03F3,0xF3546435,0x00003FE2,0x00000000
466 data4 0x00006187,0x3F430C30,0x3E8B3AE7,0x00000000
467 data4 0x39CDD2AC,0xBABA62E0,0x00003FE4,0x00000000
468 data4 0x00005D18,0x3F3A2E88,0x3EA30C68,0x00000000
469 data4 0x457978A1,0x8718789F,0x00003FE2,0x00000000
470 data4 0x0000590C,0x3F321640,0x3EB9CEC8,0x00000000
471 data4 0x3185E56A,0x9442DF96,0x0000BFE4,0x00000000
472 data4 0x00005556,0x3F2AAAA8,0x3ECF9927,0x00000000
473 data4 0x2BBE2CBD,0xCBF9A4BF,0x00003FE4,0x00000000
474 data4 0x000051EC,0x3F23D708,0x3EE47FC5,0x00000000
475 data4 0x852D5935,0xF3537535,0x00003FE3,0x00000000
476 data4 0x00004EC5,0x3F1D89D8,0x3EF8947D,0x00000000
477 data4 0x46CDF32F,0xA1F1E699,0x0000BFDF,0x00000000
478 data4 0x00004BDB,0x3F17B420,0x3F05F3A1,0x00000000
479 data4 0xD8484CE3,0x84A61856,0x00003FE4,0x00000000
480 data4 0x00004925,0x3F124920,0x3F0F4303,0x00000000
481 data4 0xFF28821B,0xC7DD97E0,0x0000BFE2,0x00000000
482 data4 0x0000469F,0x3F0D3DC8,0x3F183EBF,0x00000000
483 data4 0xEF1FD32F,0xD3C4A887,0x00003FE3,0x00000000
484 data4 0x00004445,0x3F088888,0x3F20EC80,0x00000000
485 data4 0x464C76DA,0x84672BE6,0x00003FE5,0x00000000
486 data4 0x00004211,0x3F042108,0x3F29516A,0x00000000
487 data4 0x18835FB9,0x9A43A511,0x0000BFE5,0x00000000
488 LOCAL_OBJECT_END(Constants_log_80_Z_G_H_h1)
489
490 LOCAL_OBJECT_START(Constants_log_80_Z_G_H_h2)
491 // Z2 - 16 bit fixed, G2 and H2 IEEE single, h2 IEEE double
492 data4 0x00008000,0x3F800000,0x00000000,0x00000000
493 data4 0x00000000,0x00000000,0x00000000,0x00000000
494 data4 0x00007F81,0x3F7F00F8,0x3B7F875D,0x00000000
495 data4 0x211398BF,0xAD08B116,0x00003FDB,0x00000000
496 data4 0x00007F02,0x3F7E03F8,0x3BFF015B,0x00000000
497 data4 0xC376958E,0xB106790F,0x00003FDE,0x00000000
498 data4 0x00007E85,0x3F7D08E0,0x3C3EE393,0x00000000
499 data4 0x79A7679A,0xFD03F242,0x0000BFDA,0x00000000
500 data4 0x00007E08,0x3F7C0FC0,0x3C7E0586,0x00000000
501 data4 0x05E7AE08,0xF03F81C3,0x0000BFDF,0x00000000
502 data4 0x00007D8D,0x3F7B1880,0x3C9E75D2,0x00000000
503 data4 0x049EB22F,0xD1B87D3C,0x00003FDE,0x00000000
504 data4 0x00007D12,0x3F7A2328,0x3CBDC97A,0x00000000
505 data4 0x3A9E81E0,0xFABC8B95,0x00003FDF,0x00000000
506 data4 0x00007C98,0x3F792FB0,0x3CDCFE47,0x00000000
507 data4 0x7C4B5443,0xF5F3653F,0x00003FDF,0x00000000
508 data4 0x00007C20,0x3F783E08,0x3CFC15D0,0x00000000
509 data4 0xF65A1773,0xE78AB204,0x00003FE0,0x00000000
510 data4 0x00007BA8,0x3F774E38,0x3D0D874D,0x00000000
511 data4 0x7B8EF695,0xDB7CBFFF,0x0000BFE0,0x00000000
512 data4 0x00007B31,0x3F766038,0x3D1CF49B,0x00000000
513 data4 0xCF773FB3,0xC0241AEA,0x0000BFE0,0x00000000
514 data4 0x00007ABB,0x3F757400,0x3D2C531D,0x00000000
515 data4 0xC9539FDF,0xFC8F4D48,0x00003FE1,0x00000000
516 data4 0x00007A45,0x3F748988,0x3D3BA322,0x00000000
517 data4 0x954665C2,0x9CD035FB,0x0000BFE1,0x00000000
518 data4 0x000079D1,0x3F73A0D0,0x3D4AE46F,0x00000000
519 data4 0xDD367A30,0xEC9017C7,0x00003FE1,0x00000000
520 data4 0x0000795D,0x3F72B9D0,0x3D5A1756,0x00000000
521 data4 0xCB11189C,0xEE6625D3,0x0000BFE1,0x00000000
522 data4 0x000078EB,0x3F71D488,0x3D693B9D,0x00000000
523 data4 0xBE11C424,0xA49C8DB5,0x0000BFE0,0x00000000
524 LOCAL_OBJECT_END(Constants_log_80_Z_G_H_h2)
525
526 LOCAL_OBJECT_START(Constants_log_80_h3_G_H)
527 // h3 IEEE double extended, H3 and G3 IEEE single
528 data4 0x112666B0,0xAAACAAB1,0x00003FD3,0x3F7FFC00
529 data4 0x9B7FAD21,0x90051030,0x00003FD8,0x3F7FF400
530 data4 0xF4D783C4,0xA6B46F46,0x00003FDA,0x3F7FEC00
531 data4 0x11C6DDCA,0xDA148D88,0x0000BFD8,0x3F7FE400
532 data4 0xCA964D95,0xCE65C1D8,0x0000BFD8,0x3F7FDC00
533 data4 0x23412D13,0x883838EE,0x0000BFDB,0x3F7FD400
534 data4 0x983ED687,0xB7E5CFA1,0x00003FDB,0x3F7FCC08
535 data4 0xE3C3930B,0xDBE23B16,0x0000BFD9,0x3F7FC408
536 data4 0x48AA4DFC,0x9B92F1FC,0x0000BFDC,0x3F7FBC10
537 data4 0xCE9C8F7E,0x9A8CEB15,0x0000BFD9,0x3F7FB410
538 data4 0x0DECE74A,0x8C220879,0x00003FDC,0x3F7FAC18
539 data4 0x2F053150,0xB25CA912,0x0000BFDA,0x3F7FA420
540 data4 0xD9A5BE20,0xA5876555,0x00003FDB,0x3F7F9C20
541 data4 0x2053F087,0xC919BB6E,0x00003FD9,0x3F7F9428
542 data4 0x041E9A77,0xB70BDA79,0x00003FDC,0x3F7F8C30
543 data4 0xEA1C9C30,0xF18A5C08,0x00003FDA,0x3F7F8438
544 data4 0x796D89E5,0xA3790D84,0x0000BFDD,0x3F7F7C40
545 data4 0xA2915A3A,0xE1852369,0x0000BFDD,0x3F7F7448
546 data4 0xA39ED868,0xD803858F,0x00003FDC,0x3F7F6C50
547 data4 0x9417EBB7,0xB2EEE356,0x0000BFDD,0x3F7F6458
548 data4 0x9BB0D07F,0xED5C1F8A,0x0000BFDC,0x3F7F5C68
549 data4 0xE87C740A,0xD6D201A0,0x0000BFDD,0x3F7F5470
550 data4 0x1CA74025,0xE8DEBF5E,0x00003FDC,0x3F7F4C78
551 data4 0x1F34A7EB,0x9A995A97,0x0000BFDC,0x3F7F4488
552 data4 0x359EED97,0x9CB0F742,0x0000BFDA,0x3F7F3C90
553 data4 0xBBC6A1C8,0xD6F833C2,0x0000BFDD,0x3F7F34A0
554 data4 0xE71090EC,0xE1F68F2A,0x00003FDC,0x3F7F2CA8
555 data4 0xC160A74F,0xD1881CF1,0x0000BFDB,0x3F7F24B8
556 data4 0xD78CB5A4,0x9AD05AE2,0x00003FD6,0x3F7F1CC8
557 data4 0x9A77DC4B,0xE658CB8E,0x0000BFDD,0x3F7F14D8
558 data4 0x6BD6D312,0xBA281296,0x00003FDC,0x3F7F0CE0
559 data4 0xF95210D0,0xB478BBEB,0x0000BFDB,0x3F7F04F0
560 data4 0x38800100,0x39400480,0x39A00640,0x39E00C41 // H's start here
561 data4 0x3A100A21,0x3A300F22,0x3A4FF51C,0x3A6FFC1D
562 data4 0x3A87F20B,0x3A97F68B,0x3AA7EB86,0x3AB7E101
563 data4 0x3AC7E701,0x3AD7DD7B,0x3AE7D474,0x3AF7CBED
564 data4 0x3B03E1F3,0x3B0BDE2F,0x3B13DAAA,0x3B1BD766
565 data4 0x3B23CC5C,0x3B2BC997,0x3B33C711,0x3B3BBCC6
566 data4 0x3B43BAC0,0x3B4BB0F4,0x3B53AF6D,0x3B5BA620
567 data4 0x3B639D12,0x3B6B9444,0x3B7393BC,0x3B7B8B6D
568 LOCAL_OBJECT_END(Constants_log_80_h3_G_H)
569
570 GR_sig_inv_ln2 = r14
571 GR_rshf_2to51 = r15
572 GR_exp_2tom51 = r16
573 GR_rshf = r17
574 GR_exp_half = r18
575 GR_sign_mask = r19
576 GR_exp_square_oflow = r20
577 GR_exp_square_uflow = r21
578 GR_exp_ynear1_oflow = r22
579 GR_exp_ynear1_uflow = r23
580 GR_signif_Z = r24
581
582 GR_signexp_x = r32
583
584 GR_exp_x = r33
585
586 GR_Table_Ptr = r34
587
588 GR_Table_Ptr1 = r35
589
590 GR_Index1 = r36
591
592 GR_Index2 = r37
593 GR_Expo_X = r37
594
595 GR_M = r38
596
597 GR_X_0 = r39
598 GR_Mask = r39
599
600 GR_X_1 = r40
601 GR_W1_ptr = r40
602
603 GR_W2_ptr = r41
604 GR_X_2 = r41
605
606 GR_Z_1 = r42
607 GR_M2 = r42
608
609 GR_M1 = r43
610 GR_Z_2 = r43
611
612 GR_N = r44
613 GR_k = r44
614
615 GR_Big_Pos_Exp = r45
616
617 GR_exp_pos_max = r46
618
619 GR_exp_bias_p_k = r47
620
621 GR_Index3 = r48
622 GR_temp = r48
623
624 GR_vsm_expo = r49
625
626 GR_T1_ptr = r50
627 GR_P_ptr1 = r50
628 GR_T2_ptr = r51
629 GR_P_ptr2 = r51
630 GR_N_fix = r52
631 GR_exp_y = r53
632 GR_signif_y = r54
633 GR_signexp_y = r55
634 GR_fraction_y = r55
635 GR_low_order_bit = r56
636 GR_exp_mask = r57
637 GR_exp_bias = r58
638 GR_y_sign = r59
639 GR_table_base = r60
640 GR_ptr_exp_Arg = r61
641 GR_Delta_Exp = r62
642 GR_Special_Exp = r63
643 GR_exp_neg_max = r64
644 GR_Big_Neg_Exp = r65
645
646 //** Registers for unwind support
647
648 GR_SAVE_PFS = r59
649 GR_SAVE_B0 = r60
650 GR_SAVE_GP = r61
651 GR_Parameter_X = r62
652 GR_Parameter_Y = r63
653 GR_Parameter_RESULT = r64
654 GR_Parameter_TAG = r65
655
656 //**
657
658 FR_Input_X = f8
659 FR_Result = f8
660 FR_Input_Y = f9
661
662 FR_Neg = f10
663 FR_P_hi = f10
664 FR_X = f10
665
666 FR_Half = f11
667 FR_h_3 = f11
668 FR_poly_hi = f11
669
670 FR_Sgn = f12
671
672 FR_half_W = f13
673
674 FR_X_cor = f14
675 FR_P_lo = f14
676
677 FR_W = f15
678
679 FR_X_lo = f32
680
681 FR_S = f33
682 FR_W3 = f33
683
684 FR_Y_hi = f34
685 FR_logx_hi = f34
686
687 FR_Z = f35
688 FR_logx_lo = f35
689 FR_GS_hi = f35
690 FR_Y_lo = f35
691
692 FR_r_cor = f36
693 FR_Scale = f36
694
695 FR_G_1 = f37
696 FR_G = f37
697 FR_Wsq = f37
698 FR_temp = f37
699
700 FR_H_1 = f38
701 FR_H = f38
702 FR_W4 = f38
703
704 FR_h = f39
705 FR_h_1 = f39
706 FR_N = f39
707 FR_P_7 = f39
708
709 FR_G_2 = f40
710 FR_P_8 = f40
711 FR_L_hi = f40
712
713 FR_H_2 = f41
714 FR_L_lo = f41
715 FR_A_1 = f41
716
717 FR_h_2 = f42
718
719 FR_W1 = f43
720
721 FR_G_3 = f44
722 FR_P_8 = f44
723 FR_T1 = f44
724
725 FR_log2_hi = f45
726 FR_W2 = f45
727
728 FR_GS_lo = f46
729 FR_T2 = f46
730
731 FR_W_1_p1 = f47
732 FR_H_3 = f47
733
734 FR_float_N = f48
735
736 FR_A_2 = f49
737
738 FR_Q_4 = f50
739 FR_r4 = f50
740
741 FR_Q_3 = f51
742 FR_A_3 = f51
743
744 FR_Q_2 = f52
745 FR_P_2 = f52
746
747 FR_Q_1 = f53
748 FR_P_1 = f53
749 FR_T = f53
750
751 FR_Wp1 = f54
752 FR_Q_5 = f54
753 FR_P_3 = f54
754
755 FR_Q_6 = f55
756
757 FR_log2_lo = f56
758 FR_Two = f56
759
760 FR_Big = f57
761
762 FR_neg_2_mK = f58
763
764 FR_r = f59
765
766 FR_poly_lo = f60
767
768 FR_poly = f61
769
770 FR_P_5 = f62
771 FR_Result_small = f62
772
773 FR_rsq = f63
774
775 FR_Delta = f64
776
777 FR_save_Input_X = f65
778 FR_norm_X = f66
779 FR_norm_Y = f67
780 FR_Y_lo_2 = f68
781
782 FR_P_6 = f69
783 FR_Result_big = f69
784
785 FR_RSHF_2TO51 = f70
786 FR_INV_LN2_2TO63 = f71
787 FR_2TOM51 = f72
788 FR_RSHF = f73
789 FR_TMP1 = f74
790 FR_TMP2 = f75
791 FR_TMP3 = f76
792 FR_Tscale = f77
793 FR_P_4 = f78
794 FR_NBig = f79
795
796
797 .section .text
798 GLOBAL_LIBM_ENTRY(powl)
799 //
800 // Get significand of x. It is the critical path.
801 //
802 { .mfi
803 getf.sig GR_signif_Z = FR_Input_X // Get significand of x
804 fclass.m p11, p12 = FR_Input_X, 0x0b // Test x unorm
805 nop.i 999
806 }
807 { .mfi
808 nop.m 999
809 fnorm.s1 FR_norm_X = FR_Input_X // Normalize x
810 mov GR_exp_half = 0xffff - 1 // Exponent for 0.5
811 }
812 ;;
813
814 { .mfi
815 alloc r32 = ar.pfs,0,30,4,0
816 fclass.m p7, p0 = FR_Input_Y, 0x1E7 // Test y natval, nan, inf, zero
817 mov GR_exp_pos_max = 0x13fff // Max exponent for pos oflow test
818 }
819 { .mfi
820 addl GR_table_base = @ltoff(Constants_exp_64_Arg#), gp // Ptr to tables
821 fnorm.s1 FR_norm_Y = FR_Input_Y // Normalize y
822 mov GR_exp_neg_max = 0x33fff // Max exponent for neg oflow test
823 }
824 ;;
825
826 { .mfi
827 getf.exp GR_signexp_y = FR_Input_Y // Get sign and exp of y
828 (p12) fclass.m p11, p0 = FR_Input_Y, 0x0b // Test y unorm
829 mov GR_sign_mask = 0x20000 // Sign mask
830 }
831 { .mfi
832 ld8 GR_table_base = [GR_table_base] // Get base address for tables
833 fadd.s1 FR_Two = f1, f1 // Form 2.0 for square test
834 mov GR_exp_mask = 0x1FFFF // Exponent mask
835 }
836 ;;
837
838 { .mfi
839 getf.sig GR_signif_y = FR_Input_Y // Get significand of y
840 fclass.m p6, p0 = FR_Input_X, 0x1E7 // Test x natval, nan, inf, zero
841 nop.i 999
842 }
843 ;;
844
845 { .mfi
846 getf.exp GR_signexp_x = FR_Input_X // Get signexp of x
847 fmerge.s FR_save_Input_X = FR_Input_X, FR_Input_X
848 extr.u GR_Index1 = GR_signif_Z, 59, 4 // Extract upper 4 signif bits of x
849 }
850 { .mfb
851 setf.exp FR_Half = GR_exp_half // Load half
852 nop.f 999
853 (p11) br.cond.spnt POWL_DENORM // Branch if x or y denorm/unorm
854 }
855 ;;
856
857 // Return here from POWL_DENORM
858 POWL_COMMON:
859 { .mfi
860 setf.exp FR_Big = GR_exp_pos_max // Form big pos value for oflow test
861 fclass.nm p11, p0 = FR_Input_Y, 0x1FF // Test Y unsupported
862 shl GR_Index1 = GR_Index1,5 // Adjust index1 pointer x 32
863 }
864 { .mfi
865 add GR_Table_Ptr = 0x7c0, GR_table_base // Constants_log_80_Z_G_H_h1
866 fma.s1 FR_Sgn = f1,f1,f0 // Assume result positive
867 mov GR_exp_bias = 0xFFFF // Form exponent bias
868 }
869 ;;
870
871 //
872 // Identify NatVals, NaNs, Infs, and Zeros.
873 //
874 //
875 // Remove sign bit from exponent of y.
876 // Check for x = 1
877 // Branch on Infs, Nans, Zeros, and Natvals
878 // Check to see that exponent < 0
879 //
880 { .mfi
881 setf.exp FR_NBig = GR_exp_neg_max // Form big neg value for oflow test
882 fclass.nm p8, p0 = FR_Input_X, 0x1FF // Test X unsupported
883 and GR_exp_y = GR_exp_mask,GR_signexp_y // Get biased exponent of y
884 }
885 { .mfb
886 add GR_Index1 = GR_Index1,GR_Table_Ptr
887 nop.f 999
888 (p6) br.cond.spnt POWL_64_SPECIAL // Branch if x natval, nan, inf, zero
889 }
890 ;;
891
892 // load Z_1 from Index1
893
894 // There is logic starting here to determine if y is an integer when x < 0.
895 // If 0 < |y| < 1 then clearly y is not an integer.
896 // If |y| > 1, then the significand of y is shifted left by the size of
897 // the exponent of y. This preserves the lsb of the integer part + the
898 // fractional bits. The lsb of the integer can be tested to determine if
899 // the integer is even or odd. The fractional bits can be tested. If zero,
900 // then y is an integer.
901 //
902 { .mfi
903 ld2 GR_Z_1 =[GR_Index1],4 // Load Z_1
904 fmerge.s FR_Z = f0, FR_norm_X // Z = |x|
905 extr.u GR_X_0 = GR_signif_Z, 49, 15 // Extract X_0 from significand
906 }
907 { .mfb
908 cmp.lt p9, p0 = GR_exp_y,GR_exp_bias // Test 0 < |y| < 1
909 nop.f 999
910 (p7) br.cond.spnt POWL_64_SPECIAL // Branch if y natval, nan, inf, zero
911 }
912 ;;
913
914 { .mfb
915 ldfs FR_G_1 = [GR_Index1],4 // Load G_1
916 fcmp.eq.s1 p10, p0 = FR_Input_Y, f1 // Test Y = +1.0
917 (p8) br.cond.spnt POWL_64_UNSUPPORT // Branch if x unsupported
918 }
919 ;;
920
921 //
922 // X_0 = High order 15 bit of Z
923 //
924 { .mfb
925 ldfs FR_H_1 = [GR_Index1],8 // Load H_1
926 (p9) fcmp.lt.unc.s1 p9, p0 = FR_Input_X, f0 // Test x<0, 0 <|y|<1
927 (p11) br.cond.spnt POWL_64_UNSUPPORT // Branch if y unsupported
928 }
929 ;;
930
931 { .mfi
932 ldfe FR_h_1 = [GR_Index1] // Load h_1
933 fcmp.eq.s1 p7, p0 = FR_Input_Y, FR_Two // Test y = 2.0
934 pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15 // X_1 = X_0 * Z_1 (bits 15-30)
935 // Wait 4 cycles to use result
936 }
937 { .mfi
938 add GR_Table_Ptr = 0x9c0, GR_table_base // Constants_log_80_Z_G_H_h2
939 nop.f 999
940 sub GR_exp_y = GR_exp_y,GR_exp_bias // Get true exponent of y
941 }
942 ;;
943
944 //
945 // Branch for (x < 0) and Y not an integer.
946 //
947 { .mfb
948 nop.m 999
949 fcmp.lt.s1 p6, p0 = FR_Input_X, f0 // Test x < 0
950 (p9) br.cond.spnt POWL_64_XNEG // Branch if x < 0, 0 < |y| < 1
951 }
952 ;;
953
954 { .mfi
955 nop.m 999
956 fcmp.eq.s1 p12, p0 = FR_Input_X, f1 // Test x=+1.0
957 nop.i 999
958 }
959 { .mfb
960 nop.m 999
961 fsub.s1 FR_W = FR_Z, f1 // W = Z - 1
962 (p7) br.cond.spnt POWL_64_SQUARE // Branch if y=2
963 }
964 ;;
965
966 { .mfi
967 nop.m 999
968 (p10) fmpy.s0 FR_Result = FR_Input_X, f1 // If y=+1.0, result=x
969 (p6) shl GR_fraction_y= GR_signif_y,GR_exp_y // Get lsb of int + fraction
970 // Wait 4 cycles to use result
971 }
972 ;;
973
974 { .mfi
975 nop.m 999
976 (p12) fma.s0 FR_Result = FR_Input_Y, f0, f1 // If x=1.0, result=1, chk denorm
977 extr.u GR_Index2 = GR_X_1, 6, 4 // Extract index2
978 }
979 ;;
980
981 //
982 // N = exponent of Z
983 //
984 { .mib
985 getf.exp GR_N = FR_Z // Get exponent of Z (also x)
986 shl GR_Index2=GR_Index2,5 // Index2 x 32 bytes
987 (p10) br.ret.spnt b0 // Exit if y=+1.0
988 }
989 ;;
990
991 { .mib
992 add GR_Index2 = GR_Index2, GR_Table_Ptr // Pointer to table 2
993 nop.i 999
994 (p12) br.ret.spnt b0 // Exit if x=+1.0
995 }
996 ;;
997
998 { .mmi
999 ld2 GR_Z_2 =[GR_Index2],4 // Load Z_2
1000 ;;
1001 ldfs FR_G_2 = [GR_Index2],4 // Load G_2
1002 nop.i 999
1003 }
1004 ;;
1005
1006 { .mii
1007 ldfs FR_H_2 = [GR_Index2],8 // Load H_2
1008 (p6) tbit.nz.unc p9, p0 = GR_fraction_y, 63 // Test x<0 and y odd integer
1009 add GR_Table_Ptr = 0xbcc, GR_table_base // Constants_log_80_h3_G_H, G_3
1010 }
1011 ;;
1012
1013 //
1014 // For x < 0 and y odd integer,, set sign = -1.
1015 //
1016 { .mfi
1017 getf.exp GR_M = FR_W // Get signexp of W
1018 nop.f 999
1019 pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15 // X_2 = X_1 * Z_2 (bits 15-30)
1020 }
1021 { .mfi
1022 ldfe FR_h_2 = [GR_Index2] // Load h_2
1023 (p9) fnma.s1 FR_Sgn = f1, f1, f0 // If x<0, y odd int, result negative
1024 sub GR_N = GR_N, GR_exp_bias // Get true exponent of x = N
1025 }
1026 ;;
1027
1028 { .mfi
1029 add GR_Table_Ptr1 = 0xdc0, GR_table_base // Ptr to H_3
1030 fcmp.eq.s0 p11, p0 = FR_Input_Y, FR_Half // Test y=0.5, also set denorm
1031 (p6) shl GR_fraction_y= GR_fraction_y, 1 // Shift left 1 to get fraction
1032 }
1033 ;;
1034
1035 { .mmb
1036 setf.sig FR_float_N = GR_N
1037 (p6) cmp.ne.unc p8, p0 = GR_fraction_y, r0 // Test x<0 and y not integer
1038 (p8) br.cond.spnt POWL_64_XNEG // Branch if x<0 and y not int
1039 }
1040 ;;
1041
1042 //
1043 // Raise possible denormal operand exception for both X and Y.
1044 // Set pointers in case |x| near 1
1045 // Branch to embedded sqrt(x) if y=0.5
1046 //
1047 { .mfi
1048 add GR_P_ptr1 = 0x6b0, GR_table_base // Constants_log_80_P, P8, NEAR path
1049 fcmp.eq.s0 p12, p0 = FR_Input_X, FR_Input_Y // Dummy to set denormal
1050 add GR_P_ptr2 = 0x700, GR_table_base // Constants_log_80_P, P4, NEAR path
1051 }
1052 { .mfb
1053 cmp.eq p15, p14 = r0, r0 // Assume result safe (no over/under)
1054 fsub.s1 FR_Delta = FR_Input_Y,f1 // Delta = y - 1.0
1055 (p11) br.cond.spnt POWL_64_SQRT // Branch if y=0.5
1056 }
1057 ;;
1058
1059 //
1060 // Computes ln( x ) to extra precision
1061 // Input FR 1: FR_X
1062 // Output FR 2: FR_Y_hi
1063 // Output FR 3: FR_Y_lo
1064 // Output PR 1: PR_Safe
1065 //
1066 { .mfi
1067 and GR_M = GR_exp_mask, GR_M // Mask to get exponent of W
1068 nop.f 999
1069 extr.u GR_Index3 = GR_X_2, 1, 5 // Get index3
1070 }
1071 ;;
1072
1073 { .mmi
1074 shladd GR_Table_Ptr1 = GR_Index3,2,GR_Table_Ptr1 // Ptr to H_3
1075 shladd GR_Index3 = GR_Index3,4,GR_Table_Ptr // Ptr to G_3
1076 sub GR_M = GR_M, GR_exp_bias // Get true exponent of W
1077 }
1078 ;;
1079
1080 { .mib
1081 ldfs FR_G_3 = [GR_Index3],-12 // Load G_3
1082 cmp.gt p7, p14 = -8, GR_M // Test if |x-1| < 2^-8
1083 (p7) br.cond.spnt LOGL80_NEAR // Branch if |x-1| < 2^-8
1084 }
1085 ;;
1086
1087 // Here if |x-1| >= 2^-8
1088 { .mmf
1089 ldfs FR_H_3 = [GR_Table_Ptr1] // Load H_3
1090 nop.m 999
1091 nop.f 999
1092 }
1093 ;;
1094
1095 { .mfi
1096 ldfe FR_h_3 = [GR_Index3] // Load h_3
1097 fmerge.se FR_S = f1,FR_Z // S = merge of 1.0 and signif(Z)
1098 nop.i 999
1099 }
1100 { .mfi
1101 add GR_Table_Ptr = 0x740, GR_table_base // Constants_log_80_Q
1102 fmpy.s1 FR_G = FR_G_1, FR_G_2 // G = G_1 * G_2
1103 nop.i 999
1104 }
1105 ;;
1106
1107 //
1108 // Begin Loading Q's - load log2_hi part
1109 //
1110 { .mfi
1111 ldfe FR_log2_hi = [GR_Table_Ptr],16 // Load log2_hi
1112 fadd.s1 FR_H = FR_H_1, FR_H_2 // H = H_1 + H_2
1113 nop.i 999
1114 };;
1115
1116 //
1117 // h = h_1 + h_2
1118 //
1119 { .mfi
1120 ldfe FR_log2_lo = [GR_Table_Ptr],16 // Load log2_lo
1121 fadd.s1 FR_h = FR_h_1, FR_h_2 // h = h_1 + h_2
1122 nop.i 999
1123 }
1124 ;;
1125
1126 { .mfi
1127 ldfe FR_Q_6 = [GR_Table_Ptr],16 // Load Q_6
1128 fcvt.xf FR_float_N = FR_float_N
1129 nop.i 999
1130 }
1131 ;;
1132
1133 { .mfi
1134 ldfe FR_Q_5 = [GR_Table_Ptr],16 // Load Q_5
1135 nop.f 999
1136 nop.i 999
1137 }
1138 ;;
1139
1140 //
1141 // G = G_1 * G_2 * G_3
1142 //
1143 { .mfi
1144 ldfe FR_Q_4 = [GR_Table_Ptr],16 // Load Q_4
1145 fmpy.s1 FR_G = FR_G, FR_G_3
1146 nop.i 999
1147 }
1148 ;;
1149
1150 //
1151 // H = H_1 + H_2 + H_3
1152 //
1153 { .mfi
1154 ldfe FR_Q_3 = [GR_Table_Ptr],16 // Load Q_3
1155 fadd.s1 FR_H = FR_H, FR_H_3
1156 nop.i 999
1157 }
1158 ;;
1159
1160 //
1161 // Y_lo = poly + Y_lo
1162 //
1163 // h = h_1 + h_2 + h_3
1164 //
1165 { .mfi
1166 ldfe FR_Q_2 = [GR_Table_Ptr],16 // Load Q_2
1167 fadd.s1 FR_h = FR_h, FR_h_3
1168 nop.i 999
1169 }
1170 ;;
1171
1172 //
1173 // GS_hi = G*S
1174 // r = G*S -1
1175 //
1176 { .mfi
1177 ldfe FR_Q_1 = [GR_Table_Ptr],16 // Load Q_1
1178 fmpy.s1 FR_GS_hi = FR_G, FR_S
1179 nop.i 999
1180 }
1181 { .mfi
1182 nop.m 999
1183 fms.s1 FR_r = FR_G, FR_S, f1
1184 nop.i 999
1185 }
1186 ;;
1187
1188 //
1189 // poly_lo = Q_5 + r * Q_6
1190 //
1191 { .mfi
1192 getf.exp GR_Delta_Exp = FR_Delta // Get signexp of y-1 for exp calc
1193 fma.s1 FR_poly_lo = FR_r, FR_Q_6, FR_Q_5
1194 nop.i 999
1195 }
1196 //
1197 // r_cor = GS_hi -1
1198 //
1199 { .mfi
1200 nop.m 999
1201 fsub.s1 FR_r_cor = FR_GS_hi, f1
1202 nop.i 999
1203 }
1204 ;;
1205
1206 //
1207 // GS_lo = G*S - GS_hi
1208 //
1209 { .mfi
1210 nop.m 999
1211 fms.s1 FR_GS_lo = FR_G, FR_S, FR_GS_hi
1212 nop.i 999
1213 }
1214 ;;
1215
1216 //
1217 // rsq = r * r
1218 //
1219 { .mfi
1220 nop.m 999
1221 fmpy.s1 FR_rsq = FR_r, FR_r
1222 nop.i 999
1223 }
1224 //
1225 // G = float_N*log2_hi + H
1226 //
1227 { .mfi
1228 nop.m 999
1229 fma.s1 FR_G = FR_float_N, FR_log2_hi, FR_H
1230 nop.i 999
1231 }
1232 ;;
1233
1234 //
1235 // Y_lo = float_N*log2_lo + h
1236 //
1237 { .mfi
1238 nop.m 999
1239 fma.s1 FR_Y_lo = FR_float_N, FR_log2_lo, FR_h
1240 nop.i 999
1241 }
1242 ;;
1243
1244 //
1245 // poly_lo = Q_4 + r * poly_lo
1246 // r_cor = r_cor - r
1247 //
1248 { .mfi
1249 nop.m 999
1250 fma.s1 FR_poly_lo = FR_r, FR_poly_lo, FR_Q_4
1251 nop.i 999
1252 }
1253 { .mfi
1254 nop.m 999
1255 fsub.s1 FR_r_cor = FR_r_cor, FR_r
1256 nop.i 999
1257 }
1258 ;;
1259
1260 //
1261 // poly_hi = r * Q_2 + Q_1
1262 // Y_hi = G + r
1263 //
1264 { .mfi
1265 nop.m 999
1266 fma.s1 FR_poly = FR_r, FR_Q_2, FR_Q_1
1267 nop.i 999
1268 }
1269 { .mfi
1270 nop.m 999
1271 fadd.s1 FR_Y_hi = FR_G, FR_r
1272 nop.i 999
1273 }
1274 ;;
1275
1276 //
1277 // poly_lo = Q_3 + r * poly_lo
1278 // r_cor = r_cor + GS_lo
1279 //
1280 { .mfi
1281 nop.m 999
1282 fma.s1 FR_poly_lo = FR_r, FR_poly_lo, FR_Q_3
1283 nop.i 999
1284 }
1285 { .mfi
1286 nop.m 999
1287 fadd.s1 FR_r_cor = FR_r_cor, FR_GS_lo
1288 nop.i 999
1289 }
1290 ;;
1291
1292 //
1293 // Y_lo = G - Y_hi
1294 //
1295 { .mfi
1296 nop.m 999
1297 fsub.s1 FR_Y_lo_2 = FR_G, FR_Y_hi
1298 nop.i 999
1299 }
1300 ;;
1301
1302 //
1303 // r_cor = r_cor + Y_lo
1304 // poly = poly_hi + rsq * poly_lo
1305 //
1306 { .mfi
1307 add GR_Table_Ptr = 0x0, GR_table_base // Constants_exp_64_Arg
1308 fadd.s1 FR_r_cor = FR_r_cor, FR_Y_lo
1309 nop.i 999
1310 }
1311 { .mfi
1312 nop.m 999
1313 fma.s1 FR_poly = FR_rsq, FR_poly_lo, FR_poly
1314 nop.i 999
1315 }
1316 ;;
1317
1318 //
1319 // Load L_hi
1320 // Load L_lo
1321 // all long before they are needed.
1322 // They are used in LOGL_RETURN PATH
1323 //
1324 // Y_lo = Y_lo + r
1325 // poly = rsq * poly + r_cor
1326 //
1327 { .mfi
1328 ldfe FR_L_hi = [GR_Table_Ptr],16 // Load L_hi
1329 fadd.s1 FR_Y_lo = FR_Y_lo_2, FR_r
1330 nop.i 999
1331 }
1332 { .mfi
1333 nop.m 999
1334 fma.s1 FR_poly = FR_rsq, FR_poly, FR_r_cor
1335 nop.i 999
1336 }
1337 ;;
1338
1339 { .mfb
1340 ldfe FR_L_lo = [GR_Table_Ptr],16 // Load L_lo
1341 fadd.s1 FR_Y_lo = FR_Y_lo, FR_poly
1342 br.cond.sptk LOGL_RETURN // Branch to common code
1343 }
1344 ;;
1345
1346
1347 LOGL80_NEAR:
1348 // Here if |x-1| < 2^-8
1349 //
1350 // Branch LOGL80_NEAR
1351 //
1352
1353 { .mmf
1354 ldfe FR_P_8 = [GR_P_ptr1],16 // Load P_8
1355 ldfe FR_P_4 = [GR_P_ptr2],16 // Load P_4
1356 fmpy.s1 FR_Wsq = FR_W, FR_W
1357 }
1358 ;;
1359
1360 { .mmi
1361 ldfe FR_P_7 = [GR_P_ptr1],16 // Load P_7
1362 ldfe FR_P_3 = [GR_P_ptr2],16 // Load P_3
1363 nop.i 999
1364 }
1365 ;;
1366
1367 { .mmi
1368 ldfe FR_P_6 = [GR_P_ptr1],16 // Load P_6
1369 ldfe FR_P_2 = [GR_P_ptr2],16 // Load P_2
1370 nop.i 999
1371 }
1372 ;;
1373
1374 { .mmi
1375 ldfe FR_P_5 = [GR_P_ptr1],16 // Load P_5
1376 ldfe FR_P_1 = [GR_P_ptr2],16 // Load P_1
1377 nop.i 999
1378 }
1379 ;;
1380
1381 { .mfi
1382 getf.exp GR_Delta_Exp = FR_Delta // Get signexp of y-1 for exp calc
1383 fmpy.s1 FR_W4 = FR_Wsq, FR_Wsq
1384 nop.i 999
1385 }
1386 { .mfi
1387 add GR_Table_Ptr = 0x0, GR_table_base // Constants_exp_64_Arg
1388 fmpy.s1 FR_W3 = FR_Wsq, FR_W
1389 nop.i 999
1390 }
1391 ;;
1392
1393 { .mfi
1394 nop.m 999
1395 fmpy.s1 FR_half_W = FR_Half, FR_W
1396 nop.i 999
1397 }
1398 ;;
1399
1400 { .mfi
1401 ldfe FR_L_hi = [GR_Table_Ptr],16
1402 fma.s1 FR_poly_lo = FR_W, FR_P_8,FR_P_7
1403 nop.i 999
1404 }
1405 { .mfi
1406 nop.m 999
1407 fma.s1 FR_poly = FR_W, FR_P_4, FR_P_3
1408 nop.i 999
1409 }
1410 ;;
1411
1412 { .mfi
1413 ldfe FR_L_lo = [GR_Table_Ptr],16
1414 fnma.s1 FR_Y_hi = FR_W, FR_half_W, FR_W
1415 nop.i 999
1416 }
1417 ;;
1418
1419 { .mfi
1420 nop.m 999
1421 fma.s1 FR_poly_lo = FR_W, FR_poly_lo, FR_P_6
1422 nop.i 999
1423 }
1424 { .mfi
1425 nop.m 999
1426 fma.s1 FR_poly = FR_W, FR_poly, FR_P_2
1427 nop.i 999
1428 }
1429 ;;
1430
1431 { .mfi
1432 nop.m 999
1433 fsub.s1 FR_Y_lo = FR_W, FR_Y_hi
1434 nop.i 999
1435 }
1436 ;;
1437
1438 { .mfi
1439 nop.m 999
1440 fma.s1 FR_poly_lo = FR_W, FR_poly_lo, FR_P_5
1441 nop.i 999
1442 }
1443 { .mfi
1444 nop.m 999
1445 fma.s1 FR_poly = FR_W, FR_poly, FR_P_1
1446 nop.i 999
1447 }
1448 ;;
1449
1450 { .mfi
1451 nop.m 999
1452 fnma.s1 FR_Y_lo = FR_W, FR_half_W, FR_Y_lo
1453 nop.i 999
1454 }
1455 ;;
1456
1457 { .mfi
1458 nop.m 999
1459 fma.s1 FR_poly = FR_poly_lo, FR_W4, FR_poly
1460 nop.i 999
1461 }
1462 ;;
1463
1464 { .mfi
1465 nop.m 999
1466 fma.s1 FR_Y_lo = FR_poly, FR_W3, FR_Y_lo
1467 nop.i 999
1468 }
1469 ;;
1470
1471
1472 LOGL_RETURN:
1473 // Common code for completion of both logx paths
1474
1475 //
1476 // L_hi, L_lo already loaded.
1477 //
1478 //
1479 // kernel_log_80 computed ln(X)
1480 // and return logX_hi and logX_lo as results.
1481 // PR_pow_Safe set as well.
1482 //
1483 //
1484 // Compute Y * (logX_hi + logX_lo)
1485 // P_hi -> X
1486 // P_lo -> X_cor
1487 // (Manipulate names so that inputs are in
1488 // the place kernel_exp expects them)
1489 //
1490 // This function computes exp( x + x_cor)
1491 // Input FR 1: FR_X
1492 // Input FR 2: FR_X_cor
1493 // Output FR 3: FR_Y_hi
1494 // Output FR 4: FR_Y_lo
1495 // Output FR 5: FR_Scale
1496 // Output PR 1: PR_Safe
1497 //
1498 // P15 is True
1499 //
1500 // Load constants used in computing N using right-shift technique
1501 { .mlx
1502 mov GR_exp_2tom51 = 0xffff-51
1503 movl GR_sig_inv_ln2 = 0xb8aa3b295c17f0bc // significand of 1/ln2
1504 }
1505 { .mlx
1506 add GR_Special_Exp = -50,GR_exp_bias
1507 movl GR_rshf_2to51 = 0x4718000000000000 // 1.10000 2^(63+51)
1508 }
1509 ;;
1510
1511 //
1512 // Point to Table of W1s
1513 // Point to Table of W2s
1514 //
1515 { .mmi
1516 add GR_W1_ptr = 0x2b0, GR_table_base // Constants_exp_64_W1
1517 add GR_W2_ptr = 0x4b0, GR_table_base // Constants_exp_64_W2
1518 cmp.le p6,p0= GR_Delta_Exp,GR_Special_Exp
1519 };;
1520
1521 // Form two constants we need
1522 // 1/ln2 * 2^63 to compute w = x * 1/ln2 * 128
1523 // 1.1000..000 * 2^(63+63-12) to right shift int(N) into the significand
1524
1525 { .mfi
1526 setf.sig FR_INV_LN2_2TO63 = GR_sig_inv_ln2 // form 1/ln2 * 2^63
1527 nop.f 999
1528 and GR_Delta_Exp=GR_Delta_Exp,GR_exp_mask // Get exponent of y-1
1529 }
1530 { .mlx
1531 setf.d FR_RSHF_2TO51 = GR_rshf_2to51 // Form const 1.1000 * 2^(63+51)
1532 movl GR_rshf = 0x43e8000000000000 // 1.10000 2^63 for right shift
1533 }
1534 ;;
1535
1536 { .mfi
1537 nop.m 999
1538 fmpy.s1 FR_X_lo = FR_Input_Y, FR_logx_lo // logx_lo is Y_lo
1539 cmp.eq p15, p0= r0, r0 // Set p15, assume safe
1540 };;
1541
1542 { .mmi
1543 setf.exp FR_2TOM51 = GR_exp_2tom51 // Form 2^-51 for scaling float_N
1544 setf.d FR_RSHF = GR_rshf // Form right shift const 1.1000 * 2^63
1545 add GR_Table_Ptr1 = 0x50, GR_table_base // Constants_exp_64_P for
1546 // EXPL_SMALL path
1547 }
1548 ;;
1549
1550 { .mmi
1551 ldfe FR_P_6 = [GR_Table_Ptr1],16 // Load P_6 for EXPL_SMALL path
1552 ;;
1553 ldfe FR_P_5 = [GR_Table_Ptr1],16 // Load P_5 for EXPL_SMALL path
1554 nop.i 999
1555 }
1556 ;;
1557
1558 { .mfi
1559 ldfe FR_P_4 = [GR_Table_Ptr1],16 // Load P_4 for EXPL_SMALL path
1560 fma.s1 FR_P_hi = FR_Input_Y, FR_logx_hi,FR_X_lo // logx_hi ix Y_hi
1561 nop.i 999
1562 }
1563 ;;
1564
1565 { .mmi
1566 ldfe FR_P_3 = [GR_Table_Ptr1],16 // Load P_3 for EXPL_SMALL path
1567 ;;
1568 ldfe FR_P_2 = [GR_Table_Ptr1],16 // Load P_2 for EXPL_SMALL path
1569 nop.i 999
1570 }
1571 ;;
1572
1573 // N = X * Inv_log2_by_2^12
1574 // By adding 1.10...0*2^63 we shift and get round_int(N_signif) in significand.
1575 // We actually add 1.10...0*2^51 to X * Inv_log2 to do the same thing.
1576 { .mfi
1577 ldfe FR_P_1 = [GR_Table_Ptr1] // Load P_1 for EXPL_SMALL path
1578 fma.s1 FR_N = FR_X, FR_INV_LN2_2TO63, FR_RSHF_2TO51
1579 nop.i 999
1580 }
1581 { .mfb
1582 nop.m 999
1583 fms.s1 FR_P_lo= FR_Input_Y, FR_logx_hi, FR_P_hi // P_hi is X
1584 (p6) br.cond.spnt POWL_Y_ALMOST_1 // Branch if |y-1| < 2^-50
1585 }
1586 ;;
1587
1588 { .mmi
1589 getf.exp GR_Expo_X = FR_X
1590 add GR_T1_ptr = 0x0b0, GR_table_base // Constants_exp_64_T1
1591 add GR_T2_ptr = 0x1b0, GR_table_base // Constants_exp_64_T2
1592 }
1593 ;;
1594
1595 // float_N = round_int(N)
1596 // The signficand of N contains the rounded integer part of X * 2^12/ln2,
1597 // as a twos complement number in the lower bits (that is, it may be negative).
1598 // That twos complement number (called N) is put into GR_N_fix.
1599
1600 // Since N is scaled by 2^51, it must be multiplied by 2^-51
1601 // before the shift constant 1.10000 * 2^63 is subtracted to yield float_N.
1602 // Thus, float_N contains the floating point version of N
1603
1604
1605 { .mfi
1606 add GR_Table_Ptr = 0x20, GR_table_base // Constants_exp_64_A
1607 fms.s1 FR_float_N = FR_N, FR_2TOM51, FR_RSHF // Form float_N
1608 nop.i 999
1609 }
1610 // Create low part of Y(ln(x)_hi + ln(x)_lo) as P_lo
1611 { .mfi
1612 mov GR_Big_Pos_Exp = 0x3ffe // 16382, largest safe exponent
1613 fadd.s1 FR_P_lo = FR_P_lo, FR_X_lo
1614 mov GR_Big_Neg_Exp = -0x3ffd // -16381 smallest safe exponent
1615 };;
1616
1617 { .mfi
1618 nop.m 999
1619 fmpy.s1 FR_rsq = FR_X, FR_X // rsq = X*X for EXPL_SMALL path
1620 mov GR_vsm_expo = -70 // Exponent for very small path
1621 }
1622 { .mfi
1623 nop.m 999
1624 fma.s1 FR_poly_lo = FR_P_6, FR_X, FR_P_5 // poly_lo for EXPL_SMALL path
1625 add GR_temp = 0x1,r0 // For tiny signif if small path
1626 }
1627 ;;
1628
1629 //
1630 // If expo_X < -6 goto exp_small
1631 //
1632 { .mmi
1633 getf.sig GR_N_fix = FR_N
1634 ldfe FR_A_3 = [GR_Table_Ptr],16 // Load A_3
1635 and GR_Expo_X = GR_Expo_X, GR_exp_mask // Get exponent of X
1636 }
1637 ;;
1638
1639 { .mfi
1640 ldfe FR_A_2 = [GR_Table_Ptr],16 // Load A_2
1641 nop.f 999
1642 sub GR_Expo_X = GR_Expo_X, GR_exp_bias // Get true exponent of X
1643 }
1644 ;;
1645
1646 //
1647 // If -6 > Expo_X, set P9 and branch
1648 //
1649 { .mfb
1650 cmp.gt p9, p0 = -6, GR_Expo_X
1651 fnma.s1 FR_r = FR_L_hi, FR_float_N, FR_X // r = X - L_hi * float_N
1652 (p9) br.cond.spnt EXPL_SMALL // Branch if |X| < 2^-6
1653 }
1654 ;;
1655
1656 //
1657 // If 14 <= Expo_X, set P10
1658 //
1659 { .mib
1660 cmp.le p10, p0 = 14, GR_Expo_X
1661 nop.i 999
1662 (p10) br.cond.spnt EXPL_HUGE // Branch if |X| >= 2^14
1663 }
1664 ;;
1665
1666 //
1667 // Load single T1
1668 // Load single T2
1669 // W_1_p1 = W_1 + 1
1670 //
1671 { .mmi
1672 nop.m 999
1673 nop.m 999
1674 extr.u GR_M1 = GR_N_fix, 6, 6 // Extract index M_1
1675 }
1676 ;;
1677
1678 //
1679 // k = extr.u(N_fix,0,6)
1680 //
1681 { .mmi
1682 shladd GR_W1_ptr = GR_M1,3,GR_W1_ptr // Point to W1
1683 shladd GR_T1_ptr = GR_M1,2,GR_T1_ptr // Point to T1
1684 extr.u GR_M2 = GR_N_fix, 0, 6 // Extract index M_2
1685 }
1686 ;;
1687
1688 // N_fix is only correct up to 50 bits because of our right shift technique.
1689 // Actually in the normal path we will have restricted K to about 14 bits.
1690 // Somewhat arbitrarily we extract 32 bits.
1691 { .mmi
1692 ldfd FR_W1 = [GR_W1_ptr]
1693 shladd GR_W2_ptr = GR_M2,3,GR_W2_ptr // Point to W2
1694 extr GR_k = GR_N_fix, 12, 32 // Extract k
1695 }
1696 ;;
1697
1698 { .mfi
1699 ldfs FR_T1 = [GR_T1_ptr]
1700 fnma.s1 FR_r = FR_L_lo, FR_float_N, FR_r
1701 shladd GR_T2_ptr = GR_M2,2,GR_T2_ptr // Point to T2
1702 }
1703 { .mfi
1704 add GR_exp_bias_p_k = GR_exp_bias, GR_k
1705 nop.f 999
1706 cmp.gt p14,p15 = GR_k,GR_Big_Pos_Exp
1707 }
1708 ;;
1709
1710 //
1711 // if k < big_neg_exp, set p14 and Safe=False
1712 //
1713 { .mmi
1714 ldfs FR_T2 = [GR_T2_ptr]
1715 (p15) cmp.lt p14,p15 = GR_k,GR_Big_Neg_Exp
1716 nop.i 999
1717 }
1718 ;;
1719
1720 { .mmi
1721 setf.exp FR_Scale = GR_exp_bias_p_k
1722 ldfd FR_W2 = [GR_W2_ptr]
1723 nop.i 999
1724 }
1725 ;;
1726
1727 { .mfi
1728 ldfe FR_A_1 = [GR_Table_Ptr],16
1729 fadd.s1 FR_r = FR_r, FR_X_cor
1730 nop.i 999
1731 }
1732 ;;
1733
1734 { .mfi
1735 nop.m 999
1736 fadd.s1 FR_W_1_p1 = FR_W1, f1
1737 nop.i 999
1738 }
1739 ;;
1740
1741 { .mfi
1742 nop.m 999
1743 fma.s1 FR_poly = FR_r, FR_A_3, FR_A_2
1744 nop.i 999
1745 }
1746 { .mfi
1747 nop.m 999
1748 fmpy.s1 FR_rsq = FR_r, FR_r
1749 nop.i 999
1750 }
1751 ;;
1752
1753 { .mfi
1754 nop.m 999
1755 fmpy.s1 FR_T = FR_T1, FR_T2
1756 nop.i 999
1757 }
1758 ;;
1759
1760 { .mfi
1761 nop.m 999
1762 fma.s1 FR_W = FR_W2, FR_W_1_p1, FR_W1
1763 nop.i 999
1764 }
1765 ;;
1766
1767 { .mfi
1768 nop.m 999
1769 fma.s1 FR_TMP1 = FR_Scale, FR_Sgn, f0
1770 nop.i 999
1771 }
1772 ;;
1773
1774 { .mfi
1775 nop.m 999
1776 fma.s1 FR_poly = FR_r, FR_poly, FR_A_1
1777 nop.i 999
1778 }
1779 ;;
1780
1781 { .mfi
1782 nop.m 999
1783 fma.s1 FR_TMP2 = FR_T, f1, f0 // TMP2 = Y_hi = T
1784 nop.i 999
1785 }
1786 ;;
1787
1788 { .mfi
1789 nop.m 999
1790 fadd.s1 FR_Wp1 = FR_W, f1
1791 nop.i 999
1792 }
1793 ;;
1794
1795 { .mfi
1796 nop.m 999
1797 fma.s1 FR_poly = FR_rsq, FR_poly,FR_r
1798 nop.i 999
1799 }
1800 ;;
1801
1802 { .mfi
1803 nop.m 999
1804 fma.s1 FR_Tscale = FR_T, FR_TMP1, f0 // Scale * Sgn * T
1805 nop.i 999
1806 }
1807 { .mfi
1808 nop.m 999
1809 fma.s1 FR_Y_lo = FR_Wp1, FR_poly, FR_W
1810 nop.i 999
1811 }
1812 ;;
1813
1814 { .mfb
1815 nop.m 999
1816 fmpy.s1 FR_TMP3 = FR_Y_lo, FR_Tscale
1817 br.cond.sptk POWL_64_SHARED
1818 }
1819 ;;
1820
1821
1822 EXPL_SMALL:
1823 // Here if |ylogx| < 2^-6
1824 //
1825 // Begin creating lsb to perturb final result
1826 //
1827 { .mfi
1828 setf.sig FR_temp = GR_temp
1829 fma.s1 FR_poly_lo = FR_poly_lo, FR_X, FR_P_4
1830 cmp.lt p12, p0 = GR_Expo_X, GR_vsm_expo // Test |ylogx| < 2^-70
1831 }
1832 { .mfi
1833 nop.m 999
1834 fma.s1 FR_poly_hi = FR_P_2, FR_X, FR_P_1
1835 nop.i 999
1836 }
1837 ;;
1838
1839 { .mfi
1840 nop.m 999
1841 fmpy.s1 FR_TMP2 = f1, f1
1842 nop.i 999
1843 }
1844 { .mfi
1845 nop.m 999
1846 fmpy.s1 FR_TMP1 = FR_Sgn, f1
1847 nop.i 999
1848 }
1849 ;;
1850
1851 { .mfi
1852 nop.m 999
1853 fmpy.s1 FR_r4 = FR_rsq, FR_rsq
1854 (p12) cmp.eq p15, p0 = r0, r0 // Set safe if |ylogx| < 2^-70
1855 }
1856 { .mfb
1857 nop.m 999
1858 (p12) fmpy.s1 FR_TMP3 = FR_Sgn, FR_X
1859 (p12) br.cond.spnt POWL_64_SHARED // Branch if |ylogx| < 2^-70
1860 }
1861 ;;
1862
1863 { .mfi
1864 nop.m 999
1865 fma.s1 FR_poly_lo = FR_poly_lo, FR_X, FR_P_3
1866 nop.i 999
1867 }
1868 { .mfi
1869 nop.m 999
1870 fma.s1 FR_poly_hi = FR_poly_hi, FR_rsq, FR_X
1871 nop.i 999
1872 }
1873 ;;
1874
1875 { .mfi
1876 nop.m 999
1877 fma.s1 FR_Y_lo = FR_poly_lo, FR_r4, FR_poly_hi
1878 nop.i 999
1879 }
1880 ;;
1881
1882 { .mfi
1883 nop.m 999
1884 fmpy.s1 FR_TMP3 = FR_Y_lo, FR_TMP1 // Add sign info
1885 nop.i 999
1886 }
1887 ;;
1888
1889 //
1890 // Toggle on last bit of Y_lo
1891 // Set lsb of Y_lo to 1
1892 //
1893 { .mfi
1894 nop.m 999
1895 for FR_temp = FR_Y_lo,FR_temp
1896 nop.i 999
1897 }
1898 ;;
1899
1900 { .mfb
1901 nop.m 999
1902 fmerge.se FR_TMP3 = FR_TMP3,FR_temp
1903 br.cond.sptk POWL_64_SHARED
1904 }
1905 ;;
1906
1907
1908 EXPL_HUGE:
1909 // Here if |ylogx| >= 2^14
1910 { .mfi
1911 mov GR_temp = 0x0A1DC // If X < 0, exponent -24100
1912 fcmp.gt.s1 p12, p13 = FR_X, f0 // Test X > 0
1913 cmp.eq p14, p15 = r0, r0 // Set Safe to false
1914 }
1915 ;;
1916
1917 { .mmi
1918 (p12) mov GR_Mask = 0x15DC0 // If X > 0, exponent +24000
1919 (p13) mov GR_Mask = 0x0A240 // If X < 0, exponent -24000
1920 nop.i 999
1921 }
1922 ;;
1923
1924 { .mmf
1925 setf.exp FR_TMP2 = GR_Mask // Form Y_hi = TMP2
1926 (p13) setf.exp FR_Y_lo = GR_temp // If X < 0, Y_lo = 2^-24100
1927 (p12) mov FR_Y_lo = f1 // IF X > 0, Y_lo = 1.0
1928 }
1929 ;;
1930
1931 { .mfi
1932 nop.m 999
1933 fmpy.s1 FR_TMP1 = FR_TMP2, FR_Sgn // TMP1 = Y_hi * Sgn
1934 nop.i 999
1935 }
1936 ;;
1937
1938 { .mfb
1939 nop.m 999
1940 fmpy.s1 FR_TMP3 = FR_Y_lo,FR_TMP1 // TMP3 = Y_lo * (Y_hi * Sgn)
1941 br.cond.sptk POWL_64_SHARED
1942 }
1943 ;;
1944
1945 POWL_Y_ALMOST_1:
1946 // Here if delta = |y-1| < 2^-50
1947 //
1948 // x**(1 + delta) = x * e (ln(x)*delta) = x ( 1 + ln(x) * delta)
1949 //
1950 // Computation will be safe for 2^-16381 <= x < 2^16383
1951
1952 { .mfi
1953 mov GR_exp_ynear1_oflow = 0xffff + 16383
1954 fma.s1 FR_TMP1 = FR_Input_X,FR_Delta,f0
1955 and GR_exp_x = GR_exp_mask, GR_signexp_x
1956 }
1957 ;;
1958
1959 { .mfi
1960 cmp.lt p15, p14 = GR_exp_x, GR_exp_ynear1_oflow
1961 fma.s1 FR_TMP2 = FR_logx_hi,f1,FR_X_lo
1962 mov GR_exp_ynear1_uflow = 0xffff - 16381
1963 }
1964 ;;
1965
1966 { .mfb
1967 (p15) cmp.ge p15, p14 = GR_exp_x, GR_exp_ynear1_uflow
1968 fma.s1 FR_TMP3 = FR_Input_X,f1,f0
1969 br.cond.sptk POWL_64_SHARED
1970 };;
1971
1972 POWL_64_SQUARE:
1973 //
1974 // Here if x not zero and y=2.
1975 //
1976 // Setup for multipath code
1977 //
1978 { .mfi
1979 mov GR_exp_square_oflow = 0xffff + 8192 // Exponent where x*x overflows
1980 fmerge.se FR_TMP1 = FR_Input_X, FR_Input_X
1981 and GR_exp_x = GR_exp_mask, GR_signexp_x // Get exponent of x
1982 }
1983 ;;
1984
1985 { .mfi
1986 cmp.lt p15, p14 = GR_exp_x, GR_exp_square_oflow // Decide safe/unsafe
1987 fmerge.se FR_TMP2 = FR_Input_X, FR_Input_X
1988 mov GR_exp_square_uflow = 0xffff - 8191 // Exponent where x*x underflows
1989 }
1990 ;;
1991
1992 { .mfi
1993 (p15) cmp.ge p15, p14 = GR_exp_x, GR_exp_square_uflow // Decide safe/unsafe
1994 fma.s1 FR_TMP3 = f0,f0,f0
1995 nop.i 999
1996 }
1997 ;;
1998
1999 //
2000 // This is the shared path that will set overflow and underflow.
2001 //
2002 POWL_64_SHARED:
2003
2004 //
2005 // Return if no danger of over or underflow.
2006 //
2007 { .mfb
2008 nop.m 999
2009 fma.s0 FR_Result = FR_TMP1, FR_TMP2, FR_TMP3
2010 (p15) br.ret.sptk b0 // Main path return if certain no over/underflow
2011 }
2012 ;;
2013
2014 //
2015 // S0 user supplied status
2016 // S2 user supplied status + WRE + TD (Overflows)
2017 // S2 user supplied status + FZ + TD (Underflows)
2018 //
2019 //
2020 // If (Safe) is true, then
2021 // Compute result using user supplied status field.
2022 // No overflow or underflow here, but perhaps inexact.
2023 // Return
2024 // Else
2025 // Determine if overflow or underflow was raised.
2026 // Fetch +/- overflow threshold for IEEE double extended
2027
2028 { .mfi
2029 nop.m 999
2030 fsetc.s2 0x7F,0x41 // For underflow test, set S2=User+TD+FTZ
2031 nop.i 999
2032 }
2033 ;;
2034
2035 { .mfi
2036 nop.m 999
2037 fma.s2 FR_Result_small = FR_TMP1, FR_TMP2, FR_TMP3
2038 nop.i 999
2039 }
2040 ;;
2041
2042 { .mfi
2043 nop.m 999
2044 fsetc.s2 0x7F,0x42 // For overflow test, set S2=User+TD+WRE
2045 nop.i 999
2046 }
2047 ;;
2048
2049 { .mfi
2050 nop.m 999
2051 fma.s2 FR_Result_big = FR_TMP1, FR_TMP2,FR_TMP3
2052 nop.i 999
2053 }
2054 ;;
2055
2056 { .mfi
2057 nop.m 999
2058 fsetc.s2 0x7F,0x40 // Reset S2=User
2059 nop.i 999
2060 }
2061 ;;
2062
2063 { .mfi
2064 nop.m 999
2065 fclass.m p11, p0 = FR_Result_small, 0x00F // Test small result unorm/zero
2066 nop.i 999
2067 }
2068 ;;
2069
2070 { .mfi
2071 nop.m 999
2072 fcmp.ge.s1 p8, p0 = FR_Result_big , FR_Big // Test >= + oflow threshold
2073 nop.i 999
2074 }
2075 ;;
2076
2077 { .mfb
2078 (p11) mov GR_Parameter_TAG = 19 // Set tag for underflow
2079 fcmp.le.s1 p9, p0 = FR_Result_big, FR_NBig // Test <= - oflow threshold
2080 (p11) br.cond.spnt __libm_error_region // Branch if pow underflowed
2081 }
2082 ;;
2083
2084 { .mfb
2085 (p8) mov GR_Parameter_TAG = 18 // Set tag for overflow
2086 nop.f 999
2087 (p8) br.cond.spnt __libm_error_region // Branch if pow +overflow
2088 }
2089 ;;
2090
2091 { .mbb
2092 (p9) mov GR_Parameter_TAG = 18 // Set tag for overflow
2093 (p9) br.cond.spnt __libm_error_region // Branch if pow -overflow
2094 br.ret.sptk b0 // Branch if result really ok
2095 }
2096 ;;
2097
2098
2099 POWL_64_SPECIAL:
2100 // Here if x or y is NatVal, nan, inf, or zero
2101 { .mfi
2102 nop.m 999
2103 fcmp.eq.s1 p15, p0 = FR_Input_X, f1 // Test x=+1
2104 nop.i 999
2105 }
2106 ;;
2107
2108 { .mfi
2109 nop.m 999
2110 fclass.m p8, p0 = FR_Input_X, 0x143 // Test x natval, snan
2111 nop.i 999
2112 }
2113 ;;
2114
2115 { .mfi
2116 nop.m 999
2117 (p15) fcmp.eq.unc.s0 p6,p0 = FR_Input_Y, f0 // If x=1, flag invalid if y=SNaN
2118 nop.i 999
2119 }
2120 { .mfb
2121 nop.m 999
2122 (p15) fmpy.s0 FR_Result = f1,f1 // If x=1, result=1
2123 (p15) br.ret.spnt b0 // Exit if x=1
2124 }
2125 ;;
2126
2127 { .mfi
2128 nop.m 999
2129 fclass.m p6, p0 = FR_Input_Y, 0x007 // Test y zero
2130 nop.i 999
2131 }
2132 ;;
2133
2134 { .mfi
2135 nop.m 999
2136 fclass.m p9, p0 = FR_Input_Y, 0x143 // Test y natval, snan
2137 nop.i 999
2138 }
2139 ;;
2140
2141 { .mfi
2142 nop.m 999
2143 fclass.m p10, p0 = FR_Input_X, 0x083 // Test x qnan
2144 nop.i 999
2145 }
2146 { .mfi
2147 nop.m 999
2148 (p8) fmpy.s0 FR_Result = FR_Input_Y, FR_Input_X // If x=snan, result=qnan
2149 (p6) cmp.ne p8,p0 = r0,r0 // Don't exit if x=snan, y=0 ==> result=+1
2150 }
2151 ;;
2152
2153 { .mfi
2154 nop.m 999
2155 (p6) fclass.m.unc p15, p0 = FR_Input_X,0x007 // Test x=0, y=0
2156 nop.i 999
2157 }
2158 { .mfb
2159 nop.m 999
2160 (p9) fmpy.s0 FR_Result = FR_Input_Y, FR_Input_X // If y=snan, result=qnan
2161 (p8) br.ret.spnt b0 // Exit if x=snan, y not 0,
2162 // result=qnan
2163 }
2164 ;;
2165
2166 { .mfi
2167 nop.m 999
2168 fcmp.eq.s1 p7, p0 = FR_Input_Y, f1 // Test y +1.0
2169 nop.i 999
2170 }
2171 { .mfb
2172 nop.m 999
2173 (p10) fmpy.s0 FR_Result = FR_Input_X, f0 // If x=qnan, result=qnan
2174 (p9) br.ret.spnt b0 // Exit if y=snan, result=qnan
2175 }
2176 ;;
2177
2178 { .mfi
2179 nop.m 999
2180 (p6) fclass.m.unc p8, p0 = FR_Input_X,0x0C3 // Test x=nan, y=0
2181 nop.i 999
2182 }
2183 ;;
2184
2185 { .mfi
2186 nop.m 999
2187 (p6) fcmp.eq.s0 p9,p0 = FR_Input_X, f0 // If y=0, flag if x denormal
2188 nop.i 999
2189 }
2190 { .mfi
2191 nop.m 999
2192 (p6) fadd.s0 FR_Result = f1, f0 // If y=0, result=1
2193 nop.i 999
2194 }
2195 ;;
2196
2197 { .mfi
2198 nop.m 999
2199 fclass.m p11, p0 = FR_Input_Y, 0x083 // Test y qnan
2200 nop.i 999
2201 }
2202 { .mfb
2203 (p15) mov GR_Parameter_TAG = 20 // Error tag for x=0, y=0
2204 (p7) fmpy.s0 FR_Result = FR_Input_X,f1 // If y=1, result=x
2205 (p15) br.cond.spnt __libm_error_region // Branch if x=0, y=0, result=1
2206 }
2207 ;;
2208
2209 { .mfb
2210 (p8) mov GR_Parameter_TAG = 23 // Error tag for x=nan, y=0
2211 fclass.m p14, p0 = FR_Input_Y, 0x023 // Test y inf
2212 (p8) br.cond.spnt __libm_error_region // Branch if x=snan, y=0,
2213 // result=1
2214 }
2215 ;;
2216
2217 { .mfb
2218 nop.m 999
2219 fclass.m p13, p0 = FR_Input_X, 0x023 // Test x inf
2220 (p6) br.ret.spnt b0 // Exit y=0, x not nan or 0,
2221 // result=1
2222 }
2223 ;;
2224
2225 { .mfb
2226 nop.m 999
2227 (p14) fcmp.eq.unc.s1 p0,p14 = FR_Input_X,f0 // Test x not 0, y=inf
2228 (p7) br.ret.spnt b0 // Exit y=1, x not snan,
2229 // result=x
2230 }
2231 ;;
2232
2233 { .mfb
2234 nop.m 999
2235 (p10) fmpy.s0 FR_Result = FR_Input_Y,FR_Input_X // If x=qnan, y not snan,
2236 // result=qnan
2237 (p10) br.ret.spnt b0 // Exit x=qnan, y not snan,
2238 // result=qnan
2239 }
2240 ;;
2241
2242 { .mfb
2243 nop.m 999
2244 (p11) fmpy.s0 FR_Result = FR_Input_Y,FR_Input_X // If y=qnan, x not nan or 1,
2245 // result=qnan
2246 (p11) br.ret.spnt b0 // Exit y=qnan, x not nan or 1,
2247 // result=qnan
2248 }
2249 ;;
2250
2251 { .mbb
2252 nop.m 999
2253 (p14) br.cond.spnt POWL_64_Y_IS_INF // Branch if y=inf, x not 1 or nan
2254 (p13) br.cond.spnt POWL_64_X_IS_INF // Branch if x=inf, y not 1 or nan
2255 }
2256 ;;
2257
2258
2259 POWL_64_X_IS_ZERO:
2260 // Here if x=0, y not nan or 1 or inf or 0
2261
2262 // There is logic starting here to determine if y is an integer when x = 0.
2263 // If 0 < |y| < 1 then clearly y is not an integer.
2264 // If |y| > 1, then the significand of y is shifted left by the size of
2265 // the exponent of y. This preserves the lsb of the integer part + the
2266 // fractional bits. The lsb of the integer can be tested to determine if
2267 // the integer is even or odd. The fractional bits can be tested. If zero,
2268 // then y is an integer.
2269 //
2270 { .mfi
2271 and GR_exp_y = GR_exp_mask,GR_signexp_y // Get biased exponent of y
2272 nop.f 999
2273 and GR_y_sign = GR_sign_mask,GR_signexp_y // Get sign of y
2274 }
2275 ;;
2276
2277 //
2278 // Maybe y is < 1 already, so
2279 // can never be an integer.
2280 //
2281 { .mfi
2282 cmp.lt p9, p8 = GR_exp_y,GR_exp_bias // Test 0 < |y| < 1
2283 nop.f 999
2284 sub GR_exp_y = GR_exp_y,GR_exp_bias // Get true exponent of y
2285 }
2286 ;;
2287
2288 //
2289 // Shift significand of y looking for nonzero bits
2290 // For y > 1, shift signif_y exp_y bits to the left
2291 // For y < 1, turn on 4 low order bits of significand of y
2292 // so that the fraction will always be non-zero
2293 //
2294 { .mmi
2295 (p9) or GR_exp_y= 0xF,GR_signif_y // Force nonzero fraction if y<1
2296 ;;
2297 nop.m 999
2298 (p8) shl GR_exp_y= GR_signif_y,GR_exp_y // Get lsb of int + fraction
2299 // Wait 4 cycles to use result
2300 }
2301 ;;
2302
2303 { .mmi
2304 nop.m 999
2305 ;;
2306 nop.m 999
2307 nop.i 999
2308 }
2309 ;;
2310
2311 { .mmi
2312 nop.m 999
2313 ;;
2314 nop.m 999
2315 shl GR_fraction_y= GR_exp_y,1 // Shift left 1 to get fraction
2316 }
2317 ;;
2318
2319 //
2320 // Integer part of y shifted off.
2321 // Get y's low even or odd bit - y might not be an int.
2322 //
2323 { .mii
2324 cmp.eq p13,p0 = GR_fraction_y, r0 // Test for y integer
2325 cmp.eq p8,p0 = GR_y_sign, r0 // Test for y > 0
2326 ;;
2327 (p13) tbit.nz.unc p13,p0 = GR_exp_y, 63 // Test if y an odd integer
2328 }
2329 ;;
2330
2331 { .mfi
2332 (p13) cmp.eq.unc p13,p14 = GR_y_sign, r0 // Test y pos odd integer
2333 (p8) fcmp.eq.s0 p12,p0 = FR_Input_Y, f0 // If x=0 and y>0 flag if y denormal
2334 nop.i 999
2335 }
2336 ;;
2337
2338 //
2339 // Return +/-0 when x=+/-0 and y is positive odd integer
2340 //
2341 { .mfb
2342 nop.m 999
2343 (p13) mov FR_Result = FR_Input_X // If x=0, y pos odd int, result=x
2344 (p13) br.ret.spnt b0 // Exit x=0, y pos odd int, result=x
2345 }
2346 ;;
2347
2348 //
2349 // Return +/-inf when x=+/-0 and y is negative odd int
2350 //
2351 { .mfb
2352 (p14) mov GR_Parameter_TAG = 21
2353 (p14) frcpa.s0 FR_Result, p0 = f1, FR_Input_X // Result +-inf, set Z flag
2354 (p14) br.cond.spnt __libm_error_region
2355 }
2356 ;;
2357
2358 //
2359 // Return +0 when x=+/-0 and y positive and not an odd integer
2360 //
2361 { .mfb
2362 nop.m 999
2363 (p8) mov FR_Result = f0 // If x=0, y>0 and not odd integer, result=+0
2364 (p8) br.ret.sptk b0 // Exit x=0, y>0 and not odd integer, result=+0
2365 }
2366 ;;
2367
2368 //
2369 // Return +inf when x=+/-0 and y is negative and not odd int
2370 //
2371 { .mfb
2372 mov GR_Parameter_TAG = 21
2373 frcpa.s0 FR_Result, p10 = f1,f0 // Result +inf, raise Z flag
2374 br.cond.sptk __libm_error_region
2375 }
2376 ;;
2377
2378
2379 POWL_64_X_IS_INF:
2380 //
2381 // Here if x=inf, y not 1 or nan
2382 //
2383 { .mfi
2384 and GR_exp_y = GR_exp_mask,GR_signexp_y // Get biased exponent y
2385 fclass.m p13, p0 = FR_Input_X,0x022 // Test x=-inf
2386 nop.i 999
2387 }
2388 ;;
2389
2390 { .mfi
2391 and GR_y_sign = GR_sign_mask,GR_signexp_y // Get sign of y
2392 fcmp.eq.s0 p9,p0 = FR_Input_Y, f0 // Dummy to set flag if y denorm
2393 nop.i 999
2394 }
2395 ;;
2396
2397 //
2398 // Maybe y is < 1 already, so
2399 // isn't an int.
2400 //
2401 { .mfi
2402 (p13) cmp.lt.unc p9, p8 = GR_exp_y,GR_exp_bias // Test 0 < |y| < 1 if x=-inf
2403 fclass.m p11, p0 = FR_Input_X,0x021 // Test x=+inf
2404 sub GR_exp_y = GR_exp_y,GR_exp_bias // Get true exponent y
2405 }
2406 ;;
2407
2408 //
2409 // Shift significand of y looking for nonzero bits
2410 // For y > 1, shift signif_y exp_y bits to the left
2411 // For y < 1, turn on 4 low order bits of significand of y
2412 // so that the fraction will always be non-zero
2413 //
2414 { .mmi
2415 (p9) or GR_exp_y= 0xF,GR_signif_y // Force nonzero fraction if y<1
2416 ;;
2417 (p11) cmp.eq.unc p14,p12 = GR_y_sign, r0 // Test x=+inf, y>0
2418 (p8) shl GR_exp_y= GR_signif_y,GR_exp_y // Get lsb of int + fraction
2419 // Wait 4 cycles to use result
2420 }
2421 ;;
2422
2423 //
2424 // Return +inf for x=+inf, y > 0
2425 // Return +0 for x=+inf, y < 0
2426 //
2427 { .mfi
2428 nop.m 999
2429 (p12) mov FR_Result = f0 // If x=+inf, y<0, result=+0
2430 nop.i 999
2431 }
2432 { .mfb
2433 nop.m 999
2434 (p14) fma.s0 FR_Result = FR_Input_X,f1,f0 // If x=+inf, y>0, result=+inf
2435 (p11) br.ret.sptk b0 // Exit x=+inf
2436 }
2437 ;;
2438
2439 //
2440 // Here only if x=-inf. Wait until can use result of shl...
2441 //
2442 { .mmi
2443 nop.m 999
2444 ;;
2445 nop.m 999
2446 nop.i 999
2447 }
2448 ;;
2449
2450 { .mfi
2451 cmp.eq p8,p9 = GR_y_sign, r0 // Test y pos
2452 nop.f 999
2453 shl GR_fraction_y = GR_exp_y,1 // Shift left 1 to get fraction
2454 }
2455 ;;
2456
2457 { .mmi
2458 cmp.eq p13,p0 = GR_fraction_y, r0 // Test y integer
2459 ;;
2460 nop.m 999
2461 (p13) tbit.nz.unc p13,p0 = GR_exp_y, 63 // Test y odd integer
2462 }
2463 ;;
2464
2465 //
2466 // Is y even or odd?
2467 //
2468 { .mii
2469 (p13) cmp.eq.unc p14,p10 = GR_y_sign, r0 // Test x=-inf, y pos odd int
2470 (p13) cmp.ne.and p8,p9 = r0,r0 // If y odd int, turn off p8,p9
2471 nop.i 999
2472 }
2473 ;;
2474
2475 //
2476 // Return -0 for x = -inf and y < 0 and odd int.
2477 // Return -Inf for x = -inf and y > 0 and odd int.
2478 //
2479 { .mfi
2480 nop.m 999
2481 (p10) fmerge.ns FR_Result = f0, f0 // If x=-inf, y neg odd int, result=-0
2482 nop.i 999
2483 }
2484 { .mfi
2485 nop.m 999
2486 (p14) fmpy.s0 FR_Result = FR_Input_X,f1 // If x=-inf, y pos odd int, result=-inf
2487 nop.i 999
2488 }
2489 ;;
2490
2491 //
2492 // Return Inf for x = -inf and y > 0 not an odd int.
2493 // Return +0 for x = -inf and y < 0 not an odd int.
2494 //
2495 .pred.rel "mutex",p8,p9
2496 { .mfi
2497 nop.m 999
2498 (p8) fmerge.ns FR_Result = FR_Input_X, FR_Input_X // If x=-inf, y>0 not odd int
2499 // result=+inf
2500 nop.i 999
2501 }
2502 { .mfb
2503 nop.m 999
2504 (p9) fmpy.s0 FR_Result = f0,f0 // If x=-inf, y<0 not odd int
2505 // result=+0
2506 br.ret.sptk b0 // Exit for x=-inf
2507 }
2508 ;;
2509
2510
2511 POWL_64_Y_IS_INF:
2512 // Here if y=inf, x not 1 or nan
2513 //
2514 // For y = +Inf and |x| < 1 returns 0
2515 // For y = +Inf and |x| > 1 returns Inf
2516 // For y = -Inf and |x| < 1 returns Inf
2517 // For y = -Inf and |x| > 1 returns 0
2518 // For y = Inf and |x| = 1 returns 1
2519 //
2520 { .mfi
2521 nop.m 999
2522 fclass.m p8, p0 = FR_Input_Y, 0x021 // Test y=+inf
2523 nop.i 999
2524 }
2525 ;;
2526
2527 { .mfi
2528 nop.m 999
2529 fclass.m p9, p0 = FR_Input_Y, 0x022 // Test y=-inf
2530 nop.i 999
2531 }
2532 ;;
2533
2534 { .mfi
2535 nop.m 999
2536 fabs FR_X = FR_Input_X // Form |x|
2537 nop.i 999
2538 }
2539 ;;
2540
2541 { .mfi
2542 nop.m 999
2543 fcmp.eq.s0 p10,p0 = FR_Input_X, f0 // flag if x denormal
2544 nop.i 999
2545 }
2546 ;;
2547
2548 { .mfi
2549 nop.m 999
2550 (p8) fcmp.lt.unc.s1 p6, p0 = FR_X, f1 // Test y=+inf, |x|<1
2551 nop.i 999
2552 }
2553 ;;
2554
2555 { .mfi
2556 nop.m 999
2557 (p8) fcmp.gt.unc.s1 p7, p0 = FR_X, f1 // Test y=+inf, |x|>1
2558 nop.i 999
2559 }
2560 ;;
2561
2562 { .mfi
2563 nop.m 999
2564 (p9) fcmp.lt.unc.s1 p12, p0 = FR_X, f1 // Test y=-inf, |x|<1
2565 nop.i 999
2566 }
2567 { .mfi
2568 nop.m 999
2569 (p6) fmpy.s0 FR_Result = f0,f0 // If y=+inf, |x|<1, result=+0
2570 nop.i 999
2571 }
2572 ;;
2573
2574 { .mfi
2575 nop.m 999
2576 (p9) fcmp.gt.unc.s1 p13, p0 = FR_X, f1 // Test y=-inf, |x|>1
2577 nop.i 999
2578 }
2579 { .mfi
2580 nop.m 999
2581 (p7) fmpy.s0 FR_Result = FR_Input_Y, f1 // If y=+inf, |x|>1, result=+inf
2582 nop.i 999
2583 }
2584 ;;
2585
2586 { .mfi
2587 nop.m 999
2588 fcmp.eq.s1 p14, p0 = FR_X, f1 // Test y=inf, |x|=1
2589 nop.i 999
2590 }
2591 { .mfi
2592 nop.m 999
2593 (p12) fnma.s0 FR_Result = FR_Input_Y, f1, f0 // If y=-inf, |x|<1, result=+inf
2594 nop.i 999
2595 }
2596 ;;
2597
2598 { .mfi
2599 nop.m 999
2600 (p13) mov FR_Result = f0 // If y=-inf, |x|>1, result=+0
2601 nop.i 999
2602 }
2603 ;;
2604
2605 { .mfb
2606 nop.m 999
2607 (p14) fmpy.s0 FR_Result = f1,f1 // If y=inf, |x|=1, result=+1
2608 br.ret.sptk b0 // Common return for y=inf
2609 }
2610 ;;
2611
2612
2613 // Here if x or y denorm/unorm
2614 POWL_DENORM:
2615 { .mmi
2616 getf.sig GR_signif_Z = FR_norm_X // Get significand of x
2617 ;;
2618 getf.exp GR_signexp_y = FR_norm_Y // Get sign and exp of y
2619 nop.i 999
2620 }
2621 ;;
2622
2623 { .mfi
2624 getf.sig GR_signif_y = FR_norm_Y // Get significand of y
2625 nop.f 999
2626 nop.i 999
2627 }
2628 ;;
2629
2630 { .mib
2631 getf.exp GR_signexp_x = FR_norm_X // Get sign and exp of x
2632 extr.u GR_Index1 = GR_signif_Z, 59, 4 // Extract upper 4 signif bits of x
2633 br.cond.sptk POWL_COMMON // Branch back to main path
2634 }
2635 ;;
2636
2637
2638 POWL_64_UNSUPPORT:
2639 //
2640 // Raise exceptions for specific
2641 // values - pseudo NaN and
2642 // infinities.
2643 // Return NaN and raise invalid
2644 //
2645 { .mfb
2646 nop.m 999
2647 fmpy.s0 FR_Result = FR_Input_X,f0
2648 br.ret.sptk b0
2649 }
2650 ;;
2651
2652 POWL_64_XNEG:
2653 //
2654 // Raise invalid for x < 0 and
2655 // y not an integer
2656 //
2657 { .mfi
2658 nop.m 999
2659 frcpa.s0 FR_Result, p8 = f0, f0
2660 mov GR_Parameter_TAG = 22
2661 }
2662 { .mib
2663 nop.m 999
2664 nop.i 999
2665 br.cond.sptk __libm_error_region
2666 }
2667 ;;
2668
2669 POWL_64_SQRT:
2670 { .mfi
2671 nop.m 999
2672 frsqrta.s0 FR_Result,p10 = FR_save_Input_X
2673 nop.i 999 ;;
2674 }
2675 { .mfi
2676 nop.m 999
2677 (p10) fma.s1 f62=FR_Half,FR_save_Input_X,f0
2678 nop.i 999 ;;
2679 }
2680 { .mfi
2681 nop.m 999
2682 (p10) fma.s1 f63=FR_Result,FR_Result,f0
2683 nop.i 999 ;;
2684 }
2685 { .mfi
2686 nop.m 999
2687 (p10) fnma.s1 f32=f63,f62,FR_Half
2688 nop.i 999 ;;
2689 }
2690 { .mfi
2691 nop.m 999
2692 (p10) fma.s1 f33=f32,FR_Result,FR_Result
2693 nop.i 999 ;;
2694 }
2695 { .mfi
2696 nop.m 999
2697 (p10) fma.s1 f34=f33,f62,f0
2698 nop.i 999 ;;
2699 }
2700 { .mfi
2701 nop.m 999
2702 (p10) fnma.s1 f35=f34,f33,FR_Half
2703 nop.i 999 ;;
2704 }
2705 { .mfi
2706 nop.m 999
2707 (p10) fma.s1 f63=f35,f33,f33
2708 nop.i 999 ;;
2709 }
2710 { .mfi
2711 nop.m 999
2712 (p10) fma.s1 f32=FR_save_Input_X,f63,f0
2713 nop.i 999
2714 }
2715 { .mfi
2716 nop.m 999
2717 (p10) fma.s1 FR_Result=f63,f62,f0
2718 nop.i 999 ;;
2719 }
2720 { .mfi
2721 nop.m 999
2722 (p10) fma.s1 f33=f11,f63,f0
2723 nop.i 999 ;;
2724 }
2725 { .mfi
2726 nop.m 999
2727 (p10) fnma.s1 f34=f32,f32,FR_save_Input_X
2728 nop.i 999
2729 }
2730 { .mfi
2731 nop.m 999
2732 (p10) fnma.s1 f35=FR_Result,f63,FR_Half
2733 nop.i 999 ;;
2734 }
2735 { .mfi
2736 nop.m 999
2737 (p10) fma.s1 f62=f33,f34,f32
2738 nop.i 999
2739 }
2740 { .mfi
2741 nop.m 999
2742 (p10) fma.s1 f63=f33,f35,f33
2743 nop.i 999 ;;
2744 }
2745 { .mfi
2746 nop.m 999
2747 (p10) fnma.s1 f32=f62,f62,FR_save_Input_X
2748 nop.i 999 ;;
2749 }
2750 { .mfb
2751 nop.m 999
2752 (p10) fma.s0 FR_Result=f32,f63,f62
2753 br.ret.sptk b0 // Exit for x > 0, y = 0.5
2754 }
2755 ;;
2756
2757 GLOBAL_LIBM_END(powl)
2758
2759
2760 LOCAL_LIBM_ENTRY(__libm_error_region)
2761 .prologue
2762 { .mfi
2763 add GR_Parameter_Y=-32,sp // Parameter 2 value
2764 nop.f 0
2765 .save ar.pfs,GR_SAVE_PFS
2766 mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
2767 }
2768 { .mfi
2769 .fframe 64
2770 add sp=-64,sp // Create new stack
2771 nop.f 0
2772 mov GR_SAVE_GP=gp // Save gp
2773 };;
2774 { .mmi
2775 stfe [GR_Parameter_Y] = FR_Input_Y,16 // Save Parameter 2 on stack
2776 add GR_Parameter_X = 16,sp // Parameter 1 address
2777 .save b0, GR_SAVE_B0
2778 mov GR_SAVE_B0=b0 // Save b0
2779 };;
2780 .body
2781 { .mib
2782 stfe [GR_Parameter_X] = FR_save_Input_X // Store Parameter 1 on stack
2783 add GR_Parameter_RESULT = 0,GR_Parameter_Y
2784 nop.b 0 // Parameter 3 address
2785 }
2786 { .mib
2787 stfe [GR_Parameter_Y] = FR_Result // Store Parameter 3 on stack
2788 add GR_Parameter_Y = -16,GR_Parameter_Y
2789 br.call.sptk b0=__libm_error_support# // Call error handling function
2790 };;
2791 { .mmi
2792 add GR_Parameter_RESULT = 48,sp
2793 nop.m 0
2794 nop.i 0
2795 };;
2796 { .mmi
2797 ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack
2798 .restore sp
2799 add sp = 64,sp // Restore stack pointer
2800 mov b0 = GR_SAVE_B0 // Restore return address
2801 };;
2802 { .mib
2803 mov gp = GR_SAVE_GP // Restore gp
2804 mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
2805 br.ret.sptk b0 // Return
2806 };;
2807
2808 LOCAL_LIBM_END(__libm_error_region#)
2809 .type __libm_error_support#,@function
2810 .global __libm_error_support#