]>
Commit | Line | Data |
---|---|---|
d5efd131 MF |
1 | .file "tgammaf.s" |
2 | ||
3 | ||
4 | // Copyright (c) 2001 - 2005, Intel Corporation | |
5 | // All rights reserved. | |
6 | // | |
7 | // Contributed 2001 by the Intel Numerics Group, Intel Corporation | |
8 | // | |
9 | // Redistribution and use in source and binary forms, with or without | |
10 | // modification, are permitted provided that the following conditions are | |
11 | // met: | |
12 | // | |
13 | // * Redistributions of source code must retain the above copyright | |
14 | // notice, this list of conditions and the following disclaimer. | |
15 | // | |
16 | // * Redistributions in binary form must reproduce the above copyright | |
17 | // notice, this list of conditions and the following disclaimer in the | |
18 | // documentation and/or other materials provided with the distribution. | |
19 | // | |
20 | // * The name of Intel Corporation may not be used to endorse or promote | |
21 | // products derived from this software without specific prior written | |
22 | // permission. | |
23 | ||
0347518d MF |
24 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
25 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,INCLUDING,BUT NOT | |
d5efd131 | 26 | // LIMITED TO,THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
0347518d | 27 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS |
d5efd131 MF |
28 | // CONTRIBUTORS BE LIABLE FOR ANY DIRECT,INDIRECT,INCIDENTAL,SPECIAL, |
29 | // EXEMPLARY,OR CONSEQUENTIAL DAMAGES (INCLUDING,BUT NOT LIMITED TO, | |
0347518d MF |
30 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,DATA,OR |
31 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY | |
d5efd131 | 32 | // OF LIABILITY,WHETHER IN CONTRACT,STRICT LIABILITY OR TORT (INCLUDING |
0347518d MF |
33 | // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
34 | // SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
35 | // | |
d5efd131 | 36 | // Intel Corporation is the author of this code,and requests that all |
0347518d | 37 | // problem reports or change requests be submitted to it directly at |
d5efd131 MF |
38 | // http://www.intel.com/software/products/opensource/libraries/num.htm. |
39 | // | |
40 | //********************************************************************* | |
41 | // | |
0347518d | 42 | // History: |
d5efd131 MF |
43 | // 11/30/01 Initial version |
44 | // 05/20/02 Cleaned up namespace and sf0 syntax | |
45 | // 02/10/03 Reordered header: .section, .global, .proc, .align | |
46 | // 04/04/03 Changed error codes for overflow and negative integers | |
47 | // 04/10/03 Changed code for overflow near zero handling | |
48 | // 12/16/03 Fixed parameter passing to/from error handling routine | |
49 | // 03/31/05 Reformatted delimiters between data tables | |
50 | // | |
51 | //********************************************************************* | |
52 | // | |
53 | //********************************************************************* | |
54 | // | |
55 | // Function: tgammaf(x) computes the principle value of the GAMMA | |
56 | // function of x. | |
57 | // | |
58 | //********************************************************************* | |
59 | // | |
60 | // Resources Used: | |
61 | // | |
62 | // Floating-Point Registers: f8-f15 | |
63 | // f33-f75 | |
64 | // | |
65 | // General Purpose Registers: | |
66 | // r8-r11 | |
67 | // r14-r29 | |
68 | // r32-r36 | |
69 | // r37-r40 (Used to pass arguments to error handling routine) | |
70 | // | |
71 | // Predicate Registers: p6-p15 | |
72 | // | |
73 | //********************************************************************* | |
74 | // | |
75 | // IEEE Special Conditions: | |
76 | // | |
77 | // tgammaf(+inf) = +inf | |
0347518d MF |
78 | // tgammaf(-inf) = QNaN |
79 | // tgammaf(+/-0) = +/-inf | |
d5efd131 MF |
80 | // tgammaf(x<0, x - integer) = QNaN |
81 | // tgammaf(SNaN) = QNaN | |
82 | // tgammaf(QNaN) = QNaN | |
83 | // | |
84 | //********************************************************************* | |
85 | // | |
86 | // Overview | |
87 | // | |
88 | // The method consists of three cases. | |
0347518d | 89 | // |
d5efd131 MF |
90 | // If 2 <= x < OVERFLOW_BOUNDARY use case tgamma_regular; |
91 | // else if 0 < x < 2 use case tgamma_from_0_to_2; | |
92 | // else if -(i+1) < x < -i, i = 0...43 use case tgamma_negatives; | |
93 | // | |
94 | // Case 2 <= x < OVERFLOW_BOUNDARY | |
95 | // ------------------------------- | |
96 | // Here we use algorithm based on the recursive formula | |
97 | // GAMMA(x+1) = x*GAMMA(x). For that we subdivide interval | |
98 | // [2; OVERFLOW_BOUNDARY] into intervals [8*n; 8*(n+1)] and | |
99 | // approximate GAMMA(x) by polynomial of 22th degree on each | |
100 | // [8*n; 8*n+1], recursive formula is used to expand GAMMA(x) | |
101 | // to [8*n; 8*n+1]. In other words we need to find n, i and r | |
102 | // such that x = 8 * n + i + r where n and i are integer numbers | |
103 | // and r is fractional part of x. So GAMMA(x) = GAMMA(8*n+i+r) = | |
104 | // = (x-1)*(x-2)*...*(x-i)*GAMMA(x-i) = | |
105 | // = (x-1)*(x-2)*...*(x-i)*GAMMA(8*n+r) ~ | |
106 | // ~ (x-1)*(x-2)*...*(x-i)*P12n(r). | |
107 | // | |
108 | // Step 1: Reduction | |
109 | // ----------------- | |
110 | // N = [x] with truncate | |
111 | // r = x - N, note 0 <= r < 1 | |
112 | // | |
113 | // n = N & ~0xF - index of table that contains coefficient of | |
0347518d | 114 | // polynomial approximation |
d5efd131 | 115 | // i = N & 0xF - is used in recursive formula |
0347518d | 116 | // |
d5efd131 MF |
117 | // |
118 | // Step 2: Approximation | |
119 | // --------------------- | |
120 | // We use factorized minimax approximation polynomials | |
121 | // P12n(r) = A12*(r^2+C01(n)*r+C00(n))* | |
122 | // *(r^2+C11(n)*r+C10(n))*...*(r^2+C51(n)*r+C50(n)) | |
123 | // | |
124 | // Step 3: Recursion | |
125 | // ----------------- | |
126 | // In case when i > 0 we need to multiply P12n(r) by product | |
127 | // R(i,x)=(x-1)*(x-2)*...*(x-i). To reduce number of fp-instructions | |
0347518d | 128 | // we can calculate R as follow: |
d5efd131 MF |
129 | // R(i,x) = ((x-1)*(x-2))*((x-3)*(x-4))*...*((x-(i-1))*(x-i)) if i is |
130 | // even or R = ((x-1)*(x-2))*((x-3)*(x-4))*...*((x-(i-2))*(x-(i-1)))* | |
131 | // *(i-1) if i is odd. In both cases we need to calculate | |
132 | // R2(i,x) = (x^2-3*x+2)*(x^2-7*x+12)*...*(x^2+x+2*j*(2*j-1)) = | |
133 | // = ((x^2-x)+2*(1-x))*((x^2-x)+6*(2-x))*...*((x^2-x)+2*(2*j-1)*(j-x)) = | |
134 | // = (RA+2*RB)*(RA+6*(1-RB))*...*(RA+2*(2*j-1)*(j-1+RB)) | |
135 | // where j = 1..[i/2], RA = x^2-x, RB = 1-x. | |
136 | // | |
137 | // Step 4: Reconstruction | |
138 | // ---------------------- | |
139 | // Reconstruction is just simple multiplication i.e. | |
140 | // GAMMA(x) = P12n(r)*R(i,x) | |
141 | // | |
142 | // Case 0 < x < 2 | |
143 | // -------------- | |
144 | // To calculate GAMMA(x) on this interval we do following | |
145 | // if 1.0 <= x < 1.25 than GAMMA(x) = P7(x-1) | |
146 | // if 1.25 <= x < 1.5 than GAMMA(x) = P7(x-x_min) where | |
147 | // x_min is point of local minimum on [1; 2] interval. | |
148 | // if 1.5 <= x < 1.75 than GAMMA(x) = P7(x-1.5) | |
149 | // if 1.75 <= x < 2.0 than GAMMA(x) = P7(x-1.5) | |
0347518d | 150 | // and |
d5efd131 MF |
151 | // if 0 < x < 1 than GAMMA(x) = GAMMA(x+1)/x |
152 | // | |
153 | // Case -(i+1) < x < -i, i = 0...43 | |
154 | // ---------------------------------- | |
155 | // Here we use the fact that GAMMA(-x) = PI/(x*GAMMA(x)*sin(PI*x)) and | |
156 | // so we need to calculate GAMMA(x), sin(PI*x)/PI. Calculation of | |
157 | // GAMMA(x) is described above. | |
158 | // | |
159 | // Step 1: Reduction | |
160 | // ----------------- | |
0347518d MF |
161 | // Note that period of sin(PI*x) is 2 and range reduction for |
162 | // sin(PI*x) is like to range reduction for GAMMA(x) | |
d5efd131 MF |
163 | // i.e rs = x - round(x) and |rs| <= 0.5. |
164 | // | |
165 | // Step 2: Approximation | |
166 | // --------------------- | |
0347518d | 167 | // To approximate sin(PI*x)/PI = sin(PI*(2*n+rs))/PI = |
d5efd131 MF |
168 | // = (-1)^n*sin(PI*rs)/PI Taylor series is used. |
169 | // sin(PI*rs)/PI ~ S17(rs). | |
170 | // | |
171 | // Step 3: Division | |
172 | // ---------------- | |
173 | // To calculate 1/x and 1/(GAMMA(x)*S12(rs)) we use frcpa | |
174 | // instruction with following Newton-Raphson interations. | |
0347518d | 175 | // |
d5efd131 MF |
176 | // |
177 | //********************************************************************* | |
178 | ||
179 | GR_ad_Data = r8 | |
180 | GR_TAG = r8 | |
181 | GR_SignExp = r9 | |
182 | GR_Sig = r10 | |
183 | GR_ArgNz = r10 | |
184 | GR_RqDeg = r11 | |
185 | ||
186 | GR_NanBound = r14 | |
187 | GR_ExpOf025 = r15 | |
188 | GR_ExpOf05 = r16 | |
189 | GR_ad_Co = r17 | |
190 | GR_ad_Ce = r18 | |
191 | GR_TblOffs = r19 | |
192 | GR_Arg = r20 | |
193 | GR_Exp2Ind = r21 | |
194 | GR_TblOffsMask = r21 | |
195 | GR_Offs = r22 | |
196 | GR_OvfNzBound = r23 | |
197 | GR_ZeroResBound = r24 | |
198 | GR_ad_SinO = r25 | |
199 | GR_ad_SinE = r26 | |
200 | GR_Correction = r27 | |
201 | GR_Tbl12Offs = r28 | |
202 | GR_NzBound = r28 | |
203 | GR_ExpOf1 = r29 | |
204 | GR_fpsr = r29 | |
205 | ||
206 | GR_SAVE_B0 = r33 | |
207 | GR_SAVE_PFS = r34 | |
208 | GR_SAVE_GP = r35 | |
209 | GR_SAVE_SP = r36 | |
210 | ||
211 | GR_Parameter_X = r37 | |
212 | GR_Parameter_Y = r38 | |
213 | GR_Parameter_RESULT = r39 | |
214 | GR_Parameter_TAG = r40 | |
215 | ||
216 | ||
217 | FR_X = f10 | |
218 | FR_Y = f1 | |
219 | FR_RESULT = f8 | |
220 | ||
0347518d | 221 | FR_iXt = f11 |
d5efd131 MF |
222 | FR_Xt = f12 |
223 | FR_r = f13 | |
224 | FR_r2 = f14 | |
225 | FR_r4 = f15 | |
226 | ||
227 | FR_C01 = f33 | |
228 | FR_A7 = f33 | |
229 | FR_C11 = f34 | |
230 | FR_A6 = f34 | |
231 | FR_C21 = f35 | |
232 | FR_A5 = f35 | |
233 | FR_C31 = f36 | |
234 | FR_A4 = f36 | |
235 | FR_C41 = f37 | |
236 | FR_A3 = f37 | |
237 | FR_C51 = f38 | |
238 | FR_A2 = f38 | |
239 | ||
240 | FR_C00 = f39 | |
241 | FR_A1 = f39 | |
242 | FR_C10 = f40 | |
243 | FR_A0 = f40 | |
244 | FR_C20 = f41 | |
245 | FR_C30 = f42 | |
246 | FR_C40 = f43 | |
247 | FR_C50 = f44 | |
248 | FR_An = f45 | |
249 | FR_OvfBound = f46 | |
250 | FR_InvAn = f47 | |
251 | ||
252 | FR_Multplr = f48 | |
253 | FR_NormX = f49 | |
254 | FR_X2mX = f50 | |
255 | FR_1mX = f51 | |
256 | FR_Rq0 = f51 | |
257 | FR_Rq1 = f52 | |
258 | FR_Rq2 = f53 | |
259 | FR_Rq3 = f54 | |
260 | ||
261 | FR_Rcp0 = f55 | |
262 | FR_Rcp1 = f56 | |
263 | FR_Rcp2 = f57 | |
264 | ||
265 | FR_InvNormX1 = f58 | |
266 | FR_InvNormX2 = f59 | |
267 | ||
268 | FR_rs = f60 | |
269 | FR_rs2 = f61 | |
270 | ||
271 | FR_LocalMin = f62 | |
272 | FR_10 = f63 | |
273 | ||
274 | FR_05 = f64 | |
275 | ||
276 | FR_S32 = f65 | |
277 | FR_S31 = f66 | |
278 | FR_S01 = f67 | |
279 | FR_S11 = f68 | |
280 | FR_S21 = f69 | |
281 | FR_S00 = f70 | |
282 | FR_S10 = f71 | |
283 | FR_S20 = f72 | |
284 | ||
285 | FR_GAMMA = f73 | |
286 | FR_2 = f74 | |
287 | FR_6 = f75 | |
288 | ||
289 | ||
290 | ||
291 | ||
292 | // Data tables | |
293 | //============================================================== | |
294 | RODATA | |
295 | .align 16 | |
296 | LOCAL_OBJECT_START(tgammaf_data) | |
297 | data8 0x3FDD8B618D5AF8FE // local minimum (0.461632144968362356785) | |
298 | data8 0x4024000000000000 // 10.0 | |
299 | data8 0x3E90FC992FF39E13 // S32 | |
300 | data8 0xBEC144B2760626E2 // S31 | |
301 | // | |
302 | //[2; 8) | |
303 | data8 0x4009EFD1BA0CB3B4 // C01 | |
304 | data8 0x3FFFB35378FF4822 // C11 | |
305 | data8 0xC01032270413B896 // C41 | |
306 | data8 0xC01F171A4C0D6827 // C51 | |
307 | data8 0x40148F8E197396AC // C20 | |
308 | data8 0x401C601959F1249C // C30 | |
309 | data8 0x3EE21AD881741977 // An | |
310 | data8 0x4041852200000000 // overflow boundary (35.04010009765625) | |
311 | data8 0x3FD9CE68F695B198 // C21 | |
312 | data8 0xBFF8C30AC900DA03 // C31 | |
313 | data8 0x400E17D2F0535C02 // C00 | |
314 | data8 0x4010689240F7FAC8 // C10 | |
315 | data8 0x402563147DDCCF8D // C40 | |
316 | data8 0x4033406D0480A21C // C50 | |
317 | // | |
318 | //[8; 16) | |
319 | data8 0x4006222BAE0B793B // C01 | |
320 | data8 0x4002452733473EDA // C11 | |
321 | data8 0xC0010EF3326FDDB3 // C41 | |
322 | data8 0xC01492B817F99C0F // C51 | |
323 | data8 0x40099C905A249B75 // C20 | |
324 | data8 0x4012B972AE0E533D // C30 | |
325 | data8 0x3FE6F6DB91D0D4CC // An | |
326 | data8 0x4041852200000000 // overflow boundary | |
327 | data8 0x3FF545828F7B73C5 // C21 | |
328 | data8 0xBFBBD210578764DF // C31 | |
329 | data8 0x4000542098F53CFC // C00 | |
330 | data8 0x40032C1309AD6C81 // C10 | |
331 | data8 0x401D7331E19BD2E1 // C40 | |
332 | data8 0x402A06807295EF57 // C50 | |
333 | // | |
334 | //[16; 24) | |
335 | data8 0x4000131002867596 // C01 | |
336 | data8 0x3FFAA362D5D1B6F2 // C11 | |
337 | data8 0xBFFCB6985697DB6D // C41 | |
338 | data8 0xC0115BEE3BFC3B3B // C51 | |
339 | data8 0x3FFE62FF83456F73 // C20 | |
340 | data8 0x4007E33478A114C4 // C30 | |
341 | data8 0x41E9B2B73795ED57 // An | |
342 | data8 0x4041852200000000 // overflow boundary | |
343 | data8 0x3FEEB1F345BC2769 // C21 | |
344 | data8 0xBFC3BBE6E7F3316F // C31 | |
345 | data8 0x3FF14E07DA5E9983 // C00 | |
346 | data8 0x3FF53B76BF81E2C0 // C10 | |
347 | data8 0x4014051E0269A3DC // C40 | |
348 | data8 0x40229D4227468EDB // C50 | |
349 | // | |
350 | //[24; 32) | |
351 | data8 0x3FFAF7BD498384DE // C01 | |
352 | data8 0x3FF62AD8B4D1C3D2 // C11 | |
353 | data8 0xBFFABCADCD004C32 // C41 | |
354 | data8 0xC00FADE97C097EC9 // C51 | |
355 | data8 0x3FF6DA9ED737707E // C20 | |
356 | data8 0x4002A29E9E0C782C // C30 | |
357 | data8 0x44329D5B5167C6C3 // An | |
358 | data8 0x4041852200000000 // overflow boundary | |
359 | data8 0x3FE8943CBBB4B727 // C21 | |
360 | data8 0xBFCB39D466E11756 // C31 | |
361 | data8 0x3FE879AF3243D8C1 // C00 | |
362 | data8 0x3FEEC7DEBB14CE1E // C10 | |
363 | data8 0x401017B79BA80BCB // C40 | |
364 | data8 0x401E941DC3C4DE80 // C50 | |
365 | // | |
366 | //[32; 40) | |
367 | data8 0x3FF7ECB3A0E8FE5C // C01 | |
368 | data8 0x3FF3815A8516316B // C11 | |
369 | data8 0xBFF9ABD8FCC000C3 // C41 | |
370 | data8 0xC00DD89969A4195B // C51 | |
371 | data8 0x3FF2E43139CBF563 // C20 | |
372 | data8 0x3FFF96DC3474A606 // C30 | |
373 | data8 0x46AFF4CA9B0DDDF0 // An | |
374 | data8 0x4041852200000000 // overflow boundary | |
375 | data8 0x3FE4CE76DA1B5783 // C21 | |
376 | data8 0xBFD0524DB460BC4E // C31 | |
377 | data8 0x3FE35852DF14E200 // C00 | |
378 | data8 0x3FE8C7610359F642 // C10 | |
379 | data8 0x400BCF750EC16173 // C40 | |
380 | data8 0x401AC14E02EA701C // C50 | |
381 | // | |
382 | //[40; 48) | |
383 | data8 0x3FF5DCE4D8193097 // C01 | |
384 | data8 0x3FF1B0D8C4974FFA // C11 | |
385 | data8 0xBFF8FB450194CAEA // C41 | |
386 | data8 0xC00C9658E030A6C4 // C51 | |
387 | data8 0x3FF068851118AB46 // C20 | |
388 | data8 0x3FFBF7C7BB46BF7D // C30 | |
389 | data8 0x3FF0000000000000 // An | |
390 | data8 0x4041852200000000 // overflow boundary | |
391 | data8 0x3FE231DEB11D847A // C21 | |
392 | data8 0xBFD251ECAFD7E935 // C31 | |
393 | data8 0x3FE0368AE288F6BF // C00 | |
394 | data8 0x3FE513AE4215A70C // C10 | |
395 | data8 0x4008F960F7141B8B // C40 | |
396 | data8 0x40183BA08134397B // C50 | |
397 | // | |
398 | //[1.0; 1.25) | |
399 | data8 0xBFD9909648921868 // A7 | |
400 | data8 0x3FE96FFEEEA8520F // A6 | |
401 | data8 0xBFED0800D93449B8 // A3 | |
402 | data8 0x3FEFA648D144911C // A2 | |
403 | data8 0xBFEE3720F7720B4D // A5 | |
404 | data8 0x3FEF4857A010CA3B // A4 | |
405 | data8 0xBFE2788CCD545AA4 // A1 | |
406 | data8 0x3FEFFFFFFFE9209E // A0 | |
407 | // | |
408 | //[1.25; 1.5) | |
409 | data8 0xBFB421236426936C // A7 | |
410 | data8 0x3FAF237514F36691 // A6 | |
411 | data8 0xBFC0BADE710A10B9 // A3 | |
412 | data8 0x3FDB6C5465BBEF1F // A2 | |
413 | data8 0xBFB7E7F83A546EBE // A5 | |
414 | data8 0x3FC496A01A545163 // A4 | |
415 | data8 0xBDEE86A39D8452EB // A1 | |
416 | data8 0x3FEC56DC82A39AA2 // A0 | |
417 | // | |
418 | //[1.5; 1.75) | |
419 | data8 0xBF94730B51795867 // A7 | |
420 | data8 0x3FBF4203E3816C7B // A6 | |
421 | data8 0xBFE85B427DBD23E4 // A3 | |
422 | data8 0x3FEE65557AB26771 // A2 | |
423 | data8 0xBFD59D31BE3AB42A // A5 | |
424 | data8 0x3FE3C90CC8F09147 // A4 | |
425 | data8 0xBFE245971DF735B8 // A1 | |
426 | data8 0x3FEFFC613AE7FBC8 // A0 | |
427 | // | |
428 | //[1.75; 2.0) | |
429 | data8 0xBF7746A85137617E // A7 | |
430 | data8 0x3FA96E37D09735F3 // A6 | |
431 | data8 0xBFE3C24AC40AC0BB // A3 | |
432 | data8 0x3FEC56A80A977CA5 // A2 | |
433 | data8 0xBFC6F0E707560916 // A5 | |
434 | data8 0x3FDB262D949175BE // A4 | |
435 | data8 0xBFE1C1AEDFB25495 // A1 | |
436 | data8 0x3FEFEE1E644B2022 // A0 | |
437 | // | |
438 | // sin(pi*x)/pi | |
439 | data8 0xC026FB0D377656CC // S01 | |
440 | data8 0x3FFFB15F95A22324 // S11 | |
441 | data8 0x406CE58F4A41C6E7 // S10 | |
442 | data8 0x404453786302C61E // S20 | |
443 | data8 0xC023D59A47DBFCD3 // S21 | |
444 | data8 0x405541D7ABECEFCA // S00 | |
445 | // | |
446 | // 1/An for [40; 48) | |
447 | data8 0xCAA7576DE621FCD5, 0x3F68 | |
448 | LOCAL_OBJECT_END(tgammaf_data) | |
449 | ||
450 | //============================================================== | |
451 | // Code | |
452 | //============================================================== | |
453 | ||
454 | .section .text | |
455 | GLOBAL_LIBM_ENTRY(tgammaf) | |
456 | { .mfi | |
457 | getf.exp GR_SignExp = f8 | |
458 | fma.s1 FR_NormX = f8,f1,f0 | |
459 | addl GR_ad_Data = @ltoff(tgammaf_data), gp | |
460 | } | |
461 | { .mfi | |
462 | mov GR_ExpOf05 = 0xFFFE | |
463 | fcvt.fx.trunc.s1 FR_iXt = f8 // [x] | |
464 | mov GR_Offs = 0 // 2 <= x < 8 | |
465 | };; | |
466 | { .mfi | |
467 | getf.d GR_Arg = f8 | |
468 | fcmp.lt.s1 p14,p15 = f8,f0 | |
469 | mov GR_Tbl12Offs = 0 | |
470 | } | |
471 | { .mfi | |
472 | setf.exp FR_05 = GR_ExpOf05 | |
473 | fma.s1 FR_2 = f1,f1,f1 // 2 | |
474 | mov GR_Correction = 0 | |
475 | };; | |
476 | { .mfi | |
477 | ld8 GR_ad_Data = [GR_ad_Data] | |
478 | fclass.m p10,p0 = f8,0x1E7 // is x NaTVal, NaN, +/-0 or +/-INF? | |
479 | tbit.z p12,p13 = GR_SignExp,16 // p13 if |x| >= 2 | |
480 | } | |
481 | { .mfi | |
482 | mov GR_ExpOf1 = 0xFFFF | |
483 | fcvt.fx.s1 FR_rs = f8 // round(x) | |
484 | and GR_Exp2Ind = 7,GR_SignExp | |
485 | };; | |
486 | .pred.rel "mutex",p14,p15 | |
487 | { .mfi | |
488 | (p15) cmp.eq.unc p11,p0 = GR_ExpOf1,GR_SignExp // p11 if 1 <= x < 2 | |
489 | (p14) fma.s1 FR_1mX = f1,f1,f8 // 1 - |x| | |
490 | mov GR_Sig = 0 // if |x| < 2 | |
491 | } | |
492 | { .mfi | |
493 | (p13) cmp.eq.unc p7,p0 = 2,GR_Exp2Ind | |
494 | (p15) fms.s1 FR_1mX = f1,f1,f8 // 1 - |x| | |
495 | (p13) cmp.eq.unc p8,p0 = 3,GR_Exp2Ind | |
496 | };; | |
497 | .pred.rel "mutex",p7,p8 | |
498 | { .mfi | |
499 | (p7) mov GR_Offs = 0x7 // 8 <= |x| < 16 | |
500 | nop.f 0 | |
501 | (p8) tbit.z.unc p0,p6 = GR_Arg,51 | |
502 | } | |
503 | { .mib | |
504 | (p13) cmp.lt.unc p9,p0 = 3,GR_Exp2Ind | |
505 | (p8) mov GR_Offs = 0xE // 16 <= |x| < 32 | |
506 | // jump if x is NaTVal, NaN, +/-0 or +/-INF? | |
507 | (p10) br.cond.spnt tgammaf_spec_args | |
508 | };; | |
509 | .pred.rel "mutex",p14,p15 | |
510 | .pred.rel "mutex",p6,p9 | |
511 | { .mfi | |
512 | (p9) mov GR_Offs = 0x1C // 32 <= |x| | |
513 | (p14) fma.s1 FR_X2mX = FR_NormX,FR_NormX,FR_NormX // x^2-|x| | |
514 | (p9) tbit.z.unc p0,p8 = GR_Arg,50 | |
515 | } | |
516 | { .mfi | |
517 | ldfpd FR_LocalMin,FR_10 = [GR_ad_Data],16 | |
518 | (p15) fms.s1 FR_X2mX = FR_NormX,FR_NormX,FR_NormX // x^2-|x| | |
519 | (p6) add GR_Offs = 0x7,GR_Offs // 24 <= x < 32 | |
520 | };; | |
521 | .pred.rel "mutex",p8,p12 | |
522 | { .mfi | |
523 | add GR_ad_Ce = 0x50,GR_ad_Data | |
524 | (p15) fcmp.lt.unc.s1 p10,p0 = f8,f1 // p10 if 0 <= x < 1 | |
525 | mov GR_OvfNzBound = 2 | |
526 | } | |
527 | { .mib | |
528 | ldfpd FR_S32,FR_S31 = [GR_ad_Data],16 | |
529 | (p8) add GR_Offs = 0x7,GR_Offs // 40 <= |x| | |
530 | // jump if 1 <= x < 2 | |
531 | (p11) br.cond.spnt tgammaf_from_1_to_2 | |
532 | };; | |
533 | { .mfi | |
534 | shladd GR_ad_Ce = GR_Offs,4,GR_ad_Ce | |
535 | fcvt.xf FR_Xt = FR_iXt // [x] | |
536 | (p13) cmp.eq.unc p7,p0 = r0,GR_Offs // p7 if 2 <= |x| < 8 | |
537 | } | |
538 | { .mfi | |
539 | shladd GR_ad_Co = GR_Offs,4,GR_ad_Data | |
540 | fma.s1 FR_6 = FR_2,FR_2,FR_2 | |
541 | mov GR_ExpOf05 = 0x7FC | |
542 | };; | |
543 | { .mfi | |
544 | (p13) getf.sig GR_Sig = FR_iXt // if |x| >= 2 | |
545 | frcpa.s1 FR_Rcp0,p0 = f1,FR_NormX | |
546 | (p10) shr GR_Arg = GR_Arg,51 | |
547 | } | |
548 | { .mib | |
549 | ldfpd FR_C01,FR_C11 = [GR_ad_Co],16 | |
550 | (p7) mov GR_Correction = 2 | |
551 | // jump if 0 < x < 1 | |
552 | (p10) br.cond.spnt tgammaf_from_0_to_1 | |
553 | };; | |
554 | { .mfi | |
555 | ldfpd FR_C21,FR_C31 = [GR_ad_Ce],16 | |
556 | fma.s1 FR_Rq2 = f1,f1,FR_1mX // 2 - |x| | |
557 | (p14) sub GR_Correction = r0,GR_Correction | |
558 | } | |
559 | { .mfi | |
560 | ldfpd FR_C41,FR_C51 = [GR_ad_Co],16 | |
561 | (p14) fcvt.xf FR_rs = FR_rs | |
562 | (p14) add GR_ad_SinO = 0x3A0,GR_ad_Data | |
563 | };; | |
564 | .pred.rel "mutex",p14,p15 | |
565 | { .mfi | |
566 | ldfpd FR_C00,FR_C10 = [GR_ad_Ce],16 | |
567 | nop.f 0 | |
568 | (p14) sub GR_Sig = GR_Correction,GR_Sig | |
569 | } | |
570 | { .mfi | |
571 | ldfpd FR_C20,FR_C30 = [GR_ad_Co],16 | |
572 | fma.s1 FR_Rq1 = FR_1mX,FR_2,FR_X2mX // (x-1)*(x-2) | |
573 | (p15) sub GR_Sig = GR_Sig,GR_Correction | |
574 | };; | |
575 | { .mfi | |
576 | (p14) ldfpd FR_S01,FR_S11 = [GR_ad_SinO],16 | |
577 | fma.s1 FR_Rq3 = FR_2,f1,FR_1mX // 3 - |x| | |
578 | and GR_RqDeg = 0x6,GR_Sig | |
579 | } | |
580 | { .mfi | |
581 | ldfpd FR_C40,FR_C50 = [GR_ad_Ce],16 | |
582 | (p14) fma.d.s0 FR_X = f0,f0,f8 // set deno flag | |
583 | mov GR_NanBound = 0x30016 // -2^23 | |
584 | };; | |
585 | .pred.rel "mutex",p14,p15 | |
586 | { .mfi | |
587 | (p14) add GR_ad_SinE = 0x3C0,GR_ad_Data | |
588 | (p15) fms.s1 FR_r = FR_NormX,f1,FR_Xt // r = x - [x] | |
589 | cmp.eq p8,p0 = 2,GR_RqDeg | |
590 | } | |
591 | { .mfi | |
592 | ldfpd FR_An,FR_OvfBound = [GR_ad_Co] | |
593 | (p14) fms.s1 FR_r = FR_Xt,f1,FR_NormX // r = |x - [x]| | |
594 | cmp.eq p9,p0 = 4,GR_RqDeg | |
595 | };; | |
596 | .pred.rel "mutex",p8,p9 | |
597 | { .mfi | |
598 | (p14) ldfpd FR_S21,FR_S00 = [GR_ad_SinE],16 | |
599 | (p8) fma.s1 FR_Rq0 = FR_2,f1,FR_1mX // (3-x) | |
600 | tbit.z p0,p6 = GR_Sig,0 | |
601 | } | |
602 | { .mfi | |
603 | (p14) ldfpd FR_S10,FR_S20 = [GR_ad_SinO],16 | |
604 | (p9) fma.s1 FR_Rq0 = FR_2,FR_2,FR_1mX // (5-x) | |
605 | cmp.eq p10,p0 = 6,GR_RqDeg | |
606 | };; | |
607 | { .mfi | |
608 | (p14) getf.s GR_Arg = f8 | |
609 | (p14) fcmp.eq.unc.s1 p13,p0 = FR_NormX,FR_Xt | |
610 | (p14) mov GR_ZeroResBound = 0xC22C // -43 | |
611 | } | |
612 | { .mfi | |
613 | (p14) ldfe FR_InvAn = [GR_ad_SinE] | |
614 | (p10) fma.s1 FR_Rq0 = FR_6,f1,FR_1mX // (7-x) | |
615 | cmp.eq p7,p0 = r0,GR_RqDeg | |
616 | };; | |
617 | { .mfi | |
618 | (p14) cmp.ge.unc p11,p0 = GR_SignExp,GR_NanBound | |
619 | fma.s1 FR_Rq2 = FR_Rq2,FR_6,FR_X2mX // (x-3)*(x-4) | |
620 | (p14) shl GR_ZeroResBound = GR_ZeroResBound,16 | |
621 | } | |
622 | { .mfb | |
623 | (p14) mov GR_OvfNzBound = 0x802 | |
624 | (p14) fms.s1 FR_rs = FR_rs,f1,FR_NormX // rs = round(x) - x | |
625 | // jump if x < -2^23 i.e. x is negative integer | |
626 | (p11) br.cond.spnt tgammaf_singularity | |
627 | };; | |
628 | { .mfi | |
629 | nop.m 0 | |
630 | (p7) fma.s1 FR_Rq1 = f0,f0,f1 | |
631 | (p14) shl GR_OvfNzBound = GR_OvfNzBound,20 | |
632 | } | |
633 | { .mfb | |
634 | nop.m 0 | |
635 | fma.s1 FR_Rq3 = FR_Rq3,FR_10,FR_X2mX // (x-5)*(x-6) | |
636 | // jump if x is negative integer such that -2^23 < x < 0 | |
637 | (p13) br.cond.spnt tgammaf_singularity | |
638 | };; | |
639 | { .mfi | |
640 | nop.m 0 | |
641 | fma.s1 FR_C01 = FR_C01,f1,FR_r | |
642 | (p14) mov GR_ExpOf05 = 0xFFFE | |
643 | } | |
644 | { .mfi | |
645 | (p14) cmp.eq.unc p7,p0 = GR_Arg,GR_OvfNzBound | |
646 | fma.s1 FR_C11 = FR_C11,f1,FR_r | |
647 | (p14) cmp.ltu.unc p11,p0 = GR_Arg,GR_OvfNzBound | |
648 | };; | |
649 | { .mfi | |
650 | nop.m 0 | |
651 | fma.s1 FR_C21 = FR_C21,f1,FR_r | |
652 | (p14) cmp.ltu.unc p9,p0 = GR_ZeroResBound,GR_Arg | |
653 | } | |
654 | { .mfb | |
655 | nop.m 0 | |
656 | fma.s1 FR_C31 = FR_C31,f1,FR_r | |
657 | // jump if argument is close to 0 negative | |
658 | (p11) br.cond.spnt tgammaf_overflow | |
659 | };; | |
660 | { .mfi | |
661 | nop.m 0 | |
662 | fma.s1 FR_C41 = FR_C41,f1,FR_r | |
663 | nop.i 0 | |
664 | } | |
665 | { .mfb | |
666 | nop.m 0 | |
667 | fma.s1 FR_C51 = FR_C51,f1,FR_r | |
668 | // jump if x is negative noninteger such that -2^23 < x < -43 | |
669 | (p9) br.cond.spnt tgammaf_underflow | |
670 | };; | |
671 | { .mfi | |
672 | nop.m 0 | |
673 | (p14) fma.s1 FR_rs2 = FR_rs,FR_rs,f0 | |
0347518d | 674 | nop.i 0 |
d5efd131 MF |
675 | } |
676 | { .mfb | |
677 | nop.m 0 | |
678 | (p14) fma.s1 FR_S01 = FR_rs,FR_rs,FR_S01 | |
679 | // jump if argument is 0x80200000 | |
680 | (p7) br.cond.spnt tgammaf_overflow_near0_bound | |
681 | };; | |
682 | { .mfi | |
0347518d | 683 | nop.m 0 |
d5efd131 | 684 | (p6) fnma.s1 FR_Rq1 = FR_Rq1,FR_Rq0,f0 |
0347518d | 685 | nop.i 0 |
d5efd131 MF |
686 | } |
687 | { .mfi | |
0347518d | 688 | nop.m 0 |
d5efd131 MF |
689 | (p10) fma.s1 FR_Rq2 = FR_Rq2,FR_Rq3,f0 |
690 | and GR_Sig = 0x7,GR_Sig | |
691 | };; | |
692 | { .mfi | |
693 | nop.m 0 | |
694 | fma.s1 FR_C01 = FR_C01,FR_r,FR_C00 | |
695 | nop.i 0 | |
696 | } | |
697 | { .mfi | |
698 | nop.m 0 | |
699 | fma.s1 FR_C11 = FR_C11,FR_r,FR_C10 | |
700 | cmp.eq p6,p7 = r0,GR_Sig // p6 if |x| from one of base intervals | |
701 | };; | |
702 | { .mfi | |
703 | nop.m 0 | |
704 | fma.s1 FR_C21 = FR_C21,FR_r,FR_C20 | |
705 | nop.i 0 | |
706 | } | |
707 | { .mfi | |
708 | nop.m 0 | |
709 | fma.s1 FR_C31 = FR_C31,FR_r,FR_C30 | |
710 | (p7) cmp.lt.unc p9,p0 = 2,GR_RqDeg | |
711 | };; | |
712 | { .mfi | |
713 | nop.m 0 | |
714 | (p14) fma.s1 FR_S11 = FR_rs,FR_rs,FR_S11 | |
715 | nop.i 0 | |
716 | } | |
717 | { .mfi | |
718 | nop.m 0 | |
719 | (p14) fma.s1 FR_S21 = FR_rs,FR_rs,FR_S21 | |
720 | nop.i 0 | |
721 | };; | |
722 | { .mfi | |
723 | nop.m 0 | |
724 | fma.s1 FR_C41 = FR_C41,FR_r,FR_C40 | |
725 | nop.i 0 | |
726 | } | |
727 | { .mfi | |
728 | nop.m 0 | |
729 | (p14) fma.s1 FR_S32 = FR_rs2,FR_S32,FR_S31 | |
730 | nop.i 0 | |
731 | };; | |
732 | { .mfi | |
0347518d | 733 | nop.m 0 |
d5efd131 MF |
734 | (p9) fma.s1 FR_Rq1 = FR_Rq1,FR_Rq2,f0 |
735 | nop.i 0 | |
736 | } | |
737 | { .mfi | |
738 | nop.m 0 | |
739 | fma.s1 FR_C51 = FR_C51,FR_r,FR_C50 | |
0347518d | 740 | nop.i 0 |
d5efd131 MF |
741 | };; |
742 | { .mfi | |
743 | (p14) getf.exp GR_SignExp = FR_rs | |
744 | fma.s1 FR_C01 = FR_C01,FR_C11,f0 | |
0347518d | 745 | nop.i 0 |
d5efd131 MF |
746 | } |
747 | { .mfi | |
748 | nop.m 0 | |
749 | (p14) fma.s1 FR_S01 = FR_S01,FR_rs2,FR_S00 | |
0347518d | 750 | nop.i 0 |
d5efd131 MF |
751 | };; |
752 | { .mfi | |
753 | nop.m 0 | |
754 | fma.s1 FR_C21 = FR_C21,FR_C31,f0 | |
755 | nop.i 0 | |
756 | } | |
757 | { .mfi | |
758 | nop.m 0 | |
759 | // NR-iteration | |
760 | (p14) fnma.s1 FR_InvNormX1 = FR_Rcp0,FR_NormX,f1 | |
761 | nop.i 0 | |
762 | };; | |
763 | { .mfi | |
764 | nop.m 0 | |
765 | (p14) fma.s1 FR_S11 = FR_S11,FR_rs2,FR_S10 | |
0347518d | 766 | (p14) tbit.z.unc p11,p12 = GR_SignExp,17 |
d5efd131 MF |
767 | } |
768 | { .mfi | |
769 | nop.m 0 | |
770 | (p14) fma.s1 FR_S21 = FR_S21,FR_rs2,FR_S20 | |
771 | nop.i 0 | |
772 | };; | |
773 | { .mfi | |
774 | nop.m 0 | |
775 | (p15) fcmp.lt.unc.s1 p0,p13 = FR_NormX,FR_OvfBound | |
776 | nop.i 0 | |
777 | } | |
778 | { .mfi | |
779 | nop.m 0 | |
780 | (p14) fma.s1 FR_S32 = FR_rs2,FR_S32,f0 | |
781 | nop.i 0 | |
782 | };; | |
783 | { .mfi | |
784 | nop.m 0 | |
785 | fma.s1 FR_C41 = FR_C41,FR_C51,f0 | |
786 | nop.i 0 | |
787 | } | |
788 | { .mfi | |
789 | nop.m 0 | |
790 | (p7) fma.s1 FR_An = FR_Rq1,FR_An,f0 | |
0347518d | 791 | nop.i 0 |
d5efd131 MF |
792 | };; |
793 | { .mfb | |
794 | nop.m 0 | |
795 | nop.f 0 | |
796 | // jump if x > 35.04010009765625 | |
797 | (p13) br.cond.spnt tgammaf_overflow | |
798 | };; | |
799 | { .mfi | |
800 | nop.m 0 | |
801 | // NR-iteration | |
802 | (p14) fma.s1 FR_InvNormX1 = FR_Rcp0,FR_InvNormX1,FR_Rcp0 | |
803 | nop.i 0 | |
804 | };; | |
805 | { .mfi | |
806 | nop.m 0 | |
807 | (p14) fma.s1 FR_S01 = FR_S01,FR_S11,f0 | |
808 | nop.i 0 | |
809 | };; | |
810 | { .mfi | |
811 | nop.m 0 | |
812 | (p14) fma.s1 FR_S21 = FR_S21,FR_S32,f0 | |
813 | nop.i 0 | |
814 | };; | |
815 | { .mfi | |
816 | (p14) getf.exp GR_SignExp = FR_NormX | |
817 | fma.s1 FR_C01 = FR_C01,FR_C21,f0 | |
818 | nop.i 0 | |
819 | } | |
820 | { .mfi | |
821 | nop.m 0 | |
822 | fma.s1 FR_C41 = FR_C41,FR_An,f0 | |
823 | (p14) mov GR_ExpOf1 = 0x2FFFF | |
824 | };; | |
825 | { .mfi | |
826 | nop.m 0 | |
827 | // NR-iteration | |
828 | (p14) fnma.s1 FR_InvNormX2 = FR_InvNormX1,FR_NormX,f1 | |
829 | nop.i 0 | |
830 | };; | |
831 | .pred.rel "mutex",p11,p12 | |
832 | { .mfi | |
833 | nop.m 0 | |
834 | (p12) fnma.s1 FR_S01 = FR_S01,FR_S21,f0 | |
835 | nop.i 0 | |
836 | } | |
837 | { .mfi | |
838 | nop.m 0 | |
839 | (p11) fma.s1 FR_S01 = FR_S01,FR_S21,f0 | |
840 | nop.i 0 | |
841 | };; | |
842 | ||
843 | { .mfi | |
0347518d | 844 | nop.m 0 |
d5efd131 MF |
845 | (p14) fma.s1 FR_GAMMA = FR_C01,FR_C41,f0 |
846 | (p14) tbit.z.unc p6,p7 = GR_Sig,0 | |
847 | } | |
848 | { .mfb | |
849 | nop.m 0 | |
850 | (p15) fma.s.s0 f8 = FR_C01,FR_C41,f0 | |
851 | (p15) br.ret.spnt b0 // exit for positives | |
852 | };; | |
853 | .pred.rel "mutex",p11,p12 | |
854 | { .mfi | |
855 | nop.m 0 | |
856 | (p12) fms.s1 FR_S01 = FR_rs,FR_S01,FR_rs | |
857 | nop.i 0 | |
858 | } | |
859 | { .mfi | |
860 | nop.m 0 | |
861 | (p11) fma.s1 FR_S01 = FR_rs,FR_S01,FR_rs | |
862 | nop.i 0 | |
863 | };; | |
864 | { .mfi | |
865 | nop.m 0 | |
866 | // NR-iteration | |
867 | fma.s1 FR_InvNormX2 = FR_InvNormX1,FR_InvNormX2,FR_InvNormX1 | |
868 | cmp.eq p10,p0 = 0x23,GR_Offs | |
869 | };; | |
870 | .pred.rel "mutex",p6,p7 | |
871 | { .mfi | |
872 | nop.m 0 | |
873 | (p6) fma.s1 FR_GAMMA = FR_S01,FR_GAMMA,f0 | |
874 | cmp.gtu p8,p0 = GR_SignExp,GR_ExpOf1 | |
875 | } | |
876 | { .mfi | |
877 | nop.m 0 | |
878 | (p7) fnma.s1 FR_GAMMA = FR_S01,FR_GAMMA,f0 | |
879 | cmp.eq p9,p0 = GR_SignExp,GR_ExpOf1 | |
880 | };; | |
881 | { .mfi | |
882 | nop.m 0 | |
883 | // NR-iteration | |
884 | fnma.s1 FR_InvNormX1 = FR_InvNormX2,FR_NormX,f1 | |
885 | nop.i 0 | |
886 | } | |
887 | { .mfi | |
888 | nop.m 0 | |
889 | (p10) fma.s1 FR_InvNormX2 = FR_InvNormX2,FR_InvAn,f0 | |
890 | nop.i 0 | |
891 | };; | |
892 | { .mfi | |
893 | nop.m 0 | |
894 | frcpa.s1 FR_Rcp0,p0 = f1,FR_GAMMA | |
895 | nop.i 0 | |
896 | };; | |
897 | { .mfi | |
898 | nop.m 0 | |
899 | fms.s1 FR_Multplr = FR_NormX,f1,f1 // x - 1 | |
900 | nop.i 0 | |
901 | };; | |
902 | { .mfi | |
903 | nop.m 0 | |
904 | // NR-iteration | |
905 | fnma.s1 FR_Rcp1 = FR_Rcp0,FR_GAMMA,f1 | |
906 | nop.i 0 | |
907 | };; | |
908 | .pred.rel "mutex",p8,p9 | |
909 | { .mfi | |
910 | nop.m 0 | |
911 | // 1/x or 1/(An*x) | |
912 | (p8) fma.s1 FR_Multplr = FR_InvNormX2,FR_InvNormX1,FR_InvNormX2 | |
913 | nop.i 0 | |
914 | } | |
915 | { .mfi | |
916 | nop.m 0 | |
917 | (p9) fma.s1 FR_Multplr = f1,f1,f0 | |
918 | nop.i 0 | |
919 | };; | |
920 | { .mfi | |
921 | nop.m 0 | |
922 | // NR-iteration | |
923 | fma.s1 FR_Rcp1 = FR_Rcp0,FR_Rcp1,FR_Rcp0 | |
924 | nop.i 0 | |
925 | };; | |
926 | { .mfi | |
927 | nop.m 0 | |
928 | // NR-iteration | |
929 | fnma.s1 FR_Rcp2 = FR_Rcp1,FR_GAMMA,f1 | |
930 | nop.i 0 | |
931 | } | |
932 | { .mfi | |
933 | nop.m 0 | |
934 | // NR-iteration | |
935 | fma.s1 FR_Rcp1 = FR_Rcp1,FR_Multplr,f0 | |
936 | nop.i 0 | |
937 | };; | |
938 | { .mfb | |
939 | nop.m 0 | |
940 | fma.s.s0 f8 = FR_Rcp1,FR_Rcp2,FR_Rcp1 | |
941 | br.ret.sptk b0 | |
942 | };; | |
943 | ||
944 | // here if 0 < x < 1 | |
945 | //-------------------------------------------------------------------- | |
946 | .align 32 | |
947 | tgammaf_from_0_to_1: | |
948 | { .mfi | |
949 | cmp.lt p7,p0 = GR_Arg,GR_ExpOf05 | |
950 | // NR-iteration | |
951 | fnma.s1 FR_Rcp1 = FR_Rcp0,FR_NormX,f1 | |
952 | cmp.eq p8,p0 = GR_Arg,GR_ExpOf05 | |
953 | } | |
954 | { .mfi | |
955 | cmp.gt p9,p0 = GR_Arg,GR_ExpOf05 | |
956 | fma.s1 FR_r = f0,f0,FR_NormX // reduced arg for (0;1) | |
0347518d | 957 | mov GR_ExpOf025 = 0x7FA |
d5efd131 MF |
958 | };; |
959 | { .mfi | |
960 | getf.s GR_ArgNz = f8 | |
961 | fma.d.s0 FR_X = f0,f0,f8 // set deno flag | |
962 | shl GR_OvfNzBound = GR_OvfNzBound,20 | |
963 | } | |
964 | { .mfi | |
965 | (p8) mov GR_Tbl12Offs = 0x80 // 0.5 <= x < 0.75 | |
966 | nop.f 0 | |
967 | (p7) cmp.ge.unc p6,p0 = GR_Arg,GR_ExpOf025 | |
968 | };; | |
969 | .pred.rel "mutex",p6,p9 | |
970 | { .mfi | |
971 | (p9) mov GR_Tbl12Offs = 0xC0 // 0.75 <= x < 1 | |
972 | nop.f 0 | |
973 | (p6) mov GR_Tbl12Offs = 0x40 // 0.25 <= x < 0.5 | |
974 | } | |
975 | { .mfi | |
0347518d | 976 | add GR_ad_Ce = 0x2C0,GR_ad_Data |
d5efd131 MF |
977 | nop.f 0 |
978 | add GR_ad_Co = 0x2A0,GR_ad_Data | |
979 | };; | |
980 | { .mfi | |
981 | add GR_ad_Co = GR_ad_Co,GR_Tbl12Offs | |
982 | nop.f 0 | |
983 | cmp.lt p12,p0 = GR_ArgNz,GR_OvfNzBound | |
984 | } | |
985 | { .mib | |
986 | add GR_ad_Ce = GR_ad_Ce,GR_Tbl12Offs | |
987 | cmp.eq p7,p0 = GR_ArgNz,GR_OvfNzBound | |
988 | // jump if argument is 0x00200000 | |
989 | (p7) br.cond.spnt tgammaf_overflow_near0_bound | |
990 | };; | |
991 | { .mmb | |
992 | ldfpd FR_A7,FR_A6 = [GR_ad_Co],16 | |
993 | ldfpd FR_A5,FR_A4 = [GR_ad_Ce],16 | |
994 | // jump if argument is close to 0 positive | |
0347518d | 995 | (p12) br.cond.spnt tgammaf_overflow |
d5efd131 MF |
996 | };; |
997 | { .mfi | |
998 | ldfpd FR_A3,FR_A2 = [GR_ad_Co],16 | |
999 | // NR-iteration | |
1000 | fma.s1 FR_Rcp1 = FR_Rcp0,FR_Rcp1,FR_Rcp0 | |
1001 | nop.i 0 | |
1002 | } | |
1003 | { .mfb | |
1004 | ldfpd FR_A1,FR_A0 = [GR_ad_Ce],16 | |
1005 | nop.f 0 | |
0347518d | 1006 | br.cond.sptk tgamma_from_0_to_2 |
d5efd131 MF |
1007 | };; |
1008 | ||
1009 | // here if 1 < x < 2 | |
1010 | //-------------------------------------------------------------------- | |
1011 | .align 32 | |
1012 | tgammaf_from_1_to_2: | |
1013 | { .mfi | |
1014 | add GR_ad_Co = 0x2A0,GR_ad_Data | |
1015 | fms.s1 FR_r = f0,f0,FR_1mX | |
1016 | shr GR_TblOffs = GR_Arg,47 | |
1017 | } | |
1018 | { .mfi | |
1019 | add GR_ad_Ce = 0x2C0,GR_ad_Data | |
1020 | nop.f 0 | |
1021 | mov GR_TblOffsMask = 0x18 | |
1022 | };; | |
1023 | { .mfi | |
1024 | nop.m 0 | |
1025 | nop.f 0 | |
0347518d | 1026 | and GR_TblOffs = GR_TblOffs,GR_TblOffsMask |
d5efd131 MF |
1027 | };; |
1028 | { .mfi | |
1029 | shladd GR_ad_Co = GR_TblOffs,3,GR_ad_Co | |
1030 | nop.f 0 | |
1031 | nop.i 0 | |
1032 | } | |
1033 | { .mfi | |
1034 | shladd GR_ad_Ce = GR_TblOffs,3,GR_ad_Ce | |
1035 | nop.f 0 | |
1036 | cmp.eq p6,p7 = 8,GR_TblOffs | |
1037 | };; | |
1038 | { .mmi | |
1039 | ldfpd FR_A7,FR_A6 = [GR_ad_Co],16 | |
1040 | ldfpd FR_A5,FR_A4 = [GR_ad_Ce],16 | |
1041 | nop.i 0 | |
1042 | };; | |
1043 | { .mmi | |
1044 | ldfpd FR_A3,FR_A2 = [GR_ad_Co],16 | |
1045 | ldfpd FR_A1,FR_A0 = [GR_ad_Ce],16 | |
1046 | nop.i 0 | |
1047 | };; | |
1048 | ||
1049 | .align 32 | |
1050 | tgamma_from_0_to_2: | |
1051 | { .mfi | |
1052 | nop.m 0 | |
1053 | (p6) fms.s1 FR_r = FR_r,f1,FR_LocalMin | |
1054 | nop.i 0 | |
1055 | };; | |
1056 | { .mfi | |
1057 | nop.m 0 | |
1058 | // NR-iteration | |
1059 | (p10) fnma.s1 FR_Rcp2 = FR_Rcp1,FR_NormX,f1 | |
1060 | nop.i 0 | |
1061 | };; | |
1062 | { .mfi | |
1063 | nop.m 0 | |
1064 | fms.s1 FR_r2 = FR_r,FR_r,f0 | |
1065 | nop.i 0 | |
1066 | };; | |
1067 | { .mfi | |
1068 | nop.m 0 | |
1069 | fma.s1 FR_A7 = FR_A7,FR_r,FR_A6 | |
1070 | nop.i 0 | |
1071 | } | |
1072 | { .mfi | |
1073 | nop.m 0 | |
1074 | fma.s1 FR_A5 = FR_A5,FR_r,FR_A4 | |
1075 | nop.i 0 | |
1076 | };; | |
1077 | { .mfi | |
1078 | nop.m 0 | |
1079 | fma.s1 FR_A3 = FR_A3,FR_r,FR_A2 | |
1080 | nop.i 0 | |
1081 | } | |
1082 | { .mfi | |
1083 | nop.m 0 | |
1084 | fma.s1 FR_A1 = FR_A1,FR_r,FR_A0 | |
1085 | nop.i 0 | |
1086 | };; | |
1087 | { .mfi | |
1088 | nop.m 0 | |
1089 | // NR-iteration | |
1090 | (p10) fma.s1 FR_Rcp2 = FR_Rcp1,FR_Rcp2,FR_Rcp1 | |
1091 | nop.i 0 | |
1092 | };; | |
1093 | { .mfi | |
1094 | nop.m 0 | |
1095 | fma.s1 FR_A7 = FR_A7,FR_r2,FR_A5 | |
1096 | nop.i 0 | |
1097 | } | |
1098 | { .mfi | |
1099 | nop.m 0 | |
1100 | fma.s1 FR_r4 = FR_r2,FR_r2,f0 | |
1101 | nop.i 0 | |
1102 | };; | |
1103 | { .mfi | |
1104 | nop.m 0 | |
1105 | fma.s1 FR_A3 = FR_A3,FR_r2,FR_A1 | |
1106 | nop.i 0 | |
1107 | };; | |
1108 | { .mfi | |
0347518d | 1109 | nop.m 0 |
d5efd131 MF |
1110 | (p10) fma.s1 FR_GAMMA = FR_A7,FR_r4,FR_A3 |
1111 | nop.i 0 | |
1112 | } | |
1113 | { .mfi | |
0347518d | 1114 | nop.m 0 |
d5efd131 MF |
1115 | (p11) fma.s.s0 f8 = FR_A7,FR_r4,FR_A3 |
1116 | nop.i 0 | |
1117 | };; | |
1118 | { .mfb | |
0347518d | 1119 | nop.m 0 |
d5efd131 MF |
1120 | (p10) fma.s.s0 f8 = FR_GAMMA,FR_Rcp2,f0 |
1121 | br.ret.sptk b0 | |
1122 | };; | |
1123 | ||
1124 | ||
1125 | // overflow | |
1126 | //-------------------------------------------------------------------- | |
1127 | .align 32 | |
1128 | tgammaf_overflow_near0_bound: | |
1129 | .pred.rel "mutex",p14,p15 | |
1130 | { .mfi | |
1131 | mov GR_fpsr = ar.fpsr | |
1132 | nop.f 0 | |
1133 | (p15) mov r8 = 0x7f8 | |
1134 | } | |
1135 | { .mfi | |
1136 | nop.m 0 | |
1137 | nop.f 0 | |
1138 | (p14) mov r8 = 0xff8 | |
1139 | };; | |
1140 | { .mfi | |
1141 | nop.m 0 | |
1142 | nop.f 0 | |
0347518d | 1143 | shl r8 = r8,20 |
d5efd131 MF |
1144 | };; |
1145 | { .mfi | |
1146 | sub r8 = r8,r0,1 | |
1147 | nop.f 0 | |
1148 | extr.u GR_fpsr = GR_fpsr,10,2 // rounding mode | |
1149 | };; | |
1150 | .pred.rel "mutex",p14,p15 | |
1151 | { .mfi | |
1152 | // set p8 to 0 in case of overflow and to 1 otherwise | |
0347518d | 1153 | // for negative arg: |
d5efd131 MF |
1154 | // no overflow if rounding mode either Z or +Inf, i.e. |
1155 | // GR_fpsr > 1 | |
1156 | (p14) cmp.lt p8,p0 = 1,GR_fpsr | |
1157 | nop.f 0 | |
0347518d | 1158 | // for positive arg: |
d5efd131 MF |
1159 | // no overflow if rounding mode either Z or -Inf, i.e. |
1160 | // (GR_fpsr & 1) == 0 | |
1161 | (p15) tbit.z p0,p8 = GR_fpsr,0 | |
1162 | };; | |
1163 | { .mib | |
1164 | (p8) setf.s f8 = r8 // set result to 0x7f7fffff without | |
1165 | // OVERFLOW flag raising | |
1166 | nop.i 0 | |
1167 | (p8) br.ret.sptk b0 | |
1168 | };; | |
1169 | ||
1170 | .align 32 | |
1171 | tgammaf_overflow: | |
1172 | { .mfi | |
1173 | nop.m 0 | |
1174 | nop.f 0 | |
1175 | mov r8 = 0x1FFFE | |
1176 | };; | |
1177 | { .mfi | |
1178 | setf.exp f9 = r8 | |
1179 | fmerge.s FR_X = f8,f8 | |
1180 | nop.i 0 | |
1181 | };; | |
1182 | .pred.rel "mutex",p14,p15 | |
1183 | { .mfi | |
1184 | nop.m 0 | |
1185 | (p14) fnma.s.s0 f8 = f9,f9,f0 // set I,O and -INF result | |
1186 | mov GR_TAG = 261 // overflow | |
1187 | } | |
1188 | { .mfb | |
0347518d | 1189 | nop.m 0 |
d5efd131 MF |
1190 | (p15) fma.s.s0 f8 = f9,f9,f0 // set I,O and +INF result |
1191 | br.cond.sptk tgammaf_libm_err | |
1192 | };; | |
1193 | ||
1194 | // x is negative integer or +/-0 | |
1195 | //-------------------------------------------------------------------- | |
1196 | .align 32 | |
1197 | tgammaf_singularity: | |
1198 | { .mfi | |
1199 | nop.m 0 | |
1200 | fmerge.s FR_X = f8,f8 | |
1201 | mov GR_TAG = 262 // negative | |
1202 | } | |
1203 | { .mfb | |
1204 | nop.m 0 | |
1205 | frcpa.s0 f8,p0 = f0,f0 | |
1206 | br.cond.sptk tgammaf_libm_err | |
1207 | };; | |
1208 | // x is negative noninteger with big absolute value | |
1209 | //-------------------------------------------------------------------- | |
1210 | .align 32 | |
1211 | tgammaf_underflow: | |
1212 | { .mfi | |
1213 | mov r8 = 0x00001 | |
1214 | nop.f 0 | |
1215 | tbit.z p6,p7 = GR_Sig,0 | |
1216 | };; | |
1217 | { .mfi | |
1218 | setf.exp f9 = r8 | |
1219 | nop.f 0 | |
1220 | nop.i 0 | |
1221 | };; | |
1222 | .pred.rel "mutex",p6,p7 | |
1223 | { .mfi | |
1224 | nop.m 0 | |
1225 | (p6) fms.s.s0 f8 = f9,f9,f9 | |
1226 | nop.i 0 | |
1227 | } | |
1228 | { .mfb | |
1229 | nop.m 0 | |
1230 | (p7) fma.s.s0 f8 = f9,f9,f9 | |
1231 | br.ret.sptk b0 | |
1232 | };; | |
1233 | ||
1234 | // x for natval, nan, +/-inf or +/-0 | |
1235 | //-------------------------------------------------------------------- | |
1236 | .align 32 | |
1237 | tgammaf_spec_args: | |
1238 | { .mfi | |
1239 | nop.m 0 | |
1240 | fclass.m p6,p0 = f8,0x1E1 // Test x for natval, nan, +inf | |
1241 | nop.i 0 | |
1242 | };; | |
1243 | { .mfi | |
1244 | nop.m 0 | |
1245 | fclass.m p7,p8 = f8,0x7 // +/-0 | |
1246 | nop.i 0 | |
1247 | };; | |
1248 | { .mfi | |
1249 | nop.m 0 | |
1250 | fmerge.s FR_X = f8,f8 | |
1251 | nop.i 0 | |
1252 | } | |
1253 | { .mfb | |
1254 | nop.m 0 | |
1255 | (p6) fma.s.s0 f8 = f8,f1,f8 | |
1256 | (p6) br.ret.spnt b0 | |
1257 | };; | |
1258 | .pred.rel "mutex",p7,p8 | |
1259 | { .mfi | |
1260 | (p7) mov GR_TAG = 262 // negative | |
1261 | (p7) frcpa.s0 f8,p0 = f1,f8 | |
0347518d | 1262 | nop.i 0 |
d5efd131 MF |
1263 | } |
1264 | { .mib | |
1265 | nop.m 0 | |
1266 | nop.i 0 | |
1267 | (p8) br.cond.spnt tgammaf_singularity | |
1268 | };; | |
1269 | ||
1270 | .align 32 | |
1271 | tgammaf_libm_err: | |
1272 | { .mfi | |
1273 | alloc r32 = ar.pfs,1,4,4,0 | |
1274 | nop.f 0 | |
1275 | mov GR_Parameter_TAG = GR_TAG | |
1276 | };; | |
1277 | ||
1278 | GLOBAL_LIBM_END(tgammaf) | |
1279 | ||
1280 | LOCAL_LIBM_ENTRY(__libm_error_region) | |
1281 | .prologue | |
1282 | { .mfi | |
1283 | add GR_Parameter_Y=-32,sp // Parameter 2 value | |
1284 | nop.f 0 | |
1285 | .save ar.pfs,GR_SAVE_PFS | |
0347518d | 1286 | mov GR_SAVE_PFS=ar.pfs // Save ar.pfs |
d5efd131 MF |
1287 | } |
1288 | { .mfi | |
0347518d | 1289 | .fframe 64 |
d5efd131 MF |
1290 | add sp=-64,sp // Create new stack |
1291 | nop.f 0 | |
1292 | mov GR_SAVE_GP=gp // Save gp | |
1293 | };; | |
1294 | { .mmi | |
1295 | stfs [GR_Parameter_Y] = FR_Y,16 // STORE Parameter 2 on stack | |
1296 | add GR_Parameter_X = 16,sp // Parameter 1 address | |
0347518d MF |
1297 | .save b0, GR_SAVE_B0 |
1298 | mov GR_SAVE_B0=b0 // Save b0 | |
d5efd131 MF |
1299 | };; |
1300 | .body | |
1301 | { .mib | |
0347518d MF |
1302 | stfs [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack |
1303 | add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address | |
1304 | nop.b 0 | |
d5efd131 MF |
1305 | } |
1306 | { .mib | |
1307 | stfs [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3 on stack | |
0347518d | 1308 | add GR_Parameter_Y = -16,GR_Parameter_Y |
d5efd131 MF |
1309 | br.call.sptk b0=__libm_error_support# // Call error handling function |
1310 | };; | |
1311 | { .mmi | |
1312 | nop.m 0 | |
1313 | nop.m 0 | |
1314 | add GR_Parameter_RESULT = 48,sp | |
1315 | };; | |
1316 | { .mmi | |
1317 | ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack | |
1318 | .restore sp | |
1319 | add sp = 64,sp // Restore stack pointer | |
1320 | mov b0 = GR_SAVE_B0 // Restore return address | |
1321 | };; | |
1322 | { .mib | |
0347518d | 1323 | mov gp = GR_SAVE_GP // Restore gp |
d5efd131 MF |
1324 | mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs |
1325 | br.ret.sptk b0 // Return | |
0347518d | 1326 | };; |
d5efd131 MF |
1327 | |
1328 | LOCAL_LIBM_END(__libm_error_region) | |
1329 | .type __libm_error_support#,@function | |
1330 | .global __libm_error_support# |