]>
Commit | Line | Data |
---|---|---|
d5efd131 MF |
1 | .file "libm_lgammal.s" |
2 | ||
3 | ||
4 | // Copyright (c) 2002 - 2005, Intel Corporation | |
5 | // All rights reserved. | |
6 | // | |
d5efd131 MF |
7 | // |
8 | // Redistribution and use in source and binary forms, with or without | |
9 | // modification, are permitted provided that the following conditions are | |
10 | // met: | |
11 | // | |
12 | // * Redistributions of source code must retain the above copyright | |
13 | // notice, this list of conditions and the following disclaimer. | |
14 | // | |
15 | // * Redistributions in binary form must reproduce the above copyright | |
16 | // notice, this list of conditions and the following disclaimer in the | |
17 | // documentation and/or other materials provided with the distribution. | |
18 | // | |
19 | // * The name of Intel Corporation may not be used to endorse or promote | |
20 | // products derived from this software without specific prior written | |
21 | // permission. | |
22 | ||
23 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
24 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,INCLUDING,BUT NOT | |
25 | // LIMITED TO,THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
26 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS | |
27 | // CONTRIBUTORS BE LIABLE FOR ANY DIRECT,INDIRECT,INCIDENTAL,SPECIAL, | |
28 | // EXEMPLARY,OR CONSEQUENTIAL DAMAGES (INCLUDING,BUT NOT LIMITED TO, | |
29 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,DATA,OR | |
30 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY | |
31 | // OF LIABILITY,WHETHER IN CONTRACT,STRICT LIABILITY OR TORT (INCLUDING | |
32 | // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | |
33 | // SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
34 | // | |
35 | // Intel Corporation is the author of this code,and requests that all | |
36 | // problem reports or change requests be submitted to it directly at | |
37 | // http://www.intel.com/software/products/opensource/libraries/num.htm. | |
38 | // | |
39 | //********************************************************************* | |
40 | // | |
41 | // History: | |
42 | // 03/28/02 Original version | |
43 | // 05/20/02 Cleaned up namespace and sf0 syntax | |
44 | // 08/21/02 Added support of SIGN(GAMMA(x)) calculation | |
45 | // 09/26/02 Algorithm description improved | |
46 | // 10/21/02 Now it returns SIGN(GAMMA(x))=-1 for negative zero | |
47 | // 02/10/03 Reordered header: .section, .global, .proc, .align | |
48 | // 03/31/05 Reformatted delimiters between data tables | |
49 | // | |
50 | //********************************************************************* | |
51 | // | |
52 | // Function: __libm_lgammal(long double x, int* signgam, int szsigngam) | |
53 | // computes the principal value of the logarithm of the GAMMA function | |
54 | // of x. Signum of GAMMA(x) is stored to memory starting at the address | |
55 | // specified by the signgam. | |
56 | // | |
57 | //********************************************************************* | |
58 | // | |
59 | // Resources Used: | |
60 | // | |
61 | // Floating-Point Registers: f8 (Input and Return Value) | |
62 | // f9-f15 | |
63 | // f32-f127 | |
64 | // | |
65 | // General Purpose Registers: | |
66 | // r2, r3, r8-r11, r14-r31 | |
67 | // r32-r65 | |
68 | // r66-r69 (Used to pass arguments to error handling routine) | |
69 | // | |
70 | // Predicate Registers: p6-p15 | |
71 | // | |
72 | //********************************************************************* | |
73 | // | |
74 | // IEEE Special Conditions: | |
75 | // | |
76 | // __libm_lgammal(+inf) = +inf | |
77 | // __libm_lgammal(-inf) = QNaN | |
78 | // __libm_lgammal(+/-0) = +inf | |
79 | // __libm_lgammal(x<0, x - integer) = QNaN | |
80 | // __libm_lgammal(SNaN) = QNaN | |
81 | // __libm_lgammal(QNaN) = QNaN | |
82 | // | |
83 | //********************************************************************* | |
84 | // | |
85 | // ALGORITHM DESCRIPTION | |
86 | // | |
87 | // Below we suppose that there is log(z) function which takes an long | |
88 | // double argument and returns result as a pair of long double numbers | |
89 | // lnHi and lnLo (such that sum lnHi + lnLo provides ~80 correct bits | |
90 | // of significand). Algorithm description for such log(z) function | |
91 | // see below. | |
92 | // Also, it this algorithm description we use the following notational | |
93 | // conventions: | |
94 | // a) pair A = (Ahi, Alo) means number A represented as sum of Ahi and Alo | |
95 | // b) C = A + B = (Ahi, Alo) + (Bhi, Blo) means multi-precision addition. | |
96 | // The result would be C = (Chi, Clo). Notice, that Clo shouldn't be | |
97 | // equal to Alo + Blo | |
98 | // c) D = A*B = (Ahi, Alo)*(Bhi, Blo) = (Dhi, Dlo) multi-precisiion | |
99 | // multiplication. | |
100 | // | |
101 | // So, lgammal has the following computational paths: | |
102 | // 1) |x| < 0.5 | |
103 | // P = A1*|x| + A2*|x|^2 + ... + A22*|x|^22 | |
104 | // A1, A2, A3 represented as a sum of two double precision | |
105 | // numbers and multi-precision computations are used for 3 higher | |
106 | // terms of the polynomial. We get polynomial as a sum of two | |
107 | // double extended numbers: P = (Phi, Plo) | |
108 | // 1.1) x > 0 | |
109 | // lgammal(x) = P - log(|x|) = (Phi, Plo) - (lnHi(|x|), lnLo(|x|)) | |
110 | // 1.2) x < 0 | |
111 | // lgammal(x) = -P - log(|x|) - log(sin(Pi*x)/(Pi*x)) | |
112 | // P and log(|x|) are computed by the same way as in 1.1; | |
113 | // - log(sin(Pi*x)/(Pi*x)) is approximated by a polynomial Plnsin. | |
114 | // Plnsin:= fLnSin2*|x|^2 + fLnSin4*|x|^4 + ... + fLnSin36*|x|^36 | |
115 | // The first coefficient of Plnsin is represented as sum of two | |
116 | // double precision numbers (fLnSin2, fLnSin2L). Multi-precision | |
117 | // computations for higher two terms of Plnsin are used. | |
118 | // So, the final result is reconstructed by the following formula | |
119 | // lgammal(x) = (-(Phi, Plo) - (lnHi(|x|), lnLo(|x|))) - | |
120 | // - (PlnsinHi,PlnsinLo) | |
121 | // | |
122 | // 2) 0.5 <= x < 0.75 -> t = x - 0.625 | |
123 | // -0.75 < x <= -0.5 -> t = x + 0.625 | |
124 | // 2.25 <= x < 4.0 -> t = x/2 - 1.5 | |
125 | // 4.0 <= x < 8.0 -> t = x/4 - 1.5 | |
126 | // -0.5 < x <= -0.40625 -> t = x + 0.5 | |
127 | // -2.6005859375 < x <= -2.5 -> t = x + 2.5 | |
128 | // 1.3125 <= x < 1.5625 -> t = x - LOC_MIN, where LOC_MIN is point in | |
129 | // which lgammal has local minimum. Exact | |
130 | // value can be found in the table below, | |
131 | // approximate value is ~1.46 | |
132 | // | |
133 | // lgammal(x) is approximated by the polynomial of 25th degree: P25(t) | |
134 | // P25(t) = A0 + A1*t + ... + A25*t^25 = (Phi, Plo) + t^4*P21(t), | |
135 | // where | |
136 | // (Phi, Plo) is sum of four highest terms of the polynomial P25(t): | |
137 | // (Phi, Plo) = ((A0, A0L) + (A1, A1L)*t) + t^2 *((A2, A2L) + (A3, A3L)*t), | |
138 | // (Ai, AiL) - coefficients represented as pairs of DP numbers. | |
139 | // | |
140 | // P21(t) = (PolC(t)*t^8 + PolD(t))*t^8 + PolE(t), | |
141 | // where | |
142 | // PolC(t) = C21*t^5 + C20*t^4 + ... + C16, | |
143 | // C21 = A25, C20 = A24, ..., C16 = A20 | |
144 | // | |
145 | // PolD(t) = D7*t^7 + D6*t^6 + ... + D0, | |
146 | // D7 = A19, D6 = A18, ..., D0 = A12 | |
147 | // | |
148 | // PolE(t) = E7*t^7 + E6*t^6 + ... + E0, | |
149 | // E7 = A11, E6 = A10, ..., E0 = A4 | |
150 | // | |
151 | // Cis and Dis are represented as double precision numbers, | |
152 | // Eis are represented as double extended numbers. | |
153 | // | |
154 | // 3) 0.75 <= x < 1.3125 -> t = x - 1.0 | |
155 | // 1.5625 <= x < 2.25 -> t = x - 2.0 | |
156 | // lgammal(x) is approximated by the polynomial of 25th degree: P25(t) | |
157 | // P25(t) = A1*t + ... + A25*t^25, and computations are carried out | |
158 | // by similar way as in the previous case | |
159 | // | |
160 | // 4) 10.0 < x <= Overflow Bound ("positive Sterling" range) | |
161 | // lgammal(x) is approximated using Sterling's formula: | |
162 | // lgammal(x) ~ ((x*(lnHi(x) - 1, lnLo(x))) - 0.5*(lnHi(x), lnLo(x))) + | |
163 | // + ((Chi, Clo) + S(1/x)) | |
164 | // where | |
165 | // C = (Chi, Clo) - pair of double precision numbers representing constant | |
166 | // 0.5*ln(2*Pi); | |
167 | // S(1/x) = 1/x * (B2 + B4*(1/x)^2 + ... + B20*(1/x)^18), B2, ..., B20 are | |
168 | // Bernulli numbers. S is computed in native precision and then added to | |
169 | // Clo; | |
170 | // lnHi(x) - 1 is computed in native precision and the multiprecision | |
171 | // multiplication (x, 0) *(lnHi(x) - 1, lnLo(x)) is used. | |
172 | // | |
173 | // 5) -INF < x <= -2^63, any negative integer < 0 | |
174 | // All numbers in this range are integers -> error handler is called | |
175 | // | |
176 | // 6) -2^63 < x <= -0.75 ("negative Sterling" range), x is "far" from root, | |
177 | // lgammal(-t) for positive t is approximated using the following formula: | |
178 | // lgammal(-t) = -lgammal(t)-log(t)-log(|dT|)+log(sin(Pi*|dT|)/(Pi*|dT|)) | |
179 | // where dT = -t -round_to_nearest_integer(-t) | |
180 | // Last item is approximated by the same polynomial as described in 1.2. | |
181 | // We split the whole range into three subranges due to different ways of | |
182 | // approximation of the first terms. | |
183 | // 6.1) -2^63 < x < -6.0 ("negative Sterling" range) | |
184 | // lgammal(t) is approximated exactly as in #4. The only difference that | |
185 | // for -13.0 < x < -6.0 subrange instead of Bernulli numbers we use their | |
186 | // minimax approximation on this range. | |
187 | // log(t), log(|dT|) are approximated by the log routine mentioned above. | |
188 | // 6.2) -6.0 < x <= -0.75, |x + 1|> 2^(-7) | |
189 | // log(t), log(|dT|) are approximated by the log routine mentioned above, | |
190 | // lgammal(t) is approximated by polynomials of the 25th degree similar | |
191 | // to ones from #2. Arguments z of the polynomials are as follows | |
192 | // a) 0.75 <= t < 1.0 - 2^(-7), z = 2*t - 1.5 | |
193 | // b) 1.0 - 2^(-7) < t < 2.0, z = t - 1.5 | |
194 | // c) 2.0 < t < 3.0, z = t/2 - 1.5 | |
195 | // d) 3.0 < t < 4.0, z = t/2 - 1.5. Notice, that range reduction is | |
196 | // the same as in case c) but the set of coefficients is different | |
197 | // e) 4.0 < t < 6.0, z = t/4 - 1.5 | |
198 | // 6.3) |x + 1| <= 2^(-7) | |
199 | // log(1 + (x-1)) is approximated by Taylor series, | |
200 | // log(sin(Pi*|dT|)/(Pi*|dT|)) is still approximated by polynomial but | |
201 | // it has just 4th degree. | |
202 | // log(|dT|) is approximated by the log routine mentioned above. | |
203 | // lgammal(-x) is approximated by polynomial of 8th degree from (-x + 1). | |
204 | // | |
205 | // 7) -20.0 < x < -2.0, x falls in root "neighbourhood". | |
206 | // "Neighbourhood" means that |lgammal(x)| < epsilon, where epsilon is | |
207 | // different for every root (and it is stored in the table), but typically | |
208 | // it is ~ 0.15. There are 35 roots significant from "double extended" | |
209 | // point of view. We split all the roots into two subsets: "left" and "right" | |
210 | // roots. Considering [-(N+1), -N] range we call root as "left" one if it | |
211 | // lies closer to -(N+1) and "right" otherwise. There is no "left" root in | |
212 | // the [-20, -19] range (it exists, but is insignificant for double extended | |
213 | // precision). To determine if x falls in root "neighbourhood" we store | |
214 | // significands of all the 35 roots as well as epsilon values (expressed | |
215 | // by the left and right bound). | |
216 | // In these ranges we approximate lgammal(x) by polynomial series of 19th | |
217 | // degree: | |
218 | // lgammal(x) = P19(t) = A0 + A1*t + ...+ A19*t^19, where t = x - EDP_Root, | |
219 | // EDP_Root is the exact value of the corresponding root rounded to double | |
220 | // extended precision. So, we have 35 different polynomials which make our | |
221 | // table rather big. We may hope that x falls in root "neighbourhood" | |
222 | // quite rarely -> ther might be no need in frequent use of different | |
223 | // polynomials. | |
224 | // A0, A1, A2, A3 are represented as pairs of double precision numbers, | |
225 | // A4, A5 are long doubles, and to decrease the size of the table we | |
226 | // keep the rest of coefficients in just double precision | |
227 | // | |
228 | //********************************************************************* | |
229 | // Algorithm for log(X) = (lnHi(X), lnLo(X)) | |
230 | // | |
231 | // ALGORITHM | |
232 | // | |
233 | // Here we use a table lookup method. The basic idea is that in | |
234 | // order to compute logl(Arg) for an argument Arg in [1,2), we | |
235 | // construct a value G such that G*Arg is close to 1 and that | |
236 | // logl(1/G) is obtainable easily from a table of values calculated | |
237 | // beforehand. Thus | |
238 | // | |
239 | // logl(Arg) = logl(1/G) + logl(G*Arg) | |
240 | // = logl(1/G) + logl(1 + (G*Arg - 1)) | |
241 | // | |
242 | // Because |G*Arg - 1| is small, the second term on the right hand | |
243 | // side can be approximated by a short polynomial. We elaborate | |
244 | // this method in four steps. | |
245 | // | |
246 | // Step 0: Initialization | |
247 | // | |
248 | // We need to calculate logl( X ). Obtain N, S_hi such that | |
249 | // | |
250 | // X = 2^N * S_hi exactly | |
251 | // | |
252 | // where S_hi in [1,2) | |
253 | // | |
254 | // Step 1: Argument Reduction | |
255 | // | |
256 | // Based on S_hi, obtain G_1, G_2, G_3 from a table and calculate | |
257 | // | |
258 | // G := G_1 * G_2 * G_3 | |
259 | // r := (G * S_hi - 1) | |
260 | // | |
261 | // These G_j's have the property that the product is exactly | |
262 | // representable and that |r| < 2^(-12) as a result. | |
263 | // | |
264 | // Step 2: Approximation | |
265 | // | |
266 | // | |
267 | // logl(1 + r) is approximated by a short polynomial poly(r). | |
268 | // | |
269 | // Step 3: Reconstruction | |
270 | // | |
271 | // | |
272 | // Finally, logl( X ) is given by | |
273 | // | |
274 | // logl( X ) = logl( 2^N * S_hi ) | |
275 | // ~=~ N*logl(2) + logl(1/G) + logl(1 + r) | |
276 | // ~=~ N*logl(2) + logl(1/G) + poly(r). | |
277 | // | |
278 | // IMPLEMENTATION | |
279 | // | |
280 | // Step 0. Initialization | |
281 | // ---------------------- | |
282 | // | |
283 | // Z := X | |
284 | // N := unbaised exponent of Z | |
285 | // S_hi := 2^(-N) * Z | |
286 | // | |
287 | // Step 1. Argument Reduction | |
288 | // -------------------------- | |
289 | // | |
290 | // Let | |
291 | // | |
292 | // Z = 2^N * S_hi = 2^N * 1.d_1 d_2 d_3 ... d_63 | |
293 | // | |
294 | // We obtain G_1, G_2, G_3 by the following steps. | |
295 | // | |
296 | // | |
297 | // Define X_0 := 1.d_1 d_2 ... d_14. This is extracted | |
298 | // from S_hi. | |
299 | // | |
300 | // Define A_1 := 1.d_1 d_2 d_3 d_4. This is X_0 truncated | |
301 | // to lsb = 2^(-4). | |
302 | // | |
303 | // Define index_1 := [ d_1 d_2 d_3 d_4 ]. | |
304 | // | |
305 | // Fetch Z_1 := (1/A_1) rounded UP in fixed point with | |
306 | // fixed point lsb = 2^(-15). | |
307 | // Z_1 looks like z_0.z_1 z_2 ... z_15 | |
308 | // Note that the fetching is done using index_1. | |
309 | // A_1 is actually not needed in the implementation | |
310 | // and is used here only to explain how is the value | |
311 | // Z_1 defined. | |
312 | // | |
313 | // Fetch G_1 := (1/A_1) truncated to 21 sig. bits. | |
314 | // floating pt. Again, fetching is done using index_1. A_1 | |
315 | // explains how G_1 is defined. | |
316 | // | |
317 | // Calculate X_1 := X_0 * Z_1 truncated to lsb = 2^(-14) | |
318 | // = 1.0 0 0 0 d_5 ... d_14 | |
c0c3f78a | 319 | // This is accomplished by integer multiplication. |
d5efd131 MF |
320 | // It is proved that X_1 indeed always begin |
321 | // with 1.0000 in fixed point. | |
322 | // | |
323 | // | |
324 | // Define A_2 := 1.0 0 0 0 d_5 d_6 d_7 d_8. This is X_1 | |
325 | // truncated to lsb = 2^(-8). Similar to A_1, | |
326 | // A_2 is not needed in actual implementation. It | |
327 | // helps explain how some of the values are defined. | |
328 | // | |
329 | // Define index_2 := [ d_5 d_6 d_7 d_8 ]. | |
330 | // | |
331 | // Fetch Z_2 := (1/A_2) rounded UP in fixed point with | |
332 | // fixed point lsb = 2^(-15). Fetch done using index_2. | |
333 | // Z_2 looks like z_0.z_1 z_2 ... z_15 | |
334 | // | |
335 | // Fetch G_2 := (1/A_2) truncated to 21 sig. bits. | |
336 | // floating pt. | |
337 | // | |
338 | // Calculate X_2 := X_1 * Z_2 truncated to lsb = 2^(-14) | |
339 | // = 1.0 0 0 0 0 0 0 0 d_9 d_10 ... d_14 | |
c0c3f78a | 340 | // This is accomplished by integer multiplication. |
d5efd131 MF |
341 | // It is proved that X_2 indeed always begin |
342 | // with 1.00000000 in fixed point. | |
343 | // | |
344 | // | |
345 | // Define A_3 := 1.0 0 0 0 0 0 0 0 d_9 d_10 d_11 d_12 d_13 1. | |
346 | // This is 2^(-14) + X_2 truncated to lsb = 2^(-13). | |
347 | // | |
348 | // Define index_3 := [ d_9 d_10 d_11 d_12 d_13 ]. | |
349 | // | |
350 | // Fetch G_3 := (1/A_3) truncated to 21 sig. bits. | |
351 | // floating pt. Fetch is done using index_3. | |
352 | // | |
353 | // Compute G := G_1 * G_2 * G_3. | |
354 | // | |
355 | // This is done exactly since each of G_j only has 21 sig. bits. | |
356 | // | |
357 | // Compute | |
358 | // | |
359 | // r := (G*S_hi - 1) | |
360 | // | |
361 | // | |
362 | // Step 2. Approximation | |
363 | // --------------------- | |
364 | // | |
365 | // This step computes an approximation to logl( 1 + r ) where r is the | |
366 | // reduced argument just obtained. It is proved that |r| <= 1.9*2^(-13); | |
367 | // thus logl(1+r) can be approximated by a short polynomial: | |
368 | // | |
369 | // logl(1+r) ~=~ poly = r + Q1 r^2 + ... + Q4 r^5 | |
370 | // | |
371 | // | |
372 | // Step 3. Reconstruction | |
373 | // ---------------------- | |
374 | // | |
375 | // This step computes the desired result of logl(X): | |
376 | // | |
377 | // logl(X) = logl( 2^N * S_hi ) | |
378 | // = N*logl(2) + logl( S_hi ) | |
379 | // = N*logl(2) + logl(1/G) + | |
380 | // logl(1 + G*S_hi - 1 ) | |
381 | // | |
382 | // logl(2), logl(1/G_j) are stored as pairs of (single,double) numbers: | |
383 | // log2_hi, log2_lo, log1byGj_hi, log1byGj_lo. The high parts are | |
384 | // single-precision numbers and the low parts are double precision | |
385 | // numbers. These have the property that | |
386 | // | |
387 | // N*log2_hi + SUM ( log1byGj_hi ) | |
388 | // | |
389 | // is computable exactly in double-extended precision (64 sig. bits). | |
390 | // Finally | |
391 | // | |
392 | // lnHi(X) := N*log2_hi + SUM ( log1byGj_hi ) | |
393 | // lnLo(X) := poly_hi + [ poly_lo + | |
394 | // ( SUM ( log1byGj_lo ) + N*log2_lo ) ] | |
395 | // | |
396 | // | |
397 | //********************************************************************* | |
398 | // General Purpose Registers | |
399 | // scratch registers | |
400 | rPolDataPtr = r2 | |
401 | rLnSinDataPtr = r3 | |
402 | rExpX = r8 | |
403 | rSignifX = r9 | |
404 | rDelta = r10 | |
405 | rSignExpX = r11 | |
406 | GR_ad_z_1 = r14 | |
407 | r17Ones = r15 | |
408 | GR_Index1 = r16 | |
409 | rSignif1andQ = r17 | |
410 | GR_X_0 = r18 | |
411 | GR_X_1 = r19 | |
412 | GR_X_2 = r20 | |
413 | GR_Z_1 = r21 | |
414 | GR_Z_2 = r22 | |
415 | GR_N = r23 | |
416 | rExpHalf = r24 | |
417 | rExp8 = r25 | |
418 | rX0Dx = r25 | |
419 | GR_ad_tbl_1 = r26 | |
420 | GR_ad_tbl_2 = r27 | |
421 | GR_ad_tbl_3 = r28 | |
422 | GR_ad_q = r29 | |
423 | GR_ad_z_1 = r30 | |
424 | GR_ad_z_2 = r31 | |
425 | // stacked registers | |
426 | rPFS_SAVED = r32 | |
427 | GR_ad_z_3 = r33 | |
428 | rSgnGamAddr = r34 | |
429 | rSgnGamSize = r35 | |
430 | rLogDataPtr = r36 | |
431 | rZ1offsett = r37 | |
432 | rTmpPtr = r38 | |
433 | rTmpPtr2 = r39 | |
434 | rTmpPtr3 = r40 | |
435 | rExp2 = r41 | |
436 | rExp2tom7 = r42 | |
437 | rZ625 = r42 | |
438 | rExpOne = r43 | |
439 | rNegSingularity = r44 | |
440 | rXint = r45 | |
441 | rTbl1Addr = r46 | |
442 | rTbl2Addr = r47 | |
443 | rTbl3Addr = r48 | |
444 | rZ2Addr = r49 | |
445 | rRootsAddr = r50 | |
446 | rRootsBndAddr = r51 | |
447 | rRoot = r52 | |
448 | rRightBound = r53 | |
449 | rLeftBound = r54 | |
450 | rSignifDx = r55 | |
451 | rBernulliPtr = r56 | |
452 | rLnSinTmpPtr = r56 | |
453 | rIndex1Dx = r57 | |
454 | rIndexPol = r58 | |
455 | GR_Index3 = r59 | |
456 | GR_Index2 = r60 | |
457 | rSgnGam = r61 | |
458 | rXRnd = r62 | |
459 | ||
460 | GR_SAVE_B0 = r63 | |
461 | GR_SAVE_GP = r64 | |
462 | GR_SAVE_PFS = r65 | |
463 | // output parameters when calling error handling routine | |
464 | GR_Parameter_X = r66 | |
465 | GR_Parameter_Y = r67 | |
466 | GR_Parameter_RESULT = r68 | |
467 | GR_Parameter_TAG = r69 | |
468 | ||
469 | //******************************************************************** | |
470 | // Floating Point Registers | |
471 | // CAUTION: due to the lack of registers there exist (below in the code) | |
472 | // sometimes "unconventional" use of declared registers | |
473 | // | |
474 | fAbsX = f6 | |
475 | fDelX4 = f6 | |
476 | fSignifX = f7 | |
477 | // macros for error handling routine | |
478 | FR_X = f10 // first argument | |
479 | FR_Y = f1 // second argument (lgammal has just one) | |
480 | FR_RESULT = f8 // result | |
481 | ||
482 | // First 7 Bernulli numbers | |
483 | fB2 = f9 | |
484 | fLnDeltaL = f9 | |
485 | fXSqr = f9 | |
486 | fB4 = f10 | |
487 | fX4 = f10 | |
488 | fB6 = f11 | |
489 | fX6 = f11 | |
490 | fB8 = f12 | |
491 | fXSqrL = f12 | |
492 | fB10 = f13 | |
493 | fRes7H = f13 | |
494 | fB12 = f14 | |
495 | fRes7L = f14 | |
496 | fB14 = f15 | |
497 | ||
498 | // stack registers | |
499 | // Polynomial coefficients: A0, ..., A25 | |
500 | fA0 = f32 | |
501 | fA0L = f33 | |
502 | fInvXL = f33 | |
503 | fA1 = f34 | |
504 | fA1L = f35 | |
505 | fA2 = f36 | |
506 | fA2L = f37 | |
507 | fA3 = f38 | |
508 | fA3L = f39 | |
509 | fA4 = f40 | |
510 | fA4L = f41 | |
511 | fRes6H = f41 | |
512 | fA5 = f42 | |
513 | fB2L = f42 | |
514 | fA5L = f43 | |
515 | fMinNegStir = f43 | |
516 | fRes6L = f43 | |
517 | fA6 = f44 | |
518 | fMaxNegStir = f44 | |
519 | fA7 = f45 | |
520 | fLnDeltaH = f45 | |
521 | fA8 = f46 | |
522 | fBrnL = f46 | |
523 | fA9 = f47 | |
524 | fBrnH = f47 | |
525 | fA10 = f48 | |
526 | fRes5L = f48 | |
527 | fA11 = f49 | |
528 | fRes5H = f49 | |
529 | fA12 = f50 | |
530 | fDx6 = f50 | |
531 | fA13 = f51 | |
532 | fDx8 = f51 | |
533 | fA14 = f52 | |
534 | fDx4 = f52 | |
535 | fA15 = f53 | |
536 | fYL = f53 | |
537 | fh3Dx = f53 | |
538 | fA16 = f54 | |
539 | fYH = f54 | |
540 | fH3Dx = f54 | |
541 | fA17 = f55 | |
542 | fResLnDxL = f55 | |
543 | fG3Dx = f55 | |
544 | fA18 = f56 | |
545 | fResLnDxH = f56 | |
546 | fh2Dx = f56 | |
547 | fA19 = f57 | |
548 | fFloatNDx = f57 | |
549 | fA20 = f58 | |
550 | fPolyHiDx = f58 | |
551 | fhDx = f58 | |
552 | fA21 = f59 | |
553 | fRDxCub = f59 | |
554 | fHDx = f59 | |
555 | fA22 = f60 | |
556 | fRDxSq = f60 | |
557 | fGDx = f60 | |
558 | fA23 = f61 | |
559 | fPolyLoDx = f61 | |
560 | fInvX3 = f61 | |
561 | fA24 = f62 | |
562 | fRDx = f62 | |
563 | fInvX8 = f62 | |
564 | fA25 = f63 | |
565 | fInvX4 = f63 | |
566 | fPol = f64 | |
567 | fPolL = f65 | |
568 | // Coefficients of ln(sin(Pi*x)/Pi*x) | |
569 | fLnSin2 = f66 | |
570 | fLnSin2L = f67 | |
571 | fLnSin4 = f68 | |
572 | fLnSin6 = f69 | |
573 | fLnSin8 = f70 | |
574 | fLnSin10 = f71 | |
575 | fLnSin12 = f72 | |
576 | fLnSin14 = f73 | |
577 | fLnSin16 = f74 | |
578 | fLnSin18 = f75 | |
579 | fDelX8 = f75 | |
580 | fLnSin20 = f76 | |
581 | fLnSin22 = f77 | |
582 | fDelX6 = f77 | |
583 | fLnSin24 = f78 | |
584 | fLnSin26 = f79 | |
585 | fLnSin28 = f80 | |
586 | fLnSin30 = f81 | |
587 | fhDelX = f81 | |
588 | fLnSin32 = f82 | |
589 | fLnSin34 = f83 | |
590 | fLnSin36 = f84 | |
591 | fXint = f85 | |
592 | fDxSqr = f85 | |
593 | fRes3L = f86 | |
594 | fRes3H = f87 | |
595 | fRes4H = f88 | |
596 | fRes4L = f89 | |
597 | fResH = f90 | |
598 | fResL = f91 | |
599 | fDx = f92 | |
600 | FR_MHalf = f93 | |
601 | fRes1H = f94 | |
602 | fRes1L = f95 | |
603 | fRes2H = f96 | |
604 | fRes2L = f97 | |
605 | FR_FracX = f98 | |
606 | fRcpX = f99 | |
607 | fLnSinH = f99 | |
608 | fTwo = f100 | |
609 | fMOne = f100 | |
610 | FR_G = f101 | |
611 | FR_H = f102 | |
612 | FR_h = f103 | |
613 | FR_G2 = f104 | |
614 | FR_H2 = f105 | |
615 | FR_poly_lo = f106 | |
616 | FR_poly_hi = f107 | |
617 | FR_h2 = f108 | |
618 | FR_rsq = f109 | |
619 | FR_r = f110 | |
620 | FR_log2_hi = f111 | |
621 | FR_log2_lo = f112 | |
622 | fFloatN = f113 | |
623 | FR_Q4 = f114 | |
624 | FR_G3 = f115 | |
625 | FR_H3 = f116 | |
626 | FR_h3 = f117 | |
627 | FR_Q3 = f118 | |
628 | FR_Q2 = f119 | |
629 | FR_Q1 = f120 | |
630 | fThirteen = f121 | |
631 | fSix = f121 | |
632 | FR_rcub = f121 | |
633 | // Last three Bernulli numbers | |
634 | fB16 = f122 | |
635 | fB18 = f123 | |
636 | fB20 = f124 | |
637 | fInvX = f125 | |
638 | fLnSinL = f125 | |
639 | fDxSqrL = f126 | |
640 | fFltIntX = f126 | |
641 | fRoot = f127 | |
642 | fNormDx = f127 | |
643 | ||
644 | // Data tables | |
645 | //============================================================== | |
646 | RODATA | |
647 | // ************* DO NOT CHANGE THE ORDER OF THESE TABLES ************* | |
648 | .align 16 | |
649 | LOCAL_OBJECT_START(lgammal_right_roots_data) | |
650 | // List of all right roots themselves | |
651 | data8 0x9D3FE4B007C360AB, 0x0000C000 // Range [-3, -2] | |
652 | data8 0xC9306DE4F2CD7BEE, 0x0000C000 // Range [-4, -3] | |
653 | data8 0x814273C2CCAC0618, 0x0000C001 // Range [-5, -4] | |
654 | data8 0xA04352BF85B6C865, 0x0000C001 // Range [-6, -5] | |
655 | data8 0xC00B592C4BE4676C, 0x0000C001 // Range [-7, -6] | |
656 | data8 0xE0019FEF6FF0F5BF, 0x0000C001 // Range [-8, -7] | |
657 | data8 0x80001A01459FC9F6, 0x0000C002 // Range [-9, -8] | |
658 | data8 0x900002E3BB47D86D, 0x0000C002 // Range [-10, -9] | |
659 | data8 0xA0000049F93BB992, 0x0000C002 // Range [-11, -10] | |
660 | data8 0xB0000006B9915316, 0x0000C002 // Range [-12, -11] | |
661 | data8 0xC00000008F76C773, 0x0000C002 // Range [-13, -12] | |
662 | data8 0xD00000000B09230A, 0x0000C002 // Range [-14, -13] | |
663 | data8 0xE000000000C9CBA5, 0x0000C002 // Range [-15, -14] | |
664 | data8 0xF0000000000D73FA, 0x0000C002 // Range [-16, -15] | |
665 | data8 0x8000000000006BA0, 0x0000C003 // Range [-17, -16] | |
666 | data8 0x8800000000000655, 0x0000C003 // Range [-18, -17] | |
667 | data8 0x900000000000005A, 0x0000C003 // Range [-19, -18] | |
668 | data8 0x9800000000000005, 0x0000C003 // Range [-20, -19] | |
669 | // List of bounds of ranges with special polynomial approximation near root | |
670 | // Only significands of bounds are actually stored | |
671 | data8 0xA000000000000000, 0x9800000000000000 // Bounds for root on [-3, -2] | |
672 | data8 0xCAB88035C5EFBB41, 0xC7E05E31F4B02115 // Bounds for root on [-4, -3] | |
673 | data8 0x817831B899735C72, 0x8114633941B8053A // Bounds for root on [-5, -4] | |
674 | data8 0xA04E8B34C6AA9476, 0xA039B4A42978197B // Bounds for root on [-6, -5] | |
675 | data8 0xC00D3D5E588A78A9, 0xC009BA25F7E858A6 // Bounds for root on [-7, -6] | |
676 | data8 0xE001E54202991EB4, 0xE001648416CE897F // Bounds for root on [-8, -7] | |
677 | data8 0x80001E56D13A6B9F, 0x8000164A3BAD888A // Bounds for root on [-9, -8] | |
678 | data8 0x9000035F0529272A, 0x9000027A0E3D94F0 // Bounds for root on [-10, -9] | |
679 | data8 0xA00000564D705880, 0xA000003F67EA0CC7 // Bounds for root on [-11, -10] | |
680 | data8 0xB0000007D87EE0EF, 0xB0000005C3A122A5 // Bounds for root on [-12, -11] | |
681 | data8 0xC0000000A75FE8B1, 0xC00000007AF818AC // Bounds for root on [-13, -12] | |
682 | data8 0xD00000000CDFFE36, 0xD000000009758BBF // Bounds for root on [-14, -13] | |
683 | data8 0xE000000000EB6D96, 0xE000000000ACF7B2 // Bounds for root on [-15, -14] | |
684 | data8 0xF0000000000FB1F9, 0xF0000000000B87FB // Bounds for root on [-16, -15] | |
685 | data8 0x8000000000007D90, 0x8000000000005C40 // Bounds for root on [-17, -16] | |
686 | data8 0x8800000000000763, 0x880000000000056D // Bounds for root on [-18, -17] | |
687 | data8 0x9000000000000069, 0x900000000000004D // Bounds for root on [-19, -18] | |
688 | data8 0x9800000000000006, 0x9800000000000005 // Bounds for root on [-20, -19] | |
689 | // List of all left roots themselves | |
690 | data8 0xAFDA0850DEC8065E, 0x0000C000 // Range [-3, -2] | |
691 | data8 0xFD238AA3E17F285C, 0x0000C000 // Range [-4, -3] | |
692 | data8 0x9FBABBD37757E6A2, 0x0000C001 // Range [-5, -4] | |
693 | data8 0xBFF497AC8FA06AFC, 0x0000C001 // Range [-6, -5] | |
694 | data8 0xDFFE5FBB5C377FE8, 0x0000C001 // Range [-7, -6] | |
695 | data8 0xFFFFCBFC0ACE7879, 0x0000C001 // Range [-8, -7] | |
696 | data8 0x8FFFFD1C425E8100, 0x0000C002 // Range [-9, -8] | |
697 | data8 0x9FFFFFB606BDFDCD, 0x0000C002 // Range [-10, -9] | |
698 | data8 0xAFFFFFF9466E9F1B, 0x0000C002 // Range [-11, -10] | |
699 | data8 0xBFFFFFFF70893874, 0x0000C002 // Range [-12, -11] | |
700 | data8 0xCFFFFFFFF4F6DCF6, 0x0000C002 // Range [-13, -12] | |
701 | data8 0xDFFFFFFFFF36345B, 0x0000C002 // Range [-14, -13] | |
702 | data8 0xEFFFFFFFFFF28C06, 0x0000C002 // Range [-15, -14] | |
703 | data8 0xFFFFFFFFFFFF28C0, 0x0000C002 // Range [-16, -15] | |
704 | data8 0x87FFFFFFFFFFF9AB, 0x0000C003 // Range [-17, -16] | |
705 | data8 0x8FFFFFFFFFFFFFA6, 0x0000C003 // Range [-18, -17] | |
706 | data8 0x97FFFFFFFFFFFFFB, 0x0000C003 // Range [-19, -18] | |
707 | data8 0x0000000000000000, 0x00000000 // pad to keep logic in the main path | |
708 | // List of bounds of ranges with special polynomial approximation near root | |
709 | // Only significands of bounds are actually stored | |
710 | data8 0xB235880944CC758E, 0xADD2F1A9FBE76C8B // Bounds for root on [-3, -2] | |
711 | data8 0xFD8E7844F307B07C, 0xFCA655C2152BDE4D // Bounds for root on [-4, -3] | |
712 | data8 0x9FC4D876EE546967, 0x9FAEE4AF68BC4292 // Bounds for root on [-5, -4] | |
713 | data8 0xBFF641FFBFCC44F1, 0xBFF2A47919F4BA89 // Bounds for root on [-6, -5] | |
714 | data8 0xDFFE9C803DEFDD59, 0xDFFE18932EB723FE // Bounds for root on [-7, -6] | |
715 | data8 0xFFFFD393FA47AFC3, 0xFFFFC317CF638AE1 // Bounds for root on [-8, -7] | |
716 | data8 0x8FFFFD8840279925, 0x8FFFFC9DCECEEE92 // Bounds for root on [-9, -8] | |
717 | data8 0x9FFFFFC0D34E2AF8, 0x9FFFFFA9619AA3B7 // Bounds for root on [-10, -9] | |
718 | data8 0xAFFFFFFA41C18246, 0xAFFFFFF82025A23C // Bounds for root on [-11, -10] | |
719 | data8 0xBFFFFFFF857ACB4E, 0xBFFFFFFF58032378 // Bounds for root on [-12, -11] | |
720 | data8 0xCFFFFFFFF6934AB8, 0xCFFFFFFFF313EF0A // Bounds for root on [-13, -12] | |
721 | data8 0xDFFFFFFFFF53A9E9, 0xDFFFFFFFFF13B5A5 // Bounds for root on [-14, -13] | |
722 | data8 0xEFFFFFFFFFF482CB, 0xEFFFFFFFFFF03F4F // Bounds for root on [-15, -14] | |
723 | data8 0xFFFFFFFFFFFF482D, 0xFFFFFFFFFFFF03F5 // Bounds for root on [-16, -15] | |
724 | data8 0x87FFFFFFFFFFFA98, 0x87FFFFFFFFFFF896 // Bounds for root on [-17, -16] | |
725 | data8 0x8FFFFFFFFFFFFFB3, 0x8FFFFFFFFFFFFF97 // Bounds for root on [-18, -17] | |
726 | data8 0x97FFFFFFFFFFFFFC, 0x97FFFFFFFFFFFFFB // Bounds for root on [-19, -18] | |
727 | LOCAL_OBJECT_END(lgammal_right_roots_data) | |
728 | ||
729 | LOCAL_OBJECT_START(lgammal_0_Half_data) | |
730 | // Polynomial coefficients for the lgammal(x), 0.0 < |x| < 0.5 | |
731 | data8 0xBFD9A4D55BEAB2D6, 0xBC8AA3C097746D1F //A3 | |
732 | data8 0x3FEA51A6625307D3, 0x3C7180E7BD2D0DCC //A2 | |
733 | data8 0xBFE2788CFC6FB618, 0xBC9E9346C4692BCC //A1 | |
734 | data8 0x8A8991563EC1BD13, 0x00003FFD //A4 | |
735 | data8 0xD45CE0BD52C27EF2, 0x0000BFFC //A5 | |
736 | data8 0xADA06587FA2BBD47, 0x00003FFC //A6 | |
737 | data8 0x9381D0ED2194902A, 0x0000BFFC //A7 | |
738 | data8 0x80859B3CF92D4192, 0x00003FFC //A8 | |
739 | data8 0xE4033517C622A946, 0x0000BFFB //A9 | |
740 | data8 0xCD00CE67A51FC82A, 0x00003FFB //A10 | |
741 | data8 0xBA44E2A96C3B5700, 0x0000BFFB //A11 | |
742 | data8 0xAAAD008FA46DBD99, 0x00003FFB //A12 | |
743 | data8 0x9D604AC65A41153D, 0x0000BFFB //A13 | |
744 | data8 0x917CECB864B5A861, 0x00003FFB //A14 | |
745 | data8 0x85A4810EB730FDE4, 0x0000BFFB //A15 | |
746 | data8 0xEF2761C38BD21F77, 0x00003FFA //A16 | |
747 | data8 0xC913043A128367DA, 0x0000BFFA //A17 | |
748 | data8 0x96A29B71FF7AFFAA, 0x00003FFA //A18 | |
749 | data8 0xBB9FFA1A5FE649BB, 0x0000BFF9 //A19 | |
750 | data8 0xB17982CD2DAA0EE3, 0x00003FF8 //A20 | |
751 | data8 0xDE1DDCBFFB9453F0, 0x0000BFF6 //A21 | |
752 | data8 0x87FBF5D7ACD9FA9D, 0x00003FF4 //A22 | |
753 | LOCAL_OBJECT_END(lgammal_0_Half_data) | |
754 | ||
755 | LOCAL_OBJECT_START(Constants_Q) | |
756 | // log2_hi, log2_lo, Q_4, Q_3, Q_2, and Q_1 | |
757 | data4 0x00000000,0xB1721800,0x00003FFE,0x00000000 | |
758 | data4 0x4361C4C6,0x82E30865,0x0000BFE2,0x00000000 | |
759 | data4 0x328833CB,0xCCCCCAF2,0x00003FFC,0x00000000 | |
760 | data4 0xA9D4BAFB,0x80000077,0x0000BFFD,0x00000000 | |
761 | data4 0xAAABE3D2,0xAAAAAAAA,0x00003FFD,0x00000000 | |
762 | data4 0xFFFFDAB7,0xFFFFFFFF,0x0000BFFD,0x00000000 | |
763 | LOCAL_OBJECT_END(Constants_Q) | |
764 | ||
765 | LOCAL_OBJECT_START(Constants_Z_1) | |
766 | // Z1 - 16 bit fixed | |
767 | data4 0x00008000 | |
768 | data4 0x00007879 | |
769 | data4 0x000071C8 | |
770 | data4 0x00006BCB | |
771 | data4 0x00006667 | |
772 | data4 0x00006187 | |
773 | data4 0x00005D18 | |
774 | data4 0x0000590C | |
775 | data4 0x00005556 | |
776 | data4 0x000051EC | |
777 | data4 0x00004EC5 | |
778 | data4 0x00004BDB | |
779 | data4 0x00004925 | |
780 | data4 0x0000469F | |
781 | data4 0x00004445 | |
782 | data4 0x00004211 | |
783 | LOCAL_OBJECT_END(Constants_Z_1) | |
784 | ||
785 | LOCAL_OBJECT_START(Constants_G_H_h1) | |
786 | // G1 and H1 - IEEE single and h1 - IEEE double | |
787 | data4 0x3F800000,0x00000000,0x00000000,0x00000000 | |
788 | data4 0x3F70F0F0,0x3D785196,0x617D741C,0x3DA163A6 | |
789 | data4 0x3F638E38,0x3DF13843,0xCBD3D5BB,0x3E2C55E6 | |
790 | data4 0x3F579430,0x3E2FF9A0,0xD86EA5E7,0xBE3EB0BF | |
791 | data4 0x3F4CCCC8,0x3E647FD6,0x86B12760,0x3E2E6A8C | |
792 | data4 0x3F430C30,0x3E8B3AE7,0x5C0739BA,0x3E47574C | |
793 | data4 0x3F3A2E88,0x3EA30C68,0x13E8AF2F,0x3E20E30F | |
794 | data4 0x3F321640,0x3EB9CEC8,0xF2C630BD,0xBE42885B | |
795 | data4 0x3F2AAAA8,0x3ECF9927,0x97E577C6,0x3E497F34 | |
796 | data4 0x3F23D708,0x3EE47FC5,0xA6B0A5AB,0x3E3E6A6E | |
797 | data4 0x3F1D89D8,0x3EF8947D,0xD328D9BE,0xBDF43E3C | |
798 | data4 0x3F17B420,0x3F05F3A1,0x0ADB090A,0x3E4094C3 | |
799 | data4 0x3F124920,0x3F0F4303,0xFC1FE510,0xBE28FBB2 | |
800 | data4 0x3F0D3DC8,0x3F183EBF,0x10FDE3FA,0x3E3A7895 | |
801 | data4 0x3F088888,0x3F20EC80,0x7CC8C98F,0x3E508CE5 | |
802 | data4 0x3F042108,0x3F29516A,0xA223106C,0xBE534874 | |
803 | LOCAL_OBJECT_END(Constants_G_H_h1) | |
804 | ||
805 | LOCAL_OBJECT_START(Constants_Z_2) | |
806 | // Z2 - 16 bit fixed | |
807 | data4 0x00008000 | |
808 | data4 0x00007F81 | |
809 | data4 0x00007F02 | |
810 | data4 0x00007E85 | |
811 | data4 0x00007E08 | |
812 | data4 0x00007D8D | |
813 | data4 0x00007D12 | |
814 | data4 0x00007C98 | |
815 | data4 0x00007C20 | |
816 | data4 0x00007BA8 | |
817 | data4 0x00007B31 | |
818 | data4 0x00007ABB | |
819 | data4 0x00007A45 | |
820 | data4 0x000079D1 | |
821 | data4 0x0000795D | |
822 | data4 0x000078EB | |
823 | LOCAL_OBJECT_END(Constants_Z_2) | |
824 | ||
825 | LOCAL_OBJECT_START(Constants_G_H_h2) | |
826 | // G2 and H2 - IEEE single and h2 - IEEE double | |
827 | data4 0x3F800000,0x00000000,0x00000000,0x00000000 | |
828 | data4 0x3F7F00F8,0x3B7F875D,0x22C42273,0x3DB5A116 | |
829 | data4 0x3F7E03F8,0x3BFF015B,0x21F86ED3,0x3DE620CF | |
830 | data4 0x3F7D08E0,0x3C3EE393,0x484F34ED,0xBDAFA07E | |
831 | data4 0x3F7C0FC0,0x3C7E0586,0x3860BCF6,0xBDFE07F0 | |
832 | data4 0x3F7B1880,0x3C9E75D2,0xA78093D6,0x3DEA370F | |
833 | data4 0x3F7A2328,0x3CBDC97A,0x72A753D0,0x3DFF5791 | |
834 | data4 0x3F792FB0,0x3CDCFE47,0xA7EF896B,0x3DFEBE6C | |
835 | data4 0x3F783E08,0x3CFC15D0,0x409ECB43,0x3E0CF156 | |
836 | data4 0x3F774E38,0x3D0D874D,0xFFEF71DF,0xBE0B6F97 | |
837 | data4 0x3F766038,0x3D1CF49B,0x5D59EEE8,0xBE080483 | |
838 | data4 0x3F757400,0x3D2C531D,0xA9192A74,0x3E1F91E9 | |
839 | data4 0x3F748988,0x3D3BA322,0xBF72A8CD,0xBE139A06 | |
840 | data4 0x3F73A0D0,0x3D4AE46F,0xF8FBA6CF,0x3E1D9202 | |
841 | data4 0x3F72B9D0,0x3D5A1756,0xBA796223,0xBE1DCCC4 | |
842 | data4 0x3F71D488,0x3D693B9D,0xB6B7C239,0xBE049391 | |
843 | LOCAL_OBJECT_END(Constants_G_H_h2) | |
844 | ||
845 | LOCAL_OBJECT_START(Constants_G_H_h3) | |
846 | // G3 and H3 - IEEE single and h3 - IEEE double | |
847 | data4 0x3F7FFC00,0x38800100,0x562224CD,0x3D355595 | |
848 | data4 0x3F7FF400,0x39400480,0x06136FF6,0x3D8200A2 | |
849 | data4 0x3F7FEC00,0x39A00640,0xE8DE9AF0,0x3DA4D68D | |
850 | data4 0x3F7FE400,0x39E00C41,0xB10238DC,0xBD8B4291 | |
851 | data4 0x3F7FDC00,0x3A100A21,0x3B1952CA,0xBD89CCB8 | |
852 | data4 0x3F7FD400,0x3A300F22,0x1DC46826,0xBDB10707 | |
853 | data4 0x3F7FCC08,0x3A4FF51C,0xF43307DB,0x3DB6FCB9 | |
854 | data4 0x3F7FC408,0x3A6FFC1D,0x62DC7872,0xBD9B7C47 | |
855 | data4 0x3F7FBC10,0x3A87F20B,0x3F89154A,0xBDC3725E | |
856 | data4 0x3F7FB410,0x3A97F68B,0x62B9D392,0xBD93519D | |
857 | data4 0x3F7FAC18,0x3AA7EB86,0x0F21BD9D,0x3DC18441 | |
858 | data4 0x3F7FA420,0x3AB7E101,0x2245E0A6,0xBDA64B95 | |
859 | data4 0x3F7F9C20,0x3AC7E701,0xAABB34B8,0x3DB4B0EC | |
860 | data4 0x3F7F9428,0x3AD7DD7B,0x6DC40A7E,0x3D992337 | |
861 | data4 0x3F7F8C30,0x3AE7D474,0x4F2083D3,0x3DC6E17B | |
862 | data4 0x3F7F8438,0x3AF7CBED,0x811D4394,0x3DAE314B | |
863 | data4 0x3F7F7C40,0x3B03E1F3,0xB08F2DB1,0xBDD46F21 | |
864 | data4 0x3F7F7448,0x3B0BDE2F,0x6D34522B,0xBDDC30A4 | |
865 | data4 0x3F7F6C50,0x3B13DAAA,0xB1F473DB,0x3DCB0070 | |
866 | data4 0x3F7F6458,0x3B1BD766,0x6AD282FD,0xBDD65DDC | |
867 | data4 0x3F7F5C68,0x3B23CC5C,0xF153761A,0xBDCDAB83 | |
868 | data4 0x3F7F5470,0x3B2BC997,0x341D0F8F,0xBDDADA40 | |
869 | data4 0x3F7F4C78,0x3B33C711,0xEBC394E8,0x3DCD1BD7 | |
870 | data4 0x3F7F4488,0x3B3BBCC6,0x52E3E695,0xBDC3532B | |
871 | data4 0x3F7F3C90,0x3B43BAC0,0xE846B3DE,0xBDA3961E | |
872 | data4 0x3F7F34A0,0x3B4BB0F4,0x785778D4,0xBDDADF06 | |
873 | data4 0x3F7F2CA8,0x3B53AF6D,0xE55CE212,0x3DCC3ED1 | |
874 | data4 0x3F7F24B8,0x3B5BA620,0x9E382C15,0xBDBA3103 | |
875 | data4 0x3F7F1CC8,0x3B639D12,0x5C5AF197,0x3D635A0B | |
876 | data4 0x3F7F14D8,0x3B6B9444,0x71D34EFC,0xBDDCCB19 | |
877 | data4 0x3F7F0CE0,0x3B7393BC,0x52CD7ADA,0x3DC74502 | |
878 | data4 0x3F7F04F0,0x3B7B8B6D,0x7D7F2A42,0xBDB68F17 | |
879 | LOCAL_OBJECT_END(Constants_G_H_h3) | |
880 | ||
881 | LOCAL_OBJECT_START(lgammal_data) | |
882 | // Positive overflow value | |
883 | data8 0xB8D54C8BFFFDEBF4, 0x00007FF1 | |
884 | LOCAL_OBJECT_END(lgammal_data) | |
885 | ||
886 | LOCAL_OBJECT_START(lgammal_Stirling) | |
887 | // Coefficients needed for Strirling's formula | |
888 | data8 0x3FED67F1C864BEB4 // High part of 0.5*ln(2*Pi) | |
889 | data8 0x3C94D252F2400510 // Low part of 0.5*ln(2*Pi) | |
890 | // | |
891 | // Bernulli numbers used in Striling's formula for -2^63 < |x| < -13.0 | |
892 | //(B1H, B1L) = 8.3333333333333333333262747254e-02 | |
893 | data8 0x3FB5555555555555, 0x3C55555555555555 | |
894 | data8 0xB60B60B60B60B60B, 0x0000BFF6 //B2 = -2.7777777777777777777777777778e-03 | |
895 | data8 0xD00D00D00D00D00D, 0x00003FF4 //B3 = 7.9365079365079365079365079365e-04 | |
896 | data8 0x9C09C09C09C09C0A, 0x0000BFF4 //B4 = -5.9523809523809523809523809524e-04 | |
897 | data8 0xDCA8F158C7F91AB8, 0x00003FF4 //B5 = 8.4175084175084175084175084175e-04 | |
898 | data8 0xFB5586CCC9E3E410, 0x0000BFF5 //B6 = -1.9175269175269175269175269175e-03 | |
899 | data8 0xD20D20D20D20D20D, 0x00003FF7 //B7 = 6.4102564102564102564102564103e-03 | |
900 | data8 0xF21436587A9CBEE1, 0x0000BFF9 //B8 = -2.9550653594771241830065359477e-02 | |
901 | data8 0xB7F4B1C0F033FFD1, 0x00003FFC //B9 = 1.7964437236883057316493849002e-01 | |
902 | data8 0xB23B3808C0F9CF6E, 0x0000BFFF //B10 = -1.3924322169059011164274322169e+00 | |
903 | // Polynomial coefficients for Stirling's formula, -13.0 < x < -6.0 | |
904 | data8 0x3FB5555555555555, 0x3C4D75060289C58B //A0 | |
905 | data8 0xB60B60B60B0F0876, 0x0000BFF6 //A1 | |
906 | data8 0xD00D00CE54B1256C, 0x00003FF4 //A2 | |
907 | data8 0x9C09BF46B58F75E1, 0x0000BFF4 //A3 | |
908 | data8 0xDCA8483BC91ACC6D, 0x00003FF4 //A4 | |
909 | data8 0xFB3965C939CC9FEE, 0x0000BFF5 //A5 | |
910 | data8 0xD0723ADE3F0BC401, 0x00003FF7 //A6 | |
911 | data8 0xE1ED7434E81F0B73, 0x0000BFF9 //A7 | |
912 | data8 0x8069C6982F993283, 0x00003FFC //A8 | |
913 | data8 0xC271F65BFA5BEE3F, 0x0000BFFD //A9 | |
914 | LOCAL_OBJECT_END(lgammal_Stirling) | |
915 | ||
916 | LOCAL_OBJECT_START(lgammal_lnsin_data) | |
917 | // polynomial approximation of -ln(sin(Pi*x)/(Pi*x)), 0 < x <= 0.5 | |
918 | data8 0x3FFA51A6625307D3, 0x3C81873332FAF94C //A2 | |
919 | data8 0x8A8991563EC241C3, 0x00003FFE //A4 | |
920 | data8 0xADA06588061805DF, 0x00003FFD //A6 | |
921 | data8 0x80859B57C338D0F7, 0x00003FFD //A8 | |
922 | data8 0xCD00F1C2D78754BD, 0x00003FFC //A10 | |
923 | data8 0xAAB56B1D3A1F4655, 0x00003FFC //A12 | |
924 | data8 0x924B6F2FBBED12B1, 0x00003FFC //A14 | |
925 | data8 0x80008E58765F43FC, 0x00003FFC //A16 | |
926 | data8 0x3FBC718EC115E429//A18 | |
927 | data8 0x3FB99CE544FE183E//A20 | |
928 | data8 0x3FB7251C09EAAD89//A22 | |
929 | data8 0x3FB64A970733628C//A24 | |
930 | data8 0x3FAC92D6802A3498//A26 | |
931 | data8 0x3FC47E1165261586//A28 | |
932 | data8 0xBFCA1BAA434750D4//A30 | |
933 | data8 0x3FE460001C4D5961//A32 | |
934 | data8 0xBFE6F06A3E4908AD//A34 | |
935 | data8 0x3FE300889EBB203A//A36 | |
936 | LOCAL_OBJECT_END(lgammal_lnsin_data) | |
937 | ||
938 | LOCAL_OBJECT_START(lgammal_half_3Q_data) | |
939 | // Polynomial coefficients for the lgammal(x), 0.5 <= x < 0.75 | |
940 | data8 0xBFF7A648EE90C62E, 0x3C713F326857E066 // A3, A0L | |
941 | data8 0xBFF73E4B8BA780AE, 0xBCA953BC788877EF // A1, A1L | |
942 | data8 0x403774DCD58D0291, 0xC0415254D5AE6623 // D0, D1 | |
943 | data8 0x40B07213855CBFB0, 0xC0B8855E25D2D229 // C20, C21 | |
944 | data8 0x3FFB359F85FF5000, 0x3C9BAECE6EF9EF3A // A2, A2L | |
945 | data8 0x3FD717D498A3A8CC, 0xBC9088E101CFEDFA // A0, A3L | |
946 | data8 0xAFEF36CC5AEC3FF0, 0x00004002 // E6 | |
947 | data8 0xABE2054E1C34E791, 0x00004001 // E4 | |
948 | data8 0xB39343637B2900D1, 0x00004000 // E2 | |
949 | data8 0xD74FB710D53F58F6, 0x00003FFF // E0 | |
950 | data8 0x4070655963BA4256, 0xC078DA9D263C4EA3 // D6, D7 | |
951 | data8 0x405CD2B6A9B90978, 0xC065B3B9F4F4F171 // D4, D5 | |
952 | data8 0x4049BC2204CF61FF, 0xC05337227E0BA152 // D2, D3 | |
953 | data8 0x4095509A50C07A96, 0xC0A0747949D2FB45 // C18, C19 | |
954 | data8 0x4082ECCBAD709414, 0xC08CD02FB088A702 // C16, C17 | |
955 | data8 0xFFE4B2A61B508DD5, 0x0000C002 // E7 | |
956 | data8 0xF461ADB8AE17E0A5, 0x0000C001 // E5 | |
957 | data8 0xF5BE8B0B90325F20, 0x0000C000 // E3 | |
958 | data8 0x877B275F3FB78DCA, 0x0000C000 // E1 | |
959 | LOCAL_OBJECT_END(lgammal_half_3Q_data) | |
960 | ||
961 | LOCAL_OBJECT_START(lgammal_half_3Q_neg_data) | |
962 | // Polynomial coefficients for the lgammal(x), -0.75 < x <= -0.5 | |
963 | data8 0xC014836EFD94899C, 0x3C9835679663B44F // A3, A0L | |
964 | data8 0xBFF276C7B4FB1875, 0xBC92D3D9FA29A1C0 // A1, A1L | |
965 | data8 0x40C5178F24E1A435, 0xC0D9DE84FBC5D76A // D0, D1 | |
966 | data8 0x41D4D1B236BF6E93, 0xC1EBB0445CE58550 // C20, C21 | |
967 | data8 0x4015718CD67F63D3, 0x3CC5354B6F04B59C // A2, A2L | |
968 | data8 0x3FF554493087E1ED, 0xBCB72715E37B02B9 // A0, A3L | |
969 | data8 0xE4AC7E915FA72229, 0x00004009 // E6 | |
970 | data8 0xA28244206395FCC6, 0x00004007 // E4 | |
971 | data8 0xFB045F19C07B2544, 0x00004004 // E2 | |
972 | data8 0xE5C8A6E6A9BA7D7B, 0x00004002 // E0 | |
973 | data8 0x4143943B55BF5118, 0xC158AC05EA675406 // D6, D7 | |
974 | data8 0x4118F6833D19717C, 0xC12F51A6F375CC80 // D4, D5 | |
975 | data8 0x40F00C209483481C, 0xC103F1DABF750259 // D2, D3 | |
976 | data8 0x4191038F2D8F9E40, 0xC1A413066DA8AE4A // C18, C19 | |
977 | data8 0x4170B537EDD833DE, 0xC1857E79424C61CE // C16, C17 | |
978 | data8 0x8941D8AB4855DB73, 0x0000C00B // E7 | |
979 | data8 0xBB822B131BD2E813, 0x0000C008 // E5 | |
980 | data8 0x852B4C03B83D2D4F, 0x0000C006 // E3 | |
981 | data8 0xC754CA7E2DDC0F1F, 0x0000C003 // E1 | |
982 | LOCAL_OBJECT_END(lgammal_half_3Q_neg_data) | |
983 | ||
984 | LOCAL_OBJECT_START(lgammal_2Q_4_data) | |
985 | // Polynomial coefficients for the lgammal(x), 2.25 <= |x| < 4.0 | |
986 | data8 0xBFCA4D55BEAB2D6F, 0x3C7ABC9DA14141F5 // A3, A0L | |
987 | data8 0x3FFD8773039049E7, 0x3C66CB7957A95BA4 // A1, A1L | |
988 | data8 0x3F45C3CC79E91E7D, 0xBF3A8E5005937E97 // D0, D1 | |
989 | data8 0x3EC951E35E1C9203, 0xBEB030A90026C5DF // C20, C21 | |
990 | data8 0x3FE94699894C1F4C, 0x3C91884D21D123F1 // A2, A2L | |
991 | data8 0x3FE62E42FEFA39EF, 0xBC66480CEB70870F // A0, A3L | |
992 | data8 0xF1C2EAFF0B3A7579, 0x00003FF5 // E6 | |
993 | data8 0xB36AF863926B55A3, 0x00003FF7 // E4 | |
994 | data8 0x9620656185BB44CA, 0x00003FF9 // E2 | |
995 | data8 0xA264558FB0906AFF, 0x00003FFB // E0 | |
996 | data8 0x3F03D59E9666C961, 0xBEF91115893D84A6 // D6, D7 | |
997 | data8 0x3F19333611C46225, 0xBF0F89EB7D029870 // D4, D5 | |
998 | data8 0x3F3055A96B347AFE, 0xBF243B5153E178A8 // D2, D3 | |
999 | data8 0x3ED9A4AEF30C4BB2, 0xBED388138B1CEFF2 // C18, C19 | |
1000 | data8 0x3EEF7945A3C3A254, 0xBEE36F32A938EF11 // C16, C17 | |
1001 | data8 0x9028923F47C82118, 0x0000BFF5 // E7 | |
1002 | data8 0xCE0DAAFB6DC93B22, 0x0000BFF6 // E5 | |
1003 | data8 0xA0D0983B34AC4C8D, 0x0000BFF8 // E3 | |
1004 | data8 0x94D6C50FEB8B0CE7, 0x0000BFFA // E1 | |
1005 | LOCAL_OBJECT_END(lgammal_2Q_4_data) | |
1006 | ||
1007 | LOCAL_OBJECT_START(lgammal_4_8_data) | |
1008 | // Polynomial coefficients for the lgammal(x), 4.0 <= |x| < 8.0 | |
1009 | data8 0xBFD6626BC9B31B54, 0x3CAA53C82493A92B // A3, A0L | |
1010 | data8 0x401B4C420A50AD7C, 0x3C8C6E9929F789A3 // A1, A1L | |
1011 | data8 0x3F49410427E928C2, 0xBF3E312678F8C146 // D0, D1 | |
1012 | data8 0x3ED51065F7CD5848, 0xBED052782A03312F // C20, C21 | |
1013 | data8 0x3FF735973273D5EC, 0x3C831DFC65BF8CCF // A2, A2L | |
1014 | data8 0x401326643C4479C9, 0xBC6FA0498C5548A6 // A0, A3L | |
1015 | data8 0x9382D8B3CD4EB7E3, 0x00003FF6 // E6 | |
1016 | data8 0xE9F92CAD8A85CBCD, 0x00003FF7 // E4 | |
1017 | data8 0xD58389FE38258CEC, 0x00003FF9 // E2 | |
1018 | data8 0x81310136363AE8AA, 0x00003FFC // E0 | |
1019 | data8 0x3F04F0AE38E78570, 0xBEF9E2144BB8F03C // D6, D7 | |
1020 | data8 0x3F1B5E992A6CBC2A, 0xBF10F3F400113911 // D4, D5 | |
1021 | data8 0x3F323EE00AAB7DEE, 0xBF2640FDFA9FB637 // D2, D3 | |
1022 | data8 0x3ED2143EBAFF067A, 0xBEBBDEB92D6FF35D // C18, C19 | |
1023 | data8 0x3EF173A42B69AAA4, 0xBEE78B9951A2EAA5 // C16, C17 | |
1024 | data8 0xAB3CCAC6344E52AA, 0x0000BFF5 // E7 | |
1025 | data8 0x81ACCB8915B16508, 0x0000BFF7 // E5 | |
1026 | data8 0xDA62C7221102C426, 0x0000BFF8 // E3 | |
1027 | data8 0xDF1BD44C4083580A, 0x0000BFFA // E1 | |
1028 | LOCAL_OBJECT_END(lgammal_4_8_data) | |
1029 | ||
1030 | LOCAL_OBJECT_START(lgammal_loc_min_data) | |
1031 | // Polynomial coefficients for the lgammal(x), 1.3125 <= x < 1.5625 | |
1032 | data8 0xBB16C31AB5F1FB71, 0x00003FFF // xMin - point of local minimum | |
1033 | data8 0xBFC2E4278DC6BC23, 0xBC683DA8DDCA9650 // A3, A0L | |
1034 | data8 0x3BD4DB7D0CA61D5F, 0x386E719EDD01D801 // A1, A1L | |
1035 | data8 0x3F4CC72638E1D93F, 0xBF4228EC9953CCB9 // D0, D1 | |
1036 | data8 0x3ED222F97A04613E,0xBED3DDD58095CB6C // C20, C21 | |
1037 | data8 0x3FDEF72BC8EE38AB, 0x3C863AFF3FC48940 // A2, A2L | |
1038 | data8 0xBFBF19B9BCC38A41, 0xBC7425F1BFFC1442// A0, A3L | |
1039 | data8 0x941890032BEB34C3, 0x00003FF6 // E6 | |
1040 | data8 0xC7E701591CE534BC, 0x00003FF7 // E4 | |
1041 | data8 0x93373CBD05138DD4, 0x00003FF9 // E2 | |
1042 | data8 0x845A14A6A81C05D6, 0x00003FFB // E0 | |
1043 | data8 0x3F0F6C4DF6D47A13, 0xBF045DCDB5B49E19 // D6, D7 | |
1044 | data8 0x3F22E23345DDE59C, 0xBF1851159AFB1735 // D4, D5 | |
1045 | data8 0x3F37101EA4022B78, 0xBF2D721E6323AF13 // D2, D3 | |
1046 | data8 0x3EE691EBE82DF09D, 0xBEDD42550961F730 // C18, C19 | |
1047 | data8 0x3EFA793EDE99AD85, 0xBEF14000108E70BE // C16, C17 | |
1048 | data8 0xB7CBC033ACE0C99C, 0x0000BFF5 // E7 | |
1049 | data8 0xF178D1F7B1A45E27, 0x0000BFF6 // E5 | |
1050 | data8 0xA8FCFCA8106F471C, 0x0000BFF8 // E3 | |
1051 | data8 0x864D46FA898A9AD2, 0x0000BFFA // E1 | |
1052 | LOCAL_OBJECT_END(lgammal_loc_min_data) | |
1053 | ||
1054 | LOCAL_OBJECT_START(lgammal_03Q_1Q_data) | |
1055 | // Polynomial coefficients for the lgammal(x), 0.75 <= |x| < 1.3125 | |
1056 | data8 0x3FD151322AC7D848, 0x3C7184DE0DB7B4EE // A4, A2L | |
1057 | data8 0x3FD9A4D55BEAB2D6, 0x3C9E934AAB10845F // A3, A1L | |
1058 | data8 0x3FB111289C381259, 0x3FAFFFCFB32AE18D // D2, D3 | |
1059 | data8 0x3FB3B1D9E0E3E00D, 0x3FB2496F0D3768DF // D0, D1 | |
1060 | data8 0xBA461972C057D439, 0x00003FFB // E6 | |
1061 | data8 0x3FEA51A6625307D3, 0x3C76ABC886A72DA2 // A2, A4L | |
1062 | data8 0x3FA8EFE46B32A70E, 0x3F8F31B3559576B6 // C17, C20 | |
1063 | data8 0xE403383700387D85, 0x00003FFB // E4 | |
1064 | data8 0x9381D0EE74BF7251, 0x00003FFC // E2 | |
1065 | data8 0x3FAA2177A6D28177, 0x3FA4895E65FBD995 // C18, C19 | |
1066 | data8 0x3FAAED2C77DBEE5D, 0x3FA94CA59385512C // D6, D7 | |
1067 | data8 0x3FAE1F522E8A5941, 0x3FAC785EF56DD87E // D4, D5 | |
1068 | data8 0x3FB556AD5FA56F0A, 0x3FA81F416E87C783 // E7, C16 | |
1069 | data8 0xCD00F1C2DC2C9F1E, 0x00003FFB // E5 | |
1070 | data8 0x3FE2788CFC6FB618, 0x3C8E52519B5B17CB // A1, A3L | |
1071 | data8 0x80859B57C3E7F241, 0x00003FFC // E3 | |
1072 | data8 0xADA065880615F401, 0x00003FFC // E1 | |
1073 | data8 0xD45CE0BD530AB50E, 0x00003FFC // E0 | |
1074 | LOCAL_OBJECT_END(lgammal_03Q_1Q_data) | |
1075 | ||
1076 | LOCAL_OBJECT_START(lgammal_13Q_2Q_data) | |
1077 | // Polynomial coefficients for the lgammal(x), 1.5625 <= |x| < 2.25 | |
1078 | data8 0x3F951322AC7D8483, 0x3C71873D88C6539D // A4, A2L | |
1079 | data8 0xBFB13E001A557606, 0x3C56CB907018A101 // A3, A1L | |
1080 | data8 0xBEC11B2EC1E7F6FC, 0x3EB0064ED9824CC7 // D2, D3 | |
1081 | data8 0xBEE3CBC963EC103A, 0x3ED2597A330C107D // D0, D1 | |
1082 | data8 0xBC6F2DEBDFE66F38, 0x0000BFF0 // E6 | |
1083 | data8 0x3FD4A34CC4A60FA6, 0x3C3AFC9BF775E8A0 // A2, A4L | |
1084 | data8 0x3E48B0C542F85B32, 0xBE347F12EAF787AB // C17, C20 | |
1085 | data8 0xE9FEA63B6984FA1E, 0x0000BFF2 // E4 | |
1086 | data8 0x9C562E15FC703BBF, 0x0000BFF5 // E2 | |
1087 | data8 0xBE3C12A50AB0355E, 0xBE1C941626AE4717 // C18, C19 | |
1088 | data8 0xBE7AFA8714342BC4,0x3E69A12D2B7761CB // D6, D7 | |
1089 | data8 0xBE9E25EF1D526730, 0x3E8C762291889B99 // D4, D5 | |
1090 | data8 0x3EF580DCEE754733, 0xBE57C811D070549C // E7, C16 | |
1091 | data8 0xD093D878BE209C98, 0x00003FF1 // E5 | |
1092 | data8 0x3FDB0EE6072093CE, 0xBC6024B9E81281C4 // A1, A3L | |
1093 | data8 0x859B57C31CB77D96, 0x00003FF4 // E3 | |
1094 | data8 0xBD6EB756DB617E8D, 0x00003FF6 // E1 | |
1095 | data8 0xF2027E10C7AF8C38, 0x0000BFF7 // E0 | |
1096 | LOCAL_OBJECT_END(lgammal_13Q_2Q_data) | |
1097 | ||
1098 | LOCAL_OBJECT_START(lgammal_8_10_data) | |
1099 | // Polynomial coefficients for the lgammal(x), 8.0 <= |x| < 10.0 | |
1100 | // Multi Precision terms | |
1101 | data8 0x40312008A3A23E5C, 0x3CE020B4F2E4083A //A1 | |
1102 | data8 0x4025358E82FCB70C, 0x3CD4A5A74AF7B99C //A0 | |
1103 | // Native precision terms | |
1104 | data8 0xF0AA239FFBC616D2, 0x00004000 //A2 | |
1105 | data8 0x96A8EA798FE57D66, 0x0000BFFF //A3 | |
1106 | data8 0x8D501B7E3B9B9BDB, 0x00003FFE //A4 | |
1107 | data8 0x9EE062401F4B1DC2, 0x0000BFFD //A5 | |
1108 | data8 0xC63FD8CD31E93431, 0x00003FFC //A6 | |
1109 | data8 0x8461101709C23C30, 0x0000BFFC //A7 | |
1110 | data8 0xB96D7EA7EF3648B2, 0x00003FFB //A8 | |
1111 | data8 0x86886759D2ACC906, 0x0000BFFB //A9 | |
1112 | data8 0xC894B6E28265B183, 0x00003FFA //A10 | |
1113 | data8 0x98C4348CAD821662, 0x0000BFFA //A11 | |
1114 | data8 0xEC9B092226A94DF2, 0x00003FF9 //A12 | |
1115 | data8 0xB9F169FF9B98CDDC, 0x0000BFF9 //A13 | |
1116 | data8 0x9A3A32BB040894D3, 0x00003FF9 //A14 | |
1117 | data8 0xF9504CCC1003B3C3, 0x0000BFF8 //A15 | |
1118 | LOCAL_OBJECT_END(lgammal_8_10_data) | |
1119 | ||
1120 | LOCAL_OBJECT_START(lgammal_03Q_6_data) | |
1121 | // Polynomial coefficients for the lgammal(x), 0.75 <= |x| < 1.0 | |
1122 | data8 0xBFBC47DCA479E295, 0xBC607E6C1A379D55 //A3 | |
1123 | data8 0x3FCA051C372609ED, 0x3C7B02D73EB7D831 //A0 | |
1124 | data8 0xBFE15FAFA86B04DB, 0xBC3F52EE4A8945B5 //A1 | |
1125 | data8 0x3FD455C4FF28F0BF, 0x3C75F8C6C99F30BB //A2 | |
1126 | data8 0xD2CF04CD934F03E1, 0x00003FFA //A4 | |
1127 | data8 0xDB4ED667E29256E1, 0x0000BFF9 //A5 | |
1128 | data8 0xF155A33A5B6021BF, 0x00003FF8 //A6 | |
1129 | data8 0x895E9B9D386E0338, 0x0000BFF8 //A7 | |
1130 | data8 0xA001BE94B937112E, 0x00003FF7 //A8 | |
1131 | data8 0xBD82846E490ED048, 0x0000BFF6 //A9 | |
1132 | data8 0xE358D24EC30DBB5D, 0x00003FF5 //A10 | |
1133 | data8 0x89C4F3652446B78B, 0x0000BFF5 //A11 | |
1134 | data8 0xA86043E10280193D, 0x00003FF4 //A12 | |
1135 | data8 0xCF3A2FBA61EB7682, 0x0000BFF3 //A13 | |
1136 | data8 0x3F300900CC9200EC //A14 | |
1137 | data8 0xBF23F42264B94AE8 //A15 | |
1138 | data8 0x3F18EEF29895FE73 //A16 | |
1139 | data8 0xBF0F3C4563E3EDFB //A17 | |
1140 | data8 0x3F0387DBBC385056 //A18 | |
1141 | data8 0xBEF81B4004F92900 //A19 | |
1142 | data8 0x3EECA6692A9A5B81 //A20 | |
1143 | data8 0xBEDF61A0059C15D3 //A21 | |
1144 | data8 0x3ECDA9F40DCA0111 //A22 | |
1145 | data8 0xBEB60FE788217BAF //A23 | |
1146 | data8 0x3E9661D795DFC8C6 //A24 | |
1147 | data8 0xBE66C7756A4EDEE5 //A25 | |
1148 | // Polynomial coefficients for the lgammal(x), 1.0 <= |x| < 2.0 | |
1149 | data8 0xBFC1AE55B180726B, 0xBC7DE1BC478453F5 //A3 | |
1150 | data8 0xBFBEEB95B094C191, 0xBC53456FF6F1C9D9 //A0 | |
1151 | data8 0x3FA2AED059BD608A, 0x3C0B65CC647D557F //A1 | |
1152 | data8 0x3FDDE9E64DF22EF2, 0x3C8993939A8BA8E4 //A2 | |
1153 | data8 0xF07C206D6B100CFF, 0x00003FFA //A4 | |
1154 | data8 0xED2CEA9BA52FE7FB, 0x0000BFF9 //A5 | |
1155 | data8 0xFCE51CED52DF3602, 0x00003FF8 //A6 | |
1156 | data8 0x8D45D27872326619, 0x0000BFF8 //A7 | |
1157 | data8 0xA2B78D6BCEBE27F7, 0x00003FF7 //A8 | |
1158 | data8 0xBF6DC0996A895B6F, 0x0000BFF6 //A9 | |
1159 | data8 0xE4B9AD335AF82D79, 0x00003FF5 //A10 | |
1160 | data8 0x8A451880195362A1, 0x0000BFF5 //A11 | |
1161 | data8 0xA8BE35E63089A7A9, 0x00003FF4 //A12 | |
1162 | data8 0xCF7FA175FA11C40C, 0x0000BFF3 //A13 | |
1163 | data8 0x3F300C282FAA3B02 //A14 | |
1164 | data8 0xBF23F6AEBDA68B80 //A15 | |
1165 | data8 0x3F18F6860E2224DD //A16 | |
1166 | data8 0xBF0F542B3CE32F28 //A17 | |
1167 | data8 0x3F039436218C9BF8 //A18 | |
1168 | data8 0xBEF8AE6307677AEC //A19 | |
1169 | data8 0x3EF0B55527B3A211 //A20 | |
1170 | data8 0xBEE576AC995E7605 //A21 | |
1171 | data8 0x3ED102DDC1365D2D //A22 | |
1172 | data8 0xBEC442184F97EA54 //A23 | |
1173 | data8 0x3ED4D2283DFE5FC6 //A24 | |
1174 | data8 0xBECB9219A9B46787 //A25 | |
1175 | // Polynomial coefficients for the lgammal(x), 2.0 <= |x| < 3.0 | |
1176 | data8 0xBFCA4D55BEAB2D6F, 0xBC66F80E5BFD5AF5 //A3 | |
1177 | data8 0x3FE62E42FEFA39EF, 0x3C7ABC9E3B347E3D //A0 | |
1178 | data8 0x3FFD8773039049E7, 0x3C66CB9007C426EA //A1 | |
1179 | data8 0x3FE94699894C1F4C, 0x3C918726EB111663 //A2 | |
1180 | data8 0xA264558FB0906209, 0x00003FFB //A4 | |
1181 | data8 0x94D6C50FEB902ADC, 0x0000BFFA //A5 | |
1182 | data8 0x9620656184243D17, 0x00003FF9 //A6 | |
1183 | data8 0xA0D0983B8BCA910B, 0x0000BFF8 //A7 | |
1184 | data8 0xB36AF8559B222BD3, 0x00003FF7 //A8 | |
1185 | data8 0xCE0DACB3260AE6E5, 0x0000BFF6 //A9 | |
1186 | data8 0xF1C2C0BF0437C7DB, 0x00003FF5 //A10 | |
1187 | data8 0x902A2F2F3AB74A92, 0x0000BFF5 //A11 | |
1188 | data8 0xAE05009B1B2C6E4C, 0x00003FF4 //A12 | |
1189 | data8 0xD5B71F6456D7D4CB, 0x0000BFF3 //A13 | |
1190 | data8 0x3F2F0351D71BC9C6 //A14 | |
1191 | data8 0xBF2B53BC56A3B793 //A15 | |
1192 | data8 0xBF18B12DC6F6B861 //A16 | |
1193 | data8 0xBF43EE6EB5215C2F //A17 | |
1194 | data8 0xBF5474787CDD455E //A18 | |
1195 | data8 0xBF642B503C9C060A //A19 | |
1196 | data8 0xBF6E07D1AA254AA3 //A20 | |
1197 | data8 0xBF71C785443AAEE8 //A21 | |
1198 | data8 0xBF6F67BF81B71052 //A22 | |
1199 | data8 0xBF63E4BCCF4FFABF //A23 | |
1200 | data8 0xBF50067F8C671D5A //A24 | |
1201 | data8 0xBF29C770D680A5AC //A25 | |
1202 | // Polynomial coefficients for the lgammal(x), 4.0 <= |x| < 6.0 | |
1203 | data8 0xBFD6626BC9B31B54, 0xBC85AABE08680902 //A3 | |
1204 | data8 0x401326643C4479C9, 0x3CAA53C26F31E364 //A0 | |
1205 | data8 0x401B4C420A50AD7C, 0x3C8C76D55E57DD8D //A1 | |
1206 | data8 0x3FF735973273D5EC, 0x3C83A0B78E09188A //A2 | |
1207 | data8 0x81310136363AAB6D, 0x00003FFC //A4 | |
1208 | data8 0xDF1BD44C4075C0E6, 0x0000BFFA //A5 | |
1209 | data8 0xD58389FE38D8D664, 0x00003FF9 //A6 | |
1210 | data8 0xDA62C7221D5B5F87, 0x0000BFF8 //A7 | |
1211 | data8 0xE9F92CAD0263E157, 0x00003FF7 //A8 | |
1212 | data8 0x81ACCB8606C165FE, 0x0000BFF7 //A9 | |
1213 | data8 0x9382D8D263D1C2A3, 0x00003FF6 //A10 | |
1214 | data8 0xAB3CCBA4C853B12C, 0x0000BFF5 //A11 | |
1215 | data8 0xCA0818BBCCC59296, 0x00003FF4 //A12 | |
1216 | data8 0xF18912691CBB5BD0, 0x0000BFF3 //A13 | |
1217 | data8 0x3F323EF5D8330339 //A14 | |
1218 | data8 0xBF2641132EA571F7 //A15 | |
1219 | data8 0x3F1B5D9576175CA9 //A16 | |
1220 | data8 0xBF10F56A689C623D //A17 | |
1221 | data8 0x3F04CACA9141A18D //A18 | |
1222 | data8 0xBEFA307AC9B4E85D //A19 | |
1223 | data8 0x3EF4B625939FBE32 //A20 | |
1224 | data8 0xBECEE6AC1420F86F //A21 | |
1225 | data8 0xBE9A95AE2E485964 //A22 | |
1226 | data8 0xBF039EF47F8C09BB //A23 | |
1227 | data8 0xBF05345957F7B7A9 //A24 | |
1228 | data8 0xBEF85AE6385D4CCC //A25 | |
1229 | // Polynomial coefficients for the lgammal(x), 3.0 <= |x| < 4.0 | |
1230 | data8 0xBFCA4D55BEAB2D6F, 0xBC667B20FF46C6A8 //A3 | |
1231 | data8 0x3FE62E42FEFA39EF, 0x3C7ABC9E3B398012 //A0 | |
1232 | data8 0x3FFD8773039049E7, 0x3C66CB9070238D77 //A1 | |
1233 | data8 0x3FE94699894C1F4C, 0x3C91873D8839B1CD //A2 | |
1234 | data8 0xA264558FB0906D7E, 0x00003FFB //A4 | |
1235 | data8 0x94D6C50FEB8AFD72, 0x0000BFFA //A5 | |
1236 | data8 0x9620656185B68F14, 0x00003FF9 //A6 | |
1237 | data8 0xA0D0983B34B7088A, 0x0000BFF8 //A7 | |
1238 | data8 0xB36AF863964AA440, 0x00003FF7 //A8 | |
1239 | data8 0xCE0DAAFB5497AFB8, 0x0000BFF6 //A9 | |
1240 | data8 0xF1C2EAFA79CC2864, 0x00003FF5 //A10 | |
1241 | data8 0x9028922A839572B8, 0x0000BFF5 //A11 | |
1242 | data8 0xAE1E62F870BA0278, 0x00003FF4 //A12 | |
1243 | data8 0xD4726F681E2ABA29, 0x0000BFF3 //A13 | |
1244 | data8 0x3F30559B9A02FADF //A14 | |
1245 | data8 0xBF243ADEB1266CAE //A15 | |
1246 | data8 0x3F19303B6F552603 //A16 | |
1247 | data8 0xBF0F768C288EC643 //A17 | |
1248 | data8 0x3F039D5356C21DE1 //A18 | |
1249 | data8 0xBEF81BCA8168E6BE //A19 | |
1250 | data8 0x3EEC74A53A06AD54 //A20 | |
1251 | data8 0xBEDED52D1A5DACDF //A21 | |
1252 | data8 0x3ECCB4C2C7087342 //A22 | |
1253 | data8 0xBEB4F1FAFDFF5C2F //A23 | |
1254 | data8 0x3E94C80B52D58904 //A24 | |
1255 | data8 0xBE64A328CBE92A27 //A25 | |
1256 | LOCAL_OBJECT_END(lgammal_03Q_6_data) | |
1257 | ||
1258 | LOCAL_OBJECT_START(lgammal_1pEps_data) | |
1259 | // Polynomial coefficients for the lgammal(x), 1 - 2^(-7) <= |x| < 1 + 2^(-7) | |
1260 | data8 0x93C467E37DB0C7A5, 0x00003FFE //A1 | |
1261 | data8 0xD28D3312983E9919, 0x00003FFE //A2 | |
1262 | data8 0xCD26AADF559A47E3, 0x00003FFD //A3 | |
1263 | data8 0x8A8991563EC22E81, 0x00003FFD //A4 | |
1264 | data8 0x3FCA8B9C168D52FE //A5 | |
1265 | data8 0x3FC5B40CB0696370 //A6 | |
1266 | data8 0x3FC270AC2229A65D //A7 | |
1267 | data8 0x3FC0110AF10FCBFC //A8 | |
1268 | // Polynomial coefficients for the log1p(x), - 2^(-7) <= |x| < 2^(-7) | |
1269 | data8 0x3FBC71C71C71C71C //P8 | |
1270 | data8 0xBFC0000000000000 //P7 | |
1271 | data8 0x3FC2492492492492 //P6 | |
1272 | data8 0xBFC5555555555555 //P5 | |
1273 | data8 0x3FC999999999999A //P4 | |
1274 | data8 0xBFD0000000000000 //P3 | |
1275 | data8 0x3FD5555555555555 //P2 | |
1276 | data8 0xBFE0000000000000 //P1 | |
1277 | // short version of "lnsin" polynomial | |
1278 | data8 0xD28D3312983E9918, 0x00003FFF //A2 | |
1279 | data8 0x8A8991563EC241B6, 0x00003FFE //A4 | |
1280 | data8 0xADA06588061830A5, 0x00003FFD //A6 | |
1281 | data8 0x80859B57C31CB746, 0x00003FFD //A8 | |
1282 | LOCAL_OBJECT_END(lgammal_1pEps_data) | |
1283 | ||
1284 | LOCAL_OBJECT_START(lgammal_neg2andHalf_data) | |
1285 | // Polynomial coefficients for the lgammal(x), -2.005859375 <= x < -2.5 | |
1286 | data8 0xBF927781D4BB093A, 0xBC511D86D85B7045 // A3, A0L | |
1287 | data8 0x3FF1A68793DEFC15, 0x3C9852AE2DA7DEEF // A1, A1L | |
1288 | data8 0x408555562D45FAFD, 0xBF972CDAFE5FEFAD // D0, D1 | |
1289 | data8 0xC18682331EF492A5, 0xC1845E3E0D29606B // C20, C21 | |
1290 | data8 0x4013141822E16979, 0x3CCF8718B6E75F6C // A2, A2L | |
1291 | data8 0xBFACCBF9F5ED0F15, 0xBBDD1AEB73297401 // A0, A3L | |
1292 | data8 0xCCCDB17423046445, 0x00004006 // E6 | |
1293 | data8 0x800514E230A3A452, 0x00004005 // E4 | |
1294 | data8 0xAAE9A48EC162E76F, 0x00004003 // E2 | |
1295 | data8 0x81D4F88B3F3EA0FC, 0x00004002 // E0 | |
1296 | data8 0x40CF3F3E35238DA0, 0xC0F8B340945F1A7E // D6, D7 | |
1297 | data8 0x40BF89EC0BD609C6, 0xC095897242AEFEE2 // D4, D5 | |
1298 | data8 0x40A2482FF01DBC5C, 0xC02095E275FDCF62 // D2, D3 | |
1299 | data8 0xC1641354F2312A6A, 0xC17B3657F85258E9 // C18, C19 | |
1300 | data8 0xC11F964E9ECBE2C9, 0xC146D7A90F70696C // C16, C17 | |
1301 | data8 0xE7AECDE6AF8EA816, 0x0000BFEF // E7 | |
1302 | data8 0xD711252FEBBE1091, 0x0000BFEB // E5 | |
1303 | data8 0xE648BD10F8C43391, 0x0000BFEF // E3 | |
1304 | data8 0x948A1E78AA00A98D, 0x0000BFF4 // E1 | |
1305 | LOCAL_OBJECT_END(lgammal_neg2andHalf_data) | |
1306 | ||
1307 | LOCAL_OBJECT_START(lgammal_near_neg_half_data) | |
1308 | // Polynomial coefficients for the lgammal(x), -0.5 < x < -0.40625 | |
1309 | data8 0xBFC1AE55B180726C, 0x3C8053CD734E6A1D // A3, A0L | |
1310 | data8 0x3FA2AED059BD608A, 0x3C0CD3D2CDBA17F4 // A1, A1L | |
1311 | data8 0x40855554DBCD1E1E, 0x3F96C51AC2BEE9E1 // D0, D1 | |
1312 | data8 0xC18682331EF4927D, 0x41845E3E0D295DFC // C20, C21 | |
1313 | data8 0x4011DE9E64DF22EF, 0x3CA692B70DAD6B7B // A2, A2L | |
1314 | data8 0x3FF43F89A3F0EDD6, 0xBC4955AED0FA087D // A0, A3L | |
1315 | data8 0xCCCD3F1DF4A2C1DD, 0x00004006 // E6 | |
1316 | data8 0x80028ADE33C7FCD9, 0x00004005 // E4 | |
1317 | data8 0xAACA474E485507EF, 0x00004003 // E2 | |
1318 | data8 0x80F07C206D6B0ECD, 0x00004002 // E0 | |
1319 | data8 0x40CF3F3E33E83056, 0x40F8B340944633D9 // D6, D7 | |
1320 | data8 0x40BF89EC059931F0, 0x409589723307AD20 // D4, D5 | |
1321 | data8 0x40A2482FD0054824, 0x402095CE7F19D011 // D2, D3 | |
1322 | data8 0xC1641354F2313614, 0x417B3657F8525354 // C18, C19 | |
1323 | data8 0xC11F964E9ECFD21C, 0x4146D7A90F701836 // C16, C17 | |
1324 | data8 0x86A9C01F0EA11E5A, 0x0000BFF5 // E7 | |
1325 | data8 0xBF6D8469142881C0, 0x0000BFF6 // E5 | |
1326 | data8 0x8D45D277BA8255F1, 0x0000BFF8 // E3 | |
1327 | data8 0xED2CEA9BA528BCC3, 0x0000BFF9 // E1 | |
1328 | LOCAL_OBJECT_END(lgammal_near_neg_half_data) | |
1329 | ||
1330 | //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! | |
1331 | ////////////// POLYNOMIAL COEFFICIENTS FOR "NEAR ROOTS" RANGES ///////////// | |
1332 | ////////////// THIS PART OF TABLE SHOULD BE ADDRESSED REALLY RARE ///////////// | |
1333 | //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! | |
1334 | LOCAL_OBJECT_START(lgammal_right_roots_polynomial_data) | |
1335 | // Polynomial coefficients for right root on [-3, -2] | |
6f65e668 | 1336 | // Lgammal is approximated by polynomial within [-.056244 ; .158208 ] range |
d5efd131 MF |
1337 | data8 0xBBBD5E9DCD11030B, 0xB867411D9FF87DD4 //A0 |
1338 | data8 0x3FF83FE966AF535E, 0x3CAA21235B8A769A //A1 | |
1339 | data8 0x40136EEBB002F55C, 0x3CC3959A6029838E //A2 | |
1340 | data8 0xB4A5302C53C2BEDD, 0x00003FFF //A3 | |
1341 | data8 0x8B8C6BE504F2DA1C, 0x00004002 //A4 | |
1342 | data8 0xB99CFF02593B4D98, 0x00004001 //A5 | |
1343 | data8 0x4038D32F682AA1CF //A6 | |
1344 | data8 0x403809F04EE6C5B5 //A7 | |
1345 | data8 0x40548EAA81634CEE //A8 | |
1346 | data8 0x4059297ADB6BC03D //A9 | |
1347 | data8 0x407286FB8EC5C9DA //A10 | |
1348 | data8 0x407A92E05B744CFB //A11 | |
1349 | data8 0x4091A9D4144258CD //A12 | |
1350 | data8 0x409C4D01D24F367E //A13 | |
1351 | data8 0x40B1871B9A426A83 //A14 | |
1352 | data8 0x40BE51C48BD9A583 //A15 | |
1353 | data8 0x40D2140D0C6153E7 //A16 | |
1354 | data8 0x40E0FB2C989CE4A3 //A17 | |
1355 | data8 0x40E52739AB005641 //A18 | |
1356 | data8 0x41161E3E6DDF503A //A19 | |
1357 | // Polynomial coefficients for right root on [-4, -3] | |
6f65e668 | 1358 | // Lgammal is approximated by polynomial within [-.172797 ; .171573 ] range |
d5efd131 MF |
1359 | data8 0x3C172712B248E42E, 0x38CB8D17801A5D67 //A0 |
1360 | data8 0x401F20A65F2FAC54, 0x3CCB9EA1817A824E //A1 | |
1361 | data8 0x4039D4D2977150EF, 0x3CDA42E149B6276A //A2 | |
1362 | data8 0xE089B8926AE2D9CB, 0x00004005 //A3 | |
1363 | data8 0x933901EBBB586C37, 0x00004008 //A4 | |
1364 | data8 0xCCD319BED1CFA1CD, 0x0000400A //A5 | |
1365 | data8 0x40D293C3F78D3C37 //A6 | |
1366 | data8 0x40FBB97AA0B6DD02 //A7 | |
1367 | data8 0x41251EA3345E5EB9 //A8 | |
1368 | data8 0x415057F65C92E7B0 //A9 | |
1369 | data8 0x41799C865241B505 //A10 | |
1370 | data8 0x41A445209EFE896B //A11 | |
1371 | data8 0x41D02D21880C953B //A12 | |
1372 | data8 0x41F9FFDE8C63E16D //A13 | |
1373 | data8 0x422504DC8302D2BE //A14 | |
1374 | data8 0x425111BF18C95414 //A15 | |
1375 | data8 0x427BCBE74A2B8EF7 //A16 | |
1376 | data8 0x42A7256F59B286F7 //A17 | |
1377 | data8 0x42D462D1586DE61F //A18 | |
1378 | data8 0x42FBB1228D6C5118 //A19 | |
1379 | // Polynomial coefficients for right root on [-5, -4] | |
6f65e668 | 1380 | // Lgammal is approximated by polynomial within [-.163171 ; .161988 ] range |
d5efd131 MF |
1381 | data8 0x3C5840FBAFDEE5BB, 0x38CAC0336E8C490A //A0 |
1382 | data8 0x403ACA5CF4921642, 0x3CCEDCDDA5491E56 //A1 | |
1383 | data8 0x40744415CD813F8E, 0x3CFBFEBC17E39146 //A2 | |
1384 | data8 0xAACD88D954E3E1BD, 0x0000400B //A3 | |
1385 | data8 0xCB68C710D75ED802, 0x0000400F //A4 | |
1386 | data8 0x8130F5AB997277AC, 0x00004014 //A5 | |
1387 | data8 0x41855E3DBF99EBA7 //A6 | |
1388 | data8 0x41CD14FE49C49FC2 //A7 | |
1389 | data8 0x421433DCE281F07D //A8 | |
1390 | data8 0x425C8399C7A92B6F //A9 | |
1391 | data8 0x42A45FBE67840F1A //A10 | |
1392 | data8 0x42ED68D75F9E6C98 //A11 | |
1393 | data8 0x433567291C27E5BE //A12 | |
1394 | data8 0x437F5ED7A9D9FD28 //A13 | |
1395 | data8 0x43C720A65C8AB711 //A14 | |
1396 | data8 0x441120A6C1D40B9B //A15 | |
1397 | data8 0x44596F561F2D1CBE //A16 | |
1398 | data8 0x44A3507DA81D5C01 //A17 | |
1399 | data8 0x44EF06A31E39EEDF //A18 | |
1400 | data8 0x45333774C99F523F //A19 | |
1401 | // Polynomial coefficients for right root on [-6, -5] | |
6f65e668 | 1402 | // Lgammal is approximated by polynomial within [-.156450 ; .156126 ] range |
d5efd131 MF |
1403 | data8 0x3C71B82D6B2B3304, 0x3917186E3C0DC231 //A0 |
1404 | data8 0x405ED72E0829AE02, 0x3C960C25157980EB //A1 | |
1405 | data8 0x40BCECC32EC22F9B, 0x3D5D8335A32F019C //A2 | |
1406 | data8 0x929EC2B1FB931F17, 0x00004012 //A3 | |
1407 | data8 0xD112EF96D37316DE, 0x00004018 //A4 | |
1408 | data8 0x9F00BB9BB13416AB, 0x0000401F //A5 | |
1409 | data8 0x425F7D8D5BDCB223 //A6 | |
1410 | data8 0x42C9A8D00C776CC6 //A7 | |
1411 | data8 0x433557FD8C481424 //A8 | |
1412 | data8 0x43A209221A953EF0 //A9 | |
1413 | data8 0x440EDC98D5618AB7 //A10 | |
1414 | data8 0x447AABD25E367378 //A11 | |
1415 | data8 0x44E73DE20CC3B288 //A12 | |
1416 | data8 0x455465257B4E0BD8 //A13 | |
1417 | data8 0x45C2011532085353 //A14 | |
1418 | data8 0x462FEE4CC191945B //A15 | |
1419 | data8 0x469C63AEEFEF0A7F //A16 | |
1420 | data8 0x4709D045390A3810 //A17 | |
1421 | data8 0x4778D360873C9F64 //A18 | |
1422 | data8 0x47E26965BE9A682A //A19 | |
1423 | // Polynomial coefficients for right root on [-7, -6] | |
6f65e668 | 1424 | // Lgammal is approximated by polynomial within [-.154582 ; .154521 ] range |
d5efd131 MF |
1425 | data8 0x3C75F103A1B00A48, 0x391C041C190C726D //A0 |
1426 | data8 0x40869DE49E3AF2AA, 0x3D1C17E1F813063B //A1 | |
1427 | data8 0x410FCE23484CFD10, 0x3DB6F38C2F11DAB9 //A2 | |
1428 | data8 0xEF281D1E1BE2055A, 0x00004019 //A3 | |
1429 | data8 0xFCE3DA92AC55DFF8, 0x00004022 //A4 | |
1430 | data8 0x8E9EA838A20BD58E, 0x0000402C //A5 | |
1431 | data8 0x4354F21E2FB9E0C9 //A6 | |
1432 | data8 0x43E9500994CD4F09 //A7 | |
1433 | data8 0x447F3A2C23C033DF //A8 | |
1434 | data8 0x45139152656606D8 //A9 | |
1435 | data8 0x45A8D45F8D3BF2E8 //A10 | |
1436 | data8 0x463FD32110E5BFE5 //A11 | |
1437 | data8 0x46D490B3BDBAE0BE //A12 | |
1438 | data8 0x476AC3CAD905DD23 //A13 | |
1439 | data8 0x48018558217AD473 //A14 | |
1440 | data8 0x48970AF371D30585 //A15 | |
1441 | data8 0x492E6273A8BEFFE3 //A16 | |
1442 | data8 0x49C47CC9AE3F1073 //A17 | |
1443 | data8 0x4A5D38E8C35EFF45 //A18 | |
1444 | data8 0x4AF0123E89694CD8 //A19 | |
1445 | // Polynomial coefficients for right root on [-8, -7] | |
6f65e668 | 1446 | // Lgammal is approximated by polynomial within [-.154217 ; .154208 ] range |
d5efd131 MF |
1447 | data8 0xBCD2507D818DDD68, 0xB97F6940EA2871A0 //A0 |
1448 | data8 0x40B3B407AA387BCB, 0x3D6320238F2C43D1 //A1 | |
1449 | data8 0x41683E85DAAFBAC7, 0x3E148D085958EA3A //A2 | |
1450 | data8 0x9F2A95AF1E10A548, 0x00004022 //A3 | |
1451 | data8 0x92F21522F482300E, 0x0000402E //A4 | |
1452 | data8 0x90B51AB03A1F244D, 0x0000403A //A5 | |
1453 | data8 0x44628E1C70EF534F //A6 | |
1454 | data8 0x452393E2BC32D244 //A7 | |
1455 | data8 0x45E5164141F4BA0B //A8 | |
1456 | data8 0x46A712B3A8AF5808 //A9 | |
1457 | data8 0x47698FD36CEDD0F2 //A10 | |
1458 | data8 0x482C9AE6BBAA3637 //A11 | |
1459 | data8 0x48F023821857C8E9 //A12 | |
1460 | data8 0x49B2569053FC106F //A13 | |
1461 | data8 0x4A74F646D5C1604B //A14 | |
1462 | data8 0x4B3811CF5ABA4934 //A15 | |
1463 | data8 0x4BFBB5DD6C84E233 //A16 | |
1464 | data8 0x4CC05021086F637B //A17 | |
1465 | data8 0x4D8450A345B0FB49 //A18 | |
1466 | data8 0x4E43825848865DB2 //A19 | |
1467 | // Polynomial coefficients for right root on [-9, -8] | |
6f65e668 | 1468 | // Lgammal is approximated by polynomial within [-.154160 ; .154158 ] range |
d5efd131 MF |
1469 | data8 0x3CDF4358564F2B46, 0x397969BEE6042F81 //A0 |
1470 | data8 0x40E3B088FED67721, 0x3D82787BA937EE85 //A1 | |
1471 | data8 0x41C83A3893550EF4, 0x3E542ED57E244DA8 //A2 | |
1472 | data8 0x9F003C6DC56E0B8E, 0x0000402B //A3 | |
1473 | data8 0x92BDF64A3213A699, 0x0000403A //A4 | |
1474 | data8 0x9074F503AAD417AF, 0x00004049 //A5 | |
1475 | data8 0x4582843E1313C8CD //A6 | |
1476 | data8 0x467387BD6A7826C1 //A7 | |
1477 | data8 0x4765074E788CF440 //A8 | |
1478 | data8 0x4857004DD9D1E09D //A9 | |
1479 | data8 0x4949792ED7530EAF //A10 | |
1480 | data8 0x4A3C7F089A292ED3 //A11 | |
1481 | data8 0x4B30125BF0AABB86 //A12 | |
1482 | data8 0x4C224175195E307E //A13 | |
1483 | data8 0x4D14DC4C8B32C08D //A14 | |
1484 | data8 0x4E07F1DB2786197E //A15 | |
1485 | data8 0x4EFB8EA1C336DACB //A16 | |
1486 | data8 0x4FF03797EACD0F23 //A17 | |
1487 | data8 0x50E4304A8E68A730 //A18 | |
1488 | data8 0x51D3618FB2EC9F93 //A19 | |
1489 | // Polynomial coefficients for right root on [-10, -9] | |
6f65e668 | 1490 | // Lgammal is approximated by polynomial within [-.154152 ; .154152 ] range |
d5efd131 MF |
1491 | data8 0x3D42F34DA97ECF0C, 0x39FD1256F345B0D0 //A0 |
1492 | data8 0x4116261203919787, 0x3DC12D44055588EB //A1 | |
1493 | data8 0x422EA8F32FB7FE99, 0x3ED849CE4E7B2D77 //A2 | |
1494 | data8 0xE25BAF73477A57B5, 0x00004034 //A3 | |
1495 | data8 0xEB021FD10060504A, 0x00004046 //A4 | |
1496 | data8 0x8220A208EE206C5F, 0x00004059 //A5 | |
1497 | data8 0x46B2C3903EC9DA14 //A6 | |
1498 | data8 0x47D64393744B9C67 //A7 | |
1499 | data8 0x48FAF79CCDC604DD //A8 | |
1500 | data8 0x4A20975DB8061EBA //A9 | |
1501 | data8 0x4B44AB9CBB38DB21 //A10 | |
1502 | data8 0x4C6A032F60094FE9 //A11 | |
1503 | data8 0x4D908103927634B4 //A12 | |
1504 | data8 0x4EB516CA21D30861 //A13 | |
1505 | data8 0x4FDB1BF12C58D318 //A14 | |
1506 | data8 0x510180AAE094A553 //A15 | |
1507 | data8 0x5226A8F2A2D45D57 //A16 | |
1508 | data8 0x534E00B6B0C8B809 //A17 | |
1509 | data8 0x5475022FE21215B2 //A18 | |
1510 | data8 0x5596B02BF6C5E19B //A19 | |
1511 | // Polynomial coefficients for right root on [-11, -10] | |
6f65e668 | 1512 | // Lgammal is approximated by polynomial within [-.154151 ; .154151 ] range |
d5efd131 MF |
1513 | data8 0x3D7AA9C2E2B1029C, 0x3A15FB37578544DB //A0 |
1514 | data8 0x414BAF825A0C91D4, 0x3DFB9DA2CE398747 //A1 | |
1515 | data8 0x4297F3EC8AE0AF03, 0x3F34208B55FB8781 //A2 | |
1516 | data8 0xDD0C97D3197F56DE, 0x0000403E //A3 | |
1517 | data8 0x8F6F3AF7A5499674, 0x00004054 //A4 | |
1518 | data8 0xC68DA1AF6D878EEB, 0x00004069 //A5 | |
1519 | data8 0x47F1E4E1E2197CE0 //A6 | |
1520 | data8 0x494A8A28E597C3EB //A7 | |
1521 | data8 0x4AA4175D0D35D705 //A8 | |
1522 | data8 0x4BFEE6F0AF69E814 //A9 | |
1523 | data8 0x4D580FE7B3DBB3C6 //A10 | |
1524 | data8 0x4EB2ECE60E4608AF //A11 | |
1525 | data8 0x500E04BE3E2B4F24 //A12 | |
1526 | data8 0x5167F9450F0FB8FD //A13 | |
1527 | data8 0x52C342BDE747603F //A14 | |
1528 | data8 0x541F1699D557268C //A15 | |
1529 | data8 0x557927C5F079864E //A16 | |
1530 | data8 0x56D4D10FEEDB030C //A17 | |
1531 | data8 0x5832385DF86AD28A //A18 | |
1532 | data8 0x598898914B4D6523 //A19 | |
1533 | // Polynomial coefficients for right root on [-12, -11] | |
6f65e668 | 1534 | // Lgammal is approximated by polynomial within [-.154151 ; .154151 ] range |
d5efd131 MF |
1535 | data8 0xBD96F61647C58B03, 0xBA3ABB0C2A6C755B //A0 |
1536 | data8 0x418308A82714B70D, 0x3E1088FC6A104C39 //A1 | |
1537 | data8 0x4306A493DD613C39, 0x3FB2341ECBF85741 //A2 | |
1538 | data8 0x8FA8FE98339474AB, 0x00004049 //A3 | |
1539 | data8 0x802CCDF570BA7942, 0x00004062 //A4 | |
1540 | data8 0xF3F748AF11A32890, 0x0000407A //A5 | |
1541 | data8 0x493E3B567EF178CF //A6 | |
1542 | data8 0x4ACED38F651BA362 //A7 | |
1543 | data8 0x4C600B357337F946 //A8 | |
1544 | data8 0x4DF0F71A52B54CCF //A9 | |
1545 | data8 0x4F8229F3B9FA2C70 //A10 | |
1546 | data8 0x5113A4C4979B770E //A11 | |
1547 | data8 0x52A56BC367F298D5 //A12 | |
1548 | data8 0x543785CF31842DC0 //A13 | |
1549 | data8 0x55C9FC37E3E40896 //A14 | |
1550 | data8 0x575CD5D1BA556C82 //A15 | |
1551 | data8 0x58F00A7AD99A9E08 //A16 | |
1552 | data8 0x5A824088688B008D //A17 | |
1553 | data8 0x5C15F75EF7E08EBD //A18 | |
1554 | data8 0x5DA462EA902F0C90 //A19 | |
1555 | // Polynomial coefficients for right root on [-13, -12] | |
6f65e668 | 1556 | // Lgammal is approximated by polynomial within [-.154151 ; .154151 ] range |
d5efd131 MF |
1557 | data8 0x3DC3191752ACFC9D, 0x3A26CB6629532DBF //A0 |
1558 | data8 0x41BC8CFC051191BD, 0x3E68A84DA4E62AF2 //A1 | |
1559 | data8 0x43797926294A0148, 0x400F345FF3723CFF //A2 | |
1560 | data8 0xF26D2AF700B82625, 0x00004053 //A3 | |
1561 | data8 0xA238B24A4B1F7B15, 0x00004070 //A4 | |
1562 | data8 0xE793B5C0A41A264F, 0x0000408C //A5 | |
1563 | data8 0x4A9585BDDACE863D //A6 | |
1564 | data8 0x4C6075953448088A //A7 | |
1565 | data8 0x4E29B2F38D1FC670 //A8 | |
1566 | data8 0x4FF4619B079C440F //A9 | |
1567 | data8 0x51C05DAE118D8AD9 //A10 | |
1568 | data8 0x538A8C7F87326AD4 //A11 | |
1569 | data8 0x5555B6937588DAB3 //A12 | |
1570 | data8 0x5721E1F8B6E6A7DB //A13 | |
1571 | data8 0x58EDA1D7A77DD6E5 //A14 | |
1572 | data8 0x5AB8A9616B7DC9ED //A15 | |
1573 | data8 0x5C84942AA209ED17 //A16 | |
1574 | data8 0x5E518FC34C6F54EF //A17 | |
1575 | data8 0x601FB3F17BCCD9A0 //A18 | |
1576 | data8 0x61E61128D512FE97 //A1 | |
1577 | // Polynomial coefficients for right root on [-14, -13] | |
6f65e668 | 1578 | // Lgammal is approximated by polynomial within [-.154151 ; .154151 ] range |
d5efd131 MF |
1579 | data8 0xBE170D646421B3F5, 0xBAAD95F79FCB5097 //A0 |
1580 | data8 0x41F7328CBFCD9AC7, 0x3E743B8B1E8AEDB1 //A1 | |
1581 | data8 0x43F0D0FA2DBDA237, 0x40A0422D6A227B55 //A2 | |
1582 | data8 0x82082DF2D32686CC, 0x0000405F //A3 | |
1583 | data8 0x8D64EE9B42E68B43, 0x0000407F //A4 | |
1584 | data8 0xA3FFD82E08C5F1F1, 0x0000409F //A5 | |
1585 | data8 0x4BF8C49D99123454 //A6 | |
1586 | data8 0x4DFEC79DDF11342F //A7 | |
1587 | data8 0x50038615A892F6BD //A8 | |
1588 | data8 0x520929453DB32EF1 //A9 | |
1589 | data8 0x54106A7808189A7F //A10 | |
1590 | data8 0x5615A302D03C207B //A11 | |
1591 | data8 0x581CC175AA736F5E //A12 | |
1592 | data8 0x5A233E071147C017 //A13 | |
1593 | data8 0x5C29E81917243F22 //A14 | |
1594 | data8 0x5E3184B0B5AC4707 //A15 | |
1595 | data8 0x6037C11DE62D8388 //A16 | |
1596 | data8 0x6240787C4B1C9D6C //A17 | |
1597 | data8 0x6448289235E80977 //A18 | |
1598 | data8 0x664B5352C6C3449E //A19 | |
1599 | // Polynomial coefficients for right root on [-15, -14] | |
6f65e668 | 1600 | // Lgammal is approximated by polynomial within [-.154151 ; .154151 ] range |
d5efd131 MF |
1601 | data8 0x3E562C2E34A9207D, 0x3ADC00DA3DFF7A83 //A0 |
1602 | data8 0x42344C3B2F0D90AB, 0x3EB8A2E979F24536 //A1 | |
1603 | data8 0x4469BFFF28B50D07, 0x41181E3D05C1C294 //A2 | |
1604 | data8 0xAE38F64DCB24D9F8, 0x0000406A //A3 | |
1605 | data8 0xA5C3F52C1B350702, 0x0000408E //A4 | |
1606 | data8 0xA83BC857BCD67A1B, 0x000040B2 //A5 | |
1607 | data8 0x4D663B4727B4D80A //A6 | |
1608 | data8 0x4FA82C965B0F7788 //A7 | |
1609 | data8 0x51EAD58C02908D95 //A8 | |
1610 | data8 0x542E427970E073D8 //A9 | |
1611 | data8 0x56714644C558A818 //A10 | |
1612 | data8 0x58B3EC2040C77BAE //A11 | |
1613 | data8 0x5AF72AE6A83D45B1 //A12 | |
1614 | data8 0x5D3B214F611F5D12 //A13 | |
1615 | data8 0x5F7FF5E49C54E92A //A14 | |
1616 | data8 0x61C2E917AB765FB2 //A15 | |
1617 | data8 0x64066FD70907B4C1 //A16 | |
1618 | data8 0x664B3998D60D0F9B //A17 | |
1619 | data8 0x689178710782FA8B //A18 | |
1620 | data8 0x6AD14A66C1C7BEC3 //A19 | |
1621 | // Polynomial coefficients for right root on [-16, -15] | |
6f65e668 | 1622 | // Lgammal is approximated by polynomial within [-.154151 ; .154151 ] range |
d5efd131 MF |
1623 | data8 0xBE6D7E7192615BAE, 0xBB0137677D7CC719 //A0 |
1624 | data8 0x4273077763F6628C, 0x3F09250FB8FC8EC9 //A1 | |
1625 | data8 0x44E6A1BF095B1AB3, 0x4178D5A74F6CB3B3 //A2 | |
1626 | data8 0x8F8E0D5060FCC76E, 0x00004076 //A3 | |
1627 | data8 0x800CC1DCFF092A63, 0x0000409E //A4 | |
1628 | data8 0xF3AB0BA9D14CDA72, 0x000040C5 //A5 | |
1629 | data8 0x4EDE3000A2F6D54F //A6 | |
1630 | data8 0x515EC613B9C8E241 //A7 | |
1631 | data8 0x53E003309FEEEA96 //A8 | |
1632 | data8 0x5660ED908D7C9A90 //A9 | |
1633 | data8 0x58E21E9B517B1A50 //A10 | |
1634 | data8 0x5B639745E4374EE2 //A11 | |
1635 | data8 0x5DE55BB626B2075D //A12 | |
1636 | data8 0x606772B7506BA747 //A13 | |
1637 | data8 0x62E9E581AB2E057B //A14 | |
1638 | data8 0x656CBAD1CF85D396 //A15 | |
1639 | data8 0x67EFF4EBD7989872 //A16 | |
1640 | data8 0x6A722D2B19B7E2F9 //A17 | |
1641 | data8 0x6CF5DEB3073B0743 //A18 | |
1642 | data8 0x6F744AC11550B93A //A19 | |
1643 | // Polynomial coefficients for right root on [-17, -16] | |
6f65e668 | 1644 | // Lgammal is approximated by polynomial within [-.154151 ; .154151 ] range |
d5efd131 MF |
1645 | data8 0xBEDCC6291188207E, 0xBB872E3FDD48F5B7 //A0 |
1646 | data8 0x42B3076EE7525EF9, 0x3F6687A5038CA81C //A1 | |
1647 | data8 0x4566A1AAD96EBCB5, 0x421F0FEDFBF548D2 //A2 | |
1648 | data8 0x8F8D4D3DE9850DBA, 0x00004082 //A3 | |
1649 | data8 0x800BDD6DA2CE1859, 0x000040AE //A4 | |
1650 | data8 0xF3A8EC4C9CDC1CE5, 0x000040D9 //A5 | |
1651 | data8 0x505E2FAFDB812628 //A6 | |
1652 | data8 0x531EC5B3A7508719 //A7 | |
1653 | data8 0x55E002F77E99B628 //A8 | |
1654 | data8 0x58A0ED4C9B4DAE54 //A9 | |
1655 | data8 0x5B621E4A8240F90C //A10 | |
1656 | data8 0x5E2396E5C8849814 //A11 | |
1657 | data8 0x60E55B43D8C5CE71 //A12 | |
1658 | data8 0x63A7722F5D45D01D //A13 | |
1659 | data8 0x6669E4E010DCE45A //A14 | |
1660 | data8 0x692CBA120D5E78F6 //A15 | |
1661 | data8 0x6BEFF4045350B22E //A16 | |
1662 | data8 0x6EB22C9807C21819 //A17 | |
1663 | data8 0x7175DE20D04617C4 //A18 | |
1664 | data8 0x74344AB87C6D655F //A19 | |
1665 | // Polynomial coefficients for right root on [-18, -17] | |
6f65e668 | 1666 | // Lgammal is approximated by polynomial within [-.154151 ; .154151 ] range |
d5efd131 MF |
1667 | data8 0xBF28AEEE7B61D77C, 0xBBDBBB5FC57ABF79 //A0 |
1668 | data8 0x42F436F56B3B8A0C, 0x3FA43EE3C5C576E9 //A1 | |
1669 | data8 0x45E98A22535D115D, 0x42984678BE78CC48 //A2 | |
1670 | data8 0xAC176F3775E6FCFC, 0x0000408E //A3 | |
1671 | data8 0xA3114F53A9FEB922, 0x000040BE //A4 | |
1672 | data8 0xA4D168A8334ABF41, 0x000040EE //A5 | |
1673 | data8 0x51E5B0E7EC7182BB //A6 | |
1674 | data8 0x54E77D67B876EAB6 //A7 | |
1675 | data8 0x57E9F7C30C09C4B6 //A8 | |
1676 | data8 0x5AED29B0488614CA //A9 | |
1677 | data8 0x5DF09486F87E79F9 //A10 | |
1678 | data8 0x60F30B199979654E //A11 | |
1679 | data8 0x63F60E02C7DCCC5F //A12 | |
1680 | data8 0x66F9B8A00EB01684 //A13 | |
1681 | data8 0x69FE2D3ED0700044 //A14 | |
1682 | data8 0x6D01C8363C7DCC84 //A15 | |
1683 | data8 0x700502B29C2F06E3 //A16 | |
1684 | data8 0x730962B4500F4A61 //A17 | |
1685 | data8 0x76103C6ED099192A //A18 | |
1686 | data8 0x79100C7132CFD6E3 //A19 | |
1687 | // Polynomial coefficients for right root on [-19, -18] | |
6f65e668 | 1688 | // Lgammal is approximated by polynomial within [-.154151 ; .154151 ] range |
d5efd131 MF |
1689 | data8 0x3F3C19A53328A0C3, 0x3BE04ADC3FBE1458 //A0 |
1690 | data8 0x4336C16C16C16C19, 0x3FE58CE3AC4A7C28 //A1 | |
1691 | data8 0x46702E85C0898B70, 0x432C922E412CEC6E //A2 | |
1692 | data8 0xF57B99A1C034335D, 0x0000409A //A3 | |
1693 | data8 0x82EC9634223DF909, 0x000040CF //A4 | |
1694 | data8 0x94F66D7557E2EA60, 0x00004103 //A5 | |
1695 | data8 0x5376118B79AE34D0 //A6 | |
1696 | data8 0x56BAE7106D52E548 //A7 | |
1697 | data8 0x5A00BD48CC8E25AB //A8 | |
1698 | data8 0x5D4529722821B493 //A9 | |
1699 | data8 0x608B1654AF31BBC1 //A10 | |
1700 | data8 0x63D182CC98AEA859 //A11 | |
1701 | data8 0x6716D43D5EEB05E8 //A12 | |
1702 | data8 0x6A5DF884FC172E1C //A13 | |
1703 | data8 0x6DA3CA7EBB97976B //A14 | |
1704 | data8 0x70EA416D0BE6D2EF //A15 | |
1705 | data8 0x743176C31EBB65F2 //A16 | |
1706 | data8 0x7777C401A8715CF9 //A17 | |
1707 | data8 0x7AC1110C6D350440 //A18 | |
1708 | data8 0x7E02D0971CF84865 //A19 | |
1709 | // Polynomial coefficients for right root on [-20, -19] | |
6f65e668 | 1710 | // Lgammal is approximated by polynomial within [-.154151 ; .154151 ] range |
d5efd131 MF |
1711 | data8 0xBFAB767F9BE21803, 0xBC5ACEF5BB1BD8B5 //A0 |
1712 | data8 0x4379999999999999, 0x4029241C7F5914C8 //A1 | |
1713 | data8 0x46F47AE147AE147A, 0x43AC2979B64B9D7E //A2 | |
1714 | data8 0xAEC33E1F67152993, 0x000040A7 //A3 | |
1715 | data8 0xD1B71758E219616F, 0x000040DF //A4 | |
1716 | data8 0x8637BD05AF6CF468, 0x00004118 //A5 | |
1717 | data8 0x55065E9F80F293DE //A6 | |
1718 | data8 0x588EADA78C44EE66 //A7 | |
1719 | data8 0x5C15798EE22DEF09 //A8 | |
1720 | data8 0x5F9E8ABFD644FA63 //A9 | |
1721 | data8 0x6325FD7FE29BD7CD //A10 | |
1722 | data8 0x66AFFC5C57E1F802 //A11 | |
1723 | data8 0x6A3774CD7D5C0181 //A12 | |
1724 | data8 0x6DC152724DE2A6FE //A13 | |
1725 | data8 0x7149BB138EB3D0C2 //A14 | |
1726 | data8 0x74D32FF8A70896C2 //A15 | |
1727 | data8 0x785D3749F9C72BD7 //A16 | |
1728 | data8 0x7BE5CCF65EBC4E40 //A17 | |
1729 | data8 0x7F641A891B5FC652 //A18 | |
1730 | data8 0x7FEFFFFFFFFFFFFF //A19 | |
1731 | LOCAL_OBJECT_END(lgammal_right_roots_polynomial_data) | |
1732 | ||
1733 | LOCAL_OBJECT_START(lgammal_left_roots_polynomial_data) | |
1734 | // Polynomial coefficients for left root on [-3, -2] | |
6f65e668 | 1735 | // Lgammal is approximated by polynomial within [.084641 ; -.059553 ] range |
d5efd131 MF |
1736 | data8 0xBC0844590979B82E, 0xB8BC7CE8CE2ECC3B //A0 |
1737 | data8 0xBFFEA12DA904B18C, 0xBC91A6B2BAD5EF6E //A1 | |
1738 | data8 0x4023267F3C265A51, 0x3CD7055481D03AED //A2 | |
1739 | data8 0xA0C2D618645F8E00, 0x0000C003 //A3 | |
1740 | data8 0xFA8256664F8CD2BE, 0x00004004 //A4 | |
1741 | data8 0xC2C422C103F57158, 0x0000C006 //A5 | |
1742 | data8 0x4084373F7CC70AF5 //A6 | |
1743 | data8 0xC0A12239BDD6BB95 //A7 | |
1744 | data8 0x40BDBA65E2709397 //A8 | |
1745 | data8 0xC0DA2D2504DFB085 //A9 | |
1746 | data8 0x40F758173CA5BF3C //A10 | |
1747 | data8 0xC11506C65C267E72 //A11 | |
1748 | data8 0x413318EE3A6B05FC //A12 | |
1749 | data8 0xC1517767F247DA98 //A13 | |
1750 | data8 0x41701237B4754D73 //A14 | |
1751 | data8 0xC18DB8A03BC5C3D8 //A15 | |
1752 | data8 0x41AB80953AC14A07 //A16 | |
1753 | data8 0xC1C9B7B76638D0A4 //A17 | |
1754 | data8 0x41EA727E3033E2D9 //A18 | |
1755 | data8 0xC20812C297729142 //A19 | |
1756 | // | |
1757 | // Polynomial coefficients for left root on [-4, -3] | |
6f65e668 | 1758 | // Lgammal is approximated by polynomial within [.147147 ; -.145158 ] range |
d5efd131 MF |
1759 | data8 0xBC3130AE5C4F54DB, 0xB8ED23294C13398A //A0 |
1760 | data8 0xC034B99D966C5646, 0xBCE2E5FE3BC3DBB9 //A1 | |
1761 | data8 0x406F76DEAE0436BD, 0x3D14974DDEC057BD //A2 | |
1762 | data8 0xE929ACEA5979BE96, 0x0000C00A //A3 | |
1763 | data8 0xF47C14F8A0D52771, 0x0000400E //A4 | |
1764 | data8 0x88B7BC036937481C, 0x0000C013 //A5 | |
1765 | data8 0x4173E8F3AB9FC266 //A6 | |
1766 | data8 0xC1B7DBBE062FB11B //A7 | |
1767 | data8 0x41FD2F76DE7A47A7 //A8 | |
1768 | data8 0xC242225FE53B124D //A9 | |
1769 | data8 0x4286D12AE2FBFA30 //A10 | |
1770 | data8 0xC2CCFFC267A3C4C0 //A11 | |
1771 | data8 0x431294E10008E014 //A12 | |
1772 | data8 0xC357FAC8C9A2DF6A //A13 | |
1773 | data8 0x439F2190AB9FAE01 //A14 | |
1774 | data8 0xC3E44C1D8E8C67C3 //A15 | |
1775 | data8 0x442A8901105D5A38 //A16 | |
1776 | data8 0xC471C4421E908C3A //A17 | |
1777 | data8 0x44B92CD4D59D6D17 //A18 | |
1778 | data8 0xC4FB3A078B5247FA //A19 | |
1779 | // Polynomial coefficients for left root on [-5, -4] | |
6f65e668 | 1780 | // Lgammal is approximated by polynomial within [.155671 ; -.155300 ] range |
d5efd131 MF |
1781 | data8 0xBC57BF3C6E8A94C1, 0xB902FB666934AC9E //A0 |
1782 | data8 0xC05D224A3EF9E41F, 0xBCF6F5713913E440 //A1 | |
1783 | data8 0x40BB533C678A3955, 0x3D688E53E3C72538 //A2 | |
1784 | data8 0x869FBFF732E99B84, 0x0000C012 //A3 | |
1785 | data8 0xBA9537AD61392DEC, 0x00004018 //A4 | |
1786 | data8 0x89EAE8B1DEA06B05, 0x0000C01F //A5 | |
1787 | data8 0x425A8C5C53458D3C //A6 | |
1788 | data8 0xC2C5068B3ED6509B //A7 | |
1789 | data8 0x4330FFA575E99B4E //A8 | |
1790 | data8 0xC39BEC12DDDF7669 //A9 | |
1791 | data8 0x44073825725F74F9 //A10 | |
1792 | data8 0xC47380EBCA299047 //A11 | |
1793 | data8 0x44E084DD9B666437 //A12 | |
1794 | data8 0xC54C2DA6BF787ACF //A13 | |
1795 | data8 0x45B82D65C8D6FA42 //A14 | |
1796 | data8 0xC624D62113FE950A //A15 | |
1797 | data8 0x469200CC19B45016 //A16 | |
1798 | data8 0xC6FFDDC6DD938E2E //A17 | |
1799 | data8 0x476DD7C07184B9F9 //A18 | |
1800 | data8 0xC7D554A30085C052 //A19 | |
1801 | // Polynomial coefficients for left root on [-6, -5] | |
6f65e668 | 1802 | // Lgammal is approximated by polynomial within [.157425 ; -.157360 ] range |
d5efd131 MF |
1803 | data8 0x3C9E20A87C8B79F1, 0x39488BE34B2427DB //A0 |
1804 | data8 0xC08661F6A43A5E12, 0xBD3D912526D759CC //A1 | |
1805 | data8 0x410F79DCB794F270, 0x3DB9BEE7CD3C1BF5 //A2 | |
1806 | data8 0xEB7404450D0005DB, 0x0000C019 //A3 | |
1807 | data8 0xF7AE9846DFE4D4AB, 0x00004022 //A4 | |
1808 | data8 0x8AF535855A95B6DA, 0x0000C02C //A5 | |
1809 | data8 0x43544D54E9FE240E //A6 | |
1810 | data8 0xC3E8684E40CE6CFC //A7 | |
1811 | data8 0x447DF44C1D803454 //A8 | |
1812 | data8 0xC512AC305439B2BA //A9 | |
1813 | data8 0x45A79226AF79211A //A10 | |
1814 | data8 0xC63E0DFF7244893A //A11 | |
1815 | data8 0x46D35216C3A83AF3 //A12 | |
1816 | data8 0xC76903BE0C390E28 //A13 | |
1817 | data8 0x48004A4DECFA4FD5 //A14 | |
1818 | data8 0xC8954FBD243DB8BE //A15 | |
1819 | data8 0x492BF3A31EB18DDA //A16 | |
1820 | data8 0xC9C2C6A864521F3A //A17 | |
1821 | data8 0x4A5AB127C62E8DA1 //A18 | |
1822 | data8 0xCAECF60EF3183C57 //A19 | |
1823 | // Polynomial coefficients for left root on [-7, -6] | |
6f65e668 | 1824 | // Lgammal is approximated by polynomial within [.157749 ; -.157739 ] range |
d5efd131 MF |
1825 | data8 0x3CC9B9E8B8D551D6, 0x3961813C8E1E10DB //A0 |
1826 | data8 0xC0B3ABF7A5CEA91F, 0xBD55638D4BCB4CC4 //A1 | |
1827 | data8 0x4168349A25504236, 0x3E0287ECE50CCF76 //A2 | |
1828 | data8 0x9EC8ED6E4C219E67, 0x0000C022 //A3 | |
1829 | data8 0x9279EB1B799A3FF3, 0x0000402E //A4 | |
1830 | data8 0x90213EF8D9A5DBCF, 0x0000C03A //A5 | |
1831 | data8 0x4462775E857FB71C //A6 | |
1832 | data8 0xC52377E70B45FDBF //A7 | |
1833 | data8 0x45E4F3D28EDA8C28 //A8 | |
1834 | data8 0xC6A6E85571BD2D0B //A9 | |
1835 | data8 0x47695BB17E74DF74 //A10 | |
1836 | data8 0xC82C5AC0ED6A662F //A11 | |
1837 | data8 0x48EFF8159441C2E3 //A12 | |
1838 | data8 0xC9B22602C1B68AE5 //A13 | |
1839 | data8 0x4A74BA8CE7B34100 //A14 | |
1840 | data8 0xCB37C7E208482E4B //A15 | |
1841 | data8 0x4BFB5A1D57352265 //A16 | |
1842 | data8 0xCCC01CB3021212FF //A17 | |
1843 | data8 0x4D841613AC3431D1 //A18 | |
1844 | data8 0xCE431C9E9EE43AD9 //A19 | |
1845 | // Polynomial coefficients for left root on [-8, -7] | |
6f65e668 | 1846 | // Lgammal is approximated by polynomial within [.157799 ; -.157798 ] range |
d5efd131 MF |
1847 | data8 0xBCF9C7A33AD9478C, 0xB995B0470F11E5ED //A0 |
1848 | data8 0xC0E3AF76FE4C2F8B, 0xBD8DBCD503250511 //A1 | |
1849 | data8 0x41C838E76CAAF0D5, 0x3E5D79F5E2E069C3 //A2 | |
1850 | data8 0x9EF345992B262CE0, 0x0000C02B //A3 | |
1851 | data8 0x92AE0292985FD559, 0x0000403A //A4 | |
1852 | data8 0x90615420C08F7D8C, 0x0000C049 //A5 | |
1853 | data8 0x45828139342CEEB7 //A6 | |
1854 | data8 0xC67384066C31E2D3 //A7 | |
1855 | data8 0x476502BC4DAC2C35 //A8 | |
1856 | data8 0xC856FAADFF22ADC6 //A9 | |
1857 | data8 0x49497243255AB3CE //A10 | |
1858 | data8 0xCA3C768489520F6B //A11 | |
1859 | data8 0x4B300D1EA47AF838 //A12 | |
1860 | data8 0xCC223B0508AC620E //A13 | |
1861 | data8 0x4D14D46583338CD8 //A14 | |
1862 | data8 0xCE07E7A87AA068E4 //A15 | |
1863 | data8 0x4EFB811AD2F8BEAB //A16 | |
1864 | data8 0xCFF0351B51508523 //A17 | |
1865 | data8 0x50E4364CCBF53100 //A18 | |
1866 | data8 0xD1D33CFD0BF96FA6 //A19 | |
1867 | // Polynomial coefficients for left root on [-9, -8] | |
6f65e668 | 1868 | // Lgammal is approximated by polynomial within [.157806 ; -.157806 ] range |
d5efd131 MF |
1869 | data8 0x3D333E4438B1B9D4, 0x39E7B956B83964C1 //A0 |
1870 | data8 0xC11625EDFC63DCD8, 0xBDCF39625709EFAC //A1 | |
1871 | data8 0x422EA8C150480F16, 0x3EC16ED908AB7EDD //A2 | |
1872 | data8 0xE2598725E2E11646, 0x0000C034 //A3 | |
1873 | data8 0xEAFF2346DE3EBC98, 0x00004046 //A4 | |
1874 | data8 0x821E90DE12A0F05F, 0x0000C059 //A5 | |
1875 | data8 0x46B2C334AE5366FE //A6 | |
1876 | data8 0xC7D64314B43191B6 //A7 | |
1877 | data8 0x48FAF6ED5899E01B //A8 | |
1878 | data8 0xCA2096E4472AF37D //A9 | |
1879 | data8 0x4B44AAF49FB7E4C8 //A10 | |
1880 | data8 0xCC6A02469F2BD920 //A11 | |
1881 | data8 0x4D9080626D2EFC07 //A12 | |
1882 | data8 0xCEB515EDCF0695F7 //A13 | |
1883 | data8 0x4FDB1AC69BF36960 //A14 | |
1884 | data8 0xD1017F8274339270 //A15 | |
1885 | data8 0x5226A684961BAE2F //A16 | |
1886 | data8 0xD34E085C088404A5 //A17 | |
1887 | data8 0x547511892FF8960E //A18 | |
1888 | data8 0xD5968FA3B1ED67A9 //A19 | |
1889 | // Polynomial coefficients for left root on [-10, -9] | |
6f65e668 | 1890 | // Lgammal is approximated by polynomial within [.157807 ; -.157807 ] range |
d5efd131 MF |
1891 | data8 0xBD355818A2B42BA2, 0xB9B7320B6A0D61EA //A0 |
1892 | data8 0xC14BAF7DA5F3770E, 0xBDE64AF9A868F719 //A1 | |
1893 | data8 0x4297F3E8791F9CD3, 0x3F2A553E59B4835E //A2 | |
1894 | data8 0xDD0C5F7E551BD13C, 0x0000C03E //A3 | |
1895 | data8 0x8F6F0A3B2EB08BBB, 0x00004054 //A4 | |
1896 | data8 0xC68D4D5AD230BA08, 0x0000C069 //A5 | |
1897 | data8 0x47F1E4D8C35D1A3E //A6 | |
1898 | data8 0xC94A8A191DB0A466 //A7 | |
1899 | data8 0x4AA4174F65FE6AE8 //A8 | |
1900 | data8 0xCBFEE6D90F94E9DD //A9 | |
1901 | data8 0x4D580FD3438BE16C //A10 | |
1902 | data8 0xCEB2ECD456D50224 //A11 | |
1903 | data8 0x500E049F7FE64546 //A12 | |
1904 | data8 0xD167F92D9600F378 //A13 | |
1905 | data8 0x52C342AE2B43261A //A14 | |
1906 | data8 0xD41F15DEEDA4B67E //A15 | |
1907 | data8 0x55792638748AFB7D //A16 | |
1908 | data8 0xD6D4D760074F6E6B //A17 | |
1909 | data8 0x5832469D58ED3FA9 //A18 | |
1910 | data8 0xD988769F3DC76642 //A19 | |
1911 | // Polynomial coefficients for left root on [-11, -10] | |
6f65e668 | 1912 | // Lgammal is approximated by polynomial within [.157807 ; -.157807 ] range |
d5efd131 MF |
1913 | data8 0xBDA050601F39778A, 0xBA0D4D1CE53E8241 //A0 |
1914 | data8 0xC18308A7D8EA4039, 0xBE370C379D3EAD41 //A1 | |
1915 | data8 0x4306A49380644E6C, 0x3FBBB143C0E7B5C8 //A2 | |
1916 | data8 0x8FA8FB233E4AA6D2, 0x0000C049 //A3 | |
1917 | data8 0x802CC9D8AEAC207D, 0x00004062 //A4 | |
1918 | data8 0xF3F73EE651A37A13, 0x0000C07A //A5 | |
1919 | data8 0x493E3B550A7B9568 //A6 | |
1920 | data8 0xCACED38DAA060929 //A7 | |
1921 | data8 0x4C600B346BAB3BC6 //A8 | |
1922 | data8 0xCDF0F719193E3D26 //A9 | |
1923 | data8 0x4F8229F24528B151 //A10 | |
1924 | data8 0xD113A4C2D32FBBE2 //A11 | |
1925 | data8 0x52A56BC13DC4474D //A12 | |
1926 | data8 0xD43785CFAF5E3CE3 //A13 | |
1927 | data8 0x55C9FC3EA5941202 //A14 | |
1928 | data8 0xD75CD545A3341AF5 //A15 | |
1929 | data8 0x58F009911F77C282 //A16 | |
1930 | data8 0xDA8246294D210BEC //A17 | |
1931 | data8 0x5C1608AAC32C3A8E //A18 | |
1932 | data8 0xDDA446E570A397D5 //A19 | |
1933 | // Polynomial coefficients for left root on [-12, -11] | |
6f65e668 | 1934 | // Lgammal is approximated by polynomial within [.157807 ; -.157807 ] range |
d5efd131 MF |
1935 | data8 0x3DEACBB3081C502E, 0x3A8AA6F01DEDF745 //A0 |
1936 | data8 0xC1BC8CFBFB0A9912, 0xBE6556B6504A2AE6 //A1 | |
1937 | data8 0x43797926206941D7, 0x40289A9644C2A216 //A2 | |
1938 | data8 0xF26D2A78446D0839, 0x0000C053 //A3 | |
1939 | data8 0xA238B1D937FFED38, 0x00004070 //A4 | |
1940 | data8 0xE793B4F6DE470538, 0x0000C08C //A5 | |
1941 | data8 0x4A9585BDC44DC45D //A6 | |
1942 | data8 0xCC60759520342C47 //A7 | |
1943 | data8 0x4E29B2F3694C0404 //A8 | |
1944 | data8 0xCFF4619AE7B6BBAB //A9 | |
1945 | data8 0x51C05DADF52B89E8 //A10 | |
1946 | data8 0xD38A8C7F48819A4A //A11 | |
1947 | data8 0x5555B6932D687860 //A12 | |
1948 | data8 0xD721E1FACB6C1B5B //A13 | |
1949 | data8 0x58EDA1E2677C8F91 //A14 | |
1950 | data8 0xDAB8A8EC523C1F71 //A15 | |
1951 | data8 0x5C84930133F30411 //A16 | |
1952 | data8 0xDE51952FDFD1EC49 //A17 | |
1953 | data8 0x601FCCEC1BBD25F1 //A18 | |
1954 | data8 0xE1E5F2D76B610920 //A19 | |
1955 | // Polynomial coefficients for left root on [-13, -12] | |
6f65e668 | 1956 | // Lgammal is approximated by polynomial within [.157807 ; -.157807 ] range |
d5efd131 MF |
1957 | data8 0xBE01612F373268ED, 0xBA97B7A18CDF103B //A0 |
1958 | data8 0xC1F7328CBF7A4FAC, 0xBE89A25A6952F481 //A1 | |
1959 | data8 0x43F0D0FA2DBDA237, 0x40A0422EC1CE6084 //A2 | |
1960 | data8 0x82082DF2D32686C5, 0x0000C05F //A3 | |
1961 | data8 0x8D64EE9B42E68B36, 0x0000407F //A4 | |
1962 | data8 0xA3FFD82E08C630C9, 0x0000C09F //A5 | |
1963 | data8 0x4BF8C49D99123466 //A6 | |
1964 | data8 0xCDFEC79DDF1119ED //A7 | |
1965 | data8 0x50038615A892D242 //A8 | |
1966 | data8 0xD20929453DC8B537 //A9 | |
1967 | data8 0x54106A78083BA1EE //A10 | |
1968 | data8 0xD615A302C69E27B2 //A11 | |
1969 | data8 0x581CC175870FF16F //A12 | |
1970 | data8 0xDA233E0979E12B74 //A13 | |
1971 | data8 0x5C29E822BC568C80 //A14 | |
1972 | data8 0xDE31845DB5340FBC //A15 | |
1973 | data8 0x6037BFC6D498D5F9 //A16 | |
1974 | data8 0xE2407D92CD613E82 //A17 | |
1975 | data8 0x64483B9B62367EB7 //A18 | |
1976 | data8 0xE64B2DC830E8A799 //A1 | |
1977 | // Polynomial coefficients for left root on [-14, -13] | |
6f65e668 | 1978 | // Lgammal is approximated by polynomial within [.157807 ; -.157807 ] range |
d5efd131 MF |
1979 | data8 0x3E563D0B930B371F, 0x3AE779957E14F012 //A0 |
1980 | data8 0xC2344C3B2F083767, 0xBEC0B7769AA3DD66 //A1 | |
1981 | data8 0x4469BFFF28B50D07, 0x41181E3F13ED2401 //A2 | |
1982 | data8 0xAE38F64DCB24D9EE, 0x0000C06A //A3 | |
1983 | data8 0xA5C3F52C1B3506F2, 0x0000408E //A4 | |
1984 | data8 0xA83BC857BCD6BA92, 0x0000C0B2 //A5 | |
1985 | data8 0x4D663B4727B4D81A //A6 | |
1986 | data8 0xCFA82C965B0F62E9 //A7 | |
1987 | data8 0x51EAD58C02905B71 //A8 | |
1988 | data8 0xD42E427970FA56AD //A9 | |
1989 | data8 0x56714644C57D8476 //A10 | |
1990 | data8 0xD8B3EC2037EC95F2 //A11 | |
1991 | data8 0x5AF72AE68BBA5B3D //A12 | |
1992 | data8 0xDD3B2152C67AA6B7 //A13 | |
1993 | data8 0x5F7FF5F082861B8B //A14 | |
1994 | data8 0xE1C2E8BE125A5B7A //A15 | |
1995 | data8 0x64066E92FE9EBE7D //A16 | |
1996 | data8 0xE64B4201CDF9F138 //A17 | |
1997 | data8 0x689186351E58AA88 //A18 | |
1998 | data8 0xEAD132A585DFC60A //A19 | |
1999 | // Polynomial coefficients for left root on [-15, -14] | |
6f65e668 | 2000 | // Lgammal is approximated by polynomial within [.157807 ; -.157807 ] range |
d5efd131 MF |
2001 | data8 0xBE6D7DDE12700AC1, 0xBB1E025BF1667FB5 //A0 |
2002 | data8 0xC273077763F60AD5, 0xBF2A1698184C7A9A //A1 | |
2003 | data8 0x44E6A1BF095B1AB3, 0x4178D5AE8A4A2874 //A2 | |
2004 | data8 0x8F8E0D5060FCC767, 0x0000C076 //A3 | |
2005 | data8 0x800CC1DCFF092A57, 0x0000409E //A4 | |
2006 | data8 0xF3AB0BA9D14D37D1, 0x0000C0C5 //A5 | |
2007 | data8 0x4EDE3000A2F6D565 //A6 | |
2008 | data8 0xD15EC613B9C8C800 //A7 | |
2009 | data8 0x53E003309FEECCAA //A8 | |
2010 | data8 0xD660ED908D8B15C4 //A9 | |
2011 | data8 0x58E21E9B51A1C4AE //A10 | |
2012 | data8 0xDB639745DB82210D //A11 | |
2013 | data8 0x5DE55BB60C68FCF6 //A12 | |
2014 | data8 0xE06772BA3FCA23C6 //A13 | |
2015 | data8 0x62E9E58B4F702C31 //A14 | |
2016 | data8 0xE56CBA49B071ABE2 //A15 | |
2017 | data8 0x67EFF31E4F2BA36A //A16 | |
2018 | data8 0xEA7232C8804F32C3 //A17 | |
2019 | data8 0x6CF5EFEE929A0928 //A18 | |
2020 | data8 0xEF742EE03EC3E8FF //A19 | |
2021 | // Polynomial coefficients for left root on [-16, -15] | |
6f65e668 | 2022 | // Lgammal is approximated by polynomial within [.157807 ; -.157807 ] range |
d5efd131 MF |
2023 | data8 0xBEDCC628FEAC7A1B, 0xBB80582C8BEBB198 //A0 |
2024 | data8 0xC2B3076EE752595E, 0xBF5388F55AFAE53E //A1 | |
2025 | data8 0x4566A1AAD96EBCB5, 0x421F0FEFE2444293 //A2 | |
2026 | data8 0x8F8D4D3DE9850DB2, 0x0000C082 //A3 | |
2027 | data8 0x800BDD6DA2CE184C, 0x000040AE //A4 | |
2028 | data8 0xF3A8EC4C9CDC7A43, 0x0000C0D9 //A5 | |
2029 | data8 0x505E2FAFDB81263F //A6 | |
2030 | data8 0xD31EC5B3A7506CD9 //A7 | |
2031 | data8 0x55E002F77E999810 //A8 | |
2032 | data8 0xD8A0ED4C9B5C2900 //A9 | |
2033 | data8 0x5B621E4A8267C401 //A10 | |
2034 | data8 0xDE2396E5BFCFDA7A //A11 | |
2035 | data8 0x60E55B43BE6F9A79 //A12 | |
2036 | data8 0xE3A772324C7405FA //A13 | |
2037 | data8 0x6669E4E9B7E57A2D //A14 | |
2038 | data8 0xE92CB989F8A8FB37 //A15 | |
2039 | data8 0x6BEFF2368849A36E //A16 | |
2040 | data8 0xEEB23234FE191D55 //A17 | |
2041 | data8 0x7175EF5D1080B105 //A18 | |
2042 | data8 0xF4342ED7B1B7BE31 //A19 | |
2043 | // Polynomial coefficients for left root on [-17, -16] | |
6f65e668 | 2044 | // Lgammal is approximated by polynomial within [.157807 ; -.157807 ] range |
d5efd131 MF |
2045 | data8 0xBF28AEEE7B58C790, 0xBBC4448DE371FA0A //A0 |
2046 | data8 0xC2F436F56B3B89B1, 0xBF636755245AC63A //A1 | |
2047 | data8 0x45E98A22535D115D, 0x4298467DA93DB784 //A2 | |
2048 | data8 0xAC176F3775E6FCF2, 0x0000C08E //A3 | |
2049 | data8 0xA3114F53A9FEB908, 0x000040BE //A4 | |
2050 | data8 0xA4D168A8334AFE5A, 0x0000C0EE //A5 | |
2051 | data8 0x51E5B0E7EC7182CF //A6 | |
2052 | data8 0xD4E77D67B876D6B4 //A7 | |
2053 | data8 0x57E9F7C30C098C83 //A8 | |
2054 | data8 0xDAED29B0489EF7A7 //A9 | |
2055 | data8 0x5DF09486F8A524B8 //A10 | |
2056 | data8 0xE0F30B19910A2393 //A11 | |
2057 | data8 0x63F60E02AB3109F4 //A12 | |
2058 | data8 0xE6F9B8A3431854D5 //A13 | |
2059 | data8 0x69FE2D4A6D94218E //A14 | |
2060 | data8 0xED01C7E272A73560 //A15 | |
2061 | data8 0x7005017D82B186B6 //A16 | |
2062 | data8 0xF3096A81A69BD8AE //A17 | |
2063 | data8 0x76104951BAD67D5C //A18 | |
2064 | data8 0xF90FECC99786FD5B //A19 | |
2065 | // Polynomial coefficients for left root on [-18, -17] | |
6f65e668 | 2066 | // Lgammal is approximated by polynomial within [.157807 ; -.157807 ] range |
d5efd131 MF |
2067 | data8 0x3F3C19A53328E26A, 0x3BE238D7BA036B3B //A0 |
2068 | data8 0xC336C16C16C16C13, 0xBFEACE245DEC56F3 //A1 | |
2069 | data8 0x46702E85C0898B70, 0x432C922B64FD1DA4 //A2 | |
2070 | data8 0xF57B99A1C0343350, 0x0000C09A //A3 | |
2071 | data8 0x82EC9634223DF90D, 0x000040CF //A4 | |
2072 | data8 0x94F66D7557E3237D, 0x0000C103 //A5 | |
2073 | data8 0x5376118B79AE34D6 //A6 | |
2074 | data8 0xD6BAE7106D52CE49 //A7 | |
2075 | data8 0x5A00BD48CC8E11AB //A8 | |
2076 | data8 0xDD4529722833E2DF //A9 | |
2077 | data8 0x608B1654AF5F46AF //A10 | |
2078 | data8 0xE3D182CC90D8723F //A11 | |
2079 | data8 0x6716D43D46706AA0 //A12 | |
2080 | data8 0xEA5DF888C5B428D3 //A13 | |
2081 | data8 0x6DA3CA85888931A6 //A14 | |
2082 | data8 0xF0EA40EF2AC7E070 //A15 | |
2083 | data8 0x743175D1A251AFCD //A16 | |
2084 | data8 0xF777CB6E2B550D73 //A17 | |
2085 | data8 0x7AC11E468A134A51 //A18 | |
2086 | data8 0xFE02B6BDD0FC40AA //A19 | |
2087 | // Polynomial coefficients for left root on [-19, -18] | |
6f65e668 | 2088 | // Lgammal is approximated by polynomial within [.157807 ; -.157807 ] range |
d5efd131 MF |
2089 | data8 0xBFAB767F9BE217FC, 0xBC4A5541CE0D8D0D //A0 |
2090 | data8 0xC379999999999999, 0xC01A84981B490BE8 //A1 | |
2091 | data8 0x46F47AE147AE147A, 0x43AC2987BBC466EB //A2 | |
2092 | data8 0xAEC33E1F67152987, 0x0000C0A7 //A3 | |
2093 | data8 0xD1B71758E2196153, 0x000040DF //A4 | |
2094 | data8 0x8637BD05AF6D420E, 0x0000C118 //A5 | |
2095 | data8 0x55065E9F80F293B2 //A6 | |
2096 | data8 0xD88EADA78C44BFA7 //A7 | |
2097 | data8 0x5C15798EE22EC6CD //A8 | |
2098 | data8 0xDF9E8ABFD67895CF //A9 | |
2099 | data8 0x6325FD7FE13B0DE0 //A10 | |
2100 | data8 0xE6AFFC5C3DE70858 //A11 | |
2101 | data8 0x6A3774CE81C70D43 //A12 | |
2102 | data8 0xEDC1527412D8129F //A13 | |
2103 | data8 0x7149BABCDA8B7A72 //A14 | |
2104 | data8 0xF4D330AD49071BB5 //A15 | |
2105 | data8 0x785D4046F4C5F1FD //A16 | |
2106 | data8 0xFBE59BFEDBA73FAF //A17 | |
2107 | data8 0x7F64BEF2B2EC8DA1 //A18 | |
2108 | data8 0xFFEFFFFFFFFFFFFF //A19 | |
2109 | LOCAL_OBJECT_END(lgammal_left_roots_polynomial_data) | |
2110 | ||
2111 | ||
2112 | //============================================================== | |
2113 | // Code | |
2114 | //============================================================== | |
2115 | ||
2116 | .section .text | |
2117 | GLOBAL_LIBM_ENTRY(__libm_lgammal) | |
2118 | { .mfi | |
2119 | getf.exp rSignExpX = f8 | |
2120 | // Test x for NaTVal, NaN, +/-0, +/-INF, denormals | |
2121 | fclass.m p6,p0 = f8,0x1EF | |
2122 | addl r17Ones = 0x1FFFF, r0 // exponent mask | |
2123 | } | |
2124 | { .mfi | |
2125 | addl GR_ad_z_1 = @ltoff(Constants_Z_1#),gp | |
2126 | fcvt.fx.s1 fXint = f8 // Convert arg to int (int repres. in FR) | |
2127 | adds rDelta = 0x3FC, r0 | |
2128 | } | |
2129 | ;; | |
2130 | { .mfi | |
2131 | getf.sig rSignifX = f8 | |
2132 | fcmp.lt.s1 p15, p14 = f8, f0 | |
2133 | shl rDelta = rDelta, 20 // single precision 1.5 | |
2134 | } | |
2135 | { .mfi | |
2136 | ld8 GR_ad_z_1 = [GR_ad_z_1]// get pointer to Constants_Z_1 | |
2137 | fma.s1 fTwo = f1, f1, f1 // 2.0 | |
2138 | addl rExp8 = 0x10002, r0 // exponent of 8.0 | |
2139 | } | |
2140 | ;; | |
2141 | { .mfi | |
2142 | alloc rPFS_SAVED = ar.pfs, 0, 34, 4, 0 // get some registers | |
2143 | fmerge.s fAbsX = f1, f8 // |x| | |
2144 | and rExpX = rSignExpX, r17Ones // mask sign bit | |
2145 | } | |
2146 | { .mib | |
2147 | addl rExpHalf = 0xFFFE, r0 // exponent of 0.5 | |
2148 | addl rExp2 = 0x10000, r0 // exponent of 2.0 | |
2149 | // branch out if x is NaTVal, NaN, +/-0, +/-INF, or denormalized number | |
2150 | (p6) br.cond.spnt lgammal_spec | |
2151 | } | |
2152 | ;; | |
2153 | _deno_back_to_main_path: | |
2154 | { .mfi | |
2155 | // Point to Constants_G_H_h1 | |
2156 | add rTbl1Addr = 0x040, GR_ad_z_1 | |
2157 | frcpa.s1 fRcpX, p0 = f1, f8 // initial approximation of 1/x | |
2158 | extr.u GR_Index1 = rSignifX, 59, 4 | |
2159 | } | |
2160 | { .mib | |
2161 | (p14) cmp.ge.unc p8, p0 = rExpX, rExp8 // p8 = 1 if x >= 8.0 | |
2162 | adds rZ625 = 0x3F2, r0 | |
2163 | (p8) br.cond.spnt lgammal_big_positive // branch out if x >= 8.0 | |
2164 | } | |
2165 | ;; | |
2166 | { .mfi | |
2167 | shladd rZ1offsett = GR_Index1, 2, GR_ad_z_1 // Point to Z_1 | |
2168 | fmerge.se fSignifX = f1, f8 // sifnificand of x | |
2169 | // Get high 15 bits of significand | |
2170 | extr.u GR_X_0 = rSignifX, 49, 15 | |
2171 | } | |
2172 | { .mib | |
2173 | cmp.lt.unc p9, p0 = rExpX, rExpHalf // p9 = 1 if |x| < 0.5 | |
2174 | // set p11 if 2 <= x < 4 | |
2175 | (p14) cmp.eq.unc p11, p0 = rExpX, rExp2 | |
2176 | (p9) br.cond.spnt lgammal_0_half // branch out if |x| < 0.5 | |
2177 | } | |
2178 | ;; | |
2179 | { .mfi | |
2180 | ld4 GR_Z_1 = [rZ1offsett] // Load Z_1 | |
2181 | fms.s1 fA5L = f1, f1, f8 // for 0.75 <= x < 1.3125 path | |
2182 | shl rZ625 = rZ625, 20 // sinfle precision 0.625 | |
2183 | } | |
2184 | { .mib | |
2185 | setf.s FR_MHalf = rDelta | |
2186 | // set p10 if x >= 4.0 | |
2187 | (p14) cmp.gt.unc p10, p0 = rExpX, rExp2 | |
2188 | // branch to special path for 4.0 <= x < 8 | |
2189 | (p10) br.cond.spnt lgammal_4_8 | |
2190 | } | |
2191 | ;; | |
2192 | { .mfi | |
2193 | // for 1.3125 <= x < 1.5625 path | |
2194 | addl rPolDataPtr= @ltoff(lgammal_loc_min_data),gp | |
2195 | // argument of polynomial approximation for 1.5625 <= x < 2.25 | |
2196 | fms.s1 fB4 = f8, f1, fTwo | |
2197 | cmp.eq p12, p0 = rExpX, rExpHalf | |
2198 | } | |
2199 | { .mib | |
2200 | addl rExpOne = 0xFFFF, r0 // exponent of 1.0 | |
2201 | // set p10 if significand of x >= 1.125 | |
2202 | (p11) cmp.le p11, p0 = 2, GR_Index1 | |
2203 | (p11) br.cond.spnt lgammal_2Q_4 | |
2204 | } | |
2205 | ;; | |
2206 | { .mfi | |
2207 | // point to xMin for 1.3125 <= x < 1.5625 path | |
2208 | ld8 rPolDataPtr = [rPolDataPtr] | |
2209 | fcvt.xf fFltIntX = fXint // RTN(x) | |
2210 | (p14) cmp.eq.unc p13, p7 = rExpX, rExpOne // p13 set if 1.0 <= x < 2.0 | |
2211 | } | |
2212 | { .mib | |
2213 | setf.s FR_FracX = rZ625 | |
2214 | // set p12 if |x| < 0.75 | |
2215 | (p12) cmp.gt.unc p12, p0 = 8, GR_Index1 | |
2216 | // branch out to special path for |x| < 0.75 | |
2217 | (p12) br.cond.spnt lgammal_half_3Q | |
2218 | } | |
2219 | ;; | |
2220 | .pred.rel "mutex", p7, p13 | |
2221 | { .mfi | |
2222 | getf.sig rXRnd = fXint // integer part of the input value | |
2223 | fnma.s1 fInvX = f8, fRcpX, f1 // start of 1st NR iteration | |
2224 | // Get bits 30-15 of X_0 * Z_1 | |
2225 | pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15 | |
2226 | } | |
2227 | { .mib | |
2228 | (p7) cmp.eq p6, p0 = rExpX, rExp2 // p6 set if 2.0 <= x < 2.25 | |
2229 | (p13) cmp.le p6, p0 = 9, GR_Index1 | |
2230 | // branch to special path 1.5625 <= x < 2.25 | |
2231 | (p6) br.cond.spnt lgammal_13Q_2Q | |
2232 | } | |
2233 | ;; | |
2234 | // | |
2235 | // For performance, don't use result of pmpyshr2.u for 4 cycles. | |
2236 | // | |
2237 | { .mfi | |
2238 | shladd GR_ad_tbl_1 = GR_Index1, 4, rTbl1Addr // Point to G_1 | |
2239 | fma.s1 fSix = fTwo, fTwo, fTwo // 6.0 | |
2240 | add GR_ad_q = -0x60, GR_ad_z_1 // Point to Constants_Q | |
2241 | } | |
2242 | { .mib | |
2243 | add rTmpPtr3 = -0x50, GR_ad_z_1 | |
2244 | (p13) cmp.gt p7, p0 = 5, GR_Index1 | |
2245 | // branch to special path 0.75 <= x < 1.3125 | |
2246 | (p7) br.cond.spnt lgammal_03Q_1Q | |
2247 | } | |
2248 | ;; | |
2249 | { .mfi | |
2250 | add rTmpPtr = 8, GR_ad_tbl_1 | |
2251 | fma.s1 fRoot = f8, f1, f1 // x + 1 | |
2252 | // Absolute value of int arg. Will be used as index in table with roots | |
2253 | sub rXRnd = r0, rXRnd | |
2254 | } | |
2255 | { .mib | |
2256 | ldfe fA5L = [rPolDataPtr], 16 // xMin | |
2257 | addl rNegSingularity = 0x3003E, r0 | |
2258 | (p14) br.cond.spnt lgammal_loc_min | |
2259 | } | |
2260 | ;; | |
2261 | { .mfi | |
2262 | ldfps FR_G, FR_H = [GR_ad_tbl_1], 8 // Load G_1, H_1 | |
2263 | nop.f 0 | |
2264 | add rZ2Addr = 0x140, GR_ad_z_1 // Point to Constants_Z_2 | |
2265 | } | |
2266 | { .mib | |
2267 | ldfd FR_h = [rTmpPtr] // Load h_1 | |
2268 | // If arg is less or equal to -2^63 | |
2269 | cmp.geu.unc p8,p0 = rSignExpX, rNegSingularity | |
2270 | // Singularity for x < -2^63 since all such arguments are integers | |
2271 | // branch to special code which deals with singularity | |
2272 | (p8) br.cond.spnt lgammal_singularity | |
2273 | } | |
2274 | ;; | |
2275 | { .mfi | |
2276 | ldfe FR_log2_hi = [GR_ad_q], 32 // Load log2_hi | |
2277 | nop.f 0 | |
2278 | extr.u GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1 | |
2279 | } | |
2280 | { .mfi | |
2281 | ldfe FR_log2_lo = [rTmpPtr3], 32 // Load log2_lo | |
2282 | fms.s1 fDx = f8, f1, fFltIntX // x - RTN(x) | |
2283 | // index in table with roots and bounds | |
2284 | adds rXint = -2, rXRnd | |
2285 | } | |
2286 | ;; | |
2287 | { .mfi | |
2288 | ldfe FR_Q4 = [GR_ad_q], 32 // Load Q4 | |
2289 | nop.f 0 | |
2290 | // set p12 if x may be close to negative root: -19.5 < x < -2.0 | |
2291 | cmp.gtu p12, p0 = 18, rXint | |
2292 | } | |
2293 | { .mfi | |
2294 | shladd GR_ad_z_2 = GR_Index2, 2, rZ2Addr // Point to Z_2 | |
2295 | fma.s1 fRcpX = fInvX, fRcpX, fRcpX // end of 1st NR iteration | |
2296 | // Point to Constants_G_H_h2 | |
2297 | add rTbl2Addr = 0x180, GR_ad_z_1 | |
2298 | } | |
2299 | ;; | |
2300 | { .mfi | |
2301 | shladd GR_ad_tbl_2 = GR_Index2, 4, rTbl2Addr // Point to G_2 | |
2302 | // set p9 if x is integer and negative | |
2303 | fcmp.eq.s1 p9, p0 = f8,fFltIntX | |
2304 | // Point to Constants_G_H_h3 | |
2305 | add rTbl3Addr = 0x280, GR_ad_z_1 | |
2306 | } | |
2307 | { .mfi | |
2308 | ld4 GR_Z_2 = [GR_ad_z_2] // Load Z_2 | |
2309 | nop.f 0 | |
2310 | sub GR_N = rExpX, rExpHalf, 1 | |
2311 | } | |
2312 | ;; | |
2313 | { .mfi | |
2314 | ldfe FR_Q3 = [rTmpPtr3], 32 // Load Q3 | |
2315 | nop.f 0 | |
2316 | // Point to lnsin polynomial coefficients | |
2317 | adds rLnSinDataPtr = 864, rTbl3Addr | |
2318 | } | |
2319 | { .mfi | |
2320 | ldfe FR_Q2 = [GR_ad_q],32 // Load Q2 | |
2321 | nop.f 0 | |
2322 | add rTmpPtr = 8, GR_ad_tbl_2 | |
2323 | } | |
2324 | ;; | |
2325 | { .mfi | |
2326 | ldfe FR_Q1 = [rTmpPtr3] // Load Q1 | |
2327 | fcmp.lt.s1 p0, p15 = fAbsX, fSix // p15 is set when x < -6.0 | |
2328 | // point to table with roots and bounds | |
2329 | adds rRootsBndAddr = -1296, GR_ad_z_1 | |
2330 | } | |
2331 | { .mfb | |
2332 | // Put integer N into rightmost significand | |
2333 | setf.sig fFloatN = GR_N | |
2334 | fma.s1 fThirteen = fSix, fTwo, f1 // 13.0 | |
2335 | // Singularity if -2^63 < x < 0 and x is integer | |
2336 | // branch to special code which deals with singularity | |
2337 | (p9) br.cond.spnt lgammal_singularity | |
2338 | } | |
2339 | ;; | |
2340 | { .mfi | |
2341 | ldfps FR_G2, FR_H2 = [GR_ad_tbl_2] // Load G_2, H_2 | |
2342 | // y = |x|/2^(exponent(x)) - 1.5 | |
2343 | fms.s1 FR_FracX = fSignifX, f1, FR_MHalf | |
2344 | // Get bits 30-15 of X_1 * Z_2 | |
2345 | pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15 | |
2346 | } | |
2347 | { .mfi | |
2348 | ldfd FR_h2 = [rTmpPtr] // Load h_2 | |
2349 | fma.s1 fDxSqr = fDx, fDx, f0 // deltaX^2 | |
2350 | adds rTmpPtr3 = 128, rLnSinDataPtr | |
2351 | } | |
2352 | ;; | |
2353 | // | |
2354 | // For performance, don't use result of pmpyshr2.u for 4 cycles. | |
2355 | // | |
2356 | { .mfi | |
2357 | getf.exp rRoot = fRoot // sign and biased exponent of (x + 1) | |
2358 | nop.f 0 | |
2359 | // set p6 if -4 < x <= -2 | |
2360 | cmp.eq p6, p0 = rExpX, rExp2 | |
2361 | } | |
2362 | { .mfi | |
2363 | ldfpd fLnSin2, fLnSin2L = [rLnSinDataPtr], 16 | |
2364 | fnma.s1 fInvX = f8, fRcpX, f1 // start of 2nd NR iteration | |
2365 | sub rIndexPol = rExpX, rExpHalf // index of polynom | |
2366 | } | |
2367 | ;; | |
2368 | { .mfi | |
2369 | ldfe fLnSin4 = [rLnSinDataPtr], 96 | |
2370 | // p10 is set if x is potential "right" root | |
2371 | // p11 set for possible "left" root | |
2372 | fcmp.lt.s1 p10, p11 = fDx, f0 | |
2373 | shl rIndexPol = rIndexPol, 6 // (i*16)*4 | |
2374 | } | |
2375 | { .mfi | |
2376 | ldfpd fLnSin18, fLnSin20 = [rTmpPtr3], 16 | |
2377 | nop.f 0 | |
2378 | mov rExp2tom7 = 0x0fff8 // Exponent of 2^-7 | |
2379 | } | |
2380 | ;; | |
2381 | { .mfi | |
2382 | getf.sig rSignifDx = fDx // Get significand of RTN(x) | |
2383 | nop.f 0 | |
2384 | // set p6 if -4 < x <= -3.0 | |
2385 | (p6) cmp.le.unc p6, p0 = 0x8, GR_Index1 | |
2386 | } | |
2387 | { .mfi | |
2388 | ldfpd fLnSin22, fLnSin24 = [rTmpPtr3], 16 | |
2389 | nop.f 0 | |
2390 | // mask sign bit in the exponent of (x + 1) | |
2391 | and rRoot = rRoot, r17Ones | |
2392 | } | |
2393 | ;; | |
2394 | { .mfi | |
2395 | ldfe fLnSin16 = [rLnSinDataPtr], -80 | |
2396 | nop.f 0 | |
2397 | extr.u GR_Index3 = GR_X_2, 1, 5 // Extract bits 1-5 of X_2 | |
2398 | } | |
2399 | { .mfi | |
2400 | ldfpd fLnSin26, fLnSin28 = [rTmpPtr3], 16 | |
2401 | nop.f 0 | |
2402 | and rXRnd = 1, rXRnd | |
2403 | } | |
2404 | ;; | |
2405 | { .mfi | |
2406 | shladd GR_ad_tbl_3 = GR_Index3, 4, rTbl3Addr // Point to G_3 | |
2407 | fms.s1 fDxSqrL = fDx, fDx, fDxSqr // low part of deltaX^2 | |
2408 | // potential "left" root | |
2409 | (p11) adds rRootsBndAddr = 560, rRootsBndAddr | |
2410 | } | |
2411 | { .mib | |
2412 | ldfpd fLnSin30, fLnSin32 = [rTmpPtr3], 16 | |
2413 | // set p7 if |x+1| < 2^-7 | |
2414 | cmp.lt p7, p0 = rRoot, rExp2tom7 | |
2415 | // branch to special path for |x+1| < 2^-7 | |
2416 | (p7) br.cond.spnt _closeToNegOne | |
2417 | } | |
2418 | ;; | |
2419 | { .mfi | |
2420 | ldfps FR_G3, FR_H3 = [GR_ad_tbl_3], 8 // Load G_3, H_3 | |
2421 | fcmp.lt.s1 p14, p0 = fAbsX, fThirteen // set p14 if x > -13.0 | |
2422 | // base address of polynomial on range [-6.0, -0.75] | |
2423 | adds rPolDataPtr = 3440, rTbl3Addr | |
2424 | } | |
2425 | { .mfi | |
2426 | // (i*16)*4 + (i*16)*8 - offsett of polynomial on range [-6.0, -0.75] | |
2427 | shladd rTmpPtr = rIndexPol, 2, rIndexPol | |
2428 | fma.s1 fXSqr = FR_FracX, FR_FracX, f0 // y^2 | |
2429 | // point to left "near root" bound | |
2430 | (p12) shladd rRootsBndAddr = rXint, 4, rRootsBndAddr | |
2431 | } | |
2432 | ;; | |
2433 | { .mfi | |
2434 | ldfpd fLnSin34, fLnSin36 = [rTmpPtr3], 16 | |
2435 | fma.s1 fRcpX = fInvX, fRcpX, fRcpX // end of 2nd NR iteration | |
2436 | // add special offsett if -4 < x <= -3.0 | |
2437 | (p6) adds rPolDataPtr = 640, rPolDataPtr | |
2438 | } | |
2439 | { .mfi | |
2440 | // point to right "near root" bound | |
2441 | adds rTmpPtr2 = 8, rRootsBndAddr | |
2442 | fnma.s1 fMOne = f1, f1, f0 // -1.0 | |
2443 | // Point to Bernulli numbers | |
2444 | adds rBernulliPtr = 544, rTbl3Addr | |
2445 | } | |
2446 | ;; | |
2447 | { .mfi | |
2448 | // left bound of "near root" range | |
2449 | (p12) ld8 rLeftBound = [rRootsBndAddr] | |
2450 | fmerge.se fNormDx = f1, fDx // significand of DeltaX | |
2451 | // base address + offsett for polynomial coeff. on range [-6.0, -0.75] | |
2452 | add rPolDataPtr = rPolDataPtr, rTmpPtr | |
2453 | } | |
2454 | { .mfi | |
2455 | // right bound of "near root" range | |
2456 | (p12) ld8 rRightBound = [rTmpPtr2] | |
2457 | fcvt.xf fFloatN = fFloatN | |
2458 | // special "Bernulli" numbers for Stirling's formula for -13 < x < -6 | |
2459 | (p14) adds rBernulliPtr = 160, rBernulliPtr | |
2460 | } | |
2461 | ;; | |
2462 | { .mfi | |
2463 | ldfd FR_h3 = [GR_ad_tbl_3] // Load h_3 | |
2464 | fmpy.s1 FR_G = FR_G, FR_G2 // G = G_1 * G_2 | |
2465 | adds rTmpPtr3 = -160, rTmpPtr3 | |
2466 | } | |
2467 | { .mfb | |
2468 | adds rTmpPtr = 80, rPolDataPtr | |
2469 | fadd.s1 FR_H = FR_H, FR_H2 // H = H_1 + H_2 | |
2470 | // p15 is set if -2^63 < x < 6.0 and x is not an integer | |
2471 | // branch to path with implementation using Stirling's formula for neg. x | |
2472 | (p15) br.cond.spnt _negStirling | |
2473 | } | |
2474 | ;; | |
2475 | { .mfi | |
2476 | ldfpd fA3, fA3L = [rPolDataPtr], 16 // A3 | |
2477 | fma.s1 fDelX4 = fDxSqr, fDxSqr, f0 // deltaX^4 | |
2478 | // Get high 4 bits of signif | |
2479 | extr.u rIndex1Dx = rSignifDx, 59, 4 | |
2480 | } | |
2481 | { .mfi | |
2482 | ldfe fA5 = [rTmpPtr], -16 // A5 | |
2483 | fadd.s1 FR_h = FR_h, FR_h2 // h = h_1 + h_2 | |
2484 | adds rLnSinTmpPtr = 16, rLnSinDataPtr | |
2485 | } | |
2486 | ;; | |
2487 | { .mfi | |
2488 | ldfpd fA0, fA0L = [rPolDataPtr], 16 // A0 | |
2489 | fma.s1 fLnSin20 = fLnSin20, fDxSqr, fLnSin18 | |
2490 | // Get high 15 bits of significand | |
2491 | extr.u rX0Dx = rSignifDx, 49, 15 | |
2492 | } | |
2493 | { .mfi | |
2494 | ldfe fA4 = [rTmpPtr], 192 // A4 | |
2495 | fms.s1 fXSqrL = FR_FracX, FR_FracX, fXSqr // low part of y^2 | |
2496 | shladd GR_ad_z_1 = rIndex1Dx, 2, GR_ad_z_1 // Point to Z_1 | |
2497 | } | |
2498 | ;; | |
2499 | { .mfi | |
2500 | ldfpd fA1, fA1L = [rPolDataPtr], 16 // A1 | |
2501 | fma.s1 fX4 = fXSqr, fXSqr, f0 // y^4 | |
2502 | adds rTmpPtr2 = 32, rTmpPtr | |
2503 | } | |
2504 | { .mfi | |
2505 | ldfpd fA18, fA19 = [rTmpPtr], 16 // A18, A19 | |
2506 | fma.s1 fLnSin24 = fLnSin24, fDxSqr, fLnSin22 | |
2507 | nop.i 0 | |
2508 | } | |
2509 | ;; | |
2510 | { .mfi | |
2511 | ldfe fLnSin6 = [rLnSinDataPtr], 32 | |
2512 | fma.s1 fLnSin28 = fLnSin28, fDxSqr, fLnSin26 | |
2513 | nop.i 0 | |
2514 | } | |
2515 | { .mfi | |
2516 | ldfe fLnSin8 = [rLnSinTmpPtr], 32 | |
2517 | nop.f 0 | |
2518 | nop.i 0 | |
2519 | } | |
2520 | ;; | |
2521 | { .mfi | |
2522 | ldfpd fA20, fA21 = [rTmpPtr], 16 // A20, A21 | |
2523 | fma.s1 fLnSin32 = fLnSin32, fDxSqr, fLnSin30 | |
2524 | nop.i 0 | |
2525 | } | |
2526 | { .mfi | |
2527 | ldfpd fA22, fA23 = [rTmpPtr2], 16 // A22, A23 | |
2528 | fma.s1 fB20 = f1, f1, FR_MHalf // 2.5 | |
2529 | (p12) cmp.ltu.unc p6, p0 = rSignifX, rLeftBound | |
2530 | } | |
2531 | ;; | |
2532 | { .mfi | |
2533 | ldfpd fA2, fA2L = [rPolDataPtr], 16 // A2 | |
2534 | fmpy.s1 FR_G = FR_G, FR_G3 // G = (G_1 * G_2) * G_3 | |
2535 | // set p6 if x falls in "near root" range | |
2536 | (p6) cmp.geu.unc p6, p0 = rSignifX, rRightBound | |
2537 | } | |
2538 | { .mfb | |
2539 | adds rTmpPtr3 = -64, rTmpPtr | |
2540 | fadd.s1 FR_H = FR_H, FR_H3 // H = (H_1 + H_2) + H_3 | |
2541 | // branch to special path if x falls in "near root" range | |
2542 | (p6) br.cond.spnt _negRoots | |
2543 | } | |
2544 | ;; | |
2545 | { .mfi | |
2546 | ldfpd fA24, fA25 = [rTmpPtr2], 16 // A24, A25 | |
2547 | fma.s1 fLnSin36 = fLnSin36, fDxSqr, fLnSin34 | |
2548 | (p11) cmp.eq.unc p7, p0 = 1,rXint // p7 set if -3.0 < x < -2.5 | |
2549 | } | |
2550 | { .mfi | |
2551 | adds rTmpPtr = -48, rTmpPtr | |
2552 | fma.s1 fLnSin20 = fLnSin20, fDxSqr, fLnSin16 | |
2553 | addl rDelta = 0x5338, r0 // significand of -2.605859375 | |
2554 | } | |
2555 | ;; | |
2556 | { .mfi | |
2557 | getf.exp GR_N = fDx // Get N = exponent of DeltaX | |
2558 | fma.s1 fX6 = fX4, fXSqr, f0 // y^6 | |
2559 | // p7 set if -2.605859375 <= x < -2.5 | |
2560 | (p7) cmp.gt.unc p7, p0 = rDelta, GR_X_0 | |
2561 | } | |
2562 | { .mfb | |
2563 | ld4 GR_Z_1 = [GR_ad_z_1] // Load Z_1 | |
2564 | fma.s1 fDelX8 = fDelX4, fDelX4, f0 // deltaX^8 | |
2565 | // branch to special path for -2.605859375 <= x < -2.5 | |
2566 | (p7) br.cond.spnt _neg2andHalf | |
2567 | } | |
2568 | ;; | |
2569 | { .mfi | |
2570 | ldfpd fA14, fA15 = [rTmpPtr3], 16 // A14, A15 | |
2571 | fadd.s1 FR_h = FR_h, FR_h3 // h = (h_1 + h_2) + h_3 | |
2572 | adds rTmpPtr2 = 128 , rPolDataPtr | |
2573 | } | |
2574 | { .mfi | |
2575 | ldfpd fA16, fA17 = [rTmpPtr], 16 // A16, A17 | |
2576 | fma.s1 fLnSin28 = fLnSin28, fDelX4, fLnSin24 | |
2577 | adds rPolDataPtr = 144 , rPolDataPtr | |
2578 | } | |
2579 | ;; | |
2580 | { .mfi | |
2581 | ldfe fLnSin10 = [rLnSinDataPtr], 32 | |
2582 | fma.s1 fRes1H = fA3, FR_FracX, f0 // (A3*y)hi | |
2583 | and GR_N = GR_N, r17Ones // mask sign bit | |
2584 | } | |
2585 | { .mfi | |
2586 | ldfe fLnSin12 = [rLnSinTmpPtr] | |
2587 | fma.s1 fDelX6 = fDxSqr, fDelX4, f0 // DeltaX^6 | |
2588 | shladd GR_ad_tbl_1 = rIndex1Dx, 4, rTbl1Addr // Point to G_1 | |
2589 | } | |
2590 | ;; | |
2591 | { .mfi | |
2592 | ldfe fA13 = [rPolDataPtr], -32 // A13 | |
2593 | fma.s1 fA4 = fA5, FR_FracX, fA4 // A5*y + A4 | |
2594 | // Get bits 30-15 of X_0 * Z_1 | |
2595 | pmpyshr2.u GR_X_1 = rX0Dx, GR_Z_1, 15 | |
2596 | } | |
2597 | { .mfi | |
2598 | ldfe fA12 = [rTmpPtr2], -32 // A12 | |
2599 | fms.s1 FR_r = FR_G, fSignifX, f1 // r = G * S_hi - 1 | |
2600 | sub GR_N = GR_N, rExpHalf, 1 // unbisaed exponent of DeltaX | |
2601 | } | |
2602 | ;; | |
2603 | // | |
2604 | // For performance, don't use result of pmpyshr2.u for 4 cycles. | |
2605 | // | |
2606 | .pred.rel "mutex",p10,p11 | |
2607 | { .mfi | |
2608 | ldfe fA11 = [rPolDataPtr], -32 // A11 | |
2609 | // High part of log(|x|) = Y_hi = N * log2_hi + H | |
2610 | fma.s1 fResH = fFloatN, FR_log2_hi, FR_H | |
2611 | (p10) cmp.eq p8, p9 = rXRnd, r0 | |
2612 | } | |
2613 | { .mfi | |
2614 | ldfe fA10 = [rTmpPtr2], -32 // A10 | |
2615 | fma.s1 fRes6H = fA1, FR_FracX, f0 // (A1*y)hi | |
2616 | (p11) cmp.eq p9, p8 = rXRnd, r0 | |
2617 | } | |
2618 | ;; | |
2619 | { .mfi | |
2620 | ldfe fA9 = [rPolDataPtr], -32 // A9 | |
2621 | fma.s1 fB14 = fLnSin6, fDxSqr, f0 // (LnSin6*deltaX^2)hi | |
2622 | cmp.eq p6, p7 = 4, rSgnGamSize | |
2623 | } | |
2624 | { .mfi | |
2625 | ldfe fA8 = [rTmpPtr2], -32 // A8 | |
2626 | fma.s1 fA18 = fA19, FR_FracX, fA18 | |
2627 | nop.i 0 | |
2628 | } | |
2629 | ;; | |
2630 | { .mfi | |
2631 | ldfe fA7 = [rPolDataPtr] // A7 | |
2632 | fma.s1 fA23 = fA23, FR_FracX, fA22 | |
2633 | nop.i 0 | |
2634 | } | |
2635 | { .mfi | |
2636 | ldfe fA6 = [rTmpPtr2] // A6 | |
2637 | fma.s1 fA21 = fA21, FR_FracX, fA20 | |
2638 | nop.i 0 | |
2639 | } | |
2640 | ;; | |
2641 | { .mfi | |
2642 | ldfe fLnSin14 = [rLnSinDataPtr] | |
2643 | fms.s1 fRes1L = fA3, FR_FracX, fRes1H // delta((A3*y)hi) | |
2644 | extr.u GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1 | |
2645 | } | |
2646 | { .mfi | |
2647 | setf.sig fFloatNDx = GR_N | |
2648 | fadd.s1 fPol = fRes1H, fA2 // (A3*y + A2)hi | |
2649 | nop.i 0 | |
2650 | } | |
2651 | ;; | |
2652 | { .mfi | |
2653 | ldfps FR_G, FR_H = [GR_ad_tbl_1], 8 // Load G_1, H_1 | |
2654 | fma.s1 fRes2H = fA4, fXSqr, f0 // ((A5 + A4*y)*y^2)hi | |
2655 | nop.i 0 | |
2656 | } | |
2657 | { .mfi | |
2658 | shladd GR_ad_z_2 = GR_Index2, 2, rZ2Addr // Point to Z_2 | |
2659 | fma.s1 fA25 = fA25, FR_FracX, fA24 | |
2660 | shladd GR_ad_tbl_2 = GR_Index2, 4, rTbl2Addr // Point to G_2 | |
2661 | } | |
2662 | ;; | |
2663 | .pred.rel "mutex",p8,p9 | |
2664 | { .mfi | |
2665 | ld4 GR_Z_2 = [GR_ad_z_2] // Load Z_2 | |
2666 | fms.s1 fRes6L = fA1, FR_FracX, fRes6H // delta((A1*y)hi) | |
2667 | // sign of GAMMA(x) is negative | |
2668 | (p8) adds rSgnGam = -1, r0 | |
2669 | } | |
2670 | { .mfi | |
2671 | adds rTmpPtr = 8, GR_ad_tbl_2 | |
2672 | fadd.s1 fRes3H = fRes6H, fA0 // (A1*y + A0)hi | |
2673 | // sign of GAMMA(x) is positive | |
2674 | (p9) adds rSgnGam = 1, r0 | |
2675 | } | |
2676 | ;; | |
2677 | { .mfi | |
2678 | ldfps FR_G2, FR_H2 = [GR_ad_tbl_2] // Load G_2, H_2 | |
2679 | // (LnSin6*deltaX^2 + LnSin4)hi | |
2680 | fadd.s1 fLnSinH = fB14, fLnSin4 | |
2681 | nop.i 0 | |
2682 | } | |
2683 | { .mfi | |
2684 | ldfd FR_h2 = [rTmpPtr] // Load h_2 | |
2685 | fms.s1 fB16 = fLnSin6, fDxSqr, fB14 // delta(LnSin6*deltaX^2) | |
2686 | nop.i 0 | |
2687 | } | |
2688 | ;; | |
2689 | { .mfi | |
2690 | ldfd fhDelX = [GR_ad_tbl_1] // Load h_1 | |
2691 | fma.s1 fA21 = fA21, fXSqr, fA18 | |
2692 | nop.i 0 | |
2693 | } | |
2694 | { .mfi | |
2695 | nop.m 0 | |
2696 | fma.s1 fLnSin36 = fLnSin36, fDelX4, fLnSin32 | |
2697 | nop.i 0 | |
2698 | } | |
2699 | ;; | |
2700 | { .mfi | |
2701 | nop.m 0 | |
2702 | fma.s1 fRes1L = fA3L, FR_FracX, fRes1L // (A3*y)lo | |
2703 | // Get bits 30-15 of X_1 * Z_ | |
2704 | pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15 | |
2705 | } | |
2706 | { .mfi | |
2707 | nop.m 0 | |
2708 | fsub.s1 fPolL = fA2, fPol | |
2709 | nop.i 0 | |
2710 | } | |
2711 | ;; | |
2712 | // | |
2713 | // For performance, don't use result of pmpyshr2.u for 4 cycles. | |
2714 | // | |
2715 | { .mfi | |
2716 | nop.m 0 | |
2717 | // delta(((A5 + A4*y)*y^2)hi) | |
2718 | fms.s1 fRes2L = fA4, fXSqr, fRes2H | |
2719 | nop.i 0 | |
2720 | } | |
2721 | { .mfi | |
2722 | nop.m 0 | |
2723 | // (((A5 + A4*y)*y^2) + A3*y + A2)hi | |
2724 | fadd.s1 fRes4H = fRes2H, fPol | |
2725 | nop.i 0 | |
2726 | } | |
2727 | ;; | |
2728 | { .mfi | |
2729 | // store signgam if size of variable is 4 bytes | |
2730 | (p6) st4 [rSgnGamAddr] = rSgnGam | |
2731 | fma.s1 fRes6L = fA1L, FR_FracX, fRes6L // (A1*y)lo | |
2732 | nop.i 0 | |
2733 | } | |
2734 | { .mfi | |
2735 | // store signgam if size of variable is 8 bytes | |
2736 | (p7) st8 [rSgnGamAddr] = rSgnGam | |
2737 | fsub.s1 fRes3L = fA0, fRes3H | |
2738 | nop.i 0 | |
2739 | } | |
2740 | ;; | |
2741 | { .mfi | |
2742 | nop.m 0 | |
2743 | fsub.s1 fLnSinL = fLnSin4, fLnSinH | |
2744 | nop.i 0 | |
2745 | } | |
2746 | { .mfi | |
2747 | nop.m 0 | |
2748 | // ((LnSin6*deltaX^2 + LnSin4)*deltaX^2)hi | |
2749 | fma.s1 fB18 = fLnSinH, fDxSqr, f0 | |
2750 | nop.i 0 | |
2751 | } | |
2752 | ;; | |
2753 | { .mfi | |
2754 | adds rTmpPtr = 8, rTbl3Addr | |
2755 | fma.s1 fB16 = fLnSin6, fDxSqrL, fB16 // (LnSin6*deltaX^2)lo | |
2756 | extr.u GR_Index3 = GR_X_2, 1, 5 // Extract bits 1-5 of X_2 | |
2757 | } | |
2758 | { .mfi | |
2759 | nop.m 0 | |
2760 | fma.s1 fA25 = fA25, fXSqr, fA23 | |
2761 | nop.i 0 | |
2762 | } | |
2763 | ;; | |
2764 | { .mfi | |
2765 | shladd GR_ad_tbl_3 = GR_Index3, 4, rTbl3Addr // Point to G_3 | |
2766 | fadd.s1 fPolL = fPolL, fRes1H | |
2767 | nop.i 0 | |
2768 | } | |
2769 | { .mfi | |
2770 | shladd rTmpPtr = GR_Index3, 4, rTmpPtr // Point to G_3 | |
2771 | fadd.s1 fRes1L = fRes1L, fA2L // (A3*y)lo + A2lo | |
2772 | nop.i 0 | |
2773 | } | |
2774 | ;; | |
2775 | { .mfi | |
2776 | ldfps FR_G3, FR_H3 = [GR_ad_tbl_3] // Load G_3, H_3 | |
2777 | fma.s1 fRes2L = fA4, fXSqrL, fRes2L // ((A5 + A4*y)*y^2)lo | |
2778 | nop.i 0 | |
2779 | } | |
2780 | { .mfi | |
2781 | ldfd FR_h3 = [rTmpPtr] // Load h_3 | |
2782 | fsub.s1 fRes4L = fPol, fRes4H | |
2783 | nop.i 0 | |
2784 | } | |
2785 | ;; | |
2786 | { .mfi | |
2787 | nop.m 0 | |
2788 | // ((((A5 + A4*y)*y^2) + A3*y + A2)*y^2)hi | |
2789 | fma.s1 fRes7H = fRes4H, fXSqr, f0 | |
2790 | nop.i 0 | |
2791 | } | |
2792 | { .mfi | |
2793 | nop.m 0 | |
2794 | fma.s1 fA15 = fA15, FR_FracX, fA14 | |
2795 | nop.i 0 | |
2796 | } | |
2797 | ;; | |
2798 | { .mfi | |
2799 | nop.m 0 | |
2800 | fadd.s1 fRes3L = fRes3L, fRes6H | |
2801 | nop.i 0 | |
2802 | } | |
2803 | { .mfi | |
2804 | nop.m 0 | |
2805 | fadd.s1 fRes6L = fRes6L, fA0L // (A1*y)lo + A0lo | |
2806 | nop.i 0 | |
2807 | } | |
2808 | ;; | |
2809 | { .mfi | |
2810 | nop.m 0 | |
2811 | fadd.s1 fLnSinL = fLnSinL, fB14 | |
2812 | ||
2813 | nop.i 0 | |
2814 | } | |
2815 | { .mfi | |
2816 | nop.m 0 | |
2817 | // delta((LnSin6*deltaX^2 + LnSin4)*deltaX^2) | |
2818 | fms.s1 fB20 = fLnSinH, fDxSqr, fB18 | |
2819 | nop.i 0 | |
2820 | } | |
2821 | ;; | |
2822 | { .mfi | |
2823 | nop.m 0 | |
2824 | fadd.s1 fPolL = fPolL, fRes1L // (A3*y + A2)lo | |
2825 | ||
2826 | nop.i 0 | |
2827 | } | |
2828 | { .mfi | |
2829 | nop.m 0 | |
2830 | // ((LnSin6*deltaX^2 + LnSin4)*deltaX^2 + LnSin2)hi | |
2831 | fadd.s1 fLnSin6 = fB18, fLnSin2 | |
2832 | nop.i 0 | |
2833 | } | |
2834 | ;; | |
2835 | { .mfi | |
2836 | nop.m 0 | |
2837 | fadd.s1 fRes4L = fRes4L, fRes2H | |
2838 | nop.i 0 | |
2839 | } | |
2840 | { .mfi | |
2841 | nop.m 0 | |
2842 | fma.s1 fA17 = fA17, FR_FracX, fA16 | |
2843 | nop.i 0 | |
2844 | } | |
2845 | ;; | |
2846 | { .mfi | |
2847 | nop.m 0 | |
2848 | // delta(((((A5 + A4*y)*y^2) + A3*y + A2)*y^2) | |
2849 | fms.s1 fRes7L = fRes4H, fXSqr, fRes7H | |
2850 | nop.i 0 | |
2851 | } | |
2852 | { .mfi | |
2853 | nop.m 0 | |
2854 | fadd.s1 fPol = fRes7H, fRes3H | |
2855 | nop.i 0 | |
2856 | } | |
2857 | ;; | |
2858 | { .mfi | |
2859 | nop.m 0 | |
2860 | fadd.s1 fRes3L = fRes3L, fRes6L // (A1*y + A0)lo | |
2861 | nop.i 0 | |
2862 | } | |
2863 | { .mfi | |
2864 | nop.m 0 | |
2865 | fma.s1 fA25 = fA25, fX4, fA21 | |
2866 | nop.i 0 | |
2867 | } | |
2868 | ;; | |
2869 | { .mfi | |
2870 | nop.m 0 | |
2871 | // (LnSin6*deltaX^2 + LnSin4)lo | |
2872 | fadd.s1 fLnSinL = fLnSinL, fB16 | |
2873 | nop.i 0 | |
2874 | } | |
2875 | { .mfi | |
2876 | nop.m 0 | |
2877 | fma.s1 fB20 = fLnSinH, fDxSqrL, fB20 | |
2878 | nop.i 0 | |
2879 | } | |
2880 | ;; | |
2881 | { .mfi | |
2882 | nop.m 0 | |
2883 | fsub.s1 fLnSin4 = fLnSin2, fLnSin6 | |
2884 | nop.i 0 | |
2885 | } | |
2886 | { .mfi | |
2887 | nop.m 0 | |
2888 | // (((LnSin6*deltaX^2 + LnSin4)*deltaX^2 + LnSin2)*DeltaX^2)hi | |
2889 | fma.s1 fLnSinH = fLnSin6, fDxSqr, f0 | |
2890 | nop.i 0 | |
2891 | } | |
2892 | ;; | |
2893 | { .mfi | |
2894 | nop.m 0 | |
2895 | // ((A5 + A4*y)*y^2)lo + (A3*y + A2)lo | |
2896 | fadd.s1 fRes2L = fRes2L, fPolL | |
2897 | nop.i 0 | |
2898 | } | |
2899 | { .mfi | |
2900 | nop.m 0 | |
2901 | fma.s1 fA17 = fA17, fXSqr, fA15 | |
2902 | nop.i 0 | |
2903 | } | |
2904 | ;; | |
2905 | { .mfi | |
2906 | nop.m 0 | |
2907 | // ((((A5 + A4*y)*y^2) + A3*y + A2)*y^2)lo | |
2908 | fma.s1 fRes7L = fRes4H, fXSqrL, fRes7L | |
2909 | nop.i 0 | |
2910 | } | |
2911 | { .mfi | |
2912 | nop.m 0 | |
2913 | fsub.s1 fPolL = fRes3H, fPol | |
2914 | nop.i 0 | |
2915 | } | |
2916 | ;; | |
2917 | { .mfi | |
2918 | nop.m 0 | |
2919 | fma.s1 fA13 = fA13, FR_FracX, fA12 | |
2920 | nop.i 0 | |
2921 | } | |
2922 | { .mfi | |
2923 | nop.m 0 | |
2924 | fma.s1 fA11 = fA11, FR_FracX, fA10 | |
2925 | nop.i 0 | |
2926 | } | |
2927 | ;; | |
2928 | { .mfi | |
2929 | nop.m 0 | |
2930 | // ((LnSin6*deltaX^2 + LnSin4)*deltaX^2)lo | |
2931 | fma.s1 fB20 = fLnSinL, fDxSqr, fB20 | |
2932 | nop.i 0 | |
2933 | } | |
2934 | { .mfi | |
2935 | nop.m 0 | |
2936 | fmpy.s1 FR_G = FR_G, FR_G2 // G = G_1 * G_2 | |
2937 | nop.i 0 | |
2938 | } | |
2939 | ;; | |
2940 | { .mfi | |
2941 | nop.m 0 | |
2942 | fadd.s1 fLnSin4 = fLnSin4, fB18 | |
2943 | nop.i 0 | |
2944 | } | |
2945 | { .mfi | |
2946 | nop.m 0 | |
2947 | fms.s1 fLnSinL = fLnSin6, fDxSqr, fLnSinH | |
2948 | nop.i 0 | |
2949 | } | |
2950 | ;; | |
2951 | { .mfi | |
2952 | nop.m 0 | |
2953 | // (((A5 + A4*y)*y^2) + A3*y + A2)lo | |
2954 | fadd.s1 fRes4L = fRes4L, fRes2L | |
2955 | nop.i 0 | |
2956 | } | |
2957 | { .mfi | |
2958 | nop.m 0 | |
2959 | fadd.s1 fhDelX = fhDelX, FR_h2 // h = h_1 + h_2 | |
2960 | nop.i 0 | |
2961 | } | |
2962 | ;; | |
2963 | { .mfi | |
2964 | nop.m 0 | |
2965 | fadd.s1 fRes7L = fRes7L, fRes3L | |
2966 | nop.i 0 | |
2967 | } | |
2968 | { .mfi | |
2969 | nop.m 0 | |
2970 | fadd.s1 fPolL = fPolL, fRes7H | |
2971 | nop.i 0 | |
2972 | } | |
2973 | ;; | |
2974 | { .mfi | |
2975 | nop.m 0 | |
2976 | fcvt.xf fFloatNDx = fFloatNDx | |
2977 | nop.i 0 | |
2978 | } | |
2979 | { .mfi | |
2980 | nop.m 0 | |
2981 | fadd.s1 FR_H = FR_H, FR_H2 // H = H_1 + H_2 | |
2982 | nop.i 0 | |
2983 | } | |
2984 | ;; | |
2985 | { .mfi | |
2986 | nop.m 0 | |
2987 | fmpy.s1 FR_G = FR_G, FR_G3 // G = (G_1 * G_2) * G_3 | |
2988 | nop.i 0 | |
2989 | } | |
2990 | { .mfi | |
2991 | nop.m 0 | |
2992 | // ((LnSin6*deltaX^2 + LnSin4)*deltaX^2)lo + (LnSin2)lo | |
2993 | fadd.s1 fLnSin2L = fLnSin2L, fB20 | |
2994 | nop.i 0 | |
2995 | } | |
2996 | ;; | |
2997 | { .mfi | |
2998 | nop.m 0 | |
2999 | fma.s1 fA25 = fA25, fX4, fA17 | |
3000 | nop.i 0 | |
3001 | } | |
3002 | { .mfi | |
3003 | nop.m 0 | |
3004 | fma.s1 fA13 = fA13, fXSqr, fA11 | |
3005 | nop.i 0 | |
3006 | } | |
3007 | ;; | |
3008 | { .mfi | |
3009 | nop.m 0 | |
3010 | fma.s1 fA9 = fA9, FR_FracX, fA8 | |
3011 | nop.i 0 | |
3012 | } | |
3013 | { .mfi | |
3014 | nop.m 0 | |
3015 | fma.s1 fA7 = fA7, FR_FracX, fA6 | |
3016 | nop.i 0 | |
3017 | } | |
3018 | ;; | |
3019 | { .mfi | |
3020 | nop.m 0 | |
3021 | fma.s1 fLnSin36 = fLnSin36, fDelX8, fLnSin28 | |
3022 | nop.i 0 | |
3023 | } | |
3024 | { .mfi | |
3025 | nop.m 0 | |
3026 | fma.s1 fLnSin14 = fLnSin14, fDxSqr, fLnSin12 | |
3027 | nop.i 0 | |
3028 | } | |
3029 | ;; | |
3030 | { .mfi | |
3031 | nop.m 0 | |
3032 | fma.s1 fLnSin10 = fLnSin10, fDxSqr, fLnSin8 | |
3033 | nop.i 0 | |
3034 | } | |
3035 | { .mfi | |
3036 | nop.m 0 | |
3037 | fadd.s1 FR_H = FR_H, FR_H3 // H = (H_1 + H_2) + H_3 | |
3038 | nop.i 0 | |
3039 | } | |
3040 | ;; | |
3041 | { .mfi | |
3042 | nop.m 0 | |
3043 | fms.s1 fRDx = FR_G, fNormDx, f1 // r = G * S_hi - 1 | |
3044 | nop.i 0 | |
3045 | } | |
3046 | { .mfi | |
3047 | nop.m 0 | |
3048 | // poly_lo = r * Q4 + Q3 | |
3049 | fma.s1 FR_poly_lo = FR_r, FR_Q4, FR_Q3 | |
3050 | nop.i 0 | |
3051 | } | |
3052 | ;; | |
3053 | { .mfi | |
3054 | nop.m 0 | |
3055 | fmpy.s1 FR_rsq = FR_r, FR_r // rsq = r * r | |
3056 | nop.i 0 | |
3057 | } | |
3058 | { .mfi | |
3059 | nop.m 0 | |
3060 | // ((((A5 + A4*y)*y^2) + A3*y + A2)*y^2)lo + (A1*y + A0)lo | |
3061 | fma.s1 fRes7L = fRes4L, fXSqr, fRes7L | |
3062 | nop.i 0 | |
3063 | } | |
3064 | ;; | |
3065 | { .mfi | |
3066 | nop.m 0 | |
3067 | fma.s1 fA25 = fA25, fX4, fA13 | |
3068 | nop.i 0 | |
3069 | } | |
3070 | { .mfi | |
3071 | nop.m 0 | |
3072 | fma.s1 fA9 = fA9, fXSqr, fA7 | |
3073 | nop.i 0 | |
3074 | } | |
3075 | ;; | |
3076 | { .mfi | |
3077 | nop.m 0 | |
3078 | // h = N * log2_lo + h | |
3079 | fma.s1 FR_h = fFloatN, FR_log2_lo, FR_h | |
3080 | nop.i 0 | |
3081 | } | |
3082 | { .mfi | |
3083 | nop.m 0 | |
3084 | fadd.s1 fhDelX = fhDelX, FR_h3 // h = (h_1 + h_2) + h_3 | |
3085 | nop.i 0 | |
3086 | } | |
3087 | ;; | |
3088 | { .mfi | |
3089 | nop.m 0 | |
3090 | fma.s1 fLnSin36 = fLnSin36, fDelX6, fLnSin20 | |
3091 | nop.i 0 | |
3092 | } | |
3093 | { .mfi | |
3094 | nop.m 0 | |
3095 | fma.s1 fLnSin14 = fLnSin14, fDelX4, fLnSin10 | |
3096 | nop.i 0 | |
3097 | } | |
3098 | ;; | |
3099 | { .mfi | |
3100 | nop.m 0 | |
3101 | // poly_lo = r * Q4 + Q3 | |
3102 | fma.s1 fPolyLoDx = fRDx, FR_Q4, FR_Q3 | |
3103 | nop.i 0 | |
3104 | } | |
3105 | { .mfi | |
3106 | nop.m 0 | |
3107 | fmpy.s1 fRDxSq = fRDx, fRDx // rsq = r * r | |
3108 | nop.i 0 | |
3109 | } | |
3110 | ;; | |
3111 | { .mfi | |
3112 | nop.m 0 | |
3113 | // Y_hi = N * log2_hi + H | |
3114 | fma.s1 fResLnDxH = fFloatNDx, FR_log2_hi, FR_H | |
3115 | nop.i 0 | |
3116 | } | |
3117 | { .mfi | |
3118 | nop.m 0 | |
3119 | fma.s1 FR_rcub = FR_rsq, FR_r, f0 // rcub = r^3 | |
3120 | nop.i 0 | |
3121 | } | |
3122 | ;; | |
3123 | { .mfi | |
3124 | nop.m 0 | |
3125 | fma.s1 fA9 = fA25, fX4, fA9 | |
3126 | nop.i 0 | |
3127 | } | |
3128 | { .mfi | |
3129 | nop.m 0 | |
3130 | fadd.s1 fPolL = fPolL, fRes7L | |
3131 | nop.i 0 | |
3132 | } | |
3133 | ;; | |
3134 | { .mfi | |
3135 | nop.m 0 | |
3136 | fadd.s1 fLnSin4 = fLnSin4, fLnSin2L | |
3137 | nop.i 0 | |
3138 | } | |
3139 | { .mfi | |
3140 | nop.m 0 | |
3141 | // h = N * log2_lo + h | |
3142 | fma.s1 fhDelX = fFloatNDx, FR_log2_lo, fhDelX | |
3143 | nop.i 0 | |
3144 | } | |
3145 | ;; | |
3146 | { .mfi | |
3147 | nop.m 0 | |
3148 | fma.s1 fLnSin36 = fLnSin36, fDelX8, fLnSin14 | |
3149 | nop.i 0 | |
3150 | } | |
3151 | { .mfi | |
3152 | nop.m 0 | |
3153 | // ((LnSin6*deltaX^2 + LnSin4)*deltaX^2 + LnSin2)lo | |
3154 | fma.s1 fLnSinL = fLnSin6, fDxSqrL, fLnSinL | |
3155 | nop.i 0 | |
3156 | } | |
3157 | ;; | |
3158 | { .mfi | |
3159 | nop.m 0 | |
3160 | // poly_lo = poly_lo * r + Q2 | |
3161 | fma.s1 fPolyLoDx = fPolyLoDx, fRDx, FR_Q2 | |
3162 | nop.i 0 | |
3163 | } | |
3164 | { .mfi | |
3165 | nop.m 0 | |
3166 | fma.s1 fRDxCub = fRDxSq, fRDx, f0 // rcub = r^3 | |
3167 | nop.i 0 | |
3168 | } | |
3169 | ;; | |
3170 | { .mfi | |
3171 | nop.m 0 | |
3172 | famax.s0 fRes5H = fPol, fResH | |
3173 | nop.i 0 | |
3174 | } | |
3175 | { .mfi | |
3176 | nop.m 0 | |
3177 | // High part of (lgammal(|x|) + log(|x|)) | |
3178 | fadd.s1 fRes1H = fPol, fResH | |
3179 | nop.i 0 | |
3180 | } | |
3181 | ;; | |
3182 | { .mfi | |
3183 | nop.m 0 | |
3184 | // poly_lo = poly_lo * r + Q2 | |
3185 | fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_Q2 | |
3186 | nop.i 0 | |
3187 | } | |
3188 | { .mfi | |
3189 | nop.m 0 | |
3190 | fma.s1 fPolL = fA9, fX6, fPolL // P25lo | |
3191 | nop.i 0 | |
3192 | } | |
3193 | ;; | |
3194 | ||
3195 | { .mfi | |
3196 | nop.m 0 | |
3197 | famin.s0 fRes5L = fPol, fResH | |
3198 | nop.i 0 | |
3199 | } | |
3200 | { .mfi | |
3201 | nop.m 0 | |
3202 | // High part of -(LnSin + log(|DeltaX|)) | |
3203 | fnma.s1 fRes2H = fResLnDxH, f1, fLnSinH | |
3204 | nop.i 0 | |
3205 | } | |
3206 | ;; | |
3207 | ||
3208 | { .mfi | |
3209 | nop.m 0 | |
3210 | // (((LnSin6*deltaX^2 + LnSin4)*deltaX^2 + LnSin2)*DeltaX^2)lo | |
3211 | fma.s1 fLnSinL = fLnSin4, fDxSqr, fLnSinL | |
3212 | nop.i 0 | |
3213 | } | |
3214 | { .mfi | |
3215 | nop.m 0 | |
3216 | fma.s1 fLnSin36 = fLnSin36, fDelX6, f0 | |
3217 | nop.i 0 | |
3218 | } | |
3219 | ;; | |
3220 | { .mfi | |
3221 | nop.m 0 | |
3222 | // poly_hi = Q1 * rsq + r | |
3223 | fma.s1 fPolyHiDx = FR_Q1, fRDxSq, fRDx | |
3224 | nop.i 0 | |
3225 | } | |
3226 | { .mfi | |
3227 | nop.m 0 | |
3228 | // poly_lo = poly_lo*r^3 + h | |
3229 | fma.s1 fPolyLoDx = fPolyLoDx, fRDxCub, fhDelX | |
3230 | nop.i 0 | |
3231 | } | |
3232 | ;; | |
3233 | { .mfi | |
3234 | nop.m 0 | |
3235 | fsub.s1 fRes1L = fRes5H, fRes1H | |
3236 | nop.i 0 | |
3237 | } | |
3238 | { .mfi | |
3239 | nop.m 0 | |
3240 | // -(lgammal(|x|) + log(|x|))hi | |
3241 | fnma.s1 fRes1H = fRes1H, f1, f0 | |
3242 | ||
3243 | nop.i 0 | |
3244 | } | |
3245 | ;; | |
3246 | { .mfi | |
3247 | nop.m 0 | |
3248 | // poly_hi = Q1 * rsq + r | |
3249 | fma.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r | |
3250 | nop.i 0 | |
3251 | } | |
3252 | { .mfi | |
3253 | nop.m 0 | |
3254 | // poly_lo = poly_lo*r^3 + h | |
3255 | fma.s1 FR_poly_lo = FR_poly_lo, FR_rcub, FR_h | |
3256 | nop.i 0 | |
3257 | } | |
3258 | ;; | |
3259 | { .mfi | |
3260 | nop.m 0 | |
3261 | fms.s1 fRes2L = fResLnDxH, fMOne, fRes2H | |
3262 | nop.i 0 | |
3263 | } | |
3264 | ;; | |
3265 | { .mfi | |
3266 | nop.m 0 | |
3267 | fma.s1 fLnSinL = fLnSin36, fDxSqr, fLnSinL | |
3268 | nop.i 0 | |
3269 | } | |
3270 | { .mfi | |
3271 | nop.m 0 | |
3272 | // Y_lo = poly_hi + poly_lo | |
3273 | fadd.s1 fResLnDxL = fPolyHiDx, fPolyLoDx | |
3274 | nop.i 0 | |
3275 | } | |
3276 | ;; | |
3277 | { .mfi | |
3278 | nop.m 0 | |
3279 | fadd.s1 fRes1L = fRes1L, fRes5L | |
3280 | nop.i 0 | |
3281 | } | |
3282 | { .mfi | |
3283 | nop.m 0 | |
3284 | // high part of the final result | |
3285 | fadd.s1 fYH = fRes2H, fRes1H | |
3286 | nop.i 0 | |
3287 | } | |
3288 | ;; | |
3289 | { .mfi | |
3290 | nop.m 0 | |
3291 | // Y_lo = poly_hi + poly_lo | |
3292 | fadd.s1 fResL = FR_poly_hi, FR_poly_lo | |
3293 | nop.i 0 | |
3294 | } | |
3295 | ;; | |
3296 | { .mfi | |
3297 | nop.m 0 | |
3298 | famax.s0 fRes4H = fRes2H, fRes1H | |
3299 | nop.i 0 | |
3300 | } | |
3301 | ;; | |
3302 | { .mfi | |
3303 | nop.m 0 | |
3304 | famin.s0 fRes4L = fRes2H, fRes1H | |
3305 | nop.i 0 | |
3306 | } | |
3307 | ;; | |
3308 | { .mfi | |
3309 | nop.m 0 | |
3310 | // (LnSin)lo + (log(|DeltaX|))lo | |
3311 | fsub.s1 fLnSinL = fLnSinL, fResLnDxL | |
3312 | nop.i 0 | |
3313 | } | |
3314 | { .mfi | |
3315 | nop.m 0 | |
3316 | fadd.s1 fRes2L = fRes2L, fLnSinH | |
3317 | nop.i 0 | |
3318 | } | |
3319 | ;; | |
3320 | { .mfi | |
3321 | nop.m 0 | |
3322 | //(lgammal(|x|))lo + (log(|x|))lo | |
3323 | fadd.s1 fPolL = fResL, fPolL | |
3324 | nop.i 0 | |
3325 | } | |
3326 | ;; | |
3327 | { .mfi | |
3328 | nop.m 0 | |
3329 | fsub.s1 fYL = fRes4H, fYH | |
3330 | nop.i 0 | |
3331 | } | |
3332 | ;; | |
3333 | { .mfi | |
3334 | nop.m 0 | |
3335 | // Low part of -(LnSin + log(|DeltaX|)) | |
3336 | fadd.s1 fRes2L = fRes2L, fLnSinL | |
3337 | nop.i 0 | |
3338 | } | |
3339 | { .mfi | |
3340 | nop.m 0 | |
3341 | // High part of (lgammal(|x|) + log(|x|)) | |
3342 | fadd.s1 fRes1L = fRes1L, fPolL | |
3343 | nop.i 0 | |
3344 | } | |
3345 | ;; | |
3346 | { .mfi | |
3347 | nop.m 0 | |
3348 | fadd.s1 fYL = fYL, fRes4L | |
3349 | nop.i 0 | |
3350 | } | |
3351 | { .mfi | |
3352 | nop.m 0 | |
3353 | fsub.s1 fRes2L = fRes2L, fRes1L | |
3354 | nop.i 0 | |
3355 | } | |
3356 | ;; | |
3357 | { .mfi | |
3358 | nop.m 0 | |
3359 | // low part of the final result | |
3360 | fadd.s1 fYL = fYL, fRes2L | |
3361 | nop.i 0 | |
3362 | } | |
3363 | ;; | |
3364 | { .mfb | |
3365 | nop.m 0 | |
3366 | // final result for -6.0 < x <= -0.75, non-integer, "far" from roots | |
3367 | fma.s0 f8 = fYH, f1, fYL | |
3368 | // exit here for -6.0 < x <= -0.75, non-integer, "far" from roots | |
3369 | br.ret.sptk b0 | |
3370 | } | |
3371 | ;; | |
3372 | ||
3373 | // here if |x+1| < 2^(-7) | |
3374 | .align 32 | |
3375 | _closeToNegOne: | |
3376 | { .mfi | |
3377 | getf.exp GR_N = fDx // Get N = exponent of x | |
3378 | fmerge.se fAbsX = f1, fDx // Form |deltaX| | |
3379 | // Get high 4 bits of significand of deltaX | |
3380 | extr.u rIndex1Dx = rSignifDx, 59, 4 | |
3381 | } | |
3382 | { .mfi | |
3383 | addl rPolDataPtr= @ltoff(lgammal_1pEps_data),gp | |
3384 | fma.s1 fA0L = fDxSqr, fDxSqr, f0 // deltaX^4 | |
3385 | // sign of GAMMA is positive if p10 is set to 1 | |
3386 | (p10) adds rSgnGam = 1, r0 | |
3387 | } | |
3388 | ;; | |
3389 | { .mfi | |
3390 | shladd GR_ad_z_1 = rIndex1Dx, 2, GR_ad_z_1 // Point to Z_1 | |
3391 | fnma.s1 fResL = fDx, f1, f0 // -(x+1) | |
3392 | // Get high 15 bits of significand | |
3393 | extr.u GR_X_0 = rSignifDx, 49, 15 | |
3394 | } | |
3395 | { .mfi | |
3396 | ld8 rPolDataPtr = [rPolDataPtr] | |
3397 | nop.f 0 | |
3398 | shladd GR_ad_tbl_1 = rIndex1Dx, 4, rTbl1Addr // Point to G_1 | |
3399 | } | |
3400 | ;; | |
3401 | { .mfi | |
3402 | ld4 GR_Z_1 = [GR_ad_z_1] // Load Z_1 | |
3403 | nop.f 0 | |
3404 | and GR_N = GR_N, r17Ones // mask sign bit | |
3405 | } | |
3406 | { .mfi | |
3407 | adds rTmpPtr = 8, GR_ad_tbl_1 | |
3408 | nop.f 0 | |
3409 | cmp.eq p6, p7 = 4, rSgnGamSize | |
3410 | } | |
3411 | ;; | |
3412 | { .mfi | |
3413 | ldfps FR_G, FR_H = [GR_ad_tbl_1],8 // Load G_1, H_1 | |
3414 | nop.f 0 | |
3415 | adds rTmpPtr2 = 96, rPolDataPtr | |
3416 | } | |
3417 | { .mfi | |
3418 | ldfd FR_h = [rTmpPtr] // Load h_1 | |
3419 | nop.f 0 | |
3420 | // unbiased exponent of deltaX | |
3421 | sub GR_N = GR_N, rExpHalf, 1 | |
3422 | } | |
3423 | ;; | |
3424 | { .mfi | |
3425 | adds rTmpPtr3 = 192, rPolDataPtr | |
3426 | nop.f 0 | |
3427 | // sign of GAMMA is negative if p11 is set to 1 | |
3428 | (p11) adds rSgnGam = -1, r0 | |
3429 | } | |
3430 | { .mfi | |
3431 | ldfe fA1 = [rPolDataPtr], 16 // A1 | |
3432 | nop.f 0 | |
3433 | nop.i 0 | |
3434 | } | |
3435 | ;; | |
3436 | {.mfi | |
3437 | ldfe fA2 = [rPolDataPtr], 16 // A2 | |
3438 | nop.f 0 | |
3439 | // Get bits 30-15 of X_0 * Z_1 | |
3440 | pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15 | |
3441 | } | |
3442 | { .mfi | |
3443 | ldfpd fA20, fA19 = [rTmpPtr2], 16 // P8, P7 | |
3444 | nop.f 0 | |
3445 | nop.i 0 | |
3446 | } | |
3447 | ;; | |
3448 | // | |
3449 | // For performance, don't use result of pmpyshr2.u for 4 cycles. | |
3450 | // | |
3451 | { .mfi | |
3452 | ldfe fA3 = [rPolDataPtr], 16 // A3 | |
3453 | nop.f 0 | |
3454 | nop.i 0 | |
3455 | } | |
3456 | { .mfi | |
3457 | ldfpd fA18, fA17 = [rTmpPtr2], 16 // P6, P5 | |
3458 | nop.f 0 | |
3459 | nop.i 0 | |
3460 | } | |
3461 | ;; | |
3462 | { .mfi | |
3463 | ldfe fA4 = [rPolDataPtr], 16 // A4 | |
3464 | nop.f 0 | |
3465 | nop.i 0 | |
3466 | } | |
3467 | { .mfi | |
3468 | ldfpd fA16, fA15 = [rTmpPtr2], 16 // P4, p3 | |
3469 | nop.f 0 | |
3470 | nop.i 0 | |
3471 | } | |
3472 | ;; | |
3473 | { .mfi | |
3474 | ldfpd fA5L, fA6 = [rPolDataPtr], 16 // A5, A6 | |
3475 | nop.f 0 | |
3476 | nop.i 0 | |
3477 | } | |
3478 | { .mfi | |
3479 | ldfpd fA14, fA13 = [rTmpPtr2], 16 // P2, P1 | |
3480 | nop.f 0 | |
3481 | nop.i 0 | |
3482 | } | |
3483 | ;; | |
3484 | { .mfi | |
3485 | ldfpd fA7, fA8 = [rPolDataPtr], 16 // A7, A8 | |
3486 | nop.f 0 | |
3487 | extr.u GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1 | |
3488 | } | |
3489 | { .mfi | |
3490 | ldfe fLnSin2 = [rTmpPtr2], 16 | |
3491 | nop.f 0 | |
3492 | nop.i 0 | |
3493 | } | |
3494 | ;; | |
3495 | { .mfi | |
3496 | shladd GR_ad_z_2 = GR_Index2, 2, rZ2Addr // Point to Z_2 | |
3497 | nop.f 0 | |
3498 | shladd GR_ad_tbl_2 = GR_Index2, 4, rTbl2Addr // Point to G_2 | |
3499 | } | |
3500 | { .mfi | |
3501 | ldfe fLnSin4 = [rTmpPtr2], 32 | |
3502 | nop.f 0 | |
3503 | nop.i 0 | |
3504 | } | |
3505 | ;; | |
3506 | { .mfi | |
3507 | ld4 GR_Z_2 = [GR_ad_z_2] // Load Z_2 | |
3508 | nop.f 0 | |
3509 | adds rTmpPtr = 8, GR_ad_tbl_2 | |
3510 | } | |
3511 | { .mfi | |
3512 | // Put integer N into rightmost significand | |
3513 | setf.sig fFloatN = GR_N | |
3514 | nop.f 0 | |
3515 | nop.i 0 | |
3516 | } | |
3517 | ;; | |
3518 | { .mfi | |
3519 | ldfe fLnSin6 = [rTmpPtr3] | |
3520 | nop.f 0 | |
3521 | nop.i 0 | |
3522 | } | |
3523 | { .mfi | |
3524 | ldfe fLnSin8 = [rTmpPtr2] | |
3525 | nop.f 0 | |
3526 | nop.i 0 | |
3527 | } | |
3528 | ;; | |
3529 | { .mfi | |
3530 | ldfps FR_G2, FR_H2 = [GR_ad_tbl_2],8 // Load G_2, H_2 | |
3531 | nop.f 0 | |
3532 | nop.i 0 | |
3533 | } | |
3534 | { .mfi | |
3535 | ldfd FR_h2 = [rTmpPtr] // Load h_2 | |
3536 | nop.f 0 | |
3537 | nop.i 0 | |
3538 | } | |
3539 | ;; | |
3540 | { .mfi | |
3541 | // store signgam if size of variable is 4 bytes | |
3542 | (p6) st4 [rSgnGamAddr] = rSgnGam | |
3543 | fma.s1 fResH = fA20, fResL, fA19 //polynomial for log(|x|) | |
3544 | // Get bits 30-15 of X_1 * Z_2 | |
3545 | pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15 | |
3546 | } | |
3547 | { .mfi | |
3548 | // store signgam if size of variable is 8 bytes | |
3549 | (p7) st8 [rSgnGamAddr] = rSgnGam | |
3550 | fma.s1 fA2 = fA2, fDx, fA1 // polynomial for lgammal(|x|) | |
3551 | nop.i 0 | |
3552 | } | |
3553 | ;; | |
3554 | // | |
3555 | // For performance, don't use result of pmpyshr2.u for 4 cycles. | |
3556 | // | |
3557 | { .mfi | |
3558 | nop.m 0 | |
3559 | fma.s1 fA18 = fA18, fResL, fA17 //polynomial for log(|x|) | |
3560 | nop.i 0 | |
3561 | } | |
3562 | ;; | |
3563 | { .mfi | |
3564 | nop.m 0 | |
3565 | fma.s1 fA16 = fA16, fResL, fA15 //polynomial for log(|x|) | |
3566 | nop.i 0 | |
3567 | } | |
3568 | { .mfi | |
3569 | nop.m 0 | |
3570 | fma.s1 fA4 = fA4, fDx, fA3 // polynomial for lgammal(|x|) | |
3571 | nop.i 0 | |
3572 | } | |
3573 | ;; | |
3574 | { .mfi | |
3575 | nop.m 0 | |
3576 | fma.s1 fA14 = fA14, fResL, fA13 //polynomial for log(|x|) | |
3577 | nop.i 0 | |
3578 | } | |
3579 | { .mfi | |
3580 | nop.m 0 | |
3581 | fma.s1 fA6 = fA6, fDx, fA5L // polynomial for lgammal(|x|) | |
3582 | nop.i 0 | |
3583 | } | |
3584 | ;; | |
3585 | { .mfi | |
3586 | nop.m 0 | |
3587 | fma.s1 fPol = fA8, fDx, fA7 // polynomial for lgammal(|x|) | |
3588 | extr.u GR_Index3 = GR_X_2, 1, 5 // Extract bits 1-5 of X_2 | |
3589 | } | |
3590 | ;; | |
3591 | { .mfi | |
3592 | shladd GR_ad_tbl_3 = GR_Index3, 4, rTbl3Addr // Point to G_3 | |
3593 | // loqw part of lnsin polynomial | |
3594 | fma.s1 fRes3L = fLnSin4, fDxSqr, fLnSin2 | |
3595 | nop.i 0 | |
3596 | } | |
3597 | ;; | |
3598 | { .mfi | |
3599 | ldfps FR_G3, FR_H3 = [GR_ad_tbl_3], 8 // Load G_3, H_3 | |
3600 | fcvt.xf fFloatN = fFloatN // N as FP number | |
3601 | nop.i 0 | |
3602 | } | |
3603 | { .mfi | |
3604 | nop.m 0 | |
3605 | fma.s1 fResH = fResH, fDxSqr, fA18 // High part of log(|x|) | |
3606 | nop.i 0 | |
3607 | } | |
3608 | ;; | |
3609 | { .mfi | |
3610 | ldfd FR_h3 = [GR_ad_tbl_3] // Load h_3 | |
3611 | fma.s1 fA4 = fA4, fDxSqr, fA2 // Low part of lgammal(|x|) | |
3612 | nop.i 0 | |
3613 | } | |
3614 | { .mfi | |
3615 | nop.m 0 | |
3616 | // high part of lnsin polynomial | |
3617 | fma.s1 fRes3H = fLnSin8, fDxSqr, fLnSin6 | |
3618 | nop.i 0 | |
3619 | } | |
3620 | ;; | |
3621 | { .mfi | |
3622 | nop.m 0 | |
3623 | fmpy.s1 FR_G = FR_G, FR_G2 // G = G_1 * G_2 | |
3624 | nop.i 0 | |
3625 | } | |
3626 | { .mfi | |
3627 | nop.m 0 | |
3628 | fadd.s1 FR_H = FR_H, FR_H2 // H = H_1 + H_2 | |
3629 | nop.i 0 | |
3630 | } | |
3631 | ;; | |
3632 | { .mfi | |
3633 | nop.m 0 | |
3634 | fadd.s1 FR_h = FR_h, FR_h2 // h = h_1 + h_2 | |
3635 | nop.i 0 | |
3636 | } | |
3637 | { .mfi | |
3638 | nop.m 0 | |
3639 | fma.s1 fA16 = fA16, fDxSqr, fA14 // Low part of log(|x|) | |
3640 | nop.i 0 | |
3641 | } | |
3642 | ;; | |
3643 | { .mfi | |
3644 | nop.m 0 | |
3645 | fma.s1 fPol = fPol, fDxSqr, fA6 // High part of lgammal(|x|) | |
3646 | nop.i 0 | |
3647 | } | |
3648 | ;; | |
3649 | { .mfi | |
3650 | nop.m 0 | |
3651 | fma.s1 fResH = fResH, fA0L, fA16 // log(|x|)/deltaX^2 - deltaX | |
3652 | nop.i 0 | |
3653 | } | |
3654 | ;; | |
3655 | { .mfi | |
3656 | nop.m 0 | |
3657 | fmpy.s1 FR_G = FR_G, FR_G3 // G = (G_1 * G_2) * G_3 | |
3658 | nop.i 0 | |
3659 | } | |
3660 | { .mfi | |
3661 | nop.m 0 | |
3662 | fadd.s1 FR_H = FR_H, FR_H3 // H = (H_1 + H_2) + H_3 | |
3663 | nop.i 0 | |
3664 | } | |
3665 | ;; | |
3666 | { .mfi | |
3667 | nop.m 0 | |
3668 | fadd.s1 FR_h = FR_h, FR_h3 // h = (h_1 + h_2) + h_3 | |
3669 | nop.i 0 | |
3670 | } | |
3671 | ;; | |
3672 | { .mfi | |
3673 | nop.m 0 | |
3674 | fma.s1 fResH = fResH, fDxSqr, fResL // log(|x|) | |
3675 | nop.i 0 | |
3676 | } | |
3677 | { .mfi | |
3678 | nop.m 0 | |
3679 | fma.s1 fPol = fPol, fA0L, fA4 // lgammal(|x|)/|x| | |
3680 | nop.i 0 | |
3681 | } | |
3682 | ;; | |
3683 | { .mfi | |
3684 | nop.m 0 | |
3685 | fms.s1 FR_r = FR_G, fAbsX, f1 // r = G * S_hi - 1 | |
3686 | nop.i 0 | |
3687 | } | |
3688 | { .mfi | |
3689 | nop.m 0 | |
3690 | // high part of log(deltaX)= Y_hi = N * log2_hi + H | |
3691 | fma.s1 fRes4H = fFloatN, FR_log2_hi, FR_H | |
3692 | nop.i 0 | |
3693 | } | |
3694 | ;; | |
3695 | { .mfi | |
3696 | nop.m 0 | |
3697 | // h = N * log2_lo + h | |
3698 | fma.s1 FR_h = fFloatN, FR_log2_lo, FR_h | |
3699 | nop.i 0 | |
3700 | } | |
3701 | ;; | |
3702 | { .mfi | |
3703 | nop.m 0 | |
3704 | fma.s1 fResH = fPol, fDx, fResH // lgammal(|x|) + log(|x|) | |
3705 | nop.i 0 | |
3706 | } | |
3707 | { .mfi | |
3708 | nop.m 0 | |
3709 | // lnsin/deltaX^2 | |
3710 | fma.s1 fRes3H = fRes3H, fA0L, fRes3L | |
3711 | nop.i 0 | |
3712 | } | |
3713 | ;; | |
3714 | { .mfi | |
3715 | nop.m 0 | |
3716 | // poly_lo = r * Q4 + Q3 | |
3717 | fma.s1 FR_poly_lo = FR_r, FR_Q4, FR_Q3 | |
3718 | nop.i 0 | |
3719 | } | |
3720 | { .mfi | |
3721 | nop.m 0 | |
3722 | fmpy.s1 FR_rsq = FR_r, FR_r // rsq = r * r | |
3723 | nop.i 0 | |
3724 | } | |
3725 | ;; | |
3726 | { .mfi | |
3727 | nop.m 0 | |
3728 | // lnSin - log(|x|) - lgammal(|x|) | |
3729 | fms.s1 fResH = fRes3H, fDxSqr, fResH | |
3730 | nop.i 0 | |
3731 | } | |
3732 | ;; | |
3733 | ||
3734 | { .mfi | |
3735 | nop.m 0 | |
3736 | // poly_lo = poly_lo * r + Q2 | |
3737 | fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_Q2 | |
3738 | nop.i 0 | |
3739 | } | |
3740 | { .mfi | |
3741 | nop.m 0 | |
3742 | fma.s1 FR_rcub = FR_rsq, FR_r, f0 // rcub = r^3 | |
3743 | nop.i 0 | |
3744 | } | |
3745 | ;; | |
3746 | ||
3747 | { .mfi | |
3748 | nop.m 0 | |
3749 | // poly_hi = Q1 * rsq + r | |
3750 | fma.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r | |
3751 | nop.i 0 | |
3752 | } | |
3753 | ;; | |
3754 | ||
3755 | { .mfi | |
3756 | nop.m 0 | |
3757 | // poly_lo = poly_lo*r^3 + h | |
3758 | fma.s1 FR_poly_lo = FR_poly_lo, FR_rcub, FR_h | |
3759 | nop.i 0 | |
3760 | } | |
3761 | ;; | |
3762 | ||
3763 | { .mfi | |
3764 | nop.m 0 | |
3765 | // low part of log(|deltaX|) = Y_lo = poly_hi + poly_lo | |
3766 | fadd.s1 fRes4L = FR_poly_hi, FR_poly_lo | |
3767 | nop.i 0 | |
3768 | } | |
3769 | ;; | |
3770 | { .mfi | |
3771 | nop.m 0 | |
3772 | fsub.s1 fResH = fResH, fRes4L | |
3773 | nop.i 0 | |
3774 | } | |
3775 | ;; | |
3776 | { .mfb | |
3777 | nop.m 0 | |
3778 | // final result for |x+1|< 2^(-7) path | |
3779 | fsub.s0 f8 = fResH, fRes4H | |
3780 | // exit for |x+1|< 2^(-7) path | |
3781 | br.ret.sptk b0 | |
3782 | } | |
3783 | ;; | |
3784 | ||
3785 | ||
3786 | // here if -2^63 < x < -6.0 and x is not an integer | |
3787 | // Also we are going to filter out cases when x falls in | |
3788 | // range which is "close enough" to negative root. Rhis case | |
3789 | // may occur only for -19.5 < x since other roots of lgamma are | |
3790 | // insignificant from double extended point of view (they are closer | |
3791 | // to RTN(x) than one ulp(x). | |
3792 | .align 32 | |
3793 | _negStirling: | |
3794 | { .mfi | |
3795 | ldfe fLnSin6 = [rLnSinDataPtr], 32 | |
3796 | fnma.s1 fInvX = f8, fRcpX, f1 // start of 3rd NR iteration | |
3797 | // Get high 4 bits of significand of deltaX | |
3798 | extr.u rIndex1Dx = rSignifDx, 59, 4 | |
3799 | } | |
3800 | { .mfi | |
3801 | ldfe fLnSin8 = [rTmpPtr3], 32 | |
3802 | fadd.s1 FR_h = FR_h, FR_h2 // h = h_1 + h_2 | |
3803 | (p12) cmp.ltu.unc p6, p0 = rSignifX, rLeftBound | |
3804 | } | |
3805 | ;; | |
3806 | { .mfi | |
3807 | ldfe fLnSin10 = [rLnSinDataPtr], 32 | |
3808 | fmpy.s1 FR_G = FR_G, FR_G3 // G = (G_1 * G_2) * G_3 | |
3809 | // Get high 15 bits of significand | |
3810 | extr.u GR_X_0 = rSignifDx, 49, 15 | |
3811 | } | |
3812 | { .mfi | |
3813 | shladd GR_ad_z_1 = rIndex1Dx, 2, GR_ad_z_1 // Point to Z_1 | |
3814 | fadd.s1 FR_H = FR_H, FR_H3 // H = (H_1 + H_2) + H_3 | |
3815 | // set p6 if x falls in "near root" range | |
3816 | (p6) cmp.geu.unc p6, p0 = rSignifX, rRightBound | |
3817 | } | |
3818 | ;; | |
3819 | { .mfi | |
3820 | getf.exp GR_N = fDx // Get N = exponent of x | |
3821 | fma.s1 fDx4 = fDxSqr, fDxSqr, f0 // deltaX^4 | |
3822 | adds rTmpPtr = 96, rBernulliPtr | |
3823 | } | |
3824 | { .mfb | |
3825 | ld4 GR_Z_1 = [GR_ad_z_1] // Load Z_1 | |
3826 | fma.s1 fLnSin34 = fLnSin34, fDxSqr, fLnSin32 | |
3827 | // branch to special path if x falls in "near root" range | |
3828 | (p6) br.cond.spnt _negRoots | |
3829 | } | |
3830 | ;; | |
3831 | .pred.rel "mutex",p10,p11 | |
3832 | { .mfi | |
3833 | ldfe fLnSin12 = [rTmpPtr3] | |
3834 | fma.s1 fLnSin26 = fLnSin26, fDxSqr, fLnSin24 | |
3835 | (p10) cmp.eq p8, p9 = rXRnd, r0 | |
3836 | } | |
3837 | { .mfi | |
3838 | ldfe fLnSin14 = [rLnSinDataPtr] | |
3839 | fma.s1 fLnSin30 = fLnSin30, fDxSqr, fLnSin28 | |
3840 | (p11) cmp.eq p9, p8 = rXRnd, r0 | |
3841 | } | |
3842 | ;; | |
3843 | { .mfi | |
3844 | ldfpd fB2, fB2L = [rBernulliPtr], 16 | |
3845 | fma.s1 fLnSin18 = fLnSin18, fDxSqr, fLnSin16 | |
3846 | shladd GR_ad_tbl_1 = rIndex1Dx, 4, rTbl1Addr // Point to G_1 | |
3847 | ||
3848 | } | |
3849 | { .mfi | |
3850 | ldfe fB14 = [rTmpPtr], 16 | |
3851 | fma.s1 fLnSin22 = fLnSin22, fDxSqr, fLnSin20 | |
3852 | and GR_N = GR_N, r17Ones // mask sign bit | |
3853 | } | |
3854 | ;; | |
3855 | { .mfi | |
3856 | ldfe fB4 = [rBernulliPtr], 16 | |
3857 | fma.s1 fInvX = fInvX, fRcpX, fRcpX // end of 3rd NR iteration | |
3858 | // Get bits 30-15 of X_0 * Z_1 | |
3859 | pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15 | |
3860 | } | |
3861 | { .mfi | |
3862 | ldfe fB16 = [rTmpPtr], 16 | |
3863 | fadd.s1 FR_h = FR_h, FR_h3 // h = (h_1 + h_2) + h_3 | |
3864 | adds rTmpPtr2 = 8, GR_ad_tbl_1 | |
3865 | } | |
3866 | ;; | |
3867 | // | |
3868 | // For performance, don't use result of pmpyshr2.u for 4 cycles. | |
3869 | // | |
3870 | { .mfi | |
3871 | ldfe fB6 = [rBernulliPtr], 16 | |
3872 | fms.s1 FR_r = FR_G, fSignifX, f1 // r = G * S_hi - 1 | |
3873 | adds rTmpPtr3 = -48, rTmpPtr | |
3874 | } | |
3875 | { .mfi | |
3876 | ldfe fB18 = [rTmpPtr], 16 | |
3877 | // High part of the log(|x|) = Y_hi = N * log2_hi + H | |
3878 | fma.s1 fResH = fFloatN, FR_log2_hi, FR_H | |
3879 | sub GR_N = GR_N, rExpHalf, 1 // unbiased exponent of deltaX | |
3880 | } | |
3881 | ;; | |
3882 | .pred.rel "mutex",p8,p9 | |
3883 | { .mfi | |
3884 | ldfe fB8 = [rBernulliPtr], 16 | |
3885 | fma.s1 fLnSin36 = fLnSin36, fDx4, fLnSin34 | |
3886 | // sign of GAMMA(x) is negative | |
3887 | (p8) adds rSgnGam = -1, r0 | |
3888 | } | |
3889 | { .mfi | |
3890 | ldfe fB20 = [rTmpPtr], -160 | |
3891 | fma.s1 fRes5H = fLnSin4, fDxSqr, f0 | |
3892 | // sign of GAMMA(x) is positive | |
3893 | (p9) adds rSgnGam = 1, r0 | |
3894 | ||
3895 | } | |
3896 | ;; | |
3897 | { .mfi | |
3898 | ldfe fB10 = [rBernulliPtr], 16 | |
3899 | fma.s1 fLnSin30 = fLnSin30, fDx4, fLnSin26 | |
3900 | (p14) adds rTmpPtr = -160, rTmpPtr | |
3901 | } | |
3902 | { .mfi | |
3903 | ldfe fB12 = [rTmpPtr3], 16 | |
3904 | fma.s1 fDx8 = fDx4, fDx4, f0 // deltaX^8 | |
3905 | cmp.eq p6, p7 = 4, rSgnGamSize | |
3906 | } | |
3907 | ;; | |
3908 | { .mfi | |
3909 | ldfps fGDx, fHDx = [GR_ad_tbl_1], 8 // Load G_1, H_1 | |
3910 | fma.s1 fDx6 = fDx4, fDxSqr, f0 // deltaX^6 | |
3911 | extr.u GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1 | |
3912 | } | |
3913 | { .mfi | |
3914 | ldfd fhDx = [rTmpPtr2] // Load h_1 | |
3915 | fma.s1 fLnSin22 = fLnSin22, fDx4, fLnSin18 | |
3916 | nop.i 0 | |
3917 | } | |
3918 | ;; | |
3919 | { .mfi | |
3920 | // Load two parts of C | |
3921 | ldfpd fRes1H, fRes1L = [rTmpPtr], 16 | |
3922 | fma.s1 fRcpX = fInvX, fInvX, f0 // (1/x)^2 | |
3923 | shladd GR_ad_tbl_2 = GR_Index2, 4, rTbl2Addr // Point to G_2 | |
3924 | } | |
3925 | { .mfi | |
3926 | shladd GR_ad_z_2 = GR_Index2, 2, rZ2Addr // Point to Z_2 | |
3927 | fma.s1 FR_h = fFloatN, FR_log2_lo, FR_h// h = N * log2_lo + h | |
3928 | nop.i 0 | |
3929 | } | |
3930 | ;; | |
3931 | { .mfi | |
3932 | ld4 GR_Z_2 = [GR_ad_z_2] // Load Z_2 | |
3933 | fnma.s1 fInvXL = f8, fInvX, f1 // relative error of 1/x | |
3934 | nop.i 0 | |
3935 | } | |
3936 | { .mfi | |
3937 | adds rTmpPtr2 = 8, GR_ad_tbl_2 | |
3938 | fma.s1 fLnSin8 = fLnSin8, fDxSqr, fLnSin6 | |
3939 | nop.i 0 | |
3940 | } | |
3941 | ;; | |
3942 | { .mfi | |
3943 | ldfps FR_G2, FR_H2 = [GR_ad_tbl_2],8 // Load G_2, H_2 | |
3944 | // poly_lo = r * Q4 + Q3 | |
3945 | fma.s1 FR_poly_lo = FR_r, FR_Q4, FR_Q3 | |
3946 | nop.i 0 | |
3947 | } | |
3948 | { .mfi | |
3949 | ldfd fh2Dx = [rTmpPtr2] // Load h_2 | |
3950 | fmpy.s1 FR_rsq = FR_r, FR_r // rsq = r * r | |
3951 | nop.i 0 | |
3952 | } | |
3953 | ;; | |
3954 | { .mfi | |
3955 | nop.m 0 | |
3956 | fma.s1 fA1L = fB2, fInvX, f0 // (B2*(1/x))hi | |
3957 | nop.i 0 | |
3958 | } | |
3959 | { .mfi | |
3960 | // Put integer N into rightmost significand | |
3961 | setf.sig fFloatNDx = GR_N | |
3962 | fms.s1 fRes4H = fResH, f1, f1 // ln(|x|)hi - 1 | |
3963 | nop.i 0 | |
3964 | } | |
3965 | ;; | |
3966 | { .mfi | |
3967 | nop.m 0 | |
3968 | fadd.s1 fRes2H = fRes5H, fLnSin2//(lnSin4*DeltaX^2 + lnSin2)hi | |
3969 | // Get bits 30-15 of X_1 * Z_2 | |
3970 | pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15 | |
3971 | } | |
3972 | { .mfi | |
3973 | nop.m 0 | |
3974 | fms.s1 fRes5L = fLnSin4, fDxSqr, fRes5H | |
3975 | nop.i 0 | |
3976 | } | |
3977 | ;; | |
3978 | // | |
3979 | // For performance, don't use result of pmpyshr2.u for 4 cycles. | |
3980 | // | |
3981 | { .mfi | |
3982 | nop.m 0 | |
3983 | fma.s1 fInvX4 = fRcpX, fRcpX, f0 // (1/x)^4 | |
3984 | nop.i 0 | |
3985 | } | |
3986 | { .mfi | |
3987 | nop.m 0 | |
3988 | fma.s1 fB6 = fB6, fRcpX, fB4 | |
3989 | nop.i 0 | |
3990 | } | |
3991 | ;; | |
3992 | { .mfi | |
3993 | // store signgam if size of variable is 4 bytes | |
3994 | (p6) st4 [rSgnGamAddr] = rSgnGam | |
3995 | fma.s1 fB18 = fB18, fRcpX, fB16 | |
3996 | nop.i 0 | |
3997 | } | |
3998 | { .mfi | |
3999 | // store signgam if size of variable is 8 bytes | |
4000 | (p7) st8 [rSgnGamAddr] = rSgnGam | |
4001 | fma.s1 fInvXL = fInvXL, fInvX, f0 // low part of 1/x | |
4002 | nop.i 0 | |
4003 | } | |
4004 | ;; | |
4005 | { .mfi | |
4006 | nop.m 0 | |
4007 | // poly_lo = poly_lo * r + Q2 | |
4008 | fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_Q2 | |
4009 | nop.i 0 | |
4010 | } | |
4011 | { .mfi | |
4012 | nop.m 0 | |
4013 | fma.s1 FR_rcub = FR_rsq, FR_r, f0 // rcub = r^3 | |
4014 | nop.i 0 | |
4015 | } | |
4016 | ;; | |
4017 | { .mfi | |
4018 | nop.m 0 | |
4019 | fma.s1 fRes3H = fRes4H, f8, f0 // (-|x|*(ln(|x|)-1))hi | |
4020 | extr.u GR_Index3 = GR_X_2, 1, 5 // Extract bits 1-5 of X_2 | |
4021 | } | |
4022 | { .mfi | |
4023 | nop.m 0 | |
4024 | // poly_hi = Q1 * rsq + r | |
4025 | fma.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r | |
4026 | nop.i 0 | |
4027 | } | |
4028 | ;; | |
4029 | { .mfi | |
4030 | shladd GR_ad_tbl_3 = GR_Index3, 4, rTbl3Addr // Point to G_3 | |
4031 | fms.s1 fA2L = fB2, fInvX, fA1L // delta(B2*(1/x)) | |
4032 | nop.i 0 | |
4033 | } | |
4034 | { .mfi | |
4035 | nop.m 0 | |
4036 | fnma.s1 fBrnH = fRes1H, f1, fA1L // (-C - S(1/x))hi | |
4037 | nop.i 0 | |
4038 | } | |
4039 | ;; | |
4040 | { .mfi | |
4041 | ldfps fG3Dx, fH3Dx = [GR_ad_tbl_3],8 // Load G_3, H_3 | |
4042 | fma.s1 fInvX8 = fInvX4, fInvX4, f0 // (1/x)^8 | |
4043 | nop.i 0 | |
4044 | } | |
4045 | { .mfi | |
4046 | nop.m 0 | |
4047 | fma.s1 fB10 = fB10, fRcpX, fB8 | |
4048 | nop.i 0 | |
4049 | } | |
4050 | ;; | |
4051 | ||
4052 | { .mfi | |
4053 | ldfd fh3Dx = [GR_ad_tbl_3] // Load h_3 | |
4054 | fma.s1 fB20 = fB20, fInvX4, fB18 | |
4055 | nop.i 0 | |
4056 | } | |
4057 | { .mfi | |
4058 | nop.m 0 | |
4059 | fma.s1 fB14 = fB14, fRcpX, fB12 | |
4060 | nop.i 0 | |
4061 | } | |
4062 | ;; | |
4063 | { .mfi | |
4064 | nop.m 0 | |
4065 | fma.s1 fLnSin36 = fLnSin36, fDx8, fLnSin30 | |
4066 | nop.i 0 | |
4067 | } | |
4068 | { .mfi | |
4069 | nop.m 0 | |
4070 | fma.s1 fLnSin12 = fLnSin12, fDxSqr, fLnSin10 | |
4071 | nop.i 0 | |
4072 | } | |
4073 | ;; | |
4074 | { .mfi | |
4075 | nop.m 0 | |
4076 | fsub.s1 fRes2L = fLnSin2, fRes2H | |
4077 | nop.i 0 | |
4078 | } | |
4079 | { .mfi | |
4080 | nop.m 0 | |
4081 | fma.s1 fPol = fRes2H, fDxSqr, f0 // high part of LnSin | |
4082 | nop.i 0 | |
4083 | } | |
4084 | ;; | |
4085 | { .mfi | |
4086 | nop.m 0 | |
4087 | fnma.s1 fResH = fResH, FR_MHalf, fResH // -0.5*ln(|x|)hi | |
4088 | nop.i 0 | |
4089 | } | |
4090 | { .mfi | |
4091 | nop.m 0 | |
4092 | fmpy.s1 fGDx = fGDx, FR_G2 // G = G_1 * G_2 | |
4093 | nop.i 0 | |
4094 | } | |
4095 | ;; | |
4096 | { .mfi | |
4097 | nop.m 0 | |
4098 | // poly_lo = poly_lo*r^3 + h | |
4099 | fma.s1 FR_poly_lo = FR_poly_lo, FR_rcub, FR_h | |
4100 | nop.i 0 | |
4101 | } | |
4102 | { .mfi | |
4103 | nop.m 0 | |
4104 | // B2lo*(1/x)hi+ delta(B2*(1/x)) | |
4105 | fma.s1 fA2L = fB2L, fInvX, fA2L | |
4106 | nop.i 0 | |
4107 | } | |
4108 | ;; | |
4109 | { .mfi | |
4110 | nop.m 0 | |
4111 | fma.s1 fB20 = fB20, fInvX4, fB14 | |
4112 | nop.i 0 | |
4113 | } | |
4114 | { .mfi | |
4115 | nop.m 0 | |
4116 | fma.s1 fB10 = fB10, fInvX4, fB6 | |
4117 | nop.i 0 | |
4118 | } | |
4119 | ;; | |
4120 | { .mfi | |
4121 | nop.m 0 | |
4122 | fcvt.xf fFloatNDx = fFloatNDx | |
4123 | nop.i 0 | |
4124 | } | |
4125 | { .mfi | |
4126 | nop.m 0 | |
4127 | fma.s1 fLnSin14 = fLnSin14, fDx4, fLnSin12 | |
4128 | nop.i 0 | |
4129 | } | |
4130 | ;; | |
4131 | { .mfi | |
4132 | nop.m 0 | |
4133 | fma.s1 fLnSin36 = fLnSin36, fDx8, fLnSin22 | |
4134 | nop.i 0 | |
4135 | } | |
4136 | { .mfi | |
4137 | nop.m 0 | |
4138 | fms.s1 fRes3L = fRes4H, f8, fRes3H // delta(-|x|*(ln(|x|)-1)) | |
4139 | nop.i 0 | |
4140 | } | |
4141 | ;; | |
4142 | { .mfi | |
4143 | nop.m 0 | |
4144 | fmpy.s1 fGDx = fGDx, fG3Dx // G = (G_1 * G_2) * G_3 | |
4145 | nop.i 0 | |
4146 | } | |
4147 | { .mfi | |
4148 | nop.m 0 | |
4149 | // (-|x|*(ln(|x|)-1) - 0.5ln(|x|))hi | |
4150 | fadd.s1 fRes4H = fRes3H, fResH | |
4151 | nop.i 0 | |
4152 | } | |
4153 | ;; | |
4154 | { .mfi | |
4155 | nop.m 0 | |
4156 | fma.s1 fA2L = fInvXL, fB2, fA2L //(B2*(1/x))lo | |
4157 | nop.i 0 | |
4158 | } | |
4159 | { .mfi | |
4160 | nop.m 0 | |
4161 | // low part of log(|x|) = Y_lo = poly_hi + poly_lo | |
4162 | fadd.s1 fResL = FR_poly_hi, FR_poly_lo | |
4163 | nop.i 0 | |
4164 | } | |
4165 | ;; | |
4166 | { .mfi | |
4167 | nop.m 0 | |
4168 | fma.s1 fB20 = fB20, fInvX8, fB10 | |
4169 | nop.i 0 | |
4170 | } | |
4171 | { .mfi | |
4172 | nop.m 0 | |
4173 | fma.s1 fInvX3 = fInvX, fRcpX, f0 // (1/x)^3 | |
4174 | nop.i 0 | |
4175 | } | |
4176 | ;; | |
4177 | { .mfi | |
4178 | nop.m 0 | |
4179 | fadd.s1 fHDx = fHDx, FR_H2 // H = H_1 + H_2 | |
4180 | nop.i 0 | |
4181 | } | |
4182 | { .mfi | |
4183 | nop.m 0 | |
4184 | fadd.s1 fRes5L = fRes5L, fLnSin2L | |
4185 | nop.i 0 | |
4186 | } | |
4187 | ;; | |
4188 | { .mfi | |
4189 | nop.m 0 | |
4190 | fadd.s1 fRes2L = fRes2L, fRes5H | |
4191 | nop.i 0 | |
4192 | } | |
4193 | { .mfi | |
4194 | nop.m 0 | |
4195 | fadd.s1 fhDx = fhDx, fh2Dx // h = h_1 + h_2 | |
4196 | nop.i 0 | |
4197 | } | |
4198 | ;; | |
4199 | { .mfi | |
4200 | nop.m 0 | |
4201 | fms.s1 fBrnL = fRes1H, fMOne, fBrnH | |
4202 | nop.i 0 | |
4203 | } | |
4204 | { .mfi | |
4205 | nop.m 0 | |
4206 | fms.s1 FR_r = fGDx, fNormDx, f1 // r = G * S_hi - 1 | |
4207 | nop.i 0 | |
4208 | } | |
4209 | ;; | |
4210 | { .mfi | |
4211 | nop.m 0 | |
4212 | fma.s1 fRes3L = fResL, f8 , fRes3L // (-|x|*(ln(|x|)-1))lo | |
4213 | nop.i 0 | |
4214 | } | |
4215 | { .mfi | |
4216 | nop.m 0 | |
4217 | fsub.s1 fRes4L = fRes3H, fRes4H | |
4218 | nop.i 0 | |
4219 | } | |
4220 | ;; | |
4221 | { .mfi | |
4222 | nop.m 0 | |
4223 | // low part of "Bernulli" polynomial | |
4224 | fma.s1 fB20 = fB20, fInvX3, fA2L | |
4225 | nop.i 0 | |
4226 | } | |
4227 | { .mfi | |
4228 | nop.m 0 | |
4229 | fnma.s1 fResL = fResL, FR_MHalf, fResL // -0.5*ln(|x|)lo | |
4230 | nop.i 0 | |
4231 | } | |
4232 | ;; | |
4233 | { .mfi | |
4234 | nop.m 0 | |
4235 | fadd.s1 fHDx = fHDx, fH3Dx // H = (H_1 + H_2) + H_3 | |
4236 | nop.i 0 | |
4237 | } | |
4238 | { .mfi | |
4239 | nop.m 0 | |
4240 | fms.s1 fPolL = fRes2H, fDxSqr, fPol | |
4241 | nop.i 0 | |
4242 | } | |
4243 | ;; | |
4244 | { .mfi | |
4245 | nop.m 0 | |
4246 | fadd.s1 fhDx = fhDx, fh3Dx // h = (h_1 + h_2) + h_3 | |
4247 | nop.i 0 | |
4248 | } | |
4249 | { .mfi | |
4250 | nop.m 0 | |
4251 | // (-|x|*(ln(|x|)-1) - 0.5ln(|x|) - C - S(1/x))hi | |
4252 | fadd.s1 fB14 = fRes4H, fBrnH | |
4253 | nop.i 0 | |
4254 | } | |
4255 | ;; | |
4256 | { .mfi | |
4257 | nop.m 0 | |
4258 | // poly_lo = r * Q4 + Q3 | |
4259 | fma.s1 FR_poly_lo = FR_r, FR_Q4, FR_Q3 | |
4260 | nop.i 0 | |
4261 | } | |
4262 | { .mfi | |
4263 | nop.m 0 | |
4264 | fmpy.s1 FR_rsq = FR_r, FR_r // rsq = r * r | |
4265 | nop.i 0 | |
4266 | } | |
4267 | ;; | |
4268 | { .mfi | |
4269 | nop.m 0 | |
4270 | fadd.s1 fRes4L = fRes4L, fResH | |
4271 | nop.i 0 | |
4272 | } | |
4273 | { .mfi | |
4274 | nop.m 0 | |
4275 | fadd.s1 fBrnL = fBrnL, fA1L | |
4276 | nop.i 0 | |
4277 | } | |
4278 | ;; | |
4279 | { .mfi | |
4280 | nop.m 0 | |
4281 | // (-|x|*(ln(|x|)-1))lo + (-0.5ln(|x|))lo | |
4282 | fadd.s1 fRes3L = fRes3L, fResL | |
4283 | nop.i 0 | |
4284 | } | |
4285 | { .mfi | |
4286 | nop.m 0 | |
4287 | fnma.s1 fB20 = fRes1L, f1, fB20 // -Clo - S(1/x)lo | |
4288 | nop.i 0 | |
4289 | } | |
4290 | ;; | |
4291 | { .mfi | |
4292 | nop.m 0 | |
4293 | fadd.s1 fRes2L = fRes2L, fRes5L // (lnSin4*DeltaX^2 + lnSin2)lo | |
4294 | nop.i 0 | |
4295 | } | |
4296 | { .mfi | |
4297 | nop.m 0 | |
4298 | fma.s1 fPolL = fDxSqrL, fRes2H, fPolL | |
4299 | nop.i 0 | |
4300 | } | |
4301 | ;; | |
4302 | { .mfi | |
4303 | nop.m 0 | |
4304 | fma.s1 fLnSin14 = fLnSin14, fDx4, fLnSin8 | |
4305 | nop.i 0 | |
4306 | } | |
4307 | { .mfi | |
4308 | nop.m 0 | |
4309 | fma.s1 fLnSin36 = fLnSin36, fDx8, f0 | |
4310 | nop.i 0 | |
4311 | } | |
4312 | ;; | |
4313 | { .mfi | |
4314 | nop.m 0 | |
4315 | // poly_lo = poly_lo * r + Q2 | |
4316 | fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_Q2 | |
4317 | nop.i 0 | |
4318 | } | |
4319 | { .mfi | |
4320 | nop.m 0 | |
4321 | fma.s1 FR_rcub = FR_rsq, FR_r, f0 // rcub = r^3 | |
4322 | nop.i 0 | |
4323 | } | |
4324 | ;; | |
4325 | { .mfi | |
4326 | nop.m 0 | |
4327 | // poly_hi = Q1 * rsq + r | |
4328 | fma.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r | |
4329 | nop.i 0 | |
4330 | } | |
4331 | { .mfi | |
4332 | nop.m 0 | |
4333 | fsub.s1 fB12 = fRes4H, fB14 | |
4334 | nop.i 0 | |
4335 | } | |
4336 | ;; | |
4337 | { .mfi | |
4338 | nop.m 0 | |
4339 | // (-|x|*(ln(|x|)-1) - 0.5ln(|x|))lo | |
4340 | fadd.s1 fRes4L = fRes4L, fRes3L | |
4341 | nop.i 0 | |
4342 | } | |
4343 | { .mfi | |
4344 | nop.m 0 | |
4345 | fadd.s1 fBrnL = fBrnL, fB20 // (-C - S(1/x))lo | |
4346 | nop.i 0 | |
4347 | } | |
4348 | ;; | |
4349 | { .mfi | |
4350 | nop.m 0 | |
4351 | // high part of log(|DeltaX|) = Y_hi = N * log2_hi + H | |
4352 | fma.s1 fLnDeltaH = fFloatNDx, FR_log2_hi, fHDx | |
4353 | nop.i 0 | |
4354 | } | |
4355 | { .mfi | |
4356 | nop.m 0 | |
4357 | // h = N * log2_lo + h | |
4358 | fma.s1 fhDx = fFloatNDx, FR_log2_lo, fhDx | |
4359 | nop.i 0 | |
4360 | } | |
4361 | ;; | |
4362 | { .mfi | |
4363 | nop.m 0 | |
4364 | fma.s1 fPolL = fRes2L, fDxSqr, fPolL | |
4365 | nop.i 0 | |
4366 | } | |
4367 | { .mfi | |
4368 | nop.m 0 | |
4369 | fma.s1 fLnSin14 = fLnSin36, fDxSqr, fLnSin14 | |
4370 | nop.i 0 | |
4371 | } | |
4372 | ;; | |
4373 | { .mfi | |
4374 | nop.m 0 | |
4375 | // (-|x|*(ln(|x|)-1) - 0.5ln(|x|))lo + (- C - S(1/x))lo | |
4376 | fadd.s1 fBrnL = fBrnL, fRes4L | |
4377 | nop.i 0 | |
4378 | } | |
4379 | { .mfi | |
4380 | nop.m 0 | |
4381 | fadd.s1 fB12 = fB12, fBrnH | |
4382 | nop.i 0 | |
4383 | } | |
4384 | ;; | |
4385 | { .mfi | |
4386 | nop.m 0 | |
4387 | // poly_lo = poly_lo*r^3 + h | |
4388 | fma.s1 FR_poly_lo = FR_poly_lo, FR_rcub, fhDx | |
4389 | nop.i 0 | |
4390 | } | |
4391 | { .mfi | |
4392 | nop.m 0 | |
4393 | fnma.s1 fRes1H = fLnDeltaH, f1, fPol//(-ln(|DeltaX|) + LnSin)hi | |
4394 | nop.i 0 | |
4395 | } | |
4396 | ;; | |
4397 | { .mfi | |
4398 | nop.m 0 | |
4399 | fma.s1 fPolL = fDxSqrL, fRes2L, fPolL | |
4400 | nop.i 0 | |
4401 | } | |
4402 | { .mfi | |
4403 | nop.m 0 | |
4404 | fma.s1 fLnSin36 = fLnSin14, fDx6, f0 | |
4405 | nop.i 0 | |
4406 | } | |
4407 | ;; | |
4408 | { .mfi | |
4409 | nop.m 0 | |
4410 | // (-|x|*(ln(|x|)-1) - 0.5ln(|x|) - C - S(1/x))lo | |
4411 | fadd.s1 fB12 = fB12, fBrnL | |
4412 | nop.i 0 | |
4413 | } | |
4414 | ;; | |
4415 | { .mfi | |
4416 | nop.m 0 | |
4417 | // low part of log(|DeltaX|) = Y_lo = poly_hi + poly_lo | |
4418 | fadd.s1 fLnDeltaL= FR_poly_hi, FR_poly_lo | |
4419 | nop.i 0 | |
4420 | } | |
4421 | { .mfi | |
4422 | nop.m 0 | |
4423 | fms.s1 fRes1L = fLnDeltaH, fMOne, fRes1H | |
4424 | nop.i 0 | |
4425 | } | |
4426 | ;; | |
4427 | { .mfi | |
4428 | nop.m 0 | |
4429 | fadd.s1 fPolL = fPolL, fLnSin36 | |
4430 | nop.i 0 | |
4431 | } | |
4432 | { .mfi | |
4433 | nop.m 0 | |
4434 | //(-|x|*(ln(|x|)-1)-0.5ln(|x|) - C - S(1/x))hi + (-ln(|DeltaX|) + LnSin)hi | |
4435 | fadd.s1 f8 = fRes1H, fB14 | |
4436 | nop.i 0 | |
4437 | } | |
4438 | ;; | |
4439 | { .mfi | |
4440 | nop.m 0 | |
4441 | //max((-|x|*(ln(|x|)-1)-0.5ln(|x|) - C - S(1/x))hi, | |
4442 | // (-ln(|DeltaX|) + LnSin)hi) | |
4443 | famax.s1 fMaxNegStir = fRes1H, fB14 | |
4444 | nop.i 0 | |
4445 | } | |
4446 | { .mfi | |
4447 | nop.m 0 | |
4448 | //min((-|x|*(ln(|x|)-1)-0.5ln(|x|) - C - S(1/x))hi, | |
4449 | // (-ln(|DeltaX|) + LnSin)hi) | |
4450 | famin.s1 fMinNegStir = fRes1H, fB14 | |
4451 | nop.i 0 | |
4452 | } | |
4453 | ;; | |
4454 | { .mfi | |
4455 | nop.m 0 | |
4456 | fadd.s1 fRes1L = fRes1L, fPol | |
4457 | nop.i 0 | |
4458 | } | |
4459 | { .mfi | |
4460 | nop.m 0 | |
4461 | // (-ln(|DeltaX|))lo + (LnSin)lo | |
4462 | fnma.s1 fPolL = fLnDeltaL, f1, fPolL | |
4463 | nop.i 0 | |
4464 | } | |
4465 | ;; | |
4466 | { .mfi | |
4467 | nop.m 0 | |
4468 | fsub.s1 f9 = fMaxNegStir, f8 // delta1 | |
4469 | nop.i 0 | |
4470 | } | |
4471 | ;; | |
4472 | { .mfi | |
4473 | nop.m 0 | |
4474 | fadd.s1 fRes1L = fRes1L, fPolL // (-ln(|DeltaX|) + LnSin)lo | |
4475 | nop.i 0 | |
4476 | } | |
4477 | ;; | |
4478 | { .mfi | |
4479 | nop.m 0 | |
4480 | fadd.s1 f9 = f9, fMinNegStir | |
4481 | nop.i 0 | |
4482 | } | |
4483 | ;; | |
4484 | { .mfi | |
4485 | nop.m 0 | |
4486 | fadd.s1 fRes1L = fRes1L, fB12 | |
4487 | nop.i 0 | |
4488 | } | |
4489 | ;; | |
4490 | { .mfi | |
4491 | // low part of the result | |
4492 | fadd.s1 f9 = f9, fRes1L | |
4493 | nop.i 0 | |
4494 | } | |
4495 | ;; | |
4496 | { .mfb | |
4497 | nop.m 0 | |
4498 | // final result for -2^63 < x < -6.0 path | |
4499 | fma.s0 f8 = f8, f1, f9 | |
4500 | // exit here for -2^63 < x < -6.0 path | |
4501 | br.ret.sptk b0 | |
4502 | } | |
4503 | ;; | |
4504 | ||
4505 | // here if x falls in neighbourhood of any negative root | |
4506 | // "neighbourhood" typically means that |lgammal(x)| < 0.17 | |
4507 | // on the [-3.0,-2.0] range |lgammal(x)| has even less | |
4508 | // magnitude | |
4509 | // rXint contains index of the root | |
4510 | // p10 is set if root belongs to "right" ones | |
4511 | // p11 is set if root belongs to "left" ones | |
4512 | // lgammal(x) is approximated by polynomial of | |
4513 | // 19th degree from (x - root) argument | |
4514 | .align 32 | |
4515 | _negRoots: | |
4516 | { .mfi | |
4517 | addl rPolDataPtr= @ltoff(lgammal_right_roots_polynomial_data),gp | |
4518 | nop.f 0 | |
4519 | shl rTmpPtr2 = rXint, 7 // (i*16)*8 | |
4520 | } | |
4521 | { .mfi | |
4522 | adds rRootsAddr = -288, rRootsBndAddr | |
4523 | nop.f 0 | |
4524 | nop.i 0 | |
4525 | } | |
4526 | ;; | |
4527 | { .mfi | |
4528 | ldfe fRoot = [rRootsAddr] // FP representation of root | |
4529 | nop.f 0 | |
4530 | shl rTmpPtr = rXint, 6 // (i*16)*4 | |
4531 | } | |
4532 | { .mfi | |
4533 | (p11) adds rTmpPtr2 = 3536, rTmpPtr2 | |
4534 | nop.f 0 | |
4535 | nop.i 0 | |
4536 | } | |
4537 | ;; | |
4538 | { .mfi | |
4539 | ld8 rPolDataPtr = [rPolDataPtr] | |
4540 | nop.f 0 | |
4541 | shladd rTmpPtr = rXint, 4, rTmpPtr // (i*16) + (i*16)*4 | |
4542 | } | |
4543 | { .mfi | |
4544 | adds rTmpPtr3 = 32, rTmpPtr2 | |
4545 | nop.f 0 | |
4546 | nop.i 0 | |
4547 | } | |
4548 | ;; | |
4549 | .pred.rel "mutex",p10,p11 | |
4550 | { .mfi | |
4551 | add rTmpPtr3 = rTmpPtr, rTmpPtr3 | |
4552 | nop.f 0 | |
4553 | (p10) cmp.eq p8, p9 = rXRnd, r0 | |
4554 | } | |
4555 | { .mfi | |
4556 | // (i*16) + (i*16)*4 + (i*16)*8 | |
4557 | add rTmpPtr = rTmpPtr, rTmpPtr2 | |
4558 | nop.f 0 | |
4559 | (p11) cmp.eq p9, p8 = rXRnd, r0 | |
4560 | } | |
4561 | ;; | |
4562 | { .mfi | |
4563 | add rTmpPtr2 = rPolDataPtr, rTmpPtr3 | |
4564 | nop.f 0 | |
4565 | nop.i 0 | |
4566 | } | |
4567 | { .mfi | |
4568 | add rPolDataPtr = rPolDataPtr, rTmpPtr // begin + offsett | |
4569 | nop.f 0 | |
4570 | nop.i 0 | |
4571 | } | |
4572 | ;; | |
4573 | { .mfi | |
4574 | ldfpd fA0, fA0L = [rPolDataPtr], 16 // A0 | |
4575 | nop.f 0 | |
4576 | adds rTmpPtr = 112, rTmpPtr2 | |
4577 | } | |
4578 | { .mfi | |
4579 | ldfpd fA2, fA2L = [rTmpPtr2], 16 // A2 | |
4580 | nop.f 0 | |
4581 | cmp.eq p12, p13 = 4, rSgnGamSize | |
4582 | } | |
4583 | ;; | |
4584 | { .mfi | |
4585 | ldfpd fA1, fA1L = [rPolDataPtr], 16 // A1 | |
4586 | nop.f 0 | |
4587 | nop.i 0 | |
4588 | } | |
4589 | { .mfi | |
4590 | ldfe fA3 = [rTmpPtr2], 128 // A4 | |
4591 | nop.f 0 | |
4592 | nop.i 0 | |
4593 | } | |
4594 | ;; | |
4595 | { .mfi | |
4596 | ldfpd fA12, fA13 = [rTmpPtr], 16 // A12, A13 | |
4597 | nop.f 0 | |
4598 | adds rTmpPtr3 = 64, rPolDataPtr | |
4599 | } | |
4600 | { .mfi | |
4601 | ldfpd fA16, fA17 = [rTmpPtr2], 16 // A16, A17 | |
4602 | nop.f 0 | |
4603 | adds rPolDataPtr = 32, rPolDataPtr | |
4604 | } | |
4605 | ;; | |
4606 | .pred.rel "mutex",p8,p9 | |
4607 | { .mfi | |
4608 | ldfpd fA14, fA15 = [rTmpPtr], 16 // A14, A15 | |
4609 | nop.f 0 | |
4610 | // sign of GAMMA(x) is negative | |
4611 | (p8) adds rSgnGam = -1, r0 | |
4612 | } | |
4613 | { .mfi | |
4614 | ldfpd fA18, fA19 = [rTmpPtr2], 16 // A18, A19 | |
4615 | nop.f 0 | |
4616 | // sign of GAMMA(x) is positive | |
4617 | (p9) adds rSgnGam = 1, r0 | |
4618 | } | |
4619 | ;; | |
4620 | { .mfi | |
4621 | ldfe fA4 = [rPolDataPtr], 16 // A4 | |
4622 | nop.f 0 | |
4623 | nop.i 0 | |
4624 | } | |
4625 | { .mfi | |
4626 | ldfpd fA6, fA7 = [rTmpPtr3], 16 // A6, A7 | |
4627 | nop.f 0 | |
4628 | nop.i 0 | |
4629 | } | |
4630 | ;; | |
4631 | { .mfi | |
4632 | ldfe fA5 = [rPolDataPtr], 16 // A5 | |
4633 | // if x equals to (rounded) root exactly | |
4634 | fcmp.eq.s1 p6, p0 = f8, fRoot | |
4635 | nop.i 0 | |
4636 | } | |
4637 | { .mfi | |
4638 | ldfpd fA8, fA9 = [rTmpPtr3], 16 // A8, A9 | |
4639 | fms.s1 FR_FracX = f8, f1, fRoot | |
4640 | nop.i 0 | |
4641 | } | |
4642 | ;; | |
4643 | { .mfi | |
4644 | // store signgam if size of variable is 4 bytes | |
4645 | (p12) st4 [rSgnGamAddr] = rSgnGam | |
4646 | nop.f 0 | |
4647 | nop.i 0 | |
4648 | } | |
4649 | { .mfb | |
4650 | // store signgam if size of variable is 8 bytes | |
4651 | (p13) st8 [rSgnGamAddr] = rSgnGam | |
4652 | // answer if x equals to (rounded) root exactly | |
4653 | (p6) fadd.s0 f8 = fA0, fA0L | |
4654 | // exit if x equals to (rounded) root exactly | |
4655 | (p6) br.ret.spnt b0 | |
4656 | } | |
4657 | ;; | |
4658 | { .mmf | |
4659 | ldfpd fA10, fA11 = [rTmpPtr3], 16 // A10, A11 | |
4660 | nop.m 0 | |
4661 | nop.f 0 | |
4662 | } | |
4663 | ;; | |
4664 | { .mfi | |
4665 | nop.m 0 | |
4666 | fma.s1 fResH = fA2, FR_FracX, f0 // (A2*x)hi | |
4667 | nop.i 0 | |
4668 | } | |
4669 | { .mfi | |
4670 | nop.m 0 | |
4671 | fma.s1 fA4L = FR_FracX, FR_FracX, f0 // x^2 | |
4672 | nop.i 0 | |
4673 | } | |
4674 | ;; | |
4675 | { .mfi | |
4676 | nop.m 0 | |
4677 | fma.s1 fA17 = fA17, FR_FracX, fA16 | |
4678 | nop.i 0 | |
4679 | } | |
4680 | {.mfi | |
4681 | nop.m 0 | |
4682 | fma.s1 fA13 = fA13, FR_FracX, fA12 | |
4683 | nop.i 0 | |
4684 | } | |
4685 | ;; | |
4686 | { .mfi | |
4687 | nop.m 0 | |
4688 | fma.s1 fA19 = fA19, FR_FracX, fA18 | |
4689 | nop.i 0 | |
4690 | } | |
4691 | {.mfi | |
4692 | nop.m 0 | |
4693 | fma.s1 fA15 = fA15, FR_FracX, fA14 | |
4694 | nop.i 0 | |
4695 | } | |
4696 | ;; | |
4697 | {.mfi | |
4698 | nop.m 0 | |
4699 | fma.s1 fPol = fA7, FR_FracX, fA6 | |
4700 | nop.i 0 | |
4701 | } | |
4702 | ;; | |
4703 | {.mfi | |
4704 | nop.m 0 | |
4705 | fma.s1 fA9 = fA9, FR_FracX, fA8 | |
4706 | nop.i 0 | |
4707 | } | |
4708 | ;; | |
4709 | { .mfi | |
4710 | nop.m 0 | |
4711 | fms.s1 fResL = fA2, FR_FracX, fResH // delta(A2*x) | |
4712 | nop.i 0 | |
4713 | } | |
4714 | {.mfi | |
4715 | nop.m 0 | |
4716 | fadd.s1 fRes1H = fResH, fA1 // (A2*x + A1)hi | |
4717 | nop.i 0 | |
4718 | } | |
4719 | ;; | |
4720 | { .mfi | |
4721 | nop.m 0 | |
4722 | fma.s1 fA11 = fA11, FR_FracX, fA10 | |
4723 | nop.i 0 | |
4724 | } | |
4725 | {.mfi | |
4726 | nop.m 0 | |
4727 | fma.s1 fA5L = fA4L, fA4L, f0 // x^4 | |
4728 | nop.i 0 | |
4729 | } | |
4730 | ;; | |
4731 | { .mfi | |
4732 | nop.m 0 | |
4733 | fma.s1 fA19 = fA19, fA4L, fA17 | |
4734 | nop.i 0 | |
4735 | } | |
4736 | {.mfi | |
4737 | nop.m 0 | |
4738 | fma.s1 fA15 = fA15, fA4L, fA13 | |
4739 | nop.i 0 | |
4740 | } | |
4741 | ;; | |
4742 | { .mfi | |
4743 | nop.m 0 | |
4744 | fma.s1 fPol = fPol, FR_FracX, fA5 | |
4745 | nop.i 0 | |
4746 | } | |
4747 | {.mfi | |
4748 | nop.m 0 | |
4749 | fma.s1 fA3L = fA4L, FR_FracX, f0 // x^3 | |
4750 | nop.i 0 | |
4751 | } | |
4752 | ;; | |
4753 | { .mfi | |
4754 | nop.m 0 | |
4755 | // delta(A2*x) + A2L*x = (A2*x)lo | |
4756 | fma.s1 fResL = fA2L, FR_FracX, fResL | |
4757 | nop.i 0 | |
4758 | } | |
4759 | {.mfi | |
4760 | nop.m 0 | |
4761 | fsub.s1 fRes1L = fA1, fRes1H | |
4762 | nop.i 0 | |
4763 | } | |
4764 | ;; | |
4765 | { .mfi | |
4766 | nop.m 0 | |
4767 | fma.s1 fA11 = fA11, fA4L, fA9 | |
4768 | nop.i 0 | |
4769 | } | |
4770 | {.mfi | |
4771 | nop.m 0 | |
4772 | fma.s1 fA19 = fA19, fA5L, fA15 | |
4773 | nop.i 0 | |
4774 | } | |
4775 | ;; | |
4776 | {.mfi | |
4777 | nop.m 0 | |
4778 | fma.s1 fPol = fPol, FR_FracX, fA4 | |
4779 | nop.i 0 | |
4780 | } | |
4781 | ;; | |
4782 | { .mfi | |
4783 | nop.m 0 | |
4784 | fadd.s1 fResL = fResL, fA1L // (A2*x)lo + A1 | |
4785 | nop.i 0 | |
4786 | } | |
4787 | {.mfi | |
4788 | nop.m 0 | |
4789 | fadd.s1 fRes1L = fRes1L, fResH | |
4790 | nop.i 0 | |
4791 | } | |
4792 | ;; | |
4793 | { .mfi | |
4794 | nop.m 0 | |
4795 | fma.s1 fRes2H = fRes1H, FR_FracX, f0 // ((A2*x + A1)*x)hi | |
4796 | nop.i 0 | |
4797 | } | |
4798 | ;; | |
4799 | {.mfi | |
4800 | nop.m 0 | |
4801 | fma.s1 fA19 = fA19, fA5L, fA11 | |
4802 | nop.i 0 | |
4803 | } | |
4804 | ;; | |
4805 | {.mfi | |
4806 | nop.m 0 | |
4807 | fma.s1 fPol = fPol, FR_FracX, fA3 | |
4808 | nop.i 0 | |
4809 | } | |
4810 | ;; | |
4811 | { .mfi | |
4812 | nop.m 0 | |
4813 | fadd.s1 fRes1L = fRes1L, fResL // (A2*x + A1)lo | |
4814 | nop.i 0 | |
4815 | } | |
4816 | ;; | |
4817 | { .mfi | |
4818 | nop.m 0 | |
4819 | // delta((A2*x + A1)*x) | |
4820 | fms.s1 fRes2L = fRes1H, FR_FracX, fRes2H | |
4821 | nop.i 0 | |
4822 | } | |
4823 | {.mfi | |
4824 | nop.m 0 | |
4825 | fadd.s1 fRes3H = fRes2H, fA0 // ((A2*x + A1)*x + A0)hi | |
4826 | nop.i 0 | |
4827 | } | |
4828 | ;; | |
4829 | { .mfi | |
4830 | nop.m 0 | |
4831 | fma.s1 fA19 = fA19, fA5L, f0 | |
4832 | nop.i 0 | |
4833 | } | |
4834 | ||
4835 | ;; | |
4836 | { .mfi | |
4837 | nop.m 0 | |
4838 | fma.s1 fRes2L = fRes1L, FR_FracX, fRes2L // ((A2*x + A1)*x)lo | |
4839 | nop.i 0 | |
4840 | } | |
4841 | {.mfi | |
4842 | nop.m 0 | |
4843 | fsub.s1 fRes3L = fRes2H, fRes3H | |
4844 | nop.i 0 | |
4845 | } | |
4846 | ;; | |
4847 | {.mfi | |
4848 | nop.m 0 | |
4849 | fma.s1 fPol = fA19, FR_FracX, fPol | |
4850 | nop.i 0 | |
4851 | } | |
4852 | ;; | |
4853 | { .mfi | |
4854 | nop.m 0 | |
4855 | fadd.s1 fRes3L = fRes3L, fA0 | |
4856 | nop.i 0 | |
4857 | } | |
4858 | {.mfi | |
4859 | nop.m 0 | |
4860 | fadd.s1 fRes2L = fRes2L, fA0L // ((A2*x + A1)*x)lo + A0L | |
4861 | nop.i 0 | |
4862 | } | |
4863 | ;; | |
4864 | { .mfi | |
4865 | nop.m 0 | |
4866 | fadd.s1 fRes3L = fRes3L, fRes2L // (((A2*x + A1)*x) + A0)lo | |
4867 | nop.i 0 | |
4868 | } | |
4869 | ;; | |
4870 | {.mfi | |
4871 | nop.m 0 | |
4872 | fma.s1 fRes3L = fPol, fA3L, fRes3L | |
4873 | nop.i 0 | |
4874 | } | |
4875 | ;; | |
4876 | { .mfb | |
4877 | nop.m 0 | |
4878 | // final result for arguments which are close to negative roots | |
4879 | fma.s0 f8 = fRes3H, f1, fRes3L | |
4880 | // exit here for arguments which are close to negative roots | |
4881 | br.ret.sptk b0 | |
4882 | } | |
4883 | ;; | |
4884 | ||
4885 | // here if |x| < 0.5 | |
4886 | .align 32 | |
4887 | lgammal_0_half: | |
4888 | { .mfi | |
4889 | ld4 GR_Z_1 = [rZ1offsett] // Load Z_1 | |
4890 | fma.s1 fA4L = f8, f8, f0 // x^2 | |
4891 | addl rPolDataPtr = @ltoff(lgammal_0_Half_data), gp | |
4892 | } | |
4893 | { .mfi | |
4894 | shladd GR_ad_tbl_1 = GR_Index1, 4, rTbl1Addr// Point to G_1 | |
4895 | nop.f 0 | |
4896 | addl rLnSinDataPtr = @ltoff(lgammal_lnsin_data), gp | |
4897 | } | |
4898 | ;; | |
4899 | { .mfi | |
4900 | ldfps FR_G, FR_H = [GR_ad_tbl_1],8 // Load G_1, H_1 | |
4901 | nop.f 0 | |
4902 | // Point to Constants_Z_2 | |
4903 | add GR_ad_z_2 = 0x140, GR_ad_z_1 | |
4904 | } | |
4905 | { .mfi | |
4906 | add GR_ad_q = -0x60, GR_ad_z_1 // Point to Constants_Q | |
4907 | nop.f 0 | |
4908 | // Point to Constants_G_H_h2 | |
4909 | add GR_ad_tbl_2 = 0x180, GR_ad_z_1 | |
4910 | } | |
4911 | ;; | |
4912 | { .mfi | |
4913 | ld8 rPolDataPtr = [rPolDataPtr] | |
4914 | nop.f 0 | |
4915 | // Point to Constants_G_H_h3 | |
4916 | add GR_ad_tbl_3 = 0x280, GR_ad_z_1 | |
4917 | } | |
4918 | { .mfi | |
4919 | ldfd FR_h = [GR_ad_tbl_1] // Load h_1 | |
4920 | nop.f 0 | |
4921 | sub GR_N = rExpX, rExpHalf, 1 | |
4922 | } | |
4923 | ;; | |
4924 | { .mfi | |
4925 | ld8 rLnSinDataPtr = [rLnSinDataPtr] | |
4926 | nop.f 0 | |
4927 | // Get bits 30-15 of X_0 * Z_1 | |
4928 | pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15 | |
4929 | } | |
4930 | { .mfi | |
4931 | ldfe FR_log2_hi = [GR_ad_q],16 // Load log2_hi | |
4932 | nop.f 0 | |
4933 | sub GR_N = r0, GR_N | |
4934 | } | |
4935 | ;; | |
4936 | // | |
4937 | // For performance, don't use result of pmpyshr2.u for 4 cycles. | |
4938 | // | |
4939 | { .mfi | |
4940 | ldfe FR_log2_lo = [GR_ad_q], 16 // Load log2_lo | |
4941 | nop.f 0 | |
4942 | add rTmpPtr2 = 320, rPolDataPtr | |
4943 | } | |
4944 | { .mfi | |
4945 | add rTmpPtr = 32, rPolDataPtr | |
4946 | nop.f 0 | |
4947 | // exponent of 0.25 | |
4948 | adds rExp2 = -1, rExpHalf | |
4949 | } | |
4950 | ;; | |
4951 | { .mfi | |
4952 | ldfpd fA3, fA3L = [rPolDataPtr], 16 // A3 | |
4953 | fma.s1 fA5L = fA4L, fA4L, f0 // x^4 | |
4954 | nop.i 0 | |
4955 | } | |
4956 | { .mfi | |
4957 | ldfpd fA1, fA1L = [rTmpPtr], 16 // A1 | |
4958 | fms.s1 fB8 = f8, f8, fA4L // x^2 - <x^2> | |
4959 | // set p6 if -0.5 < x <= -0.25 | |
4960 | (p15) cmp.eq.unc p6, p0 = rExpX, rExp2 | |
4961 | } | |
4962 | ;; | |
4963 | { .mfi | |
4964 | ldfpd fA2, fA2L = [rPolDataPtr], 16 // A2 | |
4965 | nop.f 0 | |
4966 | // set p6 if -0.5 < x <= -0.40625 | |
4967 | (p6) cmp.le.unc p6, p0 = 10, GR_Index1 | |
4968 | } | |
4969 | { .mfi | |
4970 | ldfe fA21 = [rTmpPtr2], -16 // A21 | |
4971 | // Put integer N into rightmost significand | |
4972 | nop.f 0 | |
4973 | adds rTmpPtr = 240, rTmpPtr | |
4974 | } | |
4975 | ;; | |
4976 | { .mfi | |
4977 | setf.sig fFloatN = GR_N | |
4978 | nop.f 0 | |
4979 | extr.u GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1 | |
4980 | } | |
4981 | { .mfi | |
4982 | ldfe FR_Q4 = [GR_ad_q], 16 // Load Q4 | |
4983 | nop.f 0 | |
4984 | adds rPolDataPtr = 304, rPolDataPtr | |
4985 | } | |
4986 | ;; | |
4987 | { .mfi | |
4988 | ldfe fA20 = [rTmpPtr2], -32 // A20 | |
4989 | nop.f 0 | |
4990 | shladd GR_ad_z_2 = GR_Index2, 2, GR_ad_z_2 // Point to Z_2 | |
4991 | } | |
4992 | { .mfi | |
4993 | ldfe fA19 = [rTmpPtr], -32 // A19 | |
4994 | nop.f 0 | |
4995 | shladd GR_ad_tbl_2 = GR_Index2, 4, GR_ad_tbl_2// Point to G_2 | |
4996 | } | |
4997 | ;; | |
4998 | { .mfi | |
4999 | ldfe fA17 = [rTmpPtr], -32 // A17 | |
5000 | nop.f 0 | |
5001 | adds rTmpPtr3 = 8, GR_ad_tbl_2 | |
5002 | } | |
5003 | { .mfb | |
5004 | ldfe fA18 = [rTmpPtr2], -32 // A18 | |
5005 | nop.f 0 | |
5006 | // branch to special path for -0.5 < x <= 0.40625 | |
5007 | (p6) br.cond.spnt lgammal_near_neg_half | |
5008 | } | |
5009 | ;; | |
5010 | { .mmf | |
5011 | ld4 GR_Z_2 = [GR_ad_z_2] // Load Z_2 | |
5012 | ldfe fA15 = [rTmpPtr], -32 // A15 | |
5013 | fma.s1 fB20 = fA5L, fA5L, f0 // x^8 | |
5014 | } | |
5015 | ;; | |
5016 | { .mmf | |
5017 | ldfe fA16 = [rTmpPtr2], -32 // A16 | |
5018 | ldfe fA13 = [rTmpPtr], -32 // A13 | |
5019 | fms.s1 fB16 = fA4L, fA4L, fA5L | |
5020 | } | |
5021 | ;; | |
5022 | { .mmf | |
5023 | ldfps FR_G2, FR_H2 = [GR_ad_tbl_2], 8 // Load G_2, H_2 | |
5024 | ldfd FR_h2 = [rTmpPtr3] // Load h_2 | |
5025 | fmerge.s fB10 = f8, fA5L // sign(x) * x^4 | |
5026 | } | |
5027 | ;; | |
5028 | { .mmi | |
5029 | ldfe fA14 = [rTmpPtr2], -32 // A14 | |
5030 | ldfe fA11 = [rTmpPtr], -32 // A11 | |
5031 | // Get bits 30-15 of X_1 * Z_2 | |
5032 | pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15 | |
5033 | } | |
5034 | ;; | |
5035 | // | |
5036 | // For performance, don't use result of pmpyshr2.u for 4 cycles. | |
5037 | // | |
5038 | { .mfi | |
5039 | ldfe fA12 = [rTmpPtr2], -32 // A12 | |
5040 | fma.s1 fRes4H = fA3, fAbsX, f0 | |
5041 | adds rTmpPtr3 = 16, GR_ad_q | |
5042 | } | |
5043 | { .mfi | |
5044 | ldfe fA9 = [rTmpPtr], -32 // A9 | |
5045 | nop.f 0 | |
5046 | nop.i 0 | |
5047 | } | |
5048 | ;; | |
5049 | { .mmf | |
5050 | ldfe fA10 = [rTmpPtr2], -32 // A10 | |
5051 | ldfe fA7 = [rTmpPtr], -32 // A7 | |
5052 | fma.s1 fB18 = fB20, fB20, f0 // x^16 | |
5053 | } | |
5054 | ;; | |
5055 | { .mmf | |
5056 | ldfe fA8 = [rTmpPtr2], -32 // A8 | |
5057 | ldfe fA22 = [rPolDataPtr], 16 // A22 | |
5058 | fcvt.xf fFloatN = fFloatN | |
5059 | } | |
5060 | ;; | |
5061 | { .mfi | |
5062 | ldfe fA5 = [rTmpPtr], -32 // A5 | |
5063 | fma.s1 fA21 = fA21, fAbsX, fA20 // v16 | |
5064 | extr.u GR_Index3 = GR_X_2, 1, 5 // Extract bits 1-5 of X_2 | |
5065 | } | |
5066 | { .mfi | |
5067 | ldfe fA6 = [rTmpPtr2], -32 // A6 | |
5068 | nop.f 0 | |
5069 | nop.i 0 | |
5070 | } | |
5071 | ;; | |
5072 | { .mmf | |
5073 | // Point to G_3 | |
5074 | shladd GR_ad_tbl_3 = GR_Index3, 4, GR_ad_tbl_3 | |
5075 | ldfe fA4 = [rTmpPtr2], -32 // A4 | |
5076 | fma.s1 fA19 = fA19, fAbsX, fA18 // v13 | |
5077 | } | |
5078 | ;; | |
5079 | .pred.rel "mutex",p14,p15 | |
5080 | { .mfi | |
5081 | ldfps FR_G3, FR_H3 = [GR_ad_tbl_3],8 // Load G_3, H_3 | |
5082 | fms.s1 fRes4L = fA3, fAbsX, fRes4H | |
5083 | (p14) adds rSgnGam = 1, r0 | |
5084 | } | |
5085 | { .mfi | |
5086 | cmp.eq p6, p7 = 4, rSgnGamSize | |
5087 | fadd.s1 fRes2H = fRes4H, fA2 | |
5088 | (p15) adds rSgnGam = -1, r0 | |
5089 | } | |
5090 | ;; | |
5091 | ||
5092 | { .mfi | |
5093 | ldfd FR_h3 = [GR_ad_tbl_3] // Load h_3 | |
5094 | fma.s1 fA17 = fA17, fAbsX, fA16 // v12 | |
5095 | nop.i 0 | |
5096 | } | |
5097 | ;; | |
5098 | { .mfi | |
5099 | ldfe FR_Q3 = [GR_ad_q], 32 // Load Q3 | |
5100 | fmpy.s1 FR_G = FR_G, FR_G2 // G = G_1 * G_2 | |
5101 | nop.i 0 | |
5102 | } | |
5103 | { .mfi | |
5104 | ldfe FR_Q2 = [rTmpPtr3], 16 // Load Q2 | |
5105 | fadd.s1 FR_H = FR_H, FR_H2 // H = H_1 + H_2 | |
5106 | nop.i 0 | |
5107 | } | |
5108 | ;; | |
5109 | { .mfi | |
5110 | ldfe FR_Q1 = [GR_ad_q] // Load Q1 | |
5111 | fma.s1 fA15 = fA15, fAbsX, fA14 // v8 | |
5112 | nop.i 0 | |
5113 | } | |
5114 | { .mfi | |
5115 | adds rTmpPtr3 = 32, rLnSinDataPtr | |
5116 | fadd.s1 FR_h = FR_h, FR_h2 // h = h_1 + h_2 | |
5117 | nop.i 0 | |
5118 | } | |
5119 | ;; | |
5120 | { .mmf | |
5121 | ldfpd fLnSin2, fLnSin2L = [rLnSinDataPtr], 16 | |
5122 | ldfe fLnSin6 = [rTmpPtr3], 32 | |
5123 | fma.s1 fA13 = fA13, fAbsX, fA12 // v7 | |
5124 | ||
5125 | } | |
5126 | ;; | |
5127 | { .mfi | |
5128 | ldfe fLnSin4 = [rLnSinDataPtr], 32 | |
5129 | fma.s1 fRes4L = fA3L, fAbsX, fRes4L | |
5130 | nop.i 0 | |
5131 | } | |
5132 | { .mfi | |
5133 | ldfe fLnSin10 = [rTmpPtr3], 32 | |
5134 | fsub.s1 fRes2L = fA2, fRes2H | |
5135 | nop.i 0 | |
5136 | } | |
5137 | ;; | |
5138 | { .mfi | |
5139 | ldfe fLnSin8 = [rLnSinDataPtr], 32 | |
5140 | fma.s1 fResH = fRes2H, fAbsX, f0 | |
5141 | nop.i 0 | |
5142 | } | |
5143 | { .mfi | |
5144 | ldfe fLnSin14 = [rTmpPtr3], 32 | |
5145 | fma.s1 fA22 = fA22, fA4L, fA21 // v15 | |
5146 | nop.i 0 | |
5147 | } | |
5148 | ;; | |
5149 | { .mfi | |
5150 | ldfe fLnSin12 = [rLnSinDataPtr], 32 | |
5151 | fma.s1 fA9 = fA9, fAbsX, fA8 // v4 | |
5152 | nop.i 0 | |
5153 | } | |
5154 | { .mfi | |
5155 | ldfd fLnSin18 = [rTmpPtr3], 16 | |
5156 | fma.s1 fA11 = fA11, fAbsX, fA10 // v5 | |
5157 | nop.i 0 | |
5158 | } | |
5159 | ;; | |
5160 | { .mfi | |
5161 | ldfe fLnSin16 = [rLnSinDataPtr], 24 | |
5162 | fma.s1 fA19 = fA19, fA4L, fA17 // v11 | |
5163 | nop.i 0 | |
5164 | } | |
5165 | { .mfi | |
5166 | ldfd fLnSin22 = [rTmpPtr3], 16 | |
5167 | fma.s1 fPolL = fA7, fAbsX, fA6 | |
5168 | nop.i 0 | |
5169 | } | |
5170 | ;; | |
5171 | { .mfi | |
5172 | ldfd fLnSin20 = [rLnSinDataPtr], 16 | |
5173 | fmpy.s1 FR_G = FR_G, FR_G3 // G = (G_1 * G_2) * G_3 | |
5174 | nop.i 0 | |
5175 | } | |
5176 | { .mfi | |
5177 | ldfd fLnSin26 = [rTmpPtr3], 16 | |
5178 | fadd.s1 FR_H = FR_H, FR_H3 // H = (H_1 + H_2) + H_3 | |
5179 | nop.i 0 | |
5180 | } | |
5181 | ;; | |
5182 | { .mfi | |
5183 | ldfd fLnSin24 = [rLnSinDataPtr], 16 | |
5184 | fadd.s1 fRes2L = fRes2L, fRes4H | |
5185 | nop.i 0 | |
5186 | } | |
5187 | { .mfi | |
5188 | ldfd fLnSin30 = [rTmpPtr3], 16 | |
5189 | fadd.s1 fA2L = fA2L, fRes4L | |
5190 | nop.i 0 | |
5191 | } | |
5192 | ;; | |
5193 | { .mfi | |
5194 | ldfd fLnSin28 = [rLnSinDataPtr], 16 | |
5195 | fms.s1 fResL = fRes2H, fAbsX, fResH | |
5196 | nop.i 0 | |
5197 | } | |
5198 | { .mfi | |
5199 | ldfd fLnSin34 = [rTmpPtr3], 8 | |
5200 | fadd.s1 fRes2H = fResH, fA1 | |
5201 | nop.i 0 | |
5202 | } | |
5203 | ;; | |
5204 | { .mfi | |
5205 | ldfd fLnSin32 = [rLnSinDataPtr] | |
5206 | fma.s1 fA11 = fA11, fA4L, fA9 // v3 | |
5207 | nop.i 0 | |
5208 | } | |
5209 | { .mfi | |
5210 | ldfd fLnSin36 = [rTmpPtr3] | |
5211 | fma.s1 fA15 = fA15, fA4L, fA13 // v6 | |
5212 | nop.i 0 | |
5213 | } | |
5214 | ;; | |
5215 | ||
5216 | { .mfi | |
5217 | // store signgam if size of variable is 4 bytes | |
5218 | (p6) st4 [rSgnGamAddr] = rSgnGam | |
5219 | fadd.s1 FR_h = FR_h, FR_h3 // h = (h_1 + h_2) + h_3 | |
5220 | nop.i 0 | |
5221 | } | |
5222 | { .mfi | |
5223 | // store signgam if size of variable is 8 bytes | |
5224 | (p7) st8 [rSgnGamAddr] = rSgnGam | |
5225 | fma.s1 fA5 = fA5, fAbsX, fA4 | |
5226 | nop.i 0 | |
5227 | } | |
5228 | ;; | |
5229 | { .mfi | |
5230 | nop.m 0 | |
5231 | fms.s1 FR_r = FR_G, fSignifX, f1 // r = G * S_hi - 1 | |
5232 | nop.i 0 | |
5233 | } | |
5234 | { .mfi | |
5235 | nop.m 0 | |
5236 | // High part of the log(|x|): Y_hi = N * log2_hi + H | |
5237 | fms.s1 FR_log2_hi = fFloatN, FR_log2_hi, FR_H | |
5238 | nop.i 0 | |
5239 | } | |
5240 | ;; | |
5241 | { .mfi | |
5242 | nop.m 0 | |
5243 | fadd.s1 fA3L = fRes2L, fA2L | |
5244 | nop.i 0 | |
5245 | } | |
5246 | { .mfi | |
5247 | nop.m 0 | |
5248 | fma.s1 fA22 = fA22, fA5L, fA19 | |
5249 | nop.i 0 | |
5250 | } | |
5251 | ;; | |
5252 | { .mfi | |
5253 | nop.m 0 | |
5254 | fsub.s1 fRes2L = fA1, fRes2H | |
5255 | nop.i 0 | |
5256 | } | |
5257 | { .mfi | |
5258 | nop.m 0 | |
5259 | fma.s1 fRes3H = fRes2H, f8, f0 | |
5260 | nop.i 0 | |
5261 | } | |
5262 | ;; | |
5263 | { .mfi | |
5264 | nop.m 0 | |
5265 | fma.s1 fA15 = fA15, fA5L, fA11 // v2 | |
5266 | nop.i 0 | |
5267 | } | |
5268 | { .mfi | |
5269 | nop.m 0 | |
5270 | fma.s1 fLnSin18 = fLnSin18, fA4L, fLnSin16 | |
5271 | nop.i 0 | |
5272 | } | |
5273 | ;; | |
5274 | { .mfi | |
5275 | nop.m 0 | |
5276 | // h = N * log2_lo + h | |
5277 | fms.s1 FR_h = fFloatN, FR_log2_lo, FR_h | |
5278 | nop.i 0 | |
5279 | } | |
5280 | { .mfi | |
5281 | nop.m 0 | |
5282 | fma.s1 fPolL = fPolL, fA4L, fA5 | |
5283 | nop.i 0 | |
5284 | } | |
5285 | ;; | |
5286 | { .mfi | |
5287 | nop.m 0 | |
5288 | // poly_lo = r * Q4 + Q3 | |
5289 | fma.s1 FR_poly_lo = FR_r, FR_Q4, FR_Q3 | |
5290 | nop.i 0 | |
5291 | } | |
5292 | { .mfi | |
5293 | nop.m 0 | |
5294 | fmpy.s1 FR_rsq = FR_r, FR_r // rsq = r * r | |
5295 | nop.i 0 | |
5296 | } | |
5297 | ;; | |
5298 | { .mfi | |
5299 | nop.m 0 | |
5300 | fma.s1 fResL = fA3L, fAbsX, fResL | |
5301 | nop.i 0 | |
5302 | } | |
5303 | { .mfi | |
5304 | nop.m 0 | |
5305 | fma.s1 fLnSin30 = fLnSin30, fA4L, fLnSin28 | |
5306 | nop.i 0 | |
5307 | } | |
5308 | ;; | |
5309 | { .mfi | |
5310 | nop.m 0 | |
5311 | fadd.s1 fRes2L = fRes2L, fResH | |
5312 | nop.i 0 | |
5313 | } | |
5314 | { .mfi | |
5315 | nop.m 0 | |
5316 | fms.s1 fRes3L = fRes2H, f8, fRes3H | |
5317 | nop.i 0 | |
5318 | } | |
5319 | ;; | |
5320 | { .mfi | |
5321 | nop.m 0 | |
5322 | fadd.s1 fRes1H = fRes3H, FR_log2_hi | |
5323 | nop.i 0 | |
5324 | } | |
5325 | { .mfi | |
5326 | nop.m 0 | |
5327 | fma.s1 fPol = fB20, fA22, fA15 | |
5328 | nop.i 0 | |
5329 | } | |
5330 | ;; | |
5331 | { .mfi | |
5332 | nop.m 0 | |
5333 | fma.s1 fLnSin34 = fLnSin34, fA4L, fLnSin32 | |
5334 | nop.i 0 | |
5335 | } | |
5336 | { .mfi | |
5337 | nop.m 0 | |
5338 | fma.s1 fLnSin14 = fLnSin14, fA4L, fLnSin12 | |
5339 | nop.i 0 | |
5340 | } | |
5341 | ;; | |
5342 | ||
5343 | { .mfi | |
5344 | nop.m 0 | |
5345 | // poly_lo = poly_lo * r + Q2 | |
5346 | fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_Q2 | |
5347 | nop.i 0 | |
5348 | } | |
5349 | { .mfi | |
5350 | nop.m 0 | |
5351 | fnma.s1 FR_rcub = FR_rsq, FR_r, f0 // rcub = r^3 | |
5352 | nop.i 0 | |
5353 | } | |
5354 | ;; | |
5355 | { .mfi | |
5356 | nop.m 0 | |
5357 | // poly_hi = Q1 * rsq + r | |
5358 | fma.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r | |
5359 | nop.i 0 | |
5360 | } | |
5361 | { .mfi | |
5362 | nop.m 0 | |
5363 | fadd.s1 fA1L = fA1L, fResL | |
5364 | nop.i 0 | |
5365 | } | |
5366 | ;; | |
5367 | ||
5368 | { .mfi | |
5369 | nop.m 0 | |
5370 | fma.s1 fLnSin22 = fLnSin22, fA4L, fLnSin20 | |
5371 | nop.i 0 | |
5372 | } | |
5373 | { .mfi | |
5374 | nop.m 0 | |
5375 | fma.s1 fLnSin26 = fLnSin26, fA4L, fLnSin24 | |
5376 | nop.i 0 | |
5377 | } | |
5378 | ;; | |
5379 | ||
5380 | { .mfi | |
5381 | nop.m 0 | |
5382 | fsub.s1 fRes1L = FR_log2_hi, fRes1H | |
5383 | nop.i 0 | |
5384 | } | |
5385 | { .mfi | |
5386 | nop.m 0 | |
5387 | fma.s1 fPol = fPol, fA5L, fPolL | |
5388 | nop.i 0 | |
5389 | } | |
5390 | ;; | |
5391 | { .mfi | |
5392 | nop.m 0 | |
5393 | fma.s1 fLnSin34 = fLnSin36, fA5L, fLnSin34 | |
5394 | nop.i 0 | |
5395 | } | |
5396 | { .mfi | |
5397 | nop.m 0 | |
5398 | fma.s1 fLnSin18 = fLnSin18, fA5L, fLnSin14 | |
5399 | nop.i 0 | |
5400 | } | |
5401 | ;; | |
5402 | { .mfi | |
5403 | nop.m 0 | |
5404 | fma.s1 fLnSin6 = fLnSin6, fA4L, fLnSin4 | |
5405 | nop.i 0 | |
5406 | } | |
5407 | { .mfi | |
5408 | nop.m 0 | |
5409 | fma.s1 fLnSin10 = fLnSin10, fA4L, fLnSin8 | |
5410 | nop.i 0 | |
5411 | } | |
5412 | ;; | |
5413 | { .mfi | |
5414 | nop.m 0 | |
5415 | // poly_hi = Q1 * rsq + r | |
5416 | fma.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r | |
5417 | nop.i 0 | |
5418 | } | |
5419 | { .mfi | |
5420 | nop.m 0 | |
5421 | fadd.s1 fRes2L = fRes2L, fA1L | |
5422 | nop.i 0 | |
5423 | } | |
5424 | ;; | |
5425 | { .mfi | |
5426 | nop.m 0 | |
5427 | // poly_lo = poly_lo*r^3 + h | |
5428 | fma.s1 FR_poly_lo = FR_poly_lo, FR_rcub, FR_h | |
5429 | nop.i 0 | |
5430 | } | |
5431 | { .mfi | |
5432 | nop.m 0 | |
5433 | fma.s1 fB2 = fLnSin2, fA4L, f0 | |
5434 | nop.i 0 | |
5435 | } | |
5436 | ;; | |
5437 | { .mfi | |
5438 | nop.m 0 | |
5439 | fadd.s1 fRes1L = fRes1L, fRes3H | |
5440 | nop.i 0 | |
5441 | } | |
5442 | { .mfi | |
5443 | nop.m 0 | |
5444 | fma.s1 fPol = fPol, fB10, f0 | |
5445 | nop.i 0 | |
5446 | } | |
5447 | ;; | |
5448 | { .mfi | |
5449 | nop.m 0 | |
5450 | fma.s1 fLnSin26 = fLnSin26, fA5L, fLnSin22 | |
5451 | nop.i 0 | |
5452 | } | |
5453 | { .mfi | |
5454 | nop.m 0 | |
5455 | fma.s1 fLnSin34 = fLnSin34, fA5L, fLnSin30 | |
5456 | nop.i 0 | |
5457 | } | |
5458 | ;; | |
5459 | { .mfi | |
5460 | nop.m 0 | |
5461 | fma.s1 fLnSin10 = fLnSin10, fA5L, fLnSin6 | |
5462 | nop.i 0 | |
5463 | } | |
5464 | { .mfi | |
5465 | nop.m 0 | |
5466 | fma.s1 fLnSin2L = fLnSin2L, fA4L, f0 | |
5467 | nop.i 0 | |
5468 | } | |
5469 | ;; | |
5470 | ||
5471 | { .mfi | |
5472 | nop.m 0 | |
5473 | fma.s1 fRes3L = fRes2L, f8, fRes3L | |
5474 | nop.i 0 | |
5475 | } | |
5476 | ;; | |
5477 | { .mfi | |
5478 | nop.m 0 | |
5479 | // Y_lo = poly_hi + poly_lo | |
5480 | fsub.s1 FR_log2_lo = FR_poly_lo, FR_poly_hi | |
5481 | nop.i 0 | |
5482 | } | |
5483 | { .mfi | |
5484 | nop.m 0 | |
5485 | fms.s1 fB4 = fLnSin2, fA4L, fB2 | |
5486 | nop.i 0 | |
5487 | } | |
5488 | ;; | |
5489 | { .mfi | |
5490 | nop.m 0 | |
5491 | fadd.s1 fRes2H = fRes1H, fPol | |
5492 | nop.i 0 | |
5493 | } | |
5494 | ;; | |
5495 | { .mfi | |
5496 | nop.m 0 | |
5497 | fma.s1 fLnSin34 = fLnSin34, fB20, fLnSin26 | |
5498 | nop.i 0 | |
5499 | } | |
5500 | ;; | |
5501 | { .mfi | |
5502 | nop.m 0 | |
5503 | fma.s1 fLnSin18 = fLnSin18, fB20, fLnSin10 | |
5504 | nop.i 0 | |
5505 | } | |
5506 | { .mfi | |
5507 | nop.m 0 | |
5508 | fma.s1 fLnSin2L = fB8, fLnSin2, fLnSin2L | |
5509 | nop.i 0 | |
5510 | } | |
5511 | ;; | |
5512 | ||
5513 | { .mfi | |
5514 | nop.m 0 | |
5515 | fadd.s1 FR_log2_lo = FR_log2_lo, fRes3L | |
5516 | nop.i 0 | |
5517 | } | |
5518 | ;; | |
5519 | { .mfi | |
5520 | nop.m 0 | |
5521 | fsub.s1 fRes2L = fRes1H, fRes2H | |
5522 | nop.i 0 | |
5523 | } | |
5524 | ;; | |
5525 | { .mfi | |
5526 | nop.m 0 | |
5527 | fma.s1 fB6 = fLnSin34, fB18, fLnSin18 | |
5528 | nop.i 0 | |
5529 | } | |
5530 | { .mfi | |
5531 | nop.m 0 | |
5532 | fadd.s1 fB4 = fLnSin2L, fB4 | |
5533 | nop.i 0 | |
5534 | } | |
5535 | ;; | |
5536 | ||
5537 | { .mfi | |
5538 | nop.m 0 | |
5539 | fadd.s1 fRes1L = fRes1L, FR_log2_lo | |
5540 | nop.i 0 | |
5541 | } | |
5542 | ;; | |
5543 | { .mfi | |
5544 | nop.m 0 | |
5545 | fadd.s1 fRes2L = fRes2L, fPol | |
5546 | nop.i 0 | |
5547 | } | |
5548 | ;; | |
5549 | { .mfi | |
5550 | nop.m 0 | |
5551 | fma.s1 fB12 = fB6, fA5L, f0 | |
5552 | nop.i 0 | |
5553 | } | |
5554 | ;; | |
5555 | { .mfi | |
5556 | nop.m 0 | |
5557 | fadd.s1 fRes2L = fRes2L, fRes1L | |
5558 | nop.i 0 | |
5559 | } | |
5560 | ;; | |
5561 | ||
5562 | { .mfi | |
5563 | nop.m 0 | |
5564 | fms.s1 fB14 = fB6, fA5L, fB12 | |
5565 | nop.i 0 | |
5566 | } | |
5567 | { .mfb | |
5568 | nop.m 0 | |
5569 | fadd.s1 fLnSin30 = fB2, fB12 | |
5570 | // branch out if x is negative | |
5571 | (p15) br.cond.spnt _O_Half_neg | |
5572 | } | |
5573 | ;; | |
5574 | { .mfb | |
5575 | nop.m 0 | |
5576 | // sign(x)*Pol(|x|) - log(|x|) | |
5577 | fma.s0 f8 = fRes2H, f1, fRes2L | |
5578 | // it's an answer already for positive x | |
5579 | // exit if 0 < x < 0.5 | |
5580 | br.ret.sptk b0 | |
5581 | } | |
5582 | ;; | |
5583 | ||
5584 | // here if x is negative and |x| < 0.5 | |
5585 | .align 32 | |
5586 | _O_Half_neg: | |
5587 | { .mfi | |
5588 | nop.m 0 | |
5589 | fma.s1 fB14 = fB16, fB6, fB14 | |
5590 | nop.i 0 | |
5591 | } | |
5592 | { .mfi | |
5593 | nop.m 0 | |
5594 | fsub.s1 fLnSin16 = fB2, fLnSin30 | |
5595 | nop.i 0 | |
5596 | } | |
5597 | ;; | |
5598 | { .mfi | |
5599 | nop.m 0 | |
5600 | fadd.s1 fResH = fLnSin30, fRes2H | |
5601 | nop.i 0 | |
5602 | } | |
5603 | ;; | |
5604 | { .mfi | |
5605 | nop.m 0 | |
5606 | fadd.s1 fLnSin16 = fLnSin16, fB12 | |
5607 | nop.i 0 | |
5608 | } | |
5609 | { .mfi | |
5610 | nop.m 0 | |
5611 | fadd.s1 fB4 = fB14, fB4 | |
5612 | nop.i 0 | |
5613 | } | |
5614 | ;; | |
5615 | { .mfi | |
5616 | nop.m 0 | |
5617 | fadd.s1 fLnSin16 = fB4, fLnSin16 | |
5618 | nop.i 0 | |
5619 | } | |
5620 | { .mfi | |
5621 | nop.m 0 | |
5622 | fsub.s1 fResL = fRes2H, fResH | |
5623 | nop.i 0 | |
5624 | } | |
5625 | ;; | |
5626 | { .mfi | |
5627 | nop.m 0 | |
5628 | fadd.s1 fResL = fResL, fLnSin30 | |
5629 | nop.i 0 | |
5630 | } | |
5631 | { .mfi | |
5632 | nop.m 0 | |
5633 | fadd.s1 fLnSin16 = fLnSin16, fRes2L | |
5634 | nop.i 0 | |
5635 | } | |
5636 | ;; | |
5637 | { .mfi | |
5638 | nop.m 0 | |
5639 | fadd.s1 fResL = fResL, fLnSin16 | |
5640 | nop.i 0 | |
5641 | } | |
5642 | ;; | |
5643 | { .mfb | |
5644 | nop.m 0 | |
5645 | // final result for -0.5 < x < 0 | |
5646 | fma.s0 f8 = fResH, f1, fResL | |
5647 | // exit for -0.5 < x < 0 | |
5648 | br.ret.sptk b0 | |
5649 | } | |
5650 | ;; | |
5651 | ||
5652 | // here if x >= 8.0 | |
5653 | // there are two computational paths: | |
5654 | // 1) For x >10.0 Stirling's formula is used | |
5655 | // 2) Polynomial approximation for 8.0 <= x <= 10.0 | |
5656 | .align 32 | |
5657 | lgammal_big_positive: | |
5658 | { .mfi | |
5659 | addl rPolDataPtr = @ltoff(lgammal_data), gp | |
5660 | fmerge.se fSignifX = f1, f8 | |
5661 | // Get high 15 bits of significand | |
5662 | extr.u GR_X_0 = rSignifX, 49, 15 | |
5663 | } | |
5664 | {.mfi | |
5665 | shladd rZ1offsett = GR_Index1, 2, GR_ad_z_1 // Point to Z_1 | |
5666 | fnma.s1 fInvX = f8, fRcpX, f1 // start of 1st NR iteration | |
5667 | adds rSignif1andQ = 0x5, r0 | |
5668 | } | |
5669 | ;; | |
5670 | {.mfi | |
5671 | ld4 GR_Z_1 = [rZ1offsett] // Load Z_1 | |
5672 | nop.f 0 | |
5673 | shl rSignif1andQ = rSignif1andQ, 61 // significand of 1.25 | |
5674 | } | |
5675 | { .mfi | |
5676 | cmp.eq p8, p0 = rExpX, rExp8 // p8 = 1 if 8.0 <= x < 16 | |
5677 | nop.f 0 | |
5678 | adds rSgnGam = 1, r0 // gamma is positive at this range | |
5679 | } | |
5680 | ;; | |
5681 | { .mfi | |
5682 | shladd GR_ad_tbl_1 = GR_Index1, 4, rTbl1Addr// Point to G_1 | |
5683 | nop.f 0 | |
5684 | add GR_ad_q = -0x60, GR_ad_z_1 // Point to Constants_Q | |
5685 | } | |
5686 | { .mlx | |
5687 | ld8 rPolDataPtr = [rPolDataPtr] | |
5688 | movl rDelta = 0x3FF2000000000000 | |
5689 | } | |
5690 | ;; | |
5691 | { .mfi | |
5692 | ldfps FR_G, FR_H = [GR_ad_tbl_1],8 // Load G_1, H_1 | |
5693 | nop.f 0 | |
5694 | add GR_ad_z_2 = 0x140, GR_ad_z_1 // Point to Constants_Z_2 | |
5695 | } | |
5696 | { .mfi | |
5697 | // Point to Constants_G_H_h2 | |
5698 | add GR_ad_tbl_2 = 0x180, GR_ad_z_1 | |
5699 | nop.f 0 | |
5700 | // p8 = 1 if 8.0 <= x <= 10.0 | |
5701 | (p8) cmp.leu.unc p8, p0 = rSignifX, rSignif1andQ | |
5702 | } | |
5703 | ;; | |
5704 | { .mfi | |
5705 | ldfd FR_h = [GR_ad_tbl_1] // Load h_1 | |
5706 | nop.f 0 | |
5707 | // Get bits 30-15 of X_0 * Z_1 | |
5708 | pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15 | |
5709 | } | |
5710 | { .mfb | |
5711 | (p8) setf.d FR_MHalf = rDelta | |
5712 | nop.f 0 | |
5713 | (p8) br.cond.spnt lgammal_8_10 // branch out if 8.0 <= x <= 10.0 | |
5714 | } | |
5715 | ;; | |
5716 | // | |
5717 | // For performance, don't use result of pmpyshr2.u for 4 cycles. | |
5718 | // | |
5719 | { .mfi | |
5720 | ldfe fA1 = [rPolDataPtr], 16 // Load overflow threshold | |
5721 | fma.s1 fRcpX = fInvX, fRcpX, fRcpX // end of 1st NR iteration | |
5722 | // Point to Constants_G_H_h3 | |
5723 | add GR_ad_tbl_3 = 0x280, GR_ad_z_1 | |
5724 | } | |
5725 | { .mlx | |
5726 | nop.m 0 | |
5727 | movl rDelta = 0xBFE0000000000000 // -0.5 in DP | |
5728 | } | |
5729 | ;; | |
5730 | { .mfi | |
5731 | ldfe FR_log2_hi = [GR_ad_q],16 // Load log2_hi | |
5732 | nop.f 0 | |
5733 | sub GR_N = rExpX, rExpHalf, 1 // unbiased exponent of x | |
5734 | } | |
5735 | ;; | |
5736 | { .mfi | |
5737 | ldfe FR_log2_lo = [GR_ad_q],16 // Load log2_lo | |
5738 | nop.f 0 | |
5739 | nop.i 0 | |
5740 | } | |
5741 | { .mfi | |
5742 | setf.d FR_MHalf = rDelta | |
5743 | nop.f 0 | |
5744 | nop.i 0 | |
5745 | } | |
5746 | ;; | |
5747 | { .mfi | |
5748 | // Put integer N into rightmost significand | |
5749 | setf.sig fFloatN = GR_N | |
5750 | nop.f 0 | |
5751 | extr.u GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1 | |
5752 | } | |
5753 | { .mfi | |
5754 | ldfe FR_Q4 = [GR_ad_q], 16 // Load Q4 | |
5755 | nop.f 0 | |
5756 | nop.i 0 | |
5757 | } | |
5758 | ;; | |
5759 | { .mfi | |
5760 | shladd GR_ad_z_2 = GR_Index2, 2, GR_ad_z_2 // Point to Z_2 | |
5761 | nop.f 0 | |
5762 | shladd GR_ad_tbl_2 = GR_Index2, 4, GR_ad_tbl_2// Point to G_2 | |
5763 | } | |
5764 | { .mfi | |
5765 | ldfe FR_Q3 = [GR_ad_q], 16 // Load Q3 | |
5766 | nop.f 0 | |
5767 | nop.i 0 | |
5768 | } | |
5769 | ;; | |
5770 | { .mfi | |
5771 | ld4 GR_Z_2 = [GR_ad_z_2] // Load Z_2 | |
5772 | fnma.s1 fInvX = f8, fRcpX, f1 // start of 2nd NR iteration | |
5773 | nop.i 0 | |
5774 | } | |
5775 | ;; | |
5776 | { .mfi | |
5777 | ldfps FR_G2, FR_H2 = [GR_ad_tbl_2], 8 // Load G_2, H_2 | |
5778 | nop.f 0 | |
5779 | nop.i 0 | |
5780 | } | |
5781 | ;; | |
5782 | { .mfi | |
5783 | ldfd FR_h2 = [GR_ad_tbl_2] // Load h_2 | |
5784 | nop.f 0 | |
5785 | nop.i 0 | |
5786 | } | |
5787 | ;; | |
5788 | { .mfi | |
5789 | ldfe FR_Q2 = [GR_ad_q],16 // Load Q2 | |
5790 | nop.f 0 | |
5791 | // Get bits 30-15 of X_1 * Z_2 | |
5792 | pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15 | |
5793 | } | |
5794 | ;; | |
5795 | // | |
5796 | // For performance, don't use result of pmpyshr2.u for 4 cycles. | |
5797 | // | |
5798 | { .mfi | |
5799 | ldfe FR_Q1 = [GR_ad_q] // Load Q1 | |
5800 | fcmp.gt.s1 p7,p0 = f8, fA1 // check if x > overflow threshold | |
5801 | nop.i 0 | |
5802 | } | |
5803 | ;; | |
5804 | {.mfi | |
5805 | ldfpd fA0, fA0L = [rPolDataPtr], 16 // Load two parts of C | |
5806 | fma.s1 fRcpX = fInvX, fRcpX, fRcpX // end of 2nd NR iteration | |
5807 | nop.i 0 | |
5808 | } | |
5809 | ;; | |
5810 | { .mfb | |
5811 | ldfpd fB2, fA1 = [rPolDataPtr], 16 | |
5812 | nop.f 0 | |
5813 | (p7) br.cond.spnt lgammal_overflow // branch if x > overflow threshold | |
5814 | } | |
5815 | ;; | |
5816 | {.mfi | |
5817 | ldfe fB4 = [rPolDataPtr], 16 | |
5818 | fcvt.xf fFloatN = fFloatN | |
5819 | extr.u GR_Index3 = GR_X_2, 1, 5 // Extract bits 1-5 of X_2 | |
5820 | } | |
5821 | ;; | |
5822 | { .mfi | |
5823 | shladd GR_ad_tbl_3 = GR_Index3, 4, GR_ad_tbl_3// Point to G_3 | |
5824 | nop.f 0 | |
5825 | nop.i 0 | |
5826 | } | |
5827 | { .mfi | |
5828 | ldfe fB6 = [rPolDataPtr], 16 | |
5829 | nop.f 0 | |
5830 | nop.i 0 | |
5831 | } | |
5832 | ;; | |
5833 | { .mfi | |
5834 | ldfps FR_G3, FR_H3 = [GR_ad_tbl_3], 8 // Load G_3, H_3 | |
5835 | nop.f 0 | |
5836 | nop.i 0 | |
5837 | } | |
5838 | ;; | |
5839 | { .mfi | |
5840 | ldfd FR_h3 = [GR_ad_tbl_3] // Load h_3 | |
5841 | fmpy.s1 FR_G = FR_G, FR_G2 // G = G_1 * G_2 | |
5842 | nop.i 0 | |
5843 | } | |
5844 | { .mfi | |
5845 | nop.m 0 | |
5846 | fadd.s1 FR_H = FR_H, FR_H2 // H = H_1 + H_2 | |
5847 | nop.i 0 | |
5848 | } | |
5849 | ;; | |
5850 | ||
5851 | { .mfi | |
5852 | ldfe fB8 = [rPolDataPtr], 16 | |
5853 | fadd.s1 FR_h = FR_h, FR_h2 // h = h_1 + h_2 | |
5854 | nop.i 0 | |
5855 | } | |
5856 | { .mfi | |
5857 | nop.m 0 | |
5858 | fnma.s1 fInvX = f8, fRcpX, f1 // start of 3rd NR iteration | |
5859 | nop.i 0 | |
5860 | } | |
5861 | ;; | |
5862 | { .mfi | |
5863 | ldfe fB10 = [rPolDataPtr], 16 | |
5864 | nop.f 0 | |
5865 | cmp.eq p6, p7 = 4, rSgnGamSize | |
5866 | } | |
5867 | ;; | |
5868 | { .mfi | |
5869 | ldfe fB12 = [rPolDataPtr], 16 | |
5870 | nop.f 0 | |
5871 | nop.i 0 | |
5872 | } | |
5873 | ;; | |
5874 | { .mfi | |
5875 | ldfe fB14 = [rPolDataPtr], 16 | |
5876 | nop.f 0 | |
5877 | nop.i 0 | |
5878 | } | |
5879 | ;; | |
5880 | ||
5881 | { .mfi | |
5882 | ldfe fB16 = [rPolDataPtr], 16 | |
5883 | // get double extended coefficients from two doubles | |
5884 | // two doubles are needed in Stitling's formula for negative x | |
5885 | fadd.s1 fB2 = fB2, fA1 | |
5886 | nop.i 0 | |
5887 | } | |
5888 | ;; | |
5889 | { .mfi | |
5890 | ldfe fB18 = [rPolDataPtr], 16 | |
5891 | fma.s1 fInvX = fInvX, fRcpX, fRcpX // end of 3rd NR iteration | |
5892 | nop.i 0 | |
5893 | } | |
5894 | ;; | |
5895 | { .mfi | |
5896 | ldfe fB20 = [rPolDataPtr], 16 | |
5897 | nop.f 0 | |
5898 | nop.i 0 | |
5899 | } | |
5900 | ;; | |
5901 | { .mfi | |
5902 | // store signgam if size of variable is 4 bytes | |
5903 | (p6) st4 [rSgnGamAddr] = rSgnGam | |
5904 | fmpy.s1 FR_G = FR_G, FR_G3 // G = (G_1 * G_2) * G_3 | |
5905 | nop.i 0 | |
5906 | } | |
5907 | { .mfi | |
5908 | // store signgam if size of variable is 8 bytes | |
5909 | (p7) st8 [rSgnGamAddr] = rSgnGam | |
5910 | fadd.s1 FR_H = FR_H, FR_H3 // H = (H_1 + H_2) + H_3 | |
5911 | nop.i 0 | |
5912 | } | |
5913 | ;; | |
5914 | { .mfi | |
5915 | nop.m 0 | |
5916 | fadd.s1 FR_h = FR_h, FR_h3 // h = (h_1 + h_2) + h_3 | |
5917 | nop.i 0 | |
5918 | } | |
5919 | ;; | |
5920 | { .mfi | |
5921 | nop.m 0 | |
5922 | fma.s1 fRcpX = fInvX, fInvX, f0 // 1/x^2 | |
5923 | nop.i 0 | |
5924 | } | |
5925 | { .mfi | |
5926 | nop.m 0 | |
5927 | fma.s1 fA0L = fB2, fInvX, fA0L | |
5928 | nop.i 0 | |
5929 | } | |
5930 | ;; | |
5931 | { .mfi | |
5932 | nop.m 0 | |
5933 | fms.s1 FR_r = fSignifX, FR_G, f1 // r = G * S_hi - 1 | |
5934 | nop.i 0 | |
5935 | } | |
5936 | { .mfi | |
5937 | nop.m 0 | |
5938 | // High part of the log(x): Y_hi = N * log2_hi + H | |
5939 | fma.s1 fRes2H = fFloatN, FR_log2_hi, FR_H | |
5940 | nop.i 0 | |
5941 | } | |
5942 | ;; | |
5943 | ||
5944 | { .mfi | |
5945 | nop.m 0 | |
5946 | // h = N * log2_lo + h | |
5947 | fma.s1 FR_h = fFloatN, FR_log2_lo, FR_h | |
5948 | nop.i 0 | |
5949 | } | |
5950 | { .mfi | |
5951 | nop.m 0 | |
5952 | // High part of the log(x): Y_hi = N * log2_hi + H | |
5953 | fma.s1 fRes1H = fFloatN, FR_log2_hi, FR_H | |
5954 | nop.i 0 | |
5955 | } | |
5956 | ;; | |
5957 | {.mfi | |
5958 | nop.m 0 | |
5959 | fma.s1 fPol = fB18, fRcpX, fB16 // v9 | |
5960 | nop.i 0 | |
5961 | } | |
5962 | { .mfi | |
5963 | nop.m 0 | |
5964 | fma.s1 fA2L = fRcpX, fRcpX, f0 // v10 | |
5965 | nop.i 0 | |
5966 | } | |
5967 | ;; | |
5968 | {.mfi | |
5969 | nop.m 0 | |
5970 | fma.s1 fA3 = fB6, fRcpX, fB4 // v3 | |
5971 | nop.i 0 | |
5972 | } | |
5973 | { .mfi | |
5974 | nop.m 0 | |
5975 | fma.s1 fA4 = fB10, fRcpX, fB8 // v4 | |
5976 | nop.i 0 | |
5977 | } | |
5978 | ;; | |
5979 | { .mfi | |
5980 | nop.m 0 | |
5981 | fms.s1 fRes2H =fRes2H, f1, f1 // log_Hi(x) -1 | |
5982 | nop.i 0 | |
5983 | } | |
5984 | { .mfi | |
5985 | nop.m 0 | |
5986 | // poly_lo = r * Q4 + Q3 | |
5987 | fma.s1 FR_poly_lo = FR_r, FR_Q4, FR_Q3 | |
5988 | nop.i 0 | |
5989 | } | |
5990 | ;; | |
5991 | { .mfi | |
5992 | nop.m 0 | |
5993 | fma.s1 fRes1H = fRes1H, FR_MHalf, f0 // -0.5*log_Hi(x) | |
5994 | nop.i 0 | |
5995 | } | |
5996 | { .mfi | |
5997 | nop.m 0 | |
5998 | fmpy.s1 FR_rsq = FR_r, FR_r // rsq = r * r | |
5999 | nop.i 0 | |
6000 | } | |
6001 | ;; | |
6002 | { .mfi | |
6003 | nop.m 0 | |
6004 | fma.s1 fA7 = fB14, fRcpX, fB12 // v7 | |
6005 | nop.i 0 | |
6006 | } | |
6007 | { .mfi | |
6008 | nop.m 0 | |
6009 | fma.s1 fA8 = fA2L, fB20, fPol // v8 | |
6010 | nop.i 0 | |
6011 | } | |
6012 | ;; | |
6013 | { .mfi | |
6014 | nop.m 0 | |
6015 | fma.s1 fA2 = fA4, fA2L, fA3 // v2 | |
6016 | nop.i 0 | |
6017 | } | |
6018 | { .mfi | |
6019 | nop.m 0 | |
6020 | fma.s1 fA4L = fA2L, fA2L, f0 // v5 | |
6021 | nop.i 0 | |
6022 | } | |
6023 | ;; | |
6024 | { .mfi | |
6025 | nop.m 0 | |
6026 | fma.s1 fResH = fRes2H, f8, f0 // (x*(ln(x)-1))hi | |
6027 | nop.i 0 | |
6028 | } | |
6029 | { .mfi | |
6030 | nop.m 0 | |
6031 | // poly_lo = poly_lo * r + Q2 | |
6032 | fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_Q2 | |
6033 | nop.i 0 | |
6034 | } | |
6035 | ;; | |
6036 | { .mfi | |
6037 | nop.m 0 | |
6038 | fma.s1 FR_rcub = FR_rsq, FR_r, f0 // rcub = r^3 | |
6039 | nop.i 0 | |
6040 | } | |
6041 | { .mfi | |
6042 | nop.m 0 | |
6043 | // poly_hi = Q1 * rsq + r | |
6044 | fma.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r | |
6045 | nop.i 0 | |
6046 | } | |
6047 | ;; | |
6048 | { .mfi | |
6049 | nop.m 0 | |
6050 | fma.s1 fA11 = fRcpX, fInvX, f0 // 1/x^3 | |
6051 | nop.i 0 | |
6052 | } | |
6053 | { .mfi | |
6054 | nop.m 0 | |
6055 | fma.s1 fA6 = fA8, fA2L, fA7 // v6 | |
6056 | nop.i 0 | |
6057 | } | |
6058 | ;; | |
6059 | { .mfi | |
6060 | nop.m 0 | |
6061 | fms.s1 fResL = fRes2H, f8, fResH // d(x*(ln(x)-1)) | |
6062 | nop.i 0 | |
6063 | } | |
6064 | { .mfi | |
6065 | nop.m 0 | |
6066 | fadd.s1 fRes3H = fResH, fRes1H // (x*(ln(x)-1) -0.5ln(x))hi | |
6067 | nop.i 0 | |
6068 | } | |
6069 | ;; | |
6070 | { .mfi | |
6071 | nop.m 0 | |
6072 | // poly_lo = poly_lo*r^3 + h | |
6073 | fma.s1 FR_poly_lo = FR_poly_lo, FR_rcub, FR_h | |
6074 | nop.i 0 | |
6075 | } | |
6076 | ;; | |
6077 | { .mfi | |
6078 | nop.m 0 | |
6079 | fma.s1 fPol = fA4L, fA6, fA2 // v1 | |
6080 | nop.i 0 | |
6081 | } | |
6082 | { .mfi | |
6083 | nop.m 0 | |
6084 | // raise inexact exception | |
6085 | fma.s0 FR_log2_lo = FR_log2_lo, FR_log2_lo, f0 | |
6086 | nop.i 0 | |
6087 | } | |
6088 | ;; | |
6089 | { .mfi | |
6090 | nop.m 0 | |
6091 | fadd.s1 fRes4H = fRes3H, fA0 // (x*(ln(x)-1) -0.5ln(x))hi + Chi | |
6092 | nop.i 0 | |
6093 | } | |
6094 | { .mfi | |
6095 | nop.m 0 | |
6096 | fsub.s1 fRes3L = fResH, fRes3H | |
6097 | nop.i 0 | |
6098 | } | |
6099 | ;; | |
6100 | { .mfi | |
6101 | nop.m 0 | |
6102 | // Y_lo = poly_hi + poly_lo | |
6103 | fadd.s1 fRes2L = FR_poly_hi, FR_poly_lo | |
6104 | nop.i 0 | |
6105 | } | |
6106 | ;; | |
6107 | ||
6108 | { .mfi | |
6109 | nop.m 0 | |
6110 | fma.s1 fA0L = fPol, fA11, fA0L // S(1/x) + Clo | |
6111 | nop.i 0 | |
6112 | } | |
6113 | ;; | |
6114 | { .mfi | |
6115 | nop.m 0 | |
6116 | fadd.s1 fRes3L = fRes3L, fRes1H | |
6117 | nop.i 0 | |
6118 | } | |
6119 | { .mfi | |
6120 | nop.m 0 | |
6121 | fsub.s1 fRes4L = fRes3H, fRes4H | |
6122 | nop.i 0 | |
6123 | } | |
6124 | ;; | |
6125 | { .mfi | |
6126 | nop.m 0 | |
6127 | fma.s1 fResL = fRes2L, f8 , fResL // lo part of x*(ln(x)-1) | |
6128 | nop.i 0 | |
6129 | } | |
6130 | ;; | |
6131 | { .mfi | |
6132 | nop.m 0 | |
6133 | // Clo + S(1/x) - 0.5*logLo(x) | |
6134 | fma.s1 fA0L = fRes2L, FR_MHalf, fA0L | |
6135 | nop.i 0 | |
6136 | } | |
6137 | ;; | |
6138 | { .mfi | |
6139 | nop.m 0 | |
6140 | fadd.s1 fRes4L = fRes4L, fA0 | |
6141 | nop.i 0 | |
6142 | } | |
6143 | ;; | |
6144 | { .mfi | |
6145 | nop.m 0 | |
6146 | // Clo + S(1/x) - 0.5*logLo(x) + (x*(ln(x)-1))lo | |
6147 | fadd.s1 fA0L = fA0L, fResL | |
6148 | nop.i 0 | |
6149 | } | |
6150 | ;; | |
6151 | { .mfi | |
6152 | nop.m 0 | |
6153 | fadd.s1 fRes4L = fRes4L, fRes3L | |
6154 | nop.i 0 | |
6155 | } | |
6156 | ;; | |
6157 | { .mfi | |
6158 | nop.m 0 | |
6159 | fadd.s1 fRes4L = fRes4L, fA0L | |
6160 | nop.i 0 | |
6161 | } | |
6162 | ;; | |
6163 | { .mfb | |
6164 | nop.m 0 | |
6165 | fma.s0 f8 = fRes4H, f1, fRes4L | |
6166 | // exit for x > 10.0 | |
6167 | br.ret.sptk b0 | |
6168 | } | |
6169 | ;; | |
6170 | // here if 8.0 <= x <= 10.0 | |
6171 | // Result = P15(y), where y = x/8.0 - 1.5 | |
6172 | .align 32 | |
6173 | lgammal_8_10: | |
6174 | { .mfi | |
6175 | addl rPolDataPtr = @ltoff(lgammal_8_10_data), gp | |
6176 | fms.s1 FR_FracX = fSignifX, f1, FR_MHalf // y = x/8.0 - 1.5 | |
6177 | cmp.eq p6, p7 = 4, rSgnGamSize | |
6178 | } | |
6179 | ;; | |
6180 | { .mfi | |
6181 | ld8 rLnSinDataPtr = [rPolDataPtr] | |
6182 | nop.f 0 | |
6183 | nop.i 0 | |
6184 | } | |
6185 | { .mfi | |
6186 | ld8 rPolDataPtr = [rPolDataPtr] | |
6187 | nop.f 0 | |
6188 | nop.i 0 | |
6189 | } | |
6190 | ;; | |
6191 | { .mfi | |
6192 | adds rZ1offsett = 32, rLnSinDataPtr | |
6193 | nop.f 0 | |
6194 | nop.i 0 | |
6195 | } | |
6196 | { .mfi | |
6197 | adds rLnSinDataPtr = 48, rLnSinDataPtr | |
6198 | nop.f 0 | |
6199 | nop.i 0 | |
6200 | } | |
6201 | ;; | |
6202 | { .mfi | |
6203 | ldfpd fA1, fA1L = [rPolDataPtr], 16 // A1 | |
6204 | nop.f 0 | |
6205 | nop.i 0 | |
6206 | } | |
6207 | { .mfi | |
6208 | ldfe fA2 = [rZ1offsett], 32 // A5 | |
6209 | nop.f 0 | |
6210 | nop.i 0 | |
6211 | } | |
6212 | ;; | |
6213 | { .mfi | |
6214 | ldfpd fA0, fA0L = [rPolDataPtr], 16 // A0 | |
6215 | fma.s1 FR_rsq = FR_FracX, FR_FracX, f0 // y^2 | |
6216 | nop.i 0 | |
6217 | } | |
6218 | { .mfi | |
6219 | ldfe fA3 = [rLnSinDataPtr],32 // A5 | |
6220 | nop.f 0 | |
6221 | nop.i 0 | |
6222 | } | |
6223 | ;; | |
6224 | { .mmf | |
6225 | ldfe fA4 = [rZ1offsett], 32 // A4 | |
6226 | ldfe fA5 = [rLnSinDataPtr], 32 // A5 | |
6227 | nop.f 0 | |
6228 | } | |
6229 | ;; | |
6230 | { .mmf | |
6231 | ldfe fA6 = [rZ1offsett], 32 // A6 | |
6232 | ldfe fA7 = [rLnSinDataPtr], 32 // A7 | |
6233 | nop.f 0 | |
6234 | } | |
6235 | ;; | |
6236 | { .mmf | |
6237 | ldfe fA8 = [rZ1offsett], 32 // A8 | |
6238 | ldfe fA9 = [rLnSinDataPtr], 32 // A9 | |
6239 | nop.f 0 | |
6240 | } | |
6241 | ;; | |
6242 | { .mmf | |
6243 | ldfe fA10 = [rZ1offsett], 32 // A10 | |
6244 | ldfe fA11 = [rLnSinDataPtr], 32 // A11 | |
6245 | nop.f 0 | |
6246 | } | |
6247 | ;; | |
6248 | { .mmf | |
6249 | ldfe fA12 = [rZ1offsett], 32 // A12 | |
6250 | ldfe fA13 = [rLnSinDataPtr], 32 // A13 | |
6251 | fma.s1 FR_Q4 = FR_rsq, FR_rsq, f0 // y^4 | |
6252 | } | |
6253 | ;; | |
6254 | { .mmf | |
6255 | ldfe fA14 = [rZ1offsett], 32 // A14 | |
6256 | ldfe fA15 = [rLnSinDataPtr], 32 // A15 | |
6257 | nop.f 0 | |
6258 | } | |
6259 | ;; | |
6260 | { .mfi | |
6261 | nop.m 0 | |
6262 | fma.s1 fRes1H = FR_FracX, fA1, f0 | |
6263 | nop.i 0 | |
6264 | } | |
6265 | ;; | |
6266 | { .mfi | |
6267 | nop.m 0 | |
6268 | fma.s1 fA3 = fA3, FR_FracX, fA2 // v4 | |
6269 | nop.i 0 | |
6270 | } | |
6271 | ;; | |
6272 | { .mfi | |
6273 | nop.m 0 | |
6274 | fma.s1 fA5 = fA5, FR_FracX, fA4 // v5 | |
6275 | nop.i 0 | |
6276 | } | |
6277 | ;; | |
6278 | { .mfi | |
6279 | // store sign of GAMMA(x) if size of variable is 4 bytes | |
6280 | (p6) st4 [rSgnGamAddr] = rSgnGam | |
6281 | fma.s1 fA3L = FR_Q4, FR_Q4, f0 // v9 = y^8 | |
6282 | nop.i 0 | |
6283 | } | |
6284 | { .mfi | |
6285 | // store sign of GAMMA(x) if size of variable is 8 bytes | |
6286 | (p7) st8 [rSgnGamAddr] = rSgnGam | |
6287 | fma.s1 fA7 = fA7, FR_FracX, fA6 // v7 | |
6288 | nop.i 0 | |
6289 | } | |
6290 | ;; | |
6291 | { .mfi | |
6292 | nop.m 0 | |
6293 | fma.s1 fA9 = fA9, FR_FracX, fA8 // v8 | |
6294 | nop.i 0 | |
6295 | } | |
6296 | ;; | |
6297 | { .mfi | |
6298 | nop.m 0 | |
6299 | fms.s1 fRes1L = FR_FracX, fA1, fRes1H | |
6300 | nop.i 0 | |
6301 | } | |
6302 | { .mfi | |
6303 | nop.m 0 | |
6304 | fma.s1 fA11 = fA11, FR_FracX, fA10 // v12 | |
6305 | nop.i 0 | |
6306 | } | |
6307 | ;; | |
6308 | { .mfi | |
6309 | nop.m 0 | |
6310 | fma.s1 fA13 = fA13, FR_FracX, fA12 // v13 | |
6311 | nop.i 0 | |
6312 | } | |
6313 | { .mfi | |
6314 | nop.m 0 | |
6315 | fma.s1 fRes2H = fRes1H, f1, fA0 | |
6316 | nop.i 0 | |
6317 | } | |
6318 | ;; | |
6319 | { .mfi | |
6320 | nop.m 0 | |
6321 | fma.s1 fA15 = fA15, FR_FracX, fA14 // v16 | |
6322 | nop.i 0 | |
6323 | } | |
6324 | { .mfi | |
6325 | nop.m 0 | |
6326 | fma.s1 fA5 = fA5, FR_rsq, fA3 // v3 | |
6327 | nop.i 0 | |
6328 | } | |
6329 | ;; | |
6330 | { .mfi | |
6331 | nop.m 0 | |
6332 | fma.s1 fA9 = fA9, FR_rsq, fA7 // v6 | |
6333 | nop.i 0 | |
6334 | } | |
6335 | ;; | |
6336 | { .mfi | |
6337 | nop.m 0 | |
6338 | fma.s1 fRes1L = FR_FracX, fA1L, fRes1L | |
6339 | nop.i 0 | |
6340 | } | |
6341 | ;; | |
6342 | { .mfi | |
6343 | nop.m 0 | |
6344 | fms.s1 fRes2L = fA0, f1, fRes2H | |
6345 | nop.i 0 | |
6346 | } | |
6347 | { .mfi | |
6348 | nop.m 0 | |
6349 | fma.s1 fA13 = fA13, FR_rsq, fA11 // v11 | |
6350 | nop.i 0 | |
6351 | } | |
6352 | ;; | |
6353 | { .mfi | |
6354 | nop.m 0 | |
6355 | fma.s1 fA9 = fA9, FR_Q4, fA5 // v2 | |
6356 | nop.i 0 | |
6357 | } | |
6358 | ;; | |
6359 | { .mfi | |
6360 | nop.m 0 | |
6361 | fma.s1 fRes1L = fRes1L, f1, fA0L | |
6362 | nop.i 0 | |
6363 | } | |
6364 | ;; | |
6365 | { .mfi | |
6366 | nop.m 0 | |
6367 | fma.s1 fRes2L = fRes2L, f1, fRes1H | |
6368 | nop.i 0 | |
6369 | } | |
6370 | { .mfi | |
6371 | nop.m 0 | |
6372 | fma.s1 fA15 = fA15, FR_Q4, fA13 // v10 | |
6373 | nop.i 0 | |
6374 | } | |
6375 | ;; | |
6376 | { .mfi | |
6377 | nop.m 0 | |
6378 | fma.s1 fRes2L = fRes1L, f1, fRes2L | |
6379 | nop.i 0 | |
6380 | } | |
6381 | { .mfi | |
6382 | nop.m 0 | |
6383 | fma.s1 fPol = fA3L, fA15, fA9 | |
6384 | nop.i 0 | |
6385 | } | |
6386 | ;; | |
6387 | { .mfi | |
6388 | nop.m 0 | |
6389 | fma.s1 f8 = FR_rsq , fPol, fRes2H | |
6390 | nop.i 0 | |
6391 | } | |
6392 | { .mfi | |
6393 | nop.m 0 | |
6394 | fma.s1 fPol = fPol, FR_rsq, f0 | |
6395 | nop.i 0 | |
6396 | } | |
6397 | ;; | |
6398 | { .mfi | |
6399 | nop.m 0 | |
6400 | fms.s1 fRes1L = fRes2H, f1, f8 | |
6401 | nop.i 0 | |
6402 | } | |
6403 | ;; | |
6404 | { .mfi | |
6405 | nop.m 0 | |
6406 | fma.s1 fRes1L = fRes1L, f1, fPol | |
6407 | nop.i 0 | |
6408 | } | |
6409 | ;; | |
6410 | {.mfi | |
6411 | nop.m 0 | |
6412 | fma.s1 fRes1L = fRes1L, f1, fRes2L | |
6413 | nop.i 0 | |
6414 | } | |
6415 | ;; | |
6416 | { .mfb | |
6417 | nop.m 0 | |
6418 | fma.s0 f8 = f8, f1, fRes1L | |
6419 | // exit for 8.0 <= x <= 10.0 | |
6420 | br.ret.sptk b0 | |
6421 | } | |
6422 | ;; | |
6423 | ||
6424 | // here if 4.0 <=x < 8.0 | |
6425 | .align 32 | |
6426 | lgammal_4_8: | |
6427 | { .mfi | |
6428 | addl rPolDataPtr= @ltoff(lgammal_4_8_data),gp | |
6429 | fms.s1 FR_FracX = fSignifX, f1, FR_MHalf | |
6430 | adds rSgnGam = 1, r0 | |
6431 | } | |
6432 | ;; | |
6433 | { .mfi | |
6434 | ld8 rPolDataPtr = [rPolDataPtr] | |
6435 | nop.f 0 | |
6436 | nop.i 0 | |
6437 | } | |
6438 | ;; | |
6439 | ||
6440 | { .mfb | |
6441 | adds rTmpPtr = 160, rPolDataPtr | |
6442 | nop.f 0 | |
6443 | // branch to special path which computes polynomial of 25th degree | |
6444 | br.sptk lgamma_polynom25 | |
6445 | } | |
6446 | ;; | |
6447 | ||
6448 | // here if 2.25 <=x < 4.0 | |
6449 | .align 32 | |
6450 | lgammal_2Q_4: | |
6451 | { .mfi | |
6452 | addl rPolDataPtr= @ltoff(lgammal_2Q_4_data),gp | |
6453 | fms.s1 FR_FracX = fSignifX, f1, FR_MHalf | |
6454 | adds rSgnGam = 1, r0 | |
6455 | } | |
6456 | ;; | |
6457 | { .mfi | |
6458 | ld8 rPolDataPtr = [rPolDataPtr] | |
6459 | nop.f 0 | |
6460 | nop.i 0 | |
6461 | } | |
6462 | ;; | |
6463 | ||
6464 | { .mfb | |
6465 | adds rTmpPtr = 160, rPolDataPtr | |
6466 | nop.f 0 | |
6467 | // branch to special path which computes polynomial of 25th degree | |
6468 | br.sptk lgamma_polynom25 | |
6469 | } | |
6470 | ;; | |
6471 | ||
6472 | // here if 0.5 <= |x| < 0.75 | |
6473 | .align 32 | |
6474 | lgammal_half_3Q: | |
6475 | .pred.rel "mutex", p14, p15 | |
6476 | { .mfi | |
6477 | (p14) addl rPolDataPtr= @ltoff(lgammal_half_3Q_data),gp | |
6478 | // FR_FracX = x - 0.625 for positive x | |
6479 | (p14) fms.s1 FR_FracX = f8, f1, FR_FracX | |
6480 | (p14) adds rSgnGam = 1, r0 | |
6481 | } | |
6482 | { .mfi | |
6483 | (p15) addl rPolDataPtr= @ltoff(lgammal_half_3Q_neg_data),gp | |
6484 | // FR_FracX = x + 0.625 for negative x | |
6485 | (p15) fma.s1 FR_FracX = f8, f1, FR_FracX | |
6486 | (p15) adds rSgnGam = -1, r0 | |
6487 | } | |
6488 | ;; | |
6489 | { .mfi | |
6490 | ld8 rPolDataPtr = [rPolDataPtr] | |
6491 | nop.f 0 | |
6492 | nop.i 0 | |
6493 | } | |
6494 | ;; | |
6495 | { .mfb | |
6496 | adds rTmpPtr = 160, rPolDataPtr | |
6497 | nop.f 0 | |
6498 | // branch to special path which computes polynomial of 25th degree | |
6499 | br.sptk lgamma_polynom25 | |
6500 | } | |
6501 | ;; | |
6502 | // here if 1.3125 <= x < 1.5625 | |
6503 | .align 32 | |
6504 | lgammal_loc_min: | |
6505 | { .mfi | |
6506 | adds rSgnGam = 1, r0 | |
6507 | nop.f 0 | |
6508 | nop.i 0 | |
6509 | } | |
6510 | { .mfb | |
6511 | adds rTmpPtr = 160, rPolDataPtr | |
6512 | fms.s1 FR_FracX = f8, f1, fA5L | |
6513 | br.sptk lgamma_polynom25 | |
6514 | } | |
6515 | ;; | |
6516 | // here if -2.605859375 <= x < -2.5 | |
6517 | // special polynomial approximation used since neither "near root" | |
6518 | // approximation nor reflection formula give satisfactory accuracy on | |
6519 | // this range | |
6520 | .align 32 | |
6521 | _neg2andHalf: | |
6522 | { .mfi | |
6523 | addl rPolDataPtr= @ltoff(lgammal_neg2andHalf_data),gp | |
6524 | fma.s1 FR_FracX = fB20, f1, f8 // 2.5 + x | |
6525 | adds rSgnGam = -1, r0 | |
6526 | } | |
6527 | ;; | |
6528 | {.mfi | |
6529 | ld8 rPolDataPtr = [rPolDataPtr] | |
6530 | nop.f 0 | |
6531 | nop.i 0 | |
6532 | } | |
6533 | ;; | |
6534 | { .mfb | |
6535 | adds rTmpPtr = 160, rPolDataPtr | |
6536 | nop.f 0 | |
6537 | // branch to special path which computes polynomial of 25th degree | |
6538 | br.sptk lgamma_polynom25 | |
6539 | } | |
6540 | ;; | |
6541 | ||
6542 | // here if -0.5 < x <= -0.40625 | |
6543 | .align 32 | |
6544 | lgammal_near_neg_half: | |
6545 | { .mmf | |
6546 | addl rPolDataPtr= @ltoff(lgammal_near_neg_half_data),gp | |
6547 | setf.exp FR_FracX = rExpHalf | |
6548 | nop.f 0 | |
6549 | } | |
6550 | ;; | |
6551 | { .mfi | |
6552 | ld8 rPolDataPtr = [rPolDataPtr] | |
6553 | nop.f 0 | |
6554 | adds rSgnGam = -1, r0 | |
6555 | } | |
6556 | ;; | |
6557 | { .mfb | |
6558 | adds rTmpPtr = 160, rPolDataPtr | |
6559 | fma.s1 FR_FracX = FR_FracX, f1, f8 | |
6560 | // branch to special path which computes polynomial of 25th degree | |
6561 | br.sptk lgamma_polynom25 | |
6562 | } | |
6563 | ;; | |
6564 | ||
6565 | // here if there an answer is P25(x) | |
6566 | // rPolDataPtr, rTmpPtr point to coefficients | |
6567 | // x is in FR_FracX register | |
6568 | .align 32 | |
6569 | lgamma_polynom25: | |
6570 | { .mfi | |
6571 | ldfpd fA3, fA0L = [rPolDataPtr], 16 // A3 | |
6572 | nop.f 0 | |
6573 | cmp.eq p6, p7 = 4, rSgnGamSize | |
6574 | } | |
6575 | { .mfi | |
6576 | ldfpd fA18, fA19 = [rTmpPtr], 16 // D7, D6 | |
6577 | nop.f 0 | |
6578 | nop.i 0 | |
6579 | } | |
6580 | ;; | |
6581 | { .mfi | |
6582 | ldfpd fA1, fA1L = [rPolDataPtr], 16 // A1 | |
6583 | nop.f 0 | |
6584 | nop.i 0 | |
6585 | } | |
6586 | { .mfi | |
6587 | ldfpd fA16, fA17 = [rTmpPtr], 16 // D4, D5 | |
6588 | nop.f 0 | |
6589 | } | |
6590 | ;; | |
6591 | { .mfi | |
6592 | ldfpd fA12, fA13 = [rPolDataPtr], 16 // D0, D1 | |
6593 | nop.f 0 | |
6594 | nop.i 0 | |
6595 | } | |
6596 | { .mfi | |
6597 | ldfpd fA14, fA15 = [rTmpPtr], 16 // D2, D3 | |
6598 | nop.f 0 | |
6599 | nop.i 0 | |
6600 | } | |
6601 | ;; | |
6602 | { .mfi | |
6603 | ldfpd fA24, fA25 = [rPolDataPtr], 16 // C21, C20 | |
6604 | nop.f 0 | |
6605 | nop.i 0 | |
6606 | } | |
6607 | { .mfi | |
6608 | ldfpd fA22, fA23 = [rTmpPtr], 16 // C19, C18 | |
6609 | nop.f 0 | |
6610 | nop.i 0 | |
6611 | } | |
6612 | ;; | |
6613 | { .mfi | |
6614 | ldfpd fA2, fA2L = [rPolDataPtr], 16 // A2 | |
6615 | fma.s1 fA4L = FR_FracX, FR_FracX, f0 // x^2 | |
6616 | nop.i 0 | |
6617 | } | |
6618 | { .mfi | |
6619 | ldfpd fA20, fA21 = [rTmpPtr], 16 // C17, C16 | |
6620 | nop.f 0 | |
6621 | nop.i 0 | |
6622 | } | |
6623 | ;; | |
6624 | { .mfi | |
6625 | ldfe fA11 = [rTmpPtr], 16 // E7 | |
6626 | nop.f 0 | |
6627 | nop.i 0 | |
6628 | } | |
6629 | { .mfi | |
6630 | ldfpd fA0, fA3L = [rPolDataPtr], 16 // A0 | |
6631 | nop.f 0 | |
6632 | nop.i 0 | |
6633 | };; | |
6634 | { .mfi | |
6635 | ldfe fA10 = [rPolDataPtr], 16 // E6 | |
6636 | nop.f 0 | |
6637 | nop.i 0 | |
6638 | } | |
6639 | { .mfi | |
6640 | ldfe fA9 = [rTmpPtr], 16 // E5 | |
6641 | nop.f 0 | |
6642 | nop.i 0 | |
6643 | } | |
6644 | ;; | |
6645 | { .mmf | |
6646 | ldfe fA8 = [rPolDataPtr], 16 // E4 | |
6647 | ldfe fA7 = [rTmpPtr], 16 // E3 | |
6648 | nop.f 0 | |
6649 | } | |
6650 | ;; | |
6651 | { .mmf | |
6652 | ldfe fA6 = [rPolDataPtr], 16 // E2 | |
6653 | ldfe fA5 = [rTmpPtr], 16 // E1 | |
6654 | nop.f 0 | |
6655 | } | |
6656 | ;; | |
6657 | { .mfi | |
6658 | ldfe fA4 = [rPolDataPtr], 16 // E0 | |
6659 | fma.s1 fA5L = fA4L, fA4L, f0 // x^4 | |
6660 | nop.i 0 | |
6661 | } | |
6662 | { .mfi | |
6663 | nop.m 0 | |
6664 | fms.s1 fB2 = FR_FracX, FR_FracX, fA4L // x^2 - <x^2> | |
6665 | nop.i 0 | |
6666 | } | |
6667 | ;; | |
6668 | { .mfi | |
6669 | // store signgam if size of variable is 4 bytes | |
6670 | (p6) st4 [rSgnGamAddr] = rSgnGam | |
6671 | fma.s1 fRes4H = fA3, FR_FracX, f0 // (A3*x)hi | |
6672 | nop.i 0 | |
6673 | } | |
6674 | { .mfi | |
6675 | // store signgam if size of variable is 8 bytes | |
6676 | (p7) st8 [rSgnGamAddr] = rSgnGam | |
6677 | fma.s1 fA19 = fA19, FR_FracX, fA18 // D7*x + D6 | |
6678 | nop.i 0 | |
6679 | } | |
6680 | ;; | |
6681 | { .mfi | |
6682 | nop.m 0 | |
6683 | fma.s1 fResH = fA1, FR_FracX, f0 // (A1*x)hi | |
6684 | nop.i 0 | |
6685 | } | |
6686 | { .mfi | |
6687 | nop.m 0 | |
6688 | fma.s1 fB6 = fA1L, FR_FracX, fA0L // A1L*x + A0L | |
6689 | nop.i 0 | |
6690 | } | |
6691 | ;; | |
6692 | { .mfi | |
6693 | nop.m 0 | |
6694 | fma.s1 fA17 = fA17, FR_FracX, fA16 // D5*x + D4 | |
6695 | nop.i 0 | |
6696 | } | |
6697 | { .mfi | |
6698 | nop.m 0 | |
6699 | fma.s1 fA15 = fA15, FR_FracX, fA14 // D3*x + D2 | |
6700 | nop.i 0 | |
6701 | } | |
6702 | ;; | |
6703 | { .mfi | |
6704 | nop.m 0 | |
6705 | fma.s1 fA25 = fA25, FR_FracX, fA24 // C21*x + C20 | |
6706 | nop.i 0 | |
6707 | } | |
6708 | { .mfi | |
6709 | nop.m 0 | |
6710 | fma.s1 fA13 = fA13, FR_FracX, fA12 // D1*x + D0 | |
6711 | nop.i 0 | |
6712 | } | |
6713 | ;; | |
6714 | { .mfi | |
6715 | nop.m 0 | |
6716 | fma.s1 fA23 = fA23, FR_FracX, fA22 // C19*x + C18 | |
6717 | nop.i 0 | |
6718 | } | |
6719 | { .mfi | |
6720 | nop.m 0 | |
6721 | fma.s1 fA21 = fA21, FR_FracX, fA20 // C17*x + C16 | |
6722 | nop.i 0 | |
6723 | } | |
6724 | ;; | |
6725 | { .mfi | |
6726 | nop.m 0 | |
6727 | fms.s1 fRes4L = fA3, FR_FracX, fRes4H // delta((A3*x)hi) | |
6728 | nop.i 0 | |
6729 | } | |
6730 | { .mfi | |
6731 | nop.m 0 | |
6732 | fadd.s1 fRes2H = fRes4H, fA2 // (A3*x + A2)hi | |
6733 | nop.i 0 | |
6734 | } | |
6735 | ;; | |
6736 | { .mfi | |
6737 | nop.m 0 | |
6738 | fms.s1 fResL = fA1, FR_FracX, fResH // d(A1*x) | |
6739 | nop.i 0 | |
6740 | } | |
6741 | { .mfi | |
6742 | nop.m 0 | |
6743 | fadd.s1 fRes1H = fResH, fA0 // (A1*x + A0)hi | |
6744 | nop.i 0 | |
6745 | } | |
6746 | ;; | |
6747 | { .mfi | |
6748 | nop.m 0 | |
6749 | fma.s1 fA19 = fA19, fA4L, fA17 // Dhi | |
6750 | nop.i 0 | |
6751 | } | |
6752 | { .mfi | |
6753 | nop.m 0 | |
6754 | fma.s1 fA11 = fA11, FR_FracX, fA10 // E7*x + E6 | |
6755 | nop.i 0 | |
6756 | } | |
6757 | ;; | |
6758 | { .mfi | |
6759 | nop.m 0 | |
6760 | // Doing this to raise inexact flag | |
6761 | fma.s0 fA10 = fA0, fA0, f0 | |
6762 | nop.i 0 | |
6763 | } | |
6764 | ;; | |
6765 | { .mfi | |
6766 | nop.m 0 | |
6767 | fma.s1 fA15 = fA15, fA4L, fA13 // Dlo | |
6768 | nop.i 0 | |
6769 | } | |
6770 | { .mfi | |
6771 | nop.m 0 | |
6772 | // (C21*x + C20)*x^2 + C19*x + C18 | |
6773 | fma.s1 fA25 = fA25, fA4L, fA23 | |
6774 | nop.i 0 | |
6775 | } | |
6776 | ;; | |
6777 | { .mfi | |
6778 | nop.m 0 | |
6779 | fma.s1 fA9 = fA9, FR_FracX, fA8 // E5*x + E4 | |
6780 | nop.i 0 | |
6781 | } | |
6782 | { .mfi | |
6783 | nop.m 0 | |
6784 | fma.s1 fA7 = fA7, FR_FracX, fA6 // E3*x + E2 | |
6785 | nop.i 0 | |
6786 | } | |
6787 | ;; | |
6788 | { .mfi | |
6789 | nop.m 0 | |
6790 | fma.s1 fRes4L = fA3L, FR_FracX, fRes4L // (A3*x)lo | |
6791 | nop.i 0 | |
6792 | } | |
6793 | { .mfi | |
6794 | nop.m 0 | |
6795 | fsub.s1 fRes2L = fA2, fRes2H | |
6796 | nop.i 0 | |
6797 | } | |
6798 | ;; | |
6799 | { .mfi | |
6800 | nop.m 0 | |
6801 | fadd.s1 fResL = fResL, fB6 // (A1L*x + A0L) + d(A1*x) | |
6802 | nop.i 0 | |
6803 | } | |
6804 | { .mfi | |
6805 | nop.m 0 | |
6806 | fsub.s1 fRes1L = fA0, fRes1H | |
6807 | nop.i 0 | |
6808 | } | |
6809 | ;; | |
6810 | { .mfi | |
6811 | nop.m 0 | |
6812 | fma.s1 fA5 = fA5, FR_FracX, fA4 // E1*x + E0 | |
6813 | nop.i 0 | |
6814 | } | |
6815 | { .mfi | |
6816 | nop.m 0 | |
6817 | fma.s1 fB8 = fA5L, fA5L, f0 // x^8 | |
6818 | nop.i 0 | |
6819 | } | |
6820 | ;; | |
6821 | { .mfi | |
6822 | nop.m 0 | |
6823 | // ((C21*x + C20)*x^2 + C19*x + C18)*x^2 + C17*x + C16 | |
6824 | fma.s1 fA25 = fA25, fA4L, fA21 | |
6825 | nop.i 0 | |
6826 | } | |
6827 | { .mfi | |
6828 | nop.m 0 | |
6829 | fma.s1 fA19 = fA19, fA5L, fA15 // D | |
6830 | nop.i 0 | |
6831 | } | |
6832 | ;; | |
6833 | { .mfi | |
6834 | nop.m 0 | |
6835 | fma.s1 fA11 = fA11, fA4L, fA9 // Ehi | |
6836 | nop.i 0 | |
6837 | } | |
6838 | ;; | |
6839 | { .mfi | |
6840 | nop.m 0 | |
6841 | fadd.s1 fRes2L = fRes2L, fRes4H | |
6842 | nop.i 0 | |
6843 | } | |
6844 | { .mfi | |
6845 | nop.m 0 | |
6846 | fadd.s1 fRes4L = fRes4L, fA2L // (A3*x)lo + A2L | |
6847 | nop.i 0 | |
6848 | } | |
6849 | ;; | |
6850 | { .mfi | |
6851 | nop.m 0 | |
6852 | fma.s1 fRes3H = fRes2H, fA4L, f0 // ((A3*x + A2)*x^2)hi | |
6853 | nop.i 0 | |
6854 | } | |
6855 | { .mfi | |
6856 | nop.m 0 | |
6857 | fadd.s1 fRes1L = fRes1L, fResH | |
6858 | nop.i 0 | |
6859 | } | |
6860 | ;; | |
6861 | { .mfi | |
6862 | nop.m 0 | |
6863 | fma.s1 fRes3L = fRes2H, fB2, f0 // (A3*x + A2)hi*d(x^2) | |
6864 | nop.i 0 | |
6865 | } | |
6866 | { .mfi | |
6867 | nop.m 0 | |
6868 | fma.s1 fA7 = fA7, fA4L, fA5 // Elo | |
6869 | nop.i 0 | |
6870 | } | |
6871 | ;; | |
6872 | { .mfi | |
6873 | nop.m 0 | |
6874 | fma.s1 fA25 = fA25, fB8, fA19 // C*x^8 + D | |
6875 | nop.i 0 | |
6876 | } | |
6877 | ;; | |
6878 | { .mfi | |
6879 | nop.m 0 | |
6880 | fadd.s1 fRes2L = fRes2L, fRes4L // (A3*x + A2)lo | |
6881 | nop.i 0 | |
6882 | } | |
6883 | ;; | |
6884 | { .mfi | |
6885 | nop.m 0 | |
6886 | fms.s1 fB4 = fRes2H, fA4L, fRes3H // d((A3*x + A2)*x^2)) | |
6887 | nop.i 0 | |
6888 | } | |
6889 | { .mfi | |
6890 | nop.m 0 | |
6891 | fadd.s1 fRes1L = fRes1L, fResL // (A1*x + A0)lo | |
6892 | nop.i 0 | |
6893 | } | |
6894 | ;; | |
6895 | { .mfi | |
6896 | nop.m 0 | |
6897 | fadd.s1 fB20 = fRes3H, fRes1H // Phi | |
6898 | nop.i 0 | |
6899 | } | |
6900 | { .mfi | |
6901 | nop.m 0 | |
6902 | fma.s1 fA11 = fA11, fA5L, fA7 // E | |
6903 | nop.i 0 | |
6904 | } | |
6905 | ;; | |
6906 | { .mfi | |
6907 | nop.m 0 | |
6908 | // ( (A3*x + A2)lo*<x^2> + (A3*x + A2)hi*d(x^2)) | |
6909 | fma.s1 fRes3L = fRes2L, fA4L, fRes3L | |
6910 | nop.i 0 | |
6911 | } | |
6912 | ;; | |
6913 | { .mfi | |
6914 | nop.m 0 | |
6915 | // d((A3*x + A2)*x^2)) + (A1*x + A0)lo | |
6916 | fadd.s1 fRes1L = fRes1L, fB4 | |
6917 | nop.i 0 | |
6918 | } | |
6919 | ;; | |
6920 | { .mfi | |
6921 | nop.m 0 | |
6922 | fsub.s1 fB18 = fRes1H, fB20 | |
6923 | nop.i 0 | |
6924 | } | |
6925 | { .mfi | |
6926 | nop.m 0 | |
6927 | fma.s1 fPol = fA25, fB8, fA11 | |
6928 | nop.i 0 | |
6929 | } | |
6930 | ;; | |
6931 | { .mfi | |
6932 | nop.m 0 | |
6933 | fadd.s1 fRes1L = fRes1L, fRes3L | |
6934 | nop.i 0 | |
6935 | } | |
6936 | ;; | |
6937 | { .mfi | |
6938 | nop.m 0 | |
6939 | fadd.s1 fB18 = fB18, fRes3H | |
6940 | nop.i 0 | |
6941 | } | |
6942 | { .mfi | |
6943 | nop.m 0 | |
6944 | fma.s1 fRes4H = fPol, fA5L, fB20 | |
6945 | nop.i 0 | |
6946 | } | |
6947 | ;; | |
6948 | { .mfi | |
6949 | nop.m 0 | |
6950 | fma.s1 fPolL = fPol, fA5L, f0 | |
6951 | nop.i 0 | |
6952 | } | |
6953 | ;; | |
6954 | { .mfi | |
6955 | nop.m 0 | |
6956 | fadd.s1 fB18 = fB18, fRes1L // Plo | |
6957 | nop.i 0 | |
6958 | } | |
6959 | { .mfi | |
6960 | nop.m 0 | |
6961 | fsub.s1 fRes4L = fB20, fRes4H | |
6962 | nop.i 0 | |
6963 | } | |
6964 | ;; | |
6965 | { .mfi | |
6966 | nop.m 0 | |
6967 | fadd.s1 fB18 = fB18, fPolL | |
6968 | nop.i 0 | |
6969 | } | |
6970 | ;; | |
6971 | { .mfi | |
6972 | nop.m 0 | |
6973 | fadd.s1 fRes4L = fRes4L, fB18 | |
6974 | nop.i 0 | |
6975 | } | |
6976 | ;; | |
6977 | { .mfb | |
6978 | nop.m 0 | |
6979 | fma.s0 f8 = fRes4H, f1, fRes4L | |
6980 | // P25(x) computed, exit here | |
6981 | br.ret.sptk b0 | |
6982 | } | |
6983 | ;; | |
6984 | ||
6985 | ||
6986 | // here if 0.75 <= x < 1.3125 | |
6987 | .align 32 | |
6988 | lgammal_03Q_1Q: | |
6989 | { .mfi | |
6990 | addl rPolDataPtr= @ltoff(lgammal_03Q_1Q_data),gp | |
6991 | fma.s1 FR_FracX = fA5L, f1, f0 // x | |
6992 | adds rSgnGam = 1, r0 | |
6993 | } | |
6994 | { .mfi | |
6995 | nop.m 0 | |
6996 | fma.s1 fB4 = fA5L, fA5L, f0 // x^2 | |
6997 | nop.i 0 | |
6998 | } | |
6999 | ;; | |
7000 | { .mfi | |
7001 | ld8 rPolDataPtr = [rPolDataPtr] | |
7002 | nop.f 0 | |
7003 | nop.i 0 | |
7004 | } | |
7005 | ;; | |
7006 | { .mfb | |
7007 | adds rTmpPtr = 144, rPolDataPtr | |
7008 | nop.f 0 | |
7009 | br.sptk lgamma_polynom24x | |
7010 | } | |
7011 | ;; | |
7012 | ||
7013 | // here if 1.5625 <= x < 2.25 | |
7014 | .align 32 | |
7015 | lgammal_13Q_2Q: | |
7016 | { .mfi | |
7017 | addl rPolDataPtr= @ltoff(lgammal_13Q_2Q_data),gp | |
7018 | fma.s1 FR_FracX = fB4, f1, f0 // x | |
7019 | adds rSgnGam = 1, r0 | |
7020 | } | |
7021 | { .mfi | |
7022 | nop.m 0 | |
7023 | fma.s1 fB4 = fB4, fB4, f0 // x^2 | |
7024 | nop.i 0 | |
7025 | } | |
7026 | ;; | |
7027 | { .mfi | |
7028 | ld8 rPolDataPtr = [rPolDataPtr] | |
7029 | nop.f 0 | |
7030 | nop.i 0 | |
7031 | } | |
7032 | ;; | |
7033 | { .mfb | |
7034 | adds rTmpPtr = 144, rPolDataPtr | |
7035 | nop.f 0 | |
7036 | br.sptk lgamma_polynom24x | |
7037 | } | |
7038 | ;; | |
7039 | ||
7040 | // here if result is Pol24(x) | |
7041 | // x is in FR_FracX, | |
7042 | // rPolDataPtr, rTmpPtr point to coefficients | |
7043 | .align 32 | |
7044 | lgamma_polynom24x: | |
7045 | { .mfi | |
7046 | ldfpd fA4, fA2L = [rPolDataPtr], 16 | |
7047 | nop.f 0 | |
7048 | cmp.eq p6, p7 = 4, rSgnGamSize | |
7049 | } | |
7050 | { .mfi | |
7051 | ldfpd fA23, fA24 = [rTmpPtr], 16 // C18, C19 | |
7052 | nop.f 0 | |
7053 | nop.i 0 | |
7054 | } | |
7055 | ;; | |
7056 | { .mfi | |
7057 | ldfpd fA3, fA1L = [rPolDataPtr], 16 | |
7058 | fma.s1 fA5L = fB4, fB4, f0 // x^4 | |
7059 | nop.i 0 | |
7060 | } | |
7061 | { .mfi | |
7062 | ldfpd fA19, fA20 = [rTmpPtr], 16 // D6, D7 | |
7063 | fms.s1 fB2 = FR_FracX, FR_FracX, fB4 // x^2 - <x^2> | |
7064 | nop.i 0 | |
7065 | } | |
7066 | ;; | |
7067 | { .mmf | |
7068 | ldfpd fA15, fA16 = [rPolDataPtr], 16 // D2, D3 | |
7069 | ldfpd fA17, fA18 = [rTmpPtr], 16 // D4, D5 | |
7070 | nop.f 0 | |
7071 | } | |
7072 | ;; | |
7073 | { .mmf | |
7074 | ldfpd fA13, fA14 = [rPolDataPtr], 16 // D0, D1 | |
7075 | ldfpd fA12, fA21 = [rTmpPtr], 16 // E7, C16 | |
7076 | nop.f 0 | |
7077 | } | |
7078 | ;; | |
7079 | { .mfi | |
7080 | ldfe fA11 = [rPolDataPtr], 16 // E6 | |
7081 | nop.f 0 | |
7082 | nop.i 0 | |
7083 | } | |
7084 | { .mfi | |
7085 | ldfe fA10 = [rTmpPtr], 16 // E5 | |
7086 | nop.f 0 | |
7087 | nop.i 0 | |
7088 | } | |
7089 | ;; | |
7090 | { .mfi | |
7091 | ldfpd fA2, fA4L = [rPolDataPtr], 16 | |
7092 | nop.f 0 | |
7093 | nop.i 0 | |
7094 | } | |
7095 | { .mfi | |
7096 | ldfpd fA1, fA3L = [rTmpPtr], 16 | |
7097 | nop.f 0 | |
7098 | nop.i 0 | |
7099 | } | |
7100 | ;; | |
7101 | { .mfi | |
7102 | ldfpd fA22, fA25 = [rPolDataPtr], 16 // C17, C20 | |
7103 | fma.s1 fA0 = fA5L, fA5L, f0 // x^8 | |
7104 | nop.i 0 | |
7105 | } | |
7106 | { .mfi | |
7107 | nop.m 0 | |
7108 | fma.s1 fA0L = fA5L, FR_FracX, f0 // x^5 | |
7109 | nop.i 0 | |
7110 | } | |
7111 | ;; | |
7112 | { .mmf | |
7113 | ldfe fA9 = [rPolDataPtr], 16 // E4 | |
7114 | ldfe fA8 = [rTmpPtr], 16 // E3 | |
7115 | nop.f 0 | |
7116 | } | |
7117 | ;; | |
7118 | { .mmf | |
7119 | ldfe fA7 = [rPolDataPtr], 16 // E2 | |
7120 | ldfe fA6 = [rTmpPtr], 16 // E1 | |
7121 | nop.f 0 | |
7122 | } | |
7123 | ;; | |
7124 | { .mfi | |
7125 | ldfe fA5 = [rTmpPtr], 16 // E0 | |
7126 | fma.s1 fRes4H = fA4, fB4, f0 // A4*<x^2> | |
7127 | nop.i 0 | |
7128 | } | |
7129 | { .mfi | |
7130 | nop.m 0 | |
7131 | fma.s1 fPol = fA24, FR_FracX, fA23 // C19*x + C18 | |
7132 | nop.i 0 | |
7133 | } | |
7134 | ;; | |
7135 | { .mfi | |
7136 | // store signgam if size of variable is 4 bytes | |
7137 | (p6) st4 [rSgnGamAddr] = rSgnGam | |
7138 | fma.s1 fRes1H = fA3, fB4, f0 // A3*<x^2> | |
7139 | nop.i 0 | |
7140 | } | |
7141 | { .mfi | |
7142 | // store signgam if size of variable is 8 bytes | |
7143 | (p7) st8 [rSgnGamAddr] = rSgnGam | |
7144 | fma.s1 fA1L = fA3, fB2,fA1L // A3*d(x^2) + A1L | |
7145 | nop.i 0 | |
7146 | } | |
7147 | ;; | |
7148 | { .mfi | |
7149 | nop.m 0 | |
7150 | fma.s1 fA20 = fA20, FR_FracX, fA19 // D7*x + D6 | |
7151 | nop.i 0 | |
7152 | } | |
7153 | { .mfi | |
7154 | nop.m 0 | |
7155 | fma.s1 fA18 = fA18, FR_FracX, fA17 // D5*x + D4 | |
7156 | nop.i 0 | |
7157 | } | |
7158 | ;; | |
7159 | { .mfi | |
7160 | nop.m 0 | |
7161 | fma.s1 fA16 = fA16, FR_FracX, fA15 // D3*x + D2 | |
7162 | nop.i 0 | |
7163 | } | |
7164 | { .mfi | |
7165 | nop.m 0 | |
7166 | fma.s1 fA14 = fA14, FR_FracX, fA13 // D1*x + D0 | |
7167 | nop.i 0 | |
7168 | } | |
7169 | ;; | |
7170 | { .mfi | |
7171 | nop.m 0 | |
7172 | fma.s1 fA2L = fA4, fB2,fA2L // A4*d(x^2) + A2L | |
7173 | nop.i 0 | |
7174 | } | |
7175 | { .mfi | |
7176 | nop.m 0 | |
7177 | fma.s1 fA12 = fA12, FR_FracX, fA11 // E7*x + E6 | |
7178 | nop.i 0 | |
7179 | } | |
7180 | ;; | |
7181 | { .mfi | |
7182 | nop.m 0 | |
7183 | fms.s1 fRes2L = fA4, fB4, fRes4H // delta(A4*<x^2>) | |
7184 | nop.i 0 | |
7185 | } | |
7186 | { .mfi | |
7187 | nop.m 0 | |
7188 | fadd.s1 fRes2H = fRes4H, fA2 // A4*<x^2> + A2 | |
7189 | nop.i 0 | |
7190 | } | |
7191 | ;; | |
7192 | { .mfi | |
7193 | nop.m 0 | |
7194 | fms.s1 fRes3L = fA3, fB4, fRes1H // delta(A3*<x^2>) | |
7195 | nop.i 0 | |
7196 | } | |
7197 | { .mfi | |
7198 | nop.m 0 | |
7199 | fadd.s1 fRes3H = fRes1H, fA1 // A3*<x^2> + A1 | |
7200 | nop.i 0 | |
7201 | } | |
7202 | ;; | |
7203 | { .mfi | |
7204 | nop.m 0 | |
7205 | fma.s1 fA20 = fA20, fB4, fA18 // (D7*x + D6)*x^2 + D5*x + D4 | |
7206 | nop.i 0 | |
7207 | } | |
7208 | { .mfi | |
7209 | nop.m 0 | |
7210 | fma.s1 fA22 = fA22, FR_FracX, fA21 // C17*x + C16 | |
7211 | nop.i 0 | |
7212 | } | |
7213 | ;; | |
7214 | { .mfi | |
7215 | nop.m 0 | |
7216 | fma.s1 fA16 = fA16, fB4, fA14 // (D3*x + D2)*x^2 + D1*x + D0 | |
7217 | nop.i 0 | |
7218 | } | |
7219 | { .mfi | |
7220 | nop.m 0 | |
7221 | fma.s1 fPol = fA25, fB4, fPol // C20*x^2 + C19*x + C18 | |
7222 | nop.i 0 | |
7223 | } | |
7224 | ;; | |
7225 | { .mfi | |
7226 | nop.m 0 | |
7227 | fma.s1 fA2L = fA4L, fB4, fA2L // A4L*<x^2> + A4*d(x^2) + A2L | |
7228 | nop.i 0 | |
7229 | } | |
7230 | { .mfi | |
7231 | nop.m 0 | |
7232 | fma.s1 fA1L = fA3L, fB4, fA1L // A3L*<x^2> + A3*d(x^2) + A1L | |
7233 | nop.i 0 | |
7234 | } | |
7235 | ;; | |
7236 | { .mfi | |
7237 | nop.m 0 | |
7238 | fsub.s1 fRes4L = fA2, fRes2H // d1 | |
7239 | nop.i 0 | |
7240 | } | |
7241 | { .mfi | |
7242 | nop.m 0 | |
7243 | fma.s1 fResH = fRes2H, fB4, f0 // (A4*<x^2> + A2)*x^2 | |
7244 | nop.i 0 | |
7245 | } | |
7246 | ;; | |
7247 | { .mfi | |
7248 | nop.m 0 | |
7249 | fsub.s1 fRes1L = fA1, fRes3H // d1 | |
7250 | nop.i 0 | |
7251 | } | |
7252 | { .mfi | |
7253 | nop.m 0 | |
7254 | fma.s1 fB6 = fRes3H, FR_FracX, f0 // (A3*<x^2> + A1)*x | |
7255 | nop.i 0 | |
7256 | } | |
7257 | ;; | |
7258 | { .mfi | |
7259 | nop.m 0 | |
7260 | fma.s1 fA10 = fA10, FR_FracX, fA9 // E5*x + E4 | |
7261 | nop.i 0 | |
7262 | } | |
7263 | { .mfi | |
7264 | nop.m 0 | |
7265 | fma.s1 fA8 = fA8, FR_FracX, fA7 // E3*x + E2 | |
7266 | nop.i 0 | |
7267 | } | |
7268 | ;; | |
7269 | { .mfi | |
7270 | nop.m 0 | |
7271 | // (C20*x^2 + C19*x + C18)*x^2 + C17*x + C16 | |
7272 | fma.s1 fPol = fPol, fB4, fA22 | |
7273 | nop.i 0 | |
7274 | } | |
7275 | { .mfi | |
7276 | nop.m 0 | |
7277 | fma.s1 fA6 = fA6, FR_FracX, fA5 // E1*x + E0 | |
7278 | nop.i 0 | |
7279 | } | |
7280 | ;; | |
7281 | { .mfi | |
7282 | nop.m 0 | |
7283 | // A4L*<x^2> + A4*d(x^2) + A2L + delta(A4*<x^2>) | |
7284 | fadd.s1 fRes2L = fA2L, fRes2L | |
7285 | nop.i 0 | |
7286 | } | |
7287 | { .mfi | |
7288 | nop.m 0 | |
7289 | // A3L*<x^2> + A3*d(x^2) + A1L + delta(A3*<x^2>) | |
7290 | fadd.s1 fRes3L = fA1L, fRes3L | |
7291 | nop.i 0 | |
7292 | } | |
7293 | ;; | |
7294 | { .mfi | |
7295 | nop.m 0 | |
7296 | fadd.s1 fRes4L = fRes4L, fRes4H // d2 | |
7297 | nop.i 0 | |
7298 | } | |
7299 | { .mfi | |
7300 | nop.m 0 | |
7301 | fms.s1 fResL = fRes2H, fB4, fResH // d(A4*<x^2> + A2)*x^2) | |
7302 | nop.i 0 | |
7303 | } | |
7304 | ;; | |
7305 | { .mfi | |
7306 | nop.m 0 | |
7307 | fadd.s1 fRes1L = fRes1L, fRes1H // d2 | |
7308 | nop.i 0 | |
7309 | } | |
7310 | { .mfi | |
7311 | nop.m 0 | |
7312 | fms.s1 fB8 = fRes3H, FR_FracX, fB6 // d((A3*<x^2> + A1)*x) | |
7313 | nop.i 0 | |
7314 | } | |
7315 | ;; | |
7316 | { .mfi | |
7317 | nop.m 0 | |
7318 | fadd.s1 fB10 = fResH, fB6 // (A4*x^4 + .. + A1*x)hi | |
7319 | nop.i 0 | |
7320 | } | |
7321 | { .mfi | |
7322 | nop.m 0 | |
7323 | fma.s1 fA12 = fA12, fB4, fA10 // Ehi | |
7324 | nop.i 0 | |
7325 | } | |
7326 | ;; | |
7327 | { .mfi | |
7328 | nop.m 0 | |
7329 | // ((D7*x + D6)*x^2 + D5*x + D4)*x^4 + (D3*x + D2)*x^2 + D1*x + D0 | |
7330 | fma.s1 fA20 = fA20, fA5L, fA16 | |
7331 | nop.i 0 | |
7332 | } | |
7333 | { .mfi | |
7334 | nop.m 0 | |
7335 | fma.s1 fA8 = fA8, fB4, fA6 // Elo | |
7336 | nop.i 0 | |
7337 | } | |
7338 | ;; | |
7339 | { .mfi | |
7340 | nop.m 0 | |
7341 | fadd.s1 fRes2L = fRes2L, fRes4L // (A4*<x^2> + A2)lo | |
7342 | nop.i 0 | |
7343 | } | |
7344 | { .mfi | |
7345 | nop.m 0 | |
7346 | // d(A4*<x^2> + A2)*x^2) + A4*<x^2> + A2)*d(x^2) | |
7347 | fma.s1 fResL = fRes2H, fB2, fResL | |
7348 | nop.i 0 | |
7349 | } | |
7350 | ;; | |
7351 | { .mfi | |
7352 | nop.m 0 | |
7353 | fadd.s1 fRes3L = fRes3L, fRes1L // (A4*<x^2> + A2)lo | |
7354 | nop.i 0 | |
7355 | } | |
7356 | ;; | |
7357 | { .mfi | |
7358 | nop.m 0 | |
7359 | fsub.s1 fB12 = fB6, fB10 | |
7360 | nop.i 0 | |
7361 | } | |
7362 | ;; | |
7363 | { .mfi | |
7364 | nop.m 0 | |
7365 | fma.s1 fPol = fPol, fA0, fA20 // PolC*x^8 + PolD | |
7366 | nop.i 0 | |
7367 | } | |
7368 | { .mfi | |
7369 | nop.m 0 | |
7370 | fma.s1 fPolL = fA12, fA5L, fA8 // E | |
7371 | nop.i 0 | |
7372 | } | |
7373 | ;; | |
7374 | { .mfi | |
7375 | nop.m 0 | |
7376 | fma.s1 fResL = fB4, fRes2L, fResL // ((A4*<x^2> + A2)*x^2)lo | |
7377 | nop.i 0 | |
7378 | } | |
7379 | ;; | |
7380 | { .mfi | |
7381 | nop.m 0 | |
7382 | fma.s1 fRes3L = fRes3L, FR_FracX, fB8 // ((A3*<x^2> + A1)*x)lo | |
7383 | nop.i 0 | |
7384 | } | |
7385 | ;; | |
7386 | { .mfi | |
7387 | nop.m 0 | |
7388 | fadd.s1 fB12 = fB12, fResH | |
7389 | nop.i 0 | |
7390 | } | |
7391 | ;; | |
7392 | { .mfi | |
7393 | nop.m 0 | |
7394 | fma.s1 fPol = fPol, fA0, fPolL | |
7395 | nop.i 0 | |
7396 | } | |
7397 | ;; | |
7398 | { .mfi | |
7399 | nop.m 0 | |
7400 | fadd.s1 fRes3L = fRes3L, fResL | |
7401 | nop.i 0 | |
7402 | } | |
7403 | ;; | |
7404 | { .mfi | |
7405 | nop.m 0 | |
7406 | fma.s1 fRes2H = fPol, fA0L, fB10 | |
7407 | nop.i 0 | |
7408 | } | |
7409 | ;; | |
7410 | { .mfi | |
7411 | nop.m 0 | |
7412 | fadd.s1 fRes3L = fB12, fRes3L | |
7413 | nop.i 0 | |
7414 | } | |
7415 | ;; | |
7416 | { .mfi | |
7417 | nop.m 0 | |
7418 | fsub.s1 fRes4L = fB10, fRes2H | |
7419 | nop.i 0 | |
7420 | } | |
7421 | ;; | |
7422 | { .mfi | |
7423 | nop.m 0 | |
7424 | fma.s1 fRes4L = fPol, fA0L, fRes4L | |
7425 | nop.i 0 | |
7426 | } | |
7427 | ;; | |
7428 | { .mfi | |
7429 | nop.m 0 | |
7430 | fadd.s1 fRes4L = fRes4L, fRes3L | |
7431 | nop.i 0 | |
7432 | } | |
7433 | ;; | |
7434 | { .mfb | |
7435 | nop.m 0 | |
7436 | // final result for all paths for which the result is Pol24(x) | |
7437 | fma.s0 f8 = fRes2H, f1, fRes4L | |
7438 | // here is the exit for all paths for which the result is Pol24(x) | |
7439 | br.ret.sptk b0 | |
7440 | } | |
7441 | ;; | |
7442 | ||
7443 | ||
7444 | // here if x is natval, nan, +/-inf, +/-0, or denormal | |
7445 | .align 32 | |
7446 | lgammal_spec: | |
7447 | { .mfi | |
7448 | nop.m 0 | |
7449 | fclass.m p9, p0 = f8, 0xB // +/-denormals | |
7450 | nop.i 0 | |
7451 | };; | |
7452 | { .mfi | |
7453 | nop.m 0 | |
7454 | fclass.m p6, p0 = f8, 0x1E1 // Test x for natval, nan, +inf | |
7455 | nop.i 0 | |
7456 | };; | |
7457 | { .mfb | |
7458 | nop.m 0 | |
7459 | fclass.m p7, p0 = f8, 0x7 // +/-0 | |
7460 | (p9) br.cond.sptk lgammal_denormal_input | |
7461 | };; | |
7462 | { .mfb | |
7463 | nop.m 0 | |
7464 | nop.f 0 | |
7465 | // branch out if x is natval, nan, +inf | |
7466 | (p6) br.cond.spnt lgammal_nan_pinf | |
7467 | };; | |
7468 | { .mfb | |
7469 | nop.m 0 | |
7470 | nop.f 0 | |
7471 | (p7) br.cond.spnt lgammal_singularity | |
7472 | };; | |
7473 | // if we are still here then x = -inf | |
7474 | { .mfi | |
7475 | cmp.eq p6, p7 = 4, rSgnGamSize | |
7476 | nop.f 0 | |
7477 | adds rSgnGam = 1, r0 | |
7478 | };; | |
7479 | { .mfi | |
7480 | // store signgam if size of variable is 4 bytes | |
7481 | (p6) st4 [rSgnGamAddr] = rSgnGam | |
7482 | nop.f 0 | |
7483 | nop.i 0 | |
7484 | } | |
7485 | { .mfb | |
7486 | // store signgam if size of variable is 8 bytes | |
7487 | (p7) st8 [rSgnGamAddr] = rSgnGam | |
7488 | fma.s0 f8 = f8,f8,f0 // return +inf, no call to error support | |
7489 | br.ret.spnt b0 | |
7490 | };; | |
7491 | ||
7492 | // here if x is NaN, NatVal or +INF | |
7493 | .align 32 | |
7494 | lgammal_nan_pinf: | |
7495 | { .mfi | |
7496 | cmp.eq p6, p7 = 4, rSgnGamSize | |
7497 | nop.f 0 | |
7498 | adds rSgnGam = 1, r0 | |
7499 | } | |
7500 | ;; | |
7501 | { .mfi | |
7502 | // store signgam if size of variable is 4 bytes | |
7503 | (p6) st4 [rSgnGamAddr] = rSgnGam | |
7504 | fma.s0 f8 = f8,f1,f8 // return x+x if x is natval, nan, +inf | |
7505 | nop.i 0 | |
7506 | } | |
7507 | { .mfb | |
7508 | // store signgam if size of variable is 8 bytes | |
7509 | (p7) st8 [rSgnGamAddr] = rSgnGam | |
7510 | nop.f 0 | |
7511 | br.ret.sptk b0 | |
7512 | } | |
7513 | ;; | |
7514 | ||
7515 | // here if x denormal or unnormal | |
7516 | .align 32 | |
7517 | lgammal_denormal_input: | |
7518 | { .mfi | |
7519 | nop.m 0 | |
7520 | fma.s0 fResH = f1, f1, f8 // raise denormal exception | |
7521 | nop.i 0 | |
7522 | } | |
7523 | { .mfi | |
7524 | nop.m 0 | |
7525 | fnorm.s1 f8 = f8 // normalize input value | |
7526 | nop.i 0 | |
7527 | } | |
7528 | ;; | |
7529 | { .mfi | |
7530 | getf.sig rSignifX = f8 | |
7531 | fmerge.se fSignifX = f1, f8 | |
7532 | nop.i 0 | |
7533 | } | |
7534 | { .mfi | |
7535 | getf.exp rSignExpX = f8 | |
7536 | fcvt.fx.s1 fXint = f8 // Convert arg to int (int repres. in FR) | |
7537 | nop.i 0 | |
7538 | } | |
7539 | ;; | |
7540 | { .mfi | |
7541 | getf.exp rSignExpX = f8 | |
7542 | fcmp.lt.s1 p15, p14 = f8, f0 | |
7543 | nop.i 0 | |
7544 | } | |
7545 | ;; | |
7546 | { .mfb | |
7547 | and rExpX = rSignExpX, r17Ones | |
7548 | fmerge.s fAbsX = f1, f8 // |x| | |
7549 | br.cond.sptk _deno_back_to_main_path | |
7550 | } | |
7551 | ;; | |
7552 | ||
7553 | ||
7554 | // here if overflow (x > overflow_bound) | |
7555 | .align 32 | |
7556 | lgammal_overflow: | |
7557 | { .mfi | |
7558 | addl r8 = 0x1FFFE, r0 | |
7559 | nop.f 0 | |
7560 | cmp.eq p6, p7 = 4, rSgnGamSize | |
7561 | } | |
7562 | { .mfi | |
7563 | adds rSgnGam = 1, r0 | |
7564 | nop.f 0 | |
7565 | nop.i 0 | |
7566 | } | |
7567 | ;; | |
7568 | { .mfi | |
7569 | setf.exp f9 = r8 | |
7570 | fmerge.s FR_X = f8,f8 | |
7571 | mov GR_Parameter_TAG = 102 // overflow | |
7572 | };; | |
7573 | { .mfi | |
7574 | // store signgam if size of variable is 4 bytes | |
7575 | (p6) st4 [rSgnGamAddr] = rSgnGam | |
7576 | nop.f 0 | |
7577 | nop.i 0 | |
7578 | } | |
7579 | { .mfb | |
7580 | // store signgam if size of variable is 8 bytes | |
7581 | (p7) st8 [rSgnGamAddr] = rSgnGam | |
7582 | fma.s0 FR_RESULT = f9,f9,f0 // Set I,O and +INF result | |
7583 | br.cond.sptk __libm_error_region | |
7584 | };; | |
7585 | ||
7586 | // here if x is negative integer or +/-0 (SINGULARITY) | |
7587 | .align 32 | |
7588 | lgammal_singularity: | |
7589 | { .mfi | |
7590 | adds rSgnGam = 1, r0 | |
7591 | fclass.m p8,p0 = f8,0x6 // is x -0? | |
7592 | mov GR_Parameter_TAG = 103 // negative | |
7593 | } | |
7594 | { .mfi | |
7595 | cmp.eq p6, p7 = 4, rSgnGamSize | |
7596 | fma.s1 FR_X = f0,f0,f8 | |
7597 | nop.i 0 | |
7598 | };; | |
7599 | { .mfi | |
7600 | (p8) sub rSgnGam = r0, rSgnGam | |
7601 | nop.f 0 | |
7602 | nop.i 0 | |
7603 | } | |
7604 | { .mfi | |
7605 | nop.m 0 | |
7606 | nop.f 0 | |
7607 | nop.i 0 | |
7608 | };; | |
7609 | { .mfi | |
7610 | // store signgam if size of variable is 4 bytes | |
7611 | (p6) st4 [rSgnGamAddr] = rSgnGam | |
7612 | nop.f 0 | |
7613 | nop.i 0 | |
7614 | } | |
7615 | { .mfb | |
7616 | // store signgam if size of variable is 8 bytes | |
7617 | (p7) st8 [rSgnGamAddr] = rSgnGam | |
7618 | frcpa.s0 FR_RESULT, p0 = f1, f0 | |
7619 | br.cond.sptk __libm_error_region | |
7620 | };; | |
7621 | ||
7622 | GLOBAL_LIBM_END(__libm_lgammal) | |
7623 | ||
7624 | ||
7625 | ||
7626 | LOCAL_LIBM_ENTRY(__libm_error_region) | |
7627 | .prologue | |
7628 | { .mfi | |
7629 | add GR_Parameter_Y=-32,sp // Parameter 2 value | |
7630 | nop.f 0 | |
7631 | .save ar.pfs,GR_SAVE_PFS | |
7632 | mov GR_SAVE_PFS=ar.pfs // Save ar.pfs | |
7633 | } | |
7634 | { .mfi | |
7635 | .fframe 64 | |
7636 | add sp=-64,sp // Create new stack | |
7637 | nop.f 0 | |
7638 | mov GR_SAVE_GP=gp // Save gp | |
7639 | };; | |
7640 | { .mmi | |
7641 | stfe [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack | |
7642 | add GR_Parameter_X = 16,sp // Parameter 1 address | |
7643 | .save b0, GR_SAVE_B0 | |
7644 | mov GR_SAVE_B0=b0 // Save b0 | |
7645 | };; | |
7646 | .body | |
7647 | { .mib | |
7648 | stfe [GR_Parameter_X] = FR_X // Store Parameter 1 on stack | |
7649 | add GR_Parameter_RESULT = 0,GR_Parameter_Y | |
7650 | nop.b 0 // Parameter 3 address | |
7651 | } | |
7652 | { .mib | |
7653 | stfe [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack | |
7654 | add GR_Parameter_Y = -16,GR_Parameter_Y | |
7655 | br.call.sptk b0=__libm_error_support# // Call error handling function | |
7656 | };; | |
7657 | { .mmi | |
7658 | add GR_Parameter_RESULT = 48,sp | |
7659 | nop.m 999 | |
7660 | nop.i 999 | |
7661 | };; | |
7662 | { .mmi | |
7663 | ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack | |
7664 | .restore sp | |
7665 | add sp = 64,sp // Restore stack pointer | |
7666 | mov b0 = GR_SAVE_B0 // Restore return address | |
7667 | };; | |
7668 | { .mib | |
7669 | mov gp = GR_SAVE_GP // Restore gp | |
7670 | mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs | |
7671 | br.ret.sptk b0 // Return | |
7672 | };; | |
7673 | ||
7674 | LOCAL_LIBM_END(__libm_error_region#) | |
7675 | ||
7676 | .type __libm_error_support#,@function | |
7677 | .global __libm_error_support# |