]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/ia64/fpu/libm_lgammal.S
Remove "Contributed by" lines
[thirdparty/glibc.git] / sysdeps / ia64 / fpu / libm_lgammal.S
1 .file "libm_lgammal.s"
2
3
4 // Copyright (c) 2002 - 2005, Intel Corporation
5 // All rights reserved.
6 //
7 //
8 // Redistribution and use in source and binary forms, with or without
9 // modification, are permitted provided that the following conditions are
10 // met:
11 //
12 // * Redistributions of source code must retain the above copyright
13 // notice, this list of conditions and the following disclaimer.
14 //
15 // * Redistributions in binary form must reproduce the above copyright
16 // notice, this list of conditions and the following disclaimer in the
17 // documentation and/or other materials provided with the distribution.
18 //
19 // * The name of Intel Corporation may not be used to endorse or promote
20 // products derived from this software without specific prior written
21 // permission.
22
23 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,INCLUDING,BUT NOT
25 // LIMITED TO,THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
27 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT,INDIRECT,INCIDENTAL,SPECIAL,
28 // EXEMPLARY,OR CONSEQUENTIAL DAMAGES (INCLUDING,BUT NOT LIMITED TO,
29 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,DATA,OR
30 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
31 // OF LIABILITY,WHETHER IN CONTRACT,STRICT LIABILITY OR TORT (INCLUDING
32 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33 // SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 //
35 // Intel Corporation is the author of this code,and requests that all
36 // problem reports or change requests be submitted to it directly at
37 // http://www.intel.com/software/products/opensource/libraries/num.htm.
38 //
39 //*********************************************************************
40 //
41 // History:
42 // 03/28/02 Original version
43 // 05/20/02 Cleaned up namespace and sf0 syntax
44 // 08/21/02 Added support of SIGN(GAMMA(x)) calculation
45 // 09/26/02 Algorithm description improved
46 // 10/21/02 Now it returns SIGN(GAMMA(x))=-1 for negative zero
47 // 02/10/03 Reordered header: .section, .global, .proc, .align
48 // 03/31/05 Reformatted delimiters between data tables
49 //
50 //*********************************************************************
51 //
52 // Function: __libm_lgammal(long double x, int* signgam, int szsigngam)
53 // computes the principal value of the logarithm of the GAMMA function
54 // of x. Signum of GAMMA(x) is stored to memory starting at the address
55 // specified by the signgam.
56 //
57 //*********************************************************************
58 //
59 // Resources Used:
60 //
61 // Floating-Point Registers: f8 (Input and Return Value)
62 // f9-f15
63 // f32-f127
64 //
65 // General Purpose Registers:
66 // r2, r3, r8-r11, r14-r31
67 // r32-r65
68 // r66-r69 (Used to pass arguments to error handling routine)
69 //
70 // Predicate Registers: p6-p15
71 //
72 //*********************************************************************
73 //
74 // IEEE Special Conditions:
75 //
76 // __libm_lgammal(+inf) = +inf
77 // __libm_lgammal(-inf) = QNaN
78 // __libm_lgammal(+/-0) = +inf
79 // __libm_lgammal(x<0, x - integer) = QNaN
80 // __libm_lgammal(SNaN) = QNaN
81 // __libm_lgammal(QNaN) = QNaN
82 //
83 //*********************************************************************
84 //
85 // ALGORITHM DESCRIPTION
86 //
87 // Below we suppose that there is log(z) function which takes an long
88 // double argument and returns result as a pair of long double numbers
89 // lnHi and lnLo (such that sum lnHi + lnLo provides ~80 correct bits
90 // of significand). Algorithm description for such log(z) function
91 // see below.
92 // Also, it this algorithm description we use the following notational
93 // conventions:
94 // a) pair A = (Ahi, Alo) means number A represented as sum of Ahi and Alo
95 // b) C = A + B = (Ahi, Alo) + (Bhi, Blo) means multi-precision addition.
96 // The result would be C = (Chi, Clo). Notice, that Clo shouldn't be
97 // equal to Alo + Blo
98 // c) D = A*B = (Ahi, Alo)*(Bhi, Blo) = (Dhi, Dlo) multi-precisiion
99 // multiplication.
100 //
101 // So, lgammal has the following computational paths:
102 // 1) |x| < 0.5
103 // P = A1*|x| + A2*|x|^2 + ... + A22*|x|^22
104 // A1, A2, A3 represented as a sum of two double precision
105 // numbers and multi-precision computations are used for 3 higher
106 // terms of the polynomial. We get polynomial as a sum of two
107 // double extended numbers: P = (Phi, Plo)
108 // 1.1) x > 0
109 // lgammal(x) = P - log(|x|) = (Phi, Plo) - (lnHi(|x|), lnLo(|x|))
110 // 1.2) x < 0
111 // lgammal(x) = -P - log(|x|) - log(sin(Pi*x)/(Pi*x))
112 // P and log(|x|) are computed by the same way as in 1.1;
113 // - log(sin(Pi*x)/(Pi*x)) is approximated by a polynomial Plnsin.
114 // Plnsin:= fLnSin2*|x|^2 + fLnSin4*|x|^4 + ... + fLnSin36*|x|^36
115 // The first coefficient of Plnsin is represented as sum of two
116 // double precision numbers (fLnSin2, fLnSin2L). Multi-precision
117 // computations for higher two terms of Plnsin are used.
118 // So, the final result is reconstructed by the following formula
119 // lgammal(x) = (-(Phi, Plo) - (lnHi(|x|), lnLo(|x|))) -
120 // - (PlnsinHi,PlnsinLo)
121 //
122 // 2) 0.5 <= x < 0.75 -> t = x - 0.625
123 // -0.75 < x <= -0.5 -> t = x + 0.625
124 // 2.25 <= x < 4.0 -> t = x/2 - 1.5
125 // 4.0 <= x < 8.0 -> t = x/4 - 1.5
126 // -0.5 < x <= -0.40625 -> t = x + 0.5
127 // -2.6005859375 < x <= -2.5 -> t = x + 2.5
128 // 1.3125 <= x < 1.5625 -> t = x - LOC_MIN, where LOC_MIN is point in
129 // which lgammal has local minimum. Exact
130 // value can be found in the table below,
131 // approximate value is ~1.46
132 //
133 // lgammal(x) is approximated by the polynomial of 25th degree: P25(t)
134 // P25(t) = A0 + A1*t + ... + A25*t^25 = (Phi, Plo) + t^4*P21(t),
135 // where
136 // (Phi, Plo) is sum of four highest terms of the polynomial P25(t):
137 // (Phi, Plo) = ((A0, A0L) + (A1, A1L)*t) + t^2 *((A2, A2L) + (A3, A3L)*t),
138 // (Ai, AiL) - coefficients represented as pairs of DP numbers.
139 //
140 // P21(t) = (PolC(t)*t^8 + PolD(t))*t^8 + PolE(t),
141 // where
142 // PolC(t) = C21*t^5 + C20*t^4 + ... + C16,
143 // C21 = A25, C20 = A24, ..., C16 = A20
144 //
145 // PolD(t) = D7*t^7 + D6*t^6 + ... + D0,
146 // D7 = A19, D6 = A18, ..., D0 = A12
147 //
148 // PolE(t) = E7*t^7 + E6*t^6 + ... + E0,
149 // E7 = A11, E6 = A10, ..., E0 = A4
150 //
151 // Cis and Dis are represented as double precision numbers,
152 // Eis are represented as double extended numbers.
153 //
154 // 3) 0.75 <= x < 1.3125 -> t = x - 1.0
155 // 1.5625 <= x < 2.25 -> t = x - 2.0
156 // lgammal(x) is approximated by the polynomial of 25th degree: P25(t)
157 // P25(t) = A1*t + ... + A25*t^25, and computations are carried out
158 // by similar way as in the previous case
159 //
160 // 4) 10.0 < x <= Overflow Bound ("positive Sterling" range)
161 // lgammal(x) is approximated using Sterling's formula:
162 // lgammal(x) ~ ((x*(lnHi(x) - 1, lnLo(x))) - 0.5*(lnHi(x), lnLo(x))) +
163 // + ((Chi, Clo) + S(1/x))
164 // where
165 // C = (Chi, Clo) - pair of double precision numbers representing constant
166 // 0.5*ln(2*Pi);
167 // S(1/x) = 1/x * (B2 + B4*(1/x)^2 + ... + B20*(1/x)^18), B2, ..., B20 are
168 // Bernulli numbers. S is computed in native precision and then added to
169 // Clo;
170 // lnHi(x) - 1 is computed in native precision and the multiprecision
171 // multiplication (x, 0) *(lnHi(x) - 1, lnLo(x)) is used.
172 //
173 // 5) -INF < x <= -2^63, any negative integer < 0
174 // All numbers in this range are integers -> error handler is called
175 //
176 // 6) -2^63 < x <= -0.75 ("negative Sterling" range), x is "far" from root,
177 // lgammal(-t) for positive t is approximated using the following formula:
178 // lgammal(-t) = -lgammal(t)-log(t)-log(|dT|)+log(sin(Pi*|dT|)/(Pi*|dT|))
179 // where dT = -t -round_to_nearest_integer(-t)
180 // Last item is approximated by the same polynomial as described in 1.2.
181 // We split the whole range into three subranges due to different ways of
182 // approximation of the first terms.
183 // 6.1) -2^63 < x < -6.0 ("negative Sterling" range)
184 // lgammal(t) is approximated exactly as in #4. The only difference that
185 // for -13.0 < x < -6.0 subrange instead of Bernulli numbers we use their
186 // minimax approximation on this range.
187 // log(t), log(|dT|) are approximated by the log routine mentioned above.
188 // 6.2) -6.0 < x <= -0.75, |x + 1|> 2^(-7)
189 // log(t), log(|dT|) are approximated by the log routine mentioned above,
190 // lgammal(t) is approximated by polynomials of the 25th degree similar
191 // to ones from #2. Arguments z of the polynomials are as follows
192 // a) 0.75 <= t < 1.0 - 2^(-7), z = 2*t - 1.5
193 // b) 1.0 - 2^(-7) < t < 2.0, z = t - 1.5
194 // c) 2.0 < t < 3.0, z = t/2 - 1.5
195 // d) 3.0 < t < 4.0, z = t/2 - 1.5. Notice, that range reduction is
196 // the same as in case c) but the set of coefficients is different
197 // e) 4.0 < t < 6.0, z = t/4 - 1.5
198 // 6.3) |x + 1| <= 2^(-7)
199 // log(1 + (x-1)) is approximated by Taylor series,
200 // log(sin(Pi*|dT|)/(Pi*|dT|)) is still approximated by polynomial but
201 // it has just 4th degree.
202 // log(|dT|) is approximated by the log routine mentioned above.
203 // lgammal(-x) is approximated by polynomial of 8th degree from (-x + 1).
204 //
205 // 7) -20.0 < x < -2.0, x falls in root "neighbourhood".
206 // "Neighbourhood" means that |lgammal(x)| < epsilon, where epsilon is
207 // different for every root (and it is stored in the table), but typically
208 // it is ~ 0.15. There are 35 roots significant from "double extended"
209 // point of view. We split all the roots into two subsets: "left" and "right"
210 // roots. Considering [-(N+1), -N] range we call root as "left" one if it
211 // lies closer to -(N+1) and "right" otherwise. There is no "left" root in
212 // the [-20, -19] range (it exists, but is insignificant for double extended
213 // precision). To determine if x falls in root "neighbourhood" we store
214 // significands of all the 35 roots as well as epsilon values (expressed
215 // by the left and right bound).
216 // In these ranges we approximate lgammal(x) by polynomial series of 19th
217 // degree:
218 // lgammal(x) = P19(t) = A0 + A1*t + ...+ A19*t^19, where t = x - EDP_Root,
219 // EDP_Root is the exact value of the corresponding root rounded to double
220 // extended precision. So, we have 35 different polynomials which make our
221 // table rather big. We may hope that x falls in root "neighbourhood"
222 // quite rarely -> ther might be no need in frequent use of different
223 // polynomials.
224 // A0, A1, A2, A3 are represented as pairs of double precision numbers,
225 // A4, A5 are long doubles, and to decrease the size of the table we
226 // keep the rest of coefficients in just double precision
227 //
228 //*********************************************************************
229 // Algorithm for log(X) = (lnHi(X), lnLo(X))
230 //
231 // ALGORITHM
232 //
233 // Here we use a table lookup method. The basic idea is that in
234 // order to compute logl(Arg) for an argument Arg in [1,2), we
235 // construct a value G such that G*Arg is close to 1 and that
236 // logl(1/G) is obtainable easily from a table of values calculated
237 // beforehand. Thus
238 //
239 // logl(Arg) = logl(1/G) + logl(G*Arg)
240 // = logl(1/G) + logl(1 + (G*Arg - 1))
241 //
242 // Because |G*Arg - 1| is small, the second term on the right hand
243 // side can be approximated by a short polynomial. We elaborate
244 // this method in four steps.
245 //
246 // Step 0: Initialization
247 //
248 // We need to calculate logl( X ). Obtain N, S_hi such that
249 //
250 // X = 2^N * S_hi exactly
251 //
252 // where S_hi in [1,2)
253 //
254 // Step 1: Argument Reduction
255 //
256 // Based on S_hi, obtain G_1, G_2, G_3 from a table and calculate
257 //
258 // G := G_1 * G_2 * G_3
259 // r := (G * S_hi - 1)
260 //
261 // These G_j's have the property that the product is exactly
262 // representable and that |r| < 2^(-12) as a result.
263 //
264 // Step 2: Approximation
265 //
266 //
267 // logl(1 + r) is approximated by a short polynomial poly(r).
268 //
269 // Step 3: Reconstruction
270 //
271 //
272 // Finally, logl( X ) is given by
273 //
274 // logl( X ) = logl( 2^N * S_hi )
275 // ~=~ N*logl(2) + logl(1/G) + logl(1 + r)
276 // ~=~ N*logl(2) + logl(1/G) + poly(r).
277 //
278 // IMPLEMENTATION
279 //
280 // Step 0. Initialization
281 // ----------------------
282 //
283 // Z := X
284 // N := unbaised exponent of Z
285 // S_hi := 2^(-N) * Z
286 //
287 // Step 1. Argument Reduction
288 // --------------------------
289 //
290 // Let
291 //
292 // Z = 2^N * S_hi = 2^N * 1.d_1 d_2 d_3 ... d_63
293 //
294 // We obtain G_1, G_2, G_3 by the following steps.
295 //
296 //
297 // Define X_0 := 1.d_1 d_2 ... d_14. This is extracted
298 // from S_hi.
299 //
300 // Define A_1 := 1.d_1 d_2 d_3 d_4. This is X_0 truncated
301 // to lsb = 2^(-4).
302 //
303 // Define index_1 := [ d_1 d_2 d_3 d_4 ].
304 //
305 // Fetch Z_1 := (1/A_1) rounded UP in fixed point with
306 // fixed point lsb = 2^(-15).
307 // Z_1 looks like z_0.z_1 z_2 ... z_15
308 // Note that the fetching is done using index_1.
309 // A_1 is actually not needed in the implementation
310 // and is used here only to explain how is the value
311 // Z_1 defined.
312 //
313 // Fetch G_1 := (1/A_1) truncated to 21 sig. bits.
314 // floating pt. Again, fetching is done using index_1. A_1
315 // explains how G_1 is defined.
316 //
317 // Calculate X_1 := X_0 * Z_1 truncated to lsb = 2^(-14)
318 // = 1.0 0 0 0 d_5 ... d_14
319 // This is accomplished by integer multiplication.
320 // It is proved that X_1 indeed always begin
321 // with 1.0000 in fixed point.
322 //
323 //
324 // Define A_2 := 1.0 0 0 0 d_5 d_6 d_7 d_8. This is X_1
325 // truncated to lsb = 2^(-8). Similar to A_1,
326 // A_2 is not needed in actual implementation. It
327 // helps explain how some of the values are defined.
328 //
329 // Define index_2 := [ d_5 d_6 d_7 d_8 ].
330 //
331 // Fetch Z_2 := (1/A_2) rounded UP in fixed point with
332 // fixed point lsb = 2^(-15). Fetch done using index_2.
333 // Z_2 looks like z_0.z_1 z_2 ... z_15
334 //
335 // Fetch G_2 := (1/A_2) truncated to 21 sig. bits.
336 // floating pt.
337 //
338 // Calculate X_2 := X_1 * Z_2 truncated to lsb = 2^(-14)
339 // = 1.0 0 0 0 0 0 0 0 d_9 d_10 ... d_14
340 // This is accomplished by integer multiplication.
341 // It is proved that X_2 indeed always begin
342 // with 1.00000000 in fixed point.
343 //
344 //
345 // Define A_3 := 1.0 0 0 0 0 0 0 0 d_9 d_10 d_11 d_12 d_13 1.
346 // This is 2^(-14) + X_2 truncated to lsb = 2^(-13).
347 //
348 // Define index_3 := [ d_9 d_10 d_11 d_12 d_13 ].
349 //
350 // Fetch G_3 := (1/A_3) truncated to 21 sig. bits.
351 // floating pt. Fetch is done using index_3.
352 //
353 // Compute G := G_1 * G_2 * G_3.
354 //
355 // This is done exactly since each of G_j only has 21 sig. bits.
356 //
357 // Compute
358 //
359 // r := (G*S_hi - 1)
360 //
361 //
362 // Step 2. Approximation
363 // ---------------------
364 //
365 // This step computes an approximation to logl( 1 + r ) where r is the
366 // reduced argument just obtained. It is proved that |r| <= 1.9*2^(-13);
367 // thus logl(1+r) can be approximated by a short polynomial:
368 //
369 // logl(1+r) ~=~ poly = r + Q1 r^2 + ... + Q4 r^5
370 //
371 //
372 // Step 3. Reconstruction
373 // ----------------------
374 //
375 // This step computes the desired result of logl(X):
376 //
377 // logl(X) = logl( 2^N * S_hi )
378 // = N*logl(2) + logl( S_hi )
379 // = N*logl(2) + logl(1/G) +
380 // logl(1 + G*S_hi - 1 )
381 //
382 // logl(2), logl(1/G_j) are stored as pairs of (single,double) numbers:
383 // log2_hi, log2_lo, log1byGj_hi, log1byGj_lo. The high parts are
384 // single-precision numbers and the low parts are double precision
385 // numbers. These have the property that
386 //
387 // N*log2_hi + SUM ( log1byGj_hi )
388 //
389 // is computable exactly in double-extended precision (64 sig. bits).
390 // Finally
391 //
392 // lnHi(X) := N*log2_hi + SUM ( log1byGj_hi )
393 // lnLo(X) := poly_hi + [ poly_lo +
394 // ( SUM ( log1byGj_lo ) + N*log2_lo ) ]
395 //
396 //
397 //*********************************************************************
398 // General Purpose Registers
399 // scratch registers
400 rPolDataPtr = r2
401 rLnSinDataPtr = r3
402 rExpX = r8
403 rSignifX = r9
404 rDelta = r10
405 rSignExpX = r11
406 GR_ad_z_1 = r14
407 r17Ones = r15
408 GR_Index1 = r16
409 rSignif1andQ = r17
410 GR_X_0 = r18
411 GR_X_1 = r19
412 GR_X_2 = r20
413 GR_Z_1 = r21
414 GR_Z_2 = r22
415 GR_N = r23
416 rExpHalf = r24
417 rExp8 = r25
418 rX0Dx = r25
419 GR_ad_tbl_1 = r26
420 GR_ad_tbl_2 = r27
421 GR_ad_tbl_3 = r28
422 GR_ad_q = r29
423 GR_ad_z_1 = r30
424 GR_ad_z_2 = r31
425 // stacked registers
426 rPFS_SAVED = r32
427 GR_ad_z_3 = r33
428 rSgnGamAddr = r34
429 rSgnGamSize = r35
430 rLogDataPtr = r36
431 rZ1offsett = r37
432 rTmpPtr = r38
433 rTmpPtr2 = r39
434 rTmpPtr3 = r40
435 rExp2 = r41
436 rExp2tom7 = r42
437 rZ625 = r42
438 rExpOne = r43
439 rNegSingularity = r44
440 rXint = r45
441 rTbl1Addr = r46
442 rTbl2Addr = r47
443 rTbl3Addr = r48
444 rZ2Addr = r49
445 rRootsAddr = r50
446 rRootsBndAddr = r51
447 rRoot = r52
448 rRightBound = r53
449 rLeftBound = r54
450 rSignifDx = r55
451 rBernulliPtr = r56
452 rLnSinTmpPtr = r56
453 rIndex1Dx = r57
454 rIndexPol = r58
455 GR_Index3 = r59
456 GR_Index2 = r60
457 rSgnGam = r61
458 rXRnd = r62
459
460 GR_SAVE_B0 = r63
461 GR_SAVE_GP = r64
462 GR_SAVE_PFS = r65
463 // output parameters when calling error handling routine
464 GR_Parameter_X = r66
465 GR_Parameter_Y = r67
466 GR_Parameter_RESULT = r68
467 GR_Parameter_TAG = r69
468
469 //********************************************************************
470 // Floating Point Registers
471 // CAUTION: due to the lack of registers there exist (below in the code)
472 // sometimes "unconventional" use of declared registers
473 //
474 fAbsX = f6
475 fDelX4 = f6
476 fSignifX = f7
477 // macros for error handling routine
478 FR_X = f10 // first argument
479 FR_Y = f1 // second argument (lgammal has just one)
480 FR_RESULT = f8 // result
481
482 // First 7 Bernulli numbers
483 fB2 = f9
484 fLnDeltaL = f9
485 fXSqr = f9
486 fB4 = f10
487 fX4 = f10
488 fB6 = f11
489 fX6 = f11
490 fB8 = f12
491 fXSqrL = f12
492 fB10 = f13
493 fRes7H = f13
494 fB12 = f14
495 fRes7L = f14
496 fB14 = f15
497
498 // stack registers
499 // Polynomial coefficients: A0, ..., A25
500 fA0 = f32
501 fA0L = f33
502 fInvXL = f33
503 fA1 = f34
504 fA1L = f35
505 fA2 = f36
506 fA2L = f37
507 fA3 = f38
508 fA3L = f39
509 fA4 = f40
510 fA4L = f41
511 fRes6H = f41
512 fA5 = f42
513 fB2L = f42
514 fA5L = f43
515 fMinNegStir = f43
516 fRes6L = f43
517 fA6 = f44
518 fMaxNegStir = f44
519 fA7 = f45
520 fLnDeltaH = f45
521 fA8 = f46
522 fBrnL = f46
523 fA9 = f47
524 fBrnH = f47
525 fA10 = f48
526 fRes5L = f48
527 fA11 = f49
528 fRes5H = f49
529 fA12 = f50
530 fDx6 = f50
531 fA13 = f51
532 fDx8 = f51
533 fA14 = f52
534 fDx4 = f52
535 fA15 = f53
536 fYL = f53
537 fh3Dx = f53
538 fA16 = f54
539 fYH = f54
540 fH3Dx = f54
541 fA17 = f55
542 fResLnDxL = f55
543 fG3Dx = f55
544 fA18 = f56
545 fResLnDxH = f56
546 fh2Dx = f56
547 fA19 = f57
548 fFloatNDx = f57
549 fA20 = f58
550 fPolyHiDx = f58
551 fhDx = f58
552 fA21 = f59
553 fRDxCub = f59
554 fHDx = f59
555 fA22 = f60
556 fRDxSq = f60
557 fGDx = f60
558 fA23 = f61
559 fPolyLoDx = f61
560 fInvX3 = f61
561 fA24 = f62
562 fRDx = f62
563 fInvX8 = f62
564 fA25 = f63
565 fInvX4 = f63
566 fPol = f64
567 fPolL = f65
568 // Coefficients of ln(sin(Pi*x)/Pi*x)
569 fLnSin2 = f66
570 fLnSin2L = f67
571 fLnSin4 = f68
572 fLnSin6 = f69
573 fLnSin8 = f70
574 fLnSin10 = f71
575 fLnSin12 = f72
576 fLnSin14 = f73
577 fLnSin16 = f74
578 fLnSin18 = f75
579 fDelX8 = f75
580 fLnSin20 = f76
581 fLnSin22 = f77
582 fDelX6 = f77
583 fLnSin24 = f78
584 fLnSin26 = f79
585 fLnSin28 = f80
586 fLnSin30 = f81
587 fhDelX = f81
588 fLnSin32 = f82
589 fLnSin34 = f83
590 fLnSin36 = f84
591 fXint = f85
592 fDxSqr = f85
593 fRes3L = f86
594 fRes3H = f87
595 fRes4H = f88
596 fRes4L = f89
597 fResH = f90
598 fResL = f91
599 fDx = f92
600 FR_MHalf = f93
601 fRes1H = f94
602 fRes1L = f95
603 fRes2H = f96
604 fRes2L = f97
605 FR_FracX = f98
606 fRcpX = f99
607 fLnSinH = f99
608 fTwo = f100
609 fMOne = f100
610 FR_G = f101
611 FR_H = f102
612 FR_h = f103
613 FR_G2 = f104
614 FR_H2 = f105
615 FR_poly_lo = f106
616 FR_poly_hi = f107
617 FR_h2 = f108
618 FR_rsq = f109
619 FR_r = f110
620 FR_log2_hi = f111
621 FR_log2_lo = f112
622 fFloatN = f113
623 FR_Q4 = f114
624 FR_G3 = f115
625 FR_H3 = f116
626 FR_h3 = f117
627 FR_Q3 = f118
628 FR_Q2 = f119
629 FR_Q1 = f120
630 fThirteen = f121
631 fSix = f121
632 FR_rcub = f121
633 // Last three Bernulli numbers
634 fB16 = f122
635 fB18 = f123
636 fB20 = f124
637 fInvX = f125
638 fLnSinL = f125
639 fDxSqrL = f126
640 fFltIntX = f126
641 fRoot = f127
642 fNormDx = f127
643
644 // Data tables
645 //==============================================================
646 RODATA
647 // ************* DO NOT CHANGE THE ORDER OF THESE TABLES *************
648 .align 16
649 LOCAL_OBJECT_START(lgammal_right_roots_data)
650 // List of all right roots themselves
651 data8 0x9D3FE4B007C360AB, 0x0000C000 // Range [-3, -2]
652 data8 0xC9306DE4F2CD7BEE, 0x0000C000 // Range [-4, -3]
653 data8 0x814273C2CCAC0618, 0x0000C001 // Range [-5, -4]
654 data8 0xA04352BF85B6C865, 0x0000C001 // Range [-6, -5]
655 data8 0xC00B592C4BE4676C, 0x0000C001 // Range [-7, -6]
656 data8 0xE0019FEF6FF0F5BF, 0x0000C001 // Range [-8, -7]
657 data8 0x80001A01459FC9F6, 0x0000C002 // Range [-9, -8]
658 data8 0x900002E3BB47D86D, 0x0000C002 // Range [-10, -9]
659 data8 0xA0000049F93BB992, 0x0000C002 // Range [-11, -10]
660 data8 0xB0000006B9915316, 0x0000C002 // Range [-12, -11]
661 data8 0xC00000008F76C773, 0x0000C002 // Range [-13, -12]
662 data8 0xD00000000B09230A, 0x0000C002 // Range [-14, -13]
663 data8 0xE000000000C9CBA5, 0x0000C002 // Range [-15, -14]
664 data8 0xF0000000000D73FA, 0x0000C002 // Range [-16, -15]
665 data8 0x8000000000006BA0, 0x0000C003 // Range [-17, -16]
666 data8 0x8800000000000655, 0x0000C003 // Range [-18, -17]
667 data8 0x900000000000005A, 0x0000C003 // Range [-19, -18]
668 data8 0x9800000000000005, 0x0000C003 // Range [-20, -19]
669 // List of bounds of ranges with special polynomial approximation near root
670 // Only significands of bounds are actually stored
671 data8 0xA000000000000000, 0x9800000000000000 // Bounds for root on [-3, -2]
672 data8 0xCAB88035C5EFBB41, 0xC7E05E31F4B02115 // Bounds for root on [-4, -3]
673 data8 0x817831B899735C72, 0x8114633941B8053A // Bounds for root on [-5, -4]
674 data8 0xA04E8B34C6AA9476, 0xA039B4A42978197B // Bounds for root on [-6, -5]
675 data8 0xC00D3D5E588A78A9, 0xC009BA25F7E858A6 // Bounds for root on [-7, -6]
676 data8 0xE001E54202991EB4, 0xE001648416CE897F // Bounds for root on [-8, -7]
677 data8 0x80001E56D13A6B9F, 0x8000164A3BAD888A // Bounds for root on [-9, -8]
678 data8 0x9000035F0529272A, 0x9000027A0E3D94F0 // Bounds for root on [-10, -9]
679 data8 0xA00000564D705880, 0xA000003F67EA0CC7 // Bounds for root on [-11, -10]
680 data8 0xB0000007D87EE0EF, 0xB0000005C3A122A5 // Bounds for root on [-12, -11]
681 data8 0xC0000000A75FE8B1, 0xC00000007AF818AC // Bounds for root on [-13, -12]
682 data8 0xD00000000CDFFE36, 0xD000000009758BBF // Bounds for root on [-14, -13]
683 data8 0xE000000000EB6D96, 0xE000000000ACF7B2 // Bounds for root on [-15, -14]
684 data8 0xF0000000000FB1F9, 0xF0000000000B87FB // Bounds for root on [-16, -15]
685 data8 0x8000000000007D90, 0x8000000000005C40 // Bounds for root on [-17, -16]
686 data8 0x8800000000000763, 0x880000000000056D // Bounds for root on [-18, -17]
687 data8 0x9000000000000069, 0x900000000000004D // Bounds for root on [-19, -18]
688 data8 0x9800000000000006, 0x9800000000000005 // Bounds for root on [-20, -19]
689 // List of all left roots themselves
690 data8 0xAFDA0850DEC8065E, 0x0000C000 // Range [-3, -2]
691 data8 0xFD238AA3E17F285C, 0x0000C000 // Range [-4, -3]
692 data8 0x9FBABBD37757E6A2, 0x0000C001 // Range [-5, -4]
693 data8 0xBFF497AC8FA06AFC, 0x0000C001 // Range [-6, -5]
694 data8 0xDFFE5FBB5C377FE8, 0x0000C001 // Range [-7, -6]
695 data8 0xFFFFCBFC0ACE7879, 0x0000C001 // Range [-8, -7]
696 data8 0x8FFFFD1C425E8100, 0x0000C002 // Range [-9, -8]
697 data8 0x9FFFFFB606BDFDCD, 0x0000C002 // Range [-10, -9]
698 data8 0xAFFFFFF9466E9F1B, 0x0000C002 // Range [-11, -10]
699 data8 0xBFFFFFFF70893874, 0x0000C002 // Range [-12, -11]
700 data8 0xCFFFFFFFF4F6DCF6, 0x0000C002 // Range [-13, -12]
701 data8 0xDFFFFFFFFF36345B, 0x0000C002 // Range [-14, -13]
702 data8 0xEFFFFFFFFFF28C06, 0x0000C002 // Range [-15, -14]
703 data8 0xFFFFFFFFFFFF28C0, 0x0000C002 // Range [-16, -15]
704 data8 0x87FFFFFFFFFFF9AB, 0x0000C003 // Range [-17, -16]
705 data8 0x8FFFFFFFFFFFFFA6, 0x0000C003 // Range [-18, -17]
706 data8 0x97FFFFFFFFFFFFFB, 0x0000C003 // Range [-19, -18]
707 data8 0x0000000000000000, 0x00000000 // pad to keep logic in the main path
708 // List of bounds of ranges with special polynomial approximation near root
709 // Only significands of bounds are actually stored
710 data8 0xB235880944CC758E, 0xADD2F1A9FBE76C8B // Bounds for root on [-3, -2]
711 data8 0xFD8E7844F307B07C, 0xFCA655C2152BDE4D // Bounds for root on [-4, -3]
712 data8 0x9FC4D876EE546967, 0x9FAEE4AF68BC4292 // Bounds for root on [-5, -4]
713 data8 0xBFF641FFBFCC44F1, 0xBFF2A47919F4BA89 // Bounds for root on [-6, -5]
714 data8 0xDFFE9C803DEFDD59, 0xDFFE18932EB723FE // Bounds for root on [-7, -6]
715 data8 0xFFFFD393FA47AFC3, 0xFFFFC317CF638AE1 // Bounds for root on [-8, -7]
716 data8 0x8FFFFD8840279925, 0x8FFFFC9DCECEEE92 // Bounds for root on [-9, -8]
717 data8 0x9FFFFFC0D34E2AF8, 0x9FFFFFA9619AA3B7 // Bounds for root on [-10, -9]
718 data8 0xAFFFFFFA41C18246, 0xAFFFFFF82025A23C // Bounds for root on [-11, -10]
719 data8 0xBFFFFFFF857ACB4E, 0xBFFFFFFF58032378 // Bounds for root on [-12, -11]
720 data8 0xCFFFFFFFF6934AB8, 0xCFFFFFFFF313EF0A // Bounds for root on [-13, -12]
721 data8 0xDFFFFFFFFF53A9E9, 0xDFFFFFFFFF13B5A5 // Bounds for root on [-14, -13]
722 data8 0xEFFFFFFFFFF482CB, 0xEFFFFFFFFFF03F4F // Bounds for root on [-15, -14]
723 data8 0xFFFFFFFFFFFF482D, 0xFFFFFFFFFFFF03F5 // Bounds for root on [-16, -15]
724 data8 0x87FFFFFFFFFFFA98, 0x87FFFFFFFFFFF896 // Bounds for root on [-17, -16]
725 data8 0x8FFFFFFFFFFFFFB3, 0x8FFFFFFFFFFFFF97 // Bounds for root on [-18, -17]
726 data8 0x97FFFFFFFFFFFFFC, 0x97FFFFFFFFFFFFFB // Bounds for root on [-19, -18]
727 LOCAL_OBJECT_END(lgammal_right_roots_data)
728
729 LOCAL_OBJECT_START(lgammal_0_Half_data)
730 // Polynomial coefficients for the lgammal(x), 0.0 < |x| < 0.5
731 data8 0xBFD9A4D55BEAB2D6, 0xBC8AA3C097746D1F //A3
732 data8 0x3FEA51A6625307D3, 0x3C7180E7BD2D0DCC //A2
733 data8 0xBFE2788CFC6FB618, 0xBC9E9346C4692BCC //A1
734 data8 0x8A8991563EC1BD13, 0x00003FFD //A4
735 data8 0xD45CE0BD52C27EF2, 0x0000BFFC //A5
736 data8 0xADA06587FA2BBD47, 0x00003FFC //A6
737 data8 0x9381D0ED2194902A, 0x0000BFFC //A7
738 data8 0x80859B3CF92D4192, 0x00003FFC //A8
739 data8 0xE4033517C622A946, 0x0000BFFB //A9
740 data8 0xCD00CE67A51FC82A, 0x00003FFB //A10
741 data8 0xBA44E2A96C3B5700, 0x0000BFFB //A11
742 data8 0xAAAD008FA46DBD99, 0x00003FFB //A12
743 data8 0x9D604AC65A41153D, 0x0000BFFB //A13
744 data8 0x917CECB864B5A861, 0x00003FFB //A14
745 data8 0x85A4810EB730FDE4, 0x0000BFFB //A15
746 data8 0xEF2761C38BD21F77, 0x00003FFA //A16
747 data8 0xC913043A128367DA, 0x0000BFFA //A17
748 data8 0x96A29B71FF7AFFAA, 0x00003FFA //A18
749 data8 0xBB9FFA1A5FE649BB, 0x0000BFF9 //A19
750 data8 0xB17982CD2DAA0EE3, 0x00003FF8 //A20
751 data8 0xDE1DDCBFFB9453F0, 0x0000BFF6 //A21
752 data8 0x87FBF5D7ACD9FA9D, 0x00003FF4 //A22
753 LOCAL_OBJECT_END(lgammal_0_Half_data)
754
755 LOCAL_OBJECT_START(Constants_Q)
756 // log2_hi, log2_lo, Q_4, Q_3, Q_2, and Q_1
757 data4 0x00000000,0xB1721800,0x00003FFE,0x00000000
758 data4 0x4361C4C6,0x82E30865,0x0000BFE2,0x00000000
759 data4 0x328833CB,0xCCCCCAF2,0x00003FFC,0x00000000
760 data4 0xA9D4BAFB,0x80000077,0x0000BFFD,0x00000000
761 data4 0xAAABE3D2,0xAAAAAAAA,0x00003FFD,0x00000000
762 data4 0xFFFFDAB7,0xFFFFFFFF,0x0000BFFD,0x00000000
763 LOCAL_OBJECT_END(Constants_Q)
764
765 LOCAL_OBJECT_START(Constants_Z_1)
766 // Z1 - 16 bit fixed
767 data4 0x00008000
768 data4 0x00007879
769 data4 0x000071C8
770 data4 0x00006BCB
771 data4 0x00006667
772 data4 0x00006187
773 data4 0x00005D18
774 data4 0x0000590C
775 data4 0x00005556
776 data4 0x000051EC
777 data4 0x00004EC5
778 data4 0x00004BDB
779 data4 0x00004925
780 data4 0x0000469F
781 data4 0x00004445
782 data4 0x00004211
783 LOCAL_OBJECT_END(Constants_Z_1)
784
785 LOCAL_OBJECT_START(Constants_G_H_h1)
786 // G1 and H1 - IEEE single and h1 - IEEE double
787 data4 0x3F800000,0x00000000,0x00000000,0x00000000
788 data4 0x3F70F0F0,0x3D785196,0x617D741C,0x3DA163A6
789 data4 0x3F638E38,0x3DF13843,0xCBD3D5BB,0x3E2C55E6
790 data4 0x3F579430,0x3E2FF9A0,0xD86EA5E7,0xBE3EB0BF
791 data4 0x3F4CCCC8,0x3E647FD6,0x86B12760,0x3E2E6A8C
792 data4 0x3F430C30,0x3E8B3AE7,0x5C0739BA,0x3E47574C
793 data4 0x3F3A2E88,0x3EA30C68,0x13E8AF2F,0x3E20E30F
794 data4 0x3F321640,0x3EB9CEC8,0xF2C630BD,0xBE42885B
795 data4 0x3F2AAAA8,0x3ECF9927,0x97E577C6,0x3E497F34
796 data4 0x3F23D708,0x3EE47FC5,0xA6B0A5AB,0x3E3E6A6E
797 data4 0x3F1D89D8,0x3EF8947D,0xD328D9BE,0xBDF43E3C
798 data4 0x3F17B420,0x3F05F3A1,0x0ADB090A,0x3E4094C3
799 data4 0x3F124920,0x3F0F4303,0xFC1FE510,0xBE28FBB2
800 data4 0x3F0D3DC8,0x3F183EBF,0x10FDE3FA,0x3E3A7895
801 data4 0x3F088888,0x3F20EC80,0x7CC8C98F,0x3E508CE5
802 data4 0x3F042108,0x3F29516A,0xA223106C,0xBE534874
803 LOCAL_OBJECT_END(Constants_G_H_h1)
804
805 LOCAL_OBJECT_START(Constants_Z_2)
806 // Z2 - 16 bit fixed
807 data4 0x00008000
808 data4 0x00007F81
809 data4 0x00007F02
810 data4 0x00007E85
811 data4 0x00007E08
812 data4 0x00007D8D
813 data4 0x00007D12
814 data4 0x00007C98
815 data4 0x00007C20
816 data4 0x00007BA8
817 data4 0x00007B31
818 data4 0x00007ABB
819 data4 0x00007A45
820 data4 0x000079D1
821 data4 0x0000795D
822 data4 0x000078EB
823 LOCAL_OBJECT_END(Constants_Z_2)
824
825 LOCAL_OBJECT_START(Constants_G_H_h2)
826 // G2 and H2 - IEEE single and h2 - IEEE double
827 data4 0x3F800000,0x00000000,0x00000000,0x00000000
828 data4 0x3F7F00F8,0x3B7F875D,0x22C42273,0x3DB5A116
829 data4 0x3F7E03F8,0x3BFF015B,0x21F86ED3,0x3DE620CF
830 data4 0x3F7D08E0,0x3C3EE393,0x484F34ED,0xBDAFA07E
831 data4 0x3F7C0FC0,0x3C7E0586,0x3860BCF6,0xBDFE07F0
832 data4 0x3F7B1880,0x3C9E75D2,0xA78093D6,0x3DEA370F
833 data4 0x3F7A2328,0x3CBDC97A,0x72A753D0,0x3DFF5791
834 data4 0x3F792FB0,0x3CDCFE47,0xA7EF896B,0x3DFEBE6C
835 data4 0x3F783E08,0x3CFC15D0,0x409ECB43,0x3E0CF156
836 data4 0x3F774E38,0x3D0D874D,0xFFEF71DF,0xBE0B6F97
837 data4 0x3F766038,0x3D1CF49B,0x5D59EEE8,0xBE080483
838 data4 0x3F757400,0x3D2C531D,0xA9192A74,0x3E1F91E9
839 data4 0x3F748988,0x3D3BA322,0xBF72A8CD,0xBE139A06
840 data4 0x3F73A0D0,0x3D4AE46F,0xF8FBA6CF,0x3E1D9202
841 data4 0x3F72B9D0,0x3D5A1756,0xBA796223,0xBE1DCCC4
842 data4 0x3F71D488,0x3D693B9D,0xB6B7C239,0xBE049391
843 LOCAL_OBJECT_END(Constants_G_H_h2)
844
845 LOCAL_OBJECT_START(Constants_G_H_h3)
846 // G3 and H3 - IEEE single and h3 - IEEE double
847 data4 0x3F7FFC00,0x38800100,0x562224CD,0x3D355595
848 data4 0x3F7FF400,0x39400480,0x06136FF6,0x3D8200A2
849 data4 0x3F7FEC00,0x39A00640,0xE8DE9AF0,0x3DA4D68D
850 data4 0x3F7FE400,0x39E00C41,0xB10238DC,0xBD8B4291
851 data4 0x3F7FDC00,0x3A100A21,0x3B1952CA,0xBD89CCB8
852 data4 0x3F7FD400,0x3A300F22,0x1DC46826,0xBDB10707
853 data4 0x3F7FCC08,0x3A4FF51C,0xF43307DB,0x3DB6FCB9
854 data4 0x3F7FC408,0x3A6FFC1D,0x62DC7872,0xBD9B7C47
855 data4 0x3F7FBC10,0x3A87F20B,0x3F89154A,0xBDC3725E
856 data4 0x3F7FB410,0x3A97F68B,0x62B9D392,0xBD93519D
857 data4 0x3F7FAC18,0x3AA7EB86,0x0F21BD9D,0x3DC18441
858 data4 0x3F7FA420,0x3AB7E101,0x2245E0A6,0xBDA64B95
859 data4 0x3F7F9C20,0x3AC7E701,0xAABB34B8,0x3DB4B0EC
860 data4 0x3F7F9428,0x3AD7DD7B,0x6DC40A7E,0x3D992337
861 data4 0x3F7F8C30,0x3AE7D474,0x4F2083D3,0x3DC6E17B
862 data4 0x3F7F8438,0x3AF7CBED,0x811D4394,0x3DAE314B
863 data4 0x3F7F7C40,0x3B03E1F3,0xB08F2DB1,0xBDD46F21
864 data4 0x3F7F7448,0x3B0BDE2F,0x6D34522B,0xBDDC30A4
865 data4 0x3F7F6C50,0x3B13DAAA,0xB1F473DB,0x3DCB0070
866 data4 0x3F7F6458,0x3B1BD766,0x6AD282FD,0xBDD65DDC
867 data4 0x3F7F5C68,0x3B23CC5C,0xF153761A,0xBDCDAB83
868 data4 0x3F7F5470,0x3B2BC997,0x341D0F8F,0xBDDADA40
869 data4 0x3F7F4C78,0x3B33C711,0xEBC394E8,0x3DCD1BD7
870 data4 0x3F7F4488,0x3B3BBCC6,0x52E3E695,0xBDC3532B
871 data4 0x3F7F3C90,0x3B43BAC0,0xE846B3DE,0xBDA3961E
872 data4 0x3F7F34A0,0x3B4BB0F4,0x785778D4,0xBDDADF06
873 data4 0x3F7F2CA8,0x3B53AF6D,0xE55CE212,0x3DCC3ED1
874 data4 0x3F7F24B8,0x3B5BA620,0x9E382C15,0xBDBA3103
875 data4 0x3F7F1CC8,0x3B639D12,0x5C5AF197,0x3D635A0B
876 data4 0x3F7F14D8,0x3B6B9444,0x71D34EFC,0xBDDCCB19
877 data4 0x3F7F0CE0,0x3B7393BC,0x52CD7ADA,0x3DC74502
878 data4 0x3F7F04F0,0x3B7B8B6D,0x7D7F2A42,0xBDB68F17
879 LOCAL_OBJECT_END(Constants_G_H_h3)
880
881 LOCAL_OBJECT_START(lgammal_data)
882 // Positive overflow value
883 data8 0xB8D54C8BFFFDEBF4, 0x00007FF1
884 LOCAL_OBJECT_END(lgammal_data)
885
886 LOCAL_OBJECT_START(lgammal_Stirling)
887 // Coefficients needed for Strirling's formula
888 data8 0x3FED67F1C864BEB4 // High part of 0.5*ln(2*Pi)
889 data8 0x3C94D252F2400510 // Low part of 0.5*ln(2*Pi)
890 //
891 // Bernulli numbers used in Striling's formula for -2^63 < |x| < -13.0
892 //(B1H, B1L) = 8.3333333333333333333262747254e-02
893 data8 0x3FB5555555555555, 0x3C55555555555555
894 data8 0xB60B60B60B60B60B, 0x0000BFF6 //B2 = -2.7777777777777777777777777778e-03
895 data8 0xD00D00D00D00D00D, 0x00003FF4 //B3 = 7.9365079365079365079365079365e-04
896 data8 0x9C09C09C09C09C0A, 0x0000BFF4 //B4 = -5.9523809523809523809523809524e-04
897 data8 0xDCA8F158C7F91AB8, 0x00003FF4 //B5 = 8.4175084175084175084175084175e-04
898 data8 0xFB5586CCC9E3E410, 0x0000BFF5 //B6 = -1.9175269175269175269175269175e-03
899 data8 0xD20D20D20D20D20D, 0x00003FF7 //B7 = 6.4102564102564102564102564103e-03
900 data8 0xF21436587A9CBEE1, 0x0000BFF9 //B8 = -2.9550653594771241830065359477e-02
901 data8 0xB7F4B1C0F033FFD1, 0x00003FFC //B9 = 1.7964437236883057316493849002e-01
902 data8 0xB23B3808C0F9CF6E, 0x0000BFFF //B10 = -1.3924322169059011164274322169e+00
903 // Polynomial coefficients for Stirling's formula, -13.0 < x < -6.0
904 data8 0x3FB5555555555555, 0x3C4D75060289C58B //A0
905 data8 0xB60B60B60B0F0876, 0x0000BFF6 //A1
906 data8 0xD00D00CE54B1256C, 0x00003FF4 //A2
907 data8 0x9C09BF46B58F75E1, 0x0000BFF4 //A3
908 data8 0xDCA8483BC91ACC6D, 0x00003FF4 //A4
909 data8 0xFB3965C939CC9FEE, 0x0000BFF5 //A5
910 data8 0xD0723ADE3F0BC401, 0x00003FF7 //A6
911 data8 0xE1ED7434E81F0B73, 0x0000BFF9 //A7
912 data8 0x8069C6982F993283, 0x00003FFC //A8
913 data8 0xC271F65BFA5BEE3F, 0x0000BFFD //A9
914 LOCAL_OBJECT_END(lgammal_Stirling)
915
916 LOCAL_OBJECT_START(lgammal_lnsin_data)
917 // polynomial approximation of -ln(sin(Pi*x)/(Pi*x)), 0 < x <= 0.5
918 data8 0x3FFA51A6625307D3, 0x3C81873332FAF94C //A2
919 data8 0x8A8991563EC241C3, 0x00003FFE //A4
920 data8 0xADA06588061805DF, 0x00003FFD //A6
921 data8 0x80859B57C338D0F7, 0x00003FFD //A8
922 data8 0xCD00F1C2D78754BD, 0x00003FFC //A10
923 data8 0xAAB56B1D3A1F4655, 0x00003FFC //A12
924 data8 0x924B6F2FBBED12B1, 0x00003FFC //A14
925 data8 0x80008E58765F43FC, 0x00003FFC //A16
926 data8 0x3FBC718EC115E429//A18
927 data8 0x3FB99CE544FE183E//A20
928 data8 0x3FB7251C09EAAD89//A22
929 data8 0x3FB64A970733628C//A24
930 data8 0x3FAC92D6802A3498//A26
931 data8 0x3FC47E1165261586//A28
932 data8 0xBFCA1BAA434750D4//A30
933 data8 0x3FE460001C4D5961//A32
934 data8 0xBFE6F06A3E4908AD//A34
935 data8 0x3FE300889EBB203A//A36
936 LOCAL_OBJECT_END(lgammal_lnsin_data)
937
938 LOCAL_OBJECT_START(lgammal_half_3Q_data)
939 // Polynomial coefficients for the lgammal(x), 0.5 <= x < 0.75
940 data8 0xBFF7A648EE90C62E, 0x3C713F326857E066 // A3, A0L
941 data8 0xBFF73E4B8BA780AE, 0xBCA953BC788877EF // A1, A1L
942 data8 0x403774DCD58D0291, 0xC0415254D5AE6623 // D0, D1
943 data8 0x40B07213855CBFB0, 0xC0B8855E25D2D229 // C20, C21
944 data8 0x3FFB359F85FF5000, 0x3C9BAECE6EF9EF3A // A2, A2L
945 data8 0x3FD717D498A3A8CC, 0xBC9088E101CFEDFA // A0, A3L
946 data8 0xAFEF36CC5AEC3FF0, 0x00004002 // E6
947 data8 0xABE2054E1C34E791, 0x00004001 // E4
948 data8 0xB39343637B2900D1, 0x00004000 // E2
949 data8 0xD74FB710D53F58F6, 0x00003FFF // E0
950 data8 0x4070655963BA4256, 0xC078DA9D263C4EA3 // D6, D7
951 data8 0x405CD2B6A9B90978, 0xC065B3B9F4F4F171 // D4, D5
952 data8 0x4049BC2204CF61FF, 0xC05337227E0BA152 // D2, D3
953 data8 0x4095509A50C07A96, 0xC0A0747949D2FB45 // C18, C19
954 data8 0x4082ECCBAD709414, 0xC08CD02FB088A702 // C16, C17
955 data8 0xFFE4B2A61B508DD5, 0x0000C002 // E7
956 data8 0xF461ADB8AE17E0A5, 0x0000C001 // E5
957 data8 0xF5BE8B0B90325F20, 0x0000C000 // E3
958 data8 0x877B275F3FB78DCA, 0x0000C000 // E1
959 LOCAL_OBJECT_END(lgammal_half_3Q_data)
960
961 LOCAL_OBJECT_START(lgammal_half_3Q_neg_data)
962 // Polynomial coefficients for the lgammal(x), -0.75 < x <= -0.5
963 data8 0xC014836EFD94899C, 0x3C9835679663B44F // A3, A0L
964 data8 0xBFF276C7B4FB1875, 0xBC92D3D9FA29A1C0 // A1, A1L
965 data8 0x40C5178F24E1A435, 0xC0D9DE84FBC5D76A // D0, D1
966 data8 0x41D4D1B236BF6E93, 0xC1EBB0445CE58550 // C20, C21
967 data8 0x4015718CD67F63D3, 0x3CC5354B6F04B59C // A2, A2L
968 data8 0x3FF554493087E1ED, 0xBCB72715E37B02B9 // A0, A3L
969 data8 0xE4AC7E915FA72229, 0x00004009 // E6
970 data8 0xA28244206395FCC6, 0x00004007 // E4
971 data8 0xFB045F19C07B2544, 0x00004004 // E2
972 data8 0xE5C8A6E6A9BA7D7B, 0x00004002 // E0
973 data8 0x4143943B55BF5118, 0xC158AC05EA675406 // D6, D7
974 data8 0x4118F6833D19717C, 0xC12F51A6F375CC80 // D4, D5
975 data8 0x40F00C209483481C, 0xC103F1DABF750259 // D2, D3
976 data8 0x4191038F2D8F9E40, 0xC1A413066DA8AE4A // C18, C19
977 data8 0x4170B537EDD833DE, 0xC1857E79424C61CE // C16, C17
978 data8 0x8941D8AB4855DB73, 0x0000C00B // E7
979 data8 0xBB822B131BD2E813, 0x0000C008 // E5
980 data8 0x852B4C03B83D2D4F, 0x0000C006 // E3
981 data8 0xC754CA7E2DDC0F1F, 0x0000C003 // E1
982 LOCAL_OBJECT_END(lgammal_half_3Q_neg_data)
983
984 LOCAL_OBJECT_START(lgammal_2Q_4_data)
985 // Polynomial coefficients for the lgammal(x), 2.25 <= |x| < 4.0
986 data8 0xBFCA4D55BEAB2D6F, 0x3C7ABC9DA14141F5 // A3, A0L
987 data8 0x3FFD8773039049E7, 0x3C66CB7957A95BA4 // A1, A1L
988 data8 0x3F45C3CC79E91E7D, 0xBF3A8E5005937E97 // D0, D1
989 data8 0x3EC951E35E1C9203, 0xBEB030A90026C5DF // C20, C21
990 data8 0x3FE94699894C1F4C, 0x3C91884D21D123F1 // A2, A2L
991 data8 0x3FE62E42FEFA39EF, 0xBC66480CEB70870F // A0, A3L
992 data8 0xF1C2EAFF0B3A7579, 0x00003FF5 // E6
993 data8 0xB36AF863926B55A3, 0x00003FF7 // E4
994 data8 0x9620656185BB44CA, 0x00003FF9 // E2
995 data8 0xA264558FB0906AFF, 0x00003FFB // E0
996 data8 0x3F03D59E9666C961, 0xBEF91115893D84A6 // D6, D7
997 data8 0x3F19333611C46225, 0xBF0F89EB7D029870 // D4, D5
998 data8 0x3F3055A96B347AFE, 0xBF243B5153E178A8 // D2, D3
999 data8 0x3ED9A4AEF30C4BB2, 0xBED388138B1CEFF2 // C18, C19
1000 data8 0x3EEF7945A3C3A254, 0xBEE36F32A938EF11 // C16, C17
1001 data8 0x9028923F47C82118, 0x0000BFF5 // E7
1002 data8 0xCE0DAAFB6DC93B22, 0x0000BFF6 // E5
1003 data8 0xA0D0983B34AC4C8D, 0x0000BFF8 // E3
1004 data8 0x94D6C50FEB8B0CE7, 0x0000BFFA // E1
1005 LOCAL_OBJECT_END(lgammal_2Q_4_data)
1006
1007 LOCAL_OBJECT_START(lgammal_4_8_data)
1008 // Polynomial coefficients for the lgammal(x), 4.0 <= |x| < 8.0
1009 data8 0xBFD6626BC9B31B54, 0x3CAA53C82493A92B // A3, A0L
1010 data8 0x401B4C420A50AD7C, 0x3C8C6E9929F789A3 // A1, A1L
1011 data8 0x3F49410427E928C2, 0xBF3E312678F8C146 // D0, D1
1012 data8 0x3ED51065F7CD5848, 0xBED052782A03312F // C20, C21
1013 data8 0x3FF735973273D5EC, 0x3C831DFC65BF8CCF // A2, A2L
1014 data8 0x401326643C4479C9, 0xBC6FA0498C5548A6 // A0, A3L
1015 data8 0x9382D8B3CD4EB7E3, 0x00003FF6 // E6
1016 data8 0xE9F92CAD8A85CBCD, 0x00003FF7 // E4
1017 data8 0xD58389FE38258CEC, 0x00003FF9 // E2
1018 data8 0x81310136363AE8AA, 0x00003FFC // E0
1019 data8 0x3F04F0AE38E78570, 0xBEF9E2144BB8F03C // D6, D7
1020 data8 0x3F1B5E992A6CBC2A, 0xBF10F3F400113911 // D4, D5
1021 data8 0x3F323EE00AAB7DEE, 0xBF2640FDFA9FB637 // D2, D3
1022 data8 0x3ED2143EBAFF067A, 0xBEBBDEB92D6FF35D // C18, C19
1023 data8 0x3EF173A42B69AAA4, 0xBEE78B9951A2EAA5 // C16, C17
1024 data8 0xAB3CCAC6344E52AA, 0x0000BFF5 // E7
1025 data8 0x81ACCB8915B16508, 0x0000BFF7 // E5
1026 data8 0xDA62C7221102C426, 0x0000BFF8 // E3
1027 data8 0xDF1BD44C4083580A, 0x0000BFFA // E1
1028 LOCAL_OBJECT_END(lgammal_4_8_data)
1029
1030 LOCAL_OBJECT_START(lgammal_loc_min_data)
1031 // Polynomial coefficients for the lgammal(x), 1.3125 <= x < 1.5625
1032 data8 0xBB16C31AB5F1FB71, 0x00003FFF // xMin - point of local minimum
1033 data8 0xBFC2E4278DC6BC23, 0xBC683DA8DDCA9650 // A3, A0L
1034 data8 0x3BD4DB7D0CA61D5F, 0x386E719EDD01D801 // A1, A1L
1035 data8 0x3F4CC72638E1D93F, 0xBF4228EC9953CCB9 // D0, D1
1036 data8 0x3ED222F97A04613E,0xBED3DDD58095CB6C // C20, C21
1037 data8 0x3FDEF72BC8EE38AB, 0x3C863AFF3FC48940 // A2, A2L
1038 data8 0xBFBF19B9BCC38A41, 0xBC7425F1BFFC1442// A0, A3L
1039 data8 0x941890032BEB34C3, 0x00003FF6 // E6
1040 data8 0xC7E701591CE534BC, 0x00003FF7 // E4
1041 data8 0x93373CBD05138DD4, 0x00003FF9 // E2
1042 data8 0x845A14A6A81C05D6, 0x00003FFB // E0
1043 data8 0x3F0F6C4DF6D47A13, 0xBF045DCDB5B49E19 // D6, D7
1044 data8 0x3F22E23345DDE59C, 0xBF1851159AFB1735 // D4, D5
1045 data8 0x3F37101EA4022B78, 0xBF2D721E6323AF13 // D2, D3
1046 data8 0x3EE691EBE82DF09D, 0xBEDD42550961F730 // C18, C19
1047 data8 0x3EFA793EDE99AD85, 0xBEF14000108E70BE // C16, C17
1048 data8 0xB7CBC033ACE0C99C, 0x0000BFF5 // E7
1049 data8 0xF178D1F7B1A45E27, 0x0000BFF6 // E5
1050 data8 0xA8FCFCA8106F471C, 0x0000BFF8 // E3
1051 data8 0x864D46FA898A9AD2, 0x0000BFFA // E1
1052 LOCAL_OBJECT_END(lgammal_loc_min_data)
1053
1054 LOCAL_OBJECT_START(lgammal_03Q_1Q_data)
1055 // Polynomial coefficients for the lgammal(x), 0.75 <= |x| < 1.3125
1056 data8 0x3FD151322AC7D848, 0x3C7184DE0DB7B4EE // A4, A2L
1057 data8 0x3FD9A4D55BEAB2D6, 0x3C9E934AAB10845F // A3, A1L
1058 data8 0x3FB111289C381259, 0x3FAFFFCFB32AE18D // D2, D3
1059 data8 0x3FB3B1D9E0E3E00D, 0x3FB2496F0D3768DF // D0, D1
1060 data8 0xBA461972C057D439, 0x00003FFB // E6
1061 data8 0x3FEA51A6625307D3, 0x3C76ABC886A72DA2 // A2, A4L
1062 data8 0x3FA8EFE46B32A70E, 0x3F8F31B3559576B6 // C17, C20
1063 data8 0xE403383700387D85, 0x00003FFB // E4
1064 data8 0x9381D0EE74BF7251, 0x00003FFC // E2
1065 data8 0x3FAA2177A6D28177, 0x3FA4895E65FBD995 // C18, C19
1066 data8 0x3FAAED2C77DBEE5D, 0x3FA94CA59385512C // D6, D7
1067 data8 0x3FAE1F522E8A5941, 0x3FAC785EF56DD87E // D4, D5
1068 data8 0x3FB556AD5FA56F0A, 0x3FA81F416E87C783 // E7, C16
1069 data8 0xCD00F1C2DC2C9F1E, 0x00003FFB // E5
1070 data8 0x3FE2788CFC6FB618, 0x3C8E52519B5B17CB // A1, A3L
1071 data8 0x80859B57C3E7F241, 0x00003FFC // E3
1072 data8 0xADA065880615F401, 0x00003FFC // E1
1073 data8 0xD45CE0BD530AB50E, 0x00003FFC // E0
1074 LOCAL_OBJECT_END(lgammal_03Q_1Q_data)
1075
1076 LOCAL_OBJECT_START(lgammal_13Q_2Q_data)
1077 // Polynomial coefficients for the lgammal(x), 1.5625 <= |x| < 2.25
1078 data8 0x3F951322AC7D8483, 0x3C71873D88C6539D // A4, A2L
1079 data8 0xBFB13E001A557606, 0x3C56CB907018A101 // A3, A1L
1080 data8 0xBEC11B2EC1E7F6FC, 0x3EB0064ED9824CC7 // D2, D3
1081 data8 0xBEE3CBC963EC103A, 0x3ED2597A330C107D // D0, D1
1082 data8 0xBC6F2DEBDFE66F38, 0x0000BFF0 // E6
1083 data8 0x3FD4A34CC4A60FA6, 0x3C3AFC9BF775E8A0 // A2, A4L
1084 data8 0x3E48B0C542F85B32, 0xBE347F12EAF787AB // C17, C20
1085 data8 0xE9FEA63B6984FA1E, 0x0000BFF2 // E4
1086 data8 0x9C562E15FC703BBF, 0x0000BFF5 // E2
1087 data8 0xBE3C12A50AB0355E, 0xBE1C941626AE4717 // C18, C19
1088 data8 0xBE7AFA8714342BC4,0x3E69A12D2B7761CB // D6, D7
1089 data8 0xBE9E25EF1D526730, 0x3E8C762291889B99 // D4, D5
1090 data8 0x3EF580DCEE754733, 0xBE57C811D070549C // E7, C16
1091 data8 0xD093D878BE209C98, 0x00003FF1 // E5
1092 data8 0x3FDB0EE6072093CE, 0xBC6024B9E81281C4 // A1, A3L
1093 data8 0x859B57C31CB77D96, 0x00003FF4 // E3
1094 data8 0xBD6EB756DB617E8D, 0x00003FF6 // E1
1095 data8 0xF2027E10C7AF8C38, 0x0000BFF7 // E0
1096 LOCAL_OBJECT_END(lgammal_13Q_2Q_data)
1097
1098 LOCAL_OBJECT_START(lgammal_8_10_data)
1099 // Polynomial coefficients for the lgammal(x), 8.0 <= |x| < 10.0
1100 // Multi Precision terms
1101 data8 0x40312008A3A23E5C, 0x3CE020B4F2E4083A //A1
1102 data8 0x4025358E82FCB70C, 0x3CD4A5A74AF7B99C //A0
1103 // Native precision terms
1104 data8 0xF0AA239FFBC616D2, 0x00004000 //A2
1105 data8 0x96A8EA798FE57D66, 0x0000BFFF //A3
1106 data8 0x8D501B7E3B9B9BDB, 0x00003FFE //A4
1107 data8 0x9EE062401F4B1DC2, 0x0000BFFD //A5
1108 data8 0xC63FD8CD31E93431, 0x00003FFC //A6
1109 data8 0x8461101709C23C30, 0x0000BFFC //A7
1110 data8 0xB96D7EA7EF3648B2, 0x00003FFB //A8
1111 data8 0x86886759D2ACC906, 0x0000BFFB //A9
1112 data8 0xC894B6E28265B183, 0x00003FFA //A10
1113 data8 0x98C4348CAD821662, 0x0000BFFA //A11
1114 data8 0xEC9B092226A94DF2, 0x00003FF9 //A12
1115 data8 0xB9F169FF9B98CDDC, 0x0000BFF9 //A13
1116 data8 0x9A3A32BB040894D3, 0x00003FF9 //A14
1117 data8 0xF9504CCC1003B3C3, 0x0000BFF8 //A15
1118 LOCAL_OBJECT_END(lgammal_8_10_data)
1119
1120 LOCAL_OBJECT_START(lgammal_03Q_6_data)
1121 // Polynomial coefficients for the lgammal(x), 0.75 <= |x| < 1.0
1122 data8 0xBFBC47DCA479E295, 0xBC607E6C1A379D55 //A3
1123 data8 0x3FCA051C372609ED, 0x3C7B02D73EB7D831 //A0
1124 data8 0xBFE15FAFA86B04DB, 0xBC3F52EE4A8945B5 //A1
1125 data8 0x3FD455C4FF28F0BF, 0x3C75F8C6C99F30BB //A2
1126 data8 0xD2CF04CD934F03E1, 0x00003FFA //A4
1127 data8 0xDB4ED667E29256E1, 0x0000BFF9 //A5
1128 data8 0xF155A33A5B6021BF, 0x00003FF8 //A6
1129 data8 0x895E9B9D386E0338, 0x0000BFF8 //A7
1130 data8 0xA001BE94B937112E, 0x00003FF7 //A8
1131 data8 0xBD82846E490ED048, 0x0000BFF6 //A9
1132 data8 0xE358D24EC30DBB5D, 0x00003FF5 //A10
1133 data8 0x89C4F3652446B78B, 0x0000BFF5 //A11
1134 data8 0xA86043E10280193D, 0x00003FF4 //A12
1135 data8 0xCF3A2FBA61EB7682, 0x0000BFF3 //A13
1136 data8 0x3F300900CC9200EC //A14
1137 data8 0xBF23F42264B94AE8 //A15
1138 data8 0x3F18EEF29895FE73 //A16
1139 data8 0xBF0F3C4563E3EDFB //A17
1140 data8 0x3F0387DBBC385056 //A18
1141 data8 0xBEF81B4004F92900 //A19
1142 data8 0x3EECA6692A9A5B81 //A20
1143 data8 0xBEDF61A0059C15D3 //A21
1144 data8 0x3ECDA9F40DCA0111 //A22
1145 data8 0xBEB60FE788217BAF //A23
1146 data8 0x3E9661D795DFC8C6 //A24
1147 data8 0xBE66C7756A4EDEE5 //A25
1148 // Polynomial coefficients for the lgammal(x), 1.0 <= |x| < 2.0
1149 data8 0xBFC1AE55B180726B, 0xBC7DE1BC478453F5 //A3
1150 data8 0xBFBEEB95B094C191, 0xBC53456FF6F1C9D9 //A0
1151 data8 0x3FA2AED059BD608A, 0x3C0B65CC647D557F //A1
1152 data8 0x3FDDE9E64DF22EF2, 0x3C8993939A8BA8E4 //A2
1153 data8 0xF07C206D6B100CFF, 0x00003FFA //A4
1154 data8 0xED2CEA9BA52FE7FB, 0x0000BFF9 //A5
1155 data8 0xFCE51CED52DF3602, 0x00003FF8 //A6
1156 data8 0x8D45D27872326619, 0x0000BFF8 //A7
1157 data8 0xA2B78D6BCEBE27F7, 0x00003FF7 //A8
1158 data8 0xBF6DC0996A895B6F, 0x0000BFF6 //A9
1159 data8 0xE4B9AD335AF82D79, 0x00003FF5 //A10
1160 data8 0x8A451880195362A1, 0x0000BFF5 //A11
1161 data8 0xA8BE35E63089A7A9, 0x00003FF4 //A12
1162 data8 0xCF7FA175FA11C40C, 0x0000BFF3 //A13
1163 data8 0x3F300C282FAA3B02 //A14
1164 data8 0xBF23F6AEBDA68B80 //A15
1165 data8 0x3F18F6860E2224DD //A16
1166 data8 0xBF0F542B3CE32F28 //A17
1167 data8 0x3F039436218C9BF8 //A18
1168 data8 0xBEF8AE6307677AEC //A19
1169 data8 0x3EF0B55527B3A211 //A20
1170 data8 0xBEE576AC995E7605 //A21
1171 data8 0x3ED102DDC1365D2D //A22
1172 data8 0xBEC442184F97EA54 //A23
1173 data8 0x3ED4D2283DFE5FC6 //A24
1174 data8 0xBECB9219A9B46787 //A25
1175 // Polynomial coefficients for the lgammal(x), 2.0 <= |x| < 3.0
1176 data8 0xBFCA4D55BEAB2D6F, 0xBC66F80E5BFD5AF5 //A3
1177 data8 0x3FE62E42FEFA39EF, 0x3C7ABC9E3B347E3D //A0
1178 data8 0x3FFD8773039049E7, 0x3C66CB9007C426EA //A1
1179 data8 0x3FE94699894C1F4C, 0x3C918726EB111663 //A2
1180 data8 0xA264558FB0906209, 0x00003FFB //A4
1181 data8 0x94D6C50FEB902ADC, 0x0000BFFA //A5
1182 data8 0x9620656184243D17, 0x00003FF9 //A6
1183 data8 0xA0D0983B8BCA910B, 0x0000BFF8 //A7
1184 data8 0xB36AF8559B222BD3, 0x00003FF7 //A8
1185 data8 0xCE0DACB3260AE6E5, 0x0000BFF6 //A9
1186 data8 0xF1C2C0BF0437C7DB, 0x00003FF5 //A10
1187 data8 0x902A2F2F3AB74A92, 0x0000BFF5 //A11
1188 data8 0xAE05009B1B2C6E4C, 0x00003FF4 //A12
1189 data8 0xD5B71F6456D7D4CB, 0x0000BFF3 //A13
1190 data8 0x3F2F0351D71BC9C6 //A14
1191 data8 0xBF2B53BC56A3B793 //A15
1192 data8 0xBF18B12DC6F6B861 //A16
1193 data8 0xBF43EE6EB5215C2F //A17
1194 data8 0xBF5474787CDD455E //A18
1195 data8 0xBF642B503C9C060A //A19
1196 data8 0xBF6E07D1AA254AA3 //A20
1197 data8 0xBF71C785443AAEE8 //A21
1198 data8 0xBF6F67BF81B71052 //A22
1199 data8 0xBF63E4BCCF4FFABF //A23
1200 data8 0xBF50067F8C671D5A //A24
1201 data8 0xBF29C770D680A5AC //A25
1202 // Polynomial coefficients for the lgammal(x), 4.0 <= |x| < 6.0
1203 data8 0xBFD6626BC9B31B54, 0xBC85AABE08680902 //A3
1204 data8 0x401326643C4479C9, 0x3CAA53C26F31E364 //A0
1205 data8 0x401B4C420A50AD7C, 0x3C8C76D55E57DD8D //A1
1206 data8 0x3FF735973273D5EC, 0x3C83A0B78E09188A //A2
1207 data8 0x81310136363AAB6D, 0x00003FFC //A4
1208 data8 0xDF1BD44C4075C0E6, 0x0000BFFA //A5
1209 data8 0xD58389FE38D8D664, 0x00003FF9 //A6
1210 data8 0xDA62C7221D5B5F87, 0x0000BFF8 //A7
1211 data8 0xE9F92CAD0263E157, 0x00003FF7 //A8
1212 data8 0x81ACCB8606C165FE, 0x0000BFF7 //A9
1213 data8 0x9382D8D263D1C2A3, 0x00003FF6 //A10
1214 data8 0xAB3CCBA4C853B12C, 0x0000BFF5 //A11
1215 data8 0xCA0818BBCCC59296, 0x00003FF4 //A12
1216 data8 0xF18912691CBB5BD0, 0x0000BFF3 //A13
1217 data8 0x3F323EF5D8330339 //A14
1218 data8 0xBF2641132EA571F7 //A15
1219 data8 0x3F1B5D9576175CA9 //A16
1220 data8 0xBF10F56A689C623D //A17
1221 data8 0x3F04CACA9141A18D //A18
1222 data8 0xBEFA307AC9B4E85D //A19
1223 data8 0x3EF4B625939FBE32 //A20
1224 data8 0xBECEE6AC1420F86F //A21
1225 data8 0xBE9A95AE2E485964 //A22
1226 data8 0xBF039EF47F8C09BB //A23
1227 data8 0xBF05345957F7B7A9 //A24
1228 data8 0xBEF85AE6385D4CCC //A25
1229 // Polynomial coefficients for the lgammal(x), 3.0 <= |x| < 4.0
1230 data8 0xBFCA4D55BEAB2D6F, 0xBC667B20FF46C6A8 //A3
1231 data8 0x3FE62E42FEFA39EF, 0x3C7ABC9E3B398012 //A0
1232 data8 0x3FFD8773039049E7, 0x3C66CB9070238D77 //A1
1233 data8 0x3FE94699894C1F4C, 0x3C91873D8839B1CD //A2
1234 data8 0xA264558FB0906D7E, 0x00003FFB //A4
1235 data8 0x94D6C50FEB8AFD72, 0x0000BFFA //A5
1236 data8 0x9620656185B68F14, 0x00003FF9 //A6
1237 data8 0xA0D0983B34B7088A, 0x0000BFF8 //A7
1238 data8 0xB36AF863964AA440, 0x00003FF7 //A8
1239 data8 0xCE0DAAFB5497AFB8, 0x0000BFF6 //A9
1240 data8 0xF1C2EAFA79CC2864, 0x00003FF5 //A10
1241 data8 0x9028922A839572B8, 0x0000BFF5 //A11
1242 data8 0xAE1E62F870BA0278, 0x00003FF4 //A12
1243 data8 0xD4726F681E2ABA29, 0x0000BFF3 //A13
1244 data8 0x3F30559B9A02FADF //A14
1245 data8 0xBF243ADEB1266CAE //A15
1246 data8 0x3F19303B6F552603 //A16
1247 data8 0xBF0F768C288EC643 //A17
1248 data8 0x3F039D5356C21DE1 //A18
1249 data8 0xBEF81BCA8168E6BE //A19
1250 data8 0x3EEC74A53A06AD54 //A20
1251 data8 0xBEDED52D1A5DACDF //A21
1252 data8 0x3ECCB4C2C7087342 //A22
1253 data8 0xBEB4F1FAFDFF5C2F //A23
1254 data8 0x3E94C80B52D58904 //A24
1255 data8 0xBE64A328CBE92A27 //A25
1256 LOCAL_OBJECT_END(lgammal_03Q_6_data)
1257
1258 LOCAL_OBJECT_START(lgammal_1pEps_data)
1259 // Polynomial coefficients for the lgammal(x), 1 - 2^(-7) <= |x| < 1 + 2^(-7)
1260 data8 0x93C467E37DB0C7A5, 0x00003FFE //A1
1261 data8 0xD28D3312983E9919, 0x00003FFE //A2
1262 data8 0xCD26AADF559A47E3, 0x00003FFD //A3
1263 data8 0x8A8991563EC22E81, 0x00003FFD //A4
1264 data8 0x3FCA8B9C168D52FE //A5
1265 data8 0x3FC5B40CB0696370 //A6
1266 data8 0x3FC270AC2229A65D //A7
1267 data8 0x3FC0110AF10FCBFC //A8
1268 // Polynomial coefficients for the log1p(x), - 2^(-7) <= |x| < 2^(-7)
1269 data8 0x3FBC71C71C71C71C //P8
1270 data8 0xBFC0000000000000 //P7
1271 data8 0x3FC2492492492492 //P6
1272 data8 0xBFC5555555555555 //P5
1273 data8 0x3FC999999999999A //P4
1274 data8 0xBFD0000000000000 //P3
1275 data8 0x3FD5555555555555 //P2
1276 data8 0xBFE0000000000000 //P1
1277 // short version of "lnsin" polynomial
1278 data8 0xD28D3312983E9918, 0x00003FFF //A2
1279 data8 0x8A8991563EC241B6, 0x00003FFE //A4
1280 data8 0xADA06588061830A5, 0x00003FFD //A6
1281 data8 0x80859B57C31CB746, 0x00003FFD //A8
1282 LOCAL_OBJECT_END(lgammal_1pEps_data)
1283
1284 LOCAL_OBJECT_START(lgammal_neg2andHalf_data)
1285 // Polynomial coefficients for the lgammal(x), -2.005859375 <= x < -2.5
1286 data8 0xBF927781D4BB093A, 0xBC511D86D85B7045 // A3, A0L
1287 data8 0x3FF1A68793DEFC15, 0x3C9852AE2DA7DEEF // A1, A1L
1288 data8 0x408555562D45FAFD, 0xBF972CDAFE5FEFAD // D0, D1
1289 data8 0xC18682331EF492A5, 0xC1845E3E0D29606B // C20, C21
1290 data8 0x4013141822E16979, 0x3CCF8718B6E75F6C // A2, A2L
1291 data8 0xBFACCBF9F5ED0F15, 0xBBDD1AEB73297401 // A0, A3L
1292 data8 0xCCCDB17423046445, 0x00004006 // E6
1293 data8 0x800514E230A3A452, 0x00004005 // E4
1294 data8 0xAAE9A48EC162E76F, 0x00004003 // E2
1295 data8 0x81D4F88B3F3EA0FC, 0x00004002 // E0
1296 data8 0x40CF3F3E35238DA0, 0xC0F8B340945F1A7E // D6, D7
1297 data8 0x40BF89EC0BD609C6, 0xC095897242AEFEE2 // D4, D5
1298 data8 0x40A2482FF01DBC5C, 0xC02095E275FDCF62 // D2, D3
1299 data8 0xC1641354F2312A6A, 0xC17B3657F85258E9 // C18, C19
1300 data8 0xC11F964E9ECBE2C9, 0xC146D7A90F70696C // C16, C17
1301 data8 0xE7AECDE6AF8EA816, 0x0000BFEF // E7
1302 data8 0xD711252FEBBE1091, 0x0000BFEB // E5
1303 data8 0xE648BD10F8C43391, 0x0000BFEF // E3
1304 data8 0x948A1E78AA00A98D, 0x0000BFF4 // E1
1305 LOCAL_OBJECT_END(lgammal_neg2andHalf_data)
1306
1307 LOCAL_OBJECT_START(lgammal_near_neg_half_data)
1308 // Polynomial coefficients for the lgammal(x), -0.5 < x < -0.40625
1309 data8 0xBFC1AE55B180726C, 0x3C8053CD734E6A1D // A3, A0L
1310 data8 0x3FA2AED059BD608A, 0x3C0CD3D2CDBA17F4 // A1, A1L
1311 data8 0x40855554DBCD1E1E, 0x3F96C51AC2BEE9E1 // D0, D1
1312 data8 0xC18682331EF4927D, 0x41845E3E0D295DFC // C20, C21
1313 data8 0x4011DE9E64DF22EF, 0x3CA692B70DAD6B7B // A2, A2L
1314 data8 0x3FF43F89A3F0EDD6, 0xBC4955AED0FA087D // A0, A3L
1315 data8 0xCCCD3F1DF4A2C1DD, 0x00004006 // E6
1316 data8 0x80028ADE33C7FCD9, 0x00004005 // E4
1317 data8 0xAACA474E485507EF, 0x00004003 // E2
1318 data8 0x80F07C206D6B0ECD, 0x00004002 // E0
1319 data8 0x40CF3F3E33E83056, 0x40F8B340944633D9 // D6, D7
1320 data8 0x40BF89EC059931F0, 0x409589723307AD20 // D4, D5
1321 data8 0x40A2482FD0054824, 0x402095CE7F19D011 // D2, D3
1322 data8 0xC1641354F2313614, 0x417B3657F8525354 // C18, C19
1323 data8 0xC11F964E9ECFD21C, 0x4146D7A90F701836 // C16, C17
1324 data8 0x86A9C01F0EA11E5A, 0x0000BFF5 // E7
1325 data8 0xBF6D8469142881C0, 0x0000BFF6 // E5
1326 data8 0x8D45D277BA8255F1, 0x0000BFF8 // E3
1327 data8 0xED2CEA9BA528BCC3, 0x0000BFF9 // E1
1328 LOCAL_OBJECT_END(lgammal_near_neg_half_data)
1329
1330 //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
1331 ////////////// POLYNOMIAL COEFFICIENTS FOR "NEAR ROOTS" RANGES /////////////
1332 ////////////// THIS PART OF TABLE SHOULD BE ADDRESSED REALLY RARE /////////////
1333 //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
1334 LOCAL_OBJECT_START(lgammal_right_roots_polynomial_data)
1335 // Polynomial coefficients for right root on [-3, -2]
1336 // Lgammal is approximated by polynomial within [-.056244 ; .158208 ] range
1337 data8 0xBBBD5E9DCD11030B, 0xB867411D9FF87DD4 //A0
1338 data8 0x3FF83FE966AF535E, 0x3CAA21235B8A769A //A1
1339 data8 0x40136EEBB002F55C, 0x3CC3959A6029838E //A2
1340 data8 0xB4A5302C53C2BEDD, 0x00003FFF //A3
1341 data8 0x8B8C6BE504F2DA1C, 0x00004002 //A4
1342 data8 0xB99CFF02593B4D98, 0x00004001 //A5
1343 data8 0x4038D32F682AA1CF //A6
1344 data8 0x403809F04EE6C5B5 //A7
1345 data8 0x40548EAA81634CEE //A8
1346 data8 0x4059297ADB6BC03D //A9
1347 data8 0x407286FB8EC5C9DA //A10
1348 data8 0x407A92E05B744CFB //A11
1349 data8 0x4091A9D4144258CD //A12
1350 data8 0x409C4D01D24F367E //A13
1351 data8 0x40B1871B9A426A83 //A14
1352 data8 0x40BE51C48BD9A583 //A15
1353 data8 0x40D2140D0C6153E7 //A16
1354 data8 0x40E0FB2C989CE4A3 //A17
1355 data8 0x40E52739AB005641 //A18
1356 data8 0x41161E3E6DDF503A //A19
1357 // Polynomial coefficients for right root on [-4, -3]
1358 // Lgammal is approximated by polynomial within [-.172797 ; .171573 ] range
1359 data8 0x3C172712B248E42E, 0x38CB8D17801A5D67 //A0
1360 data8 0x401F20A65F2FAC54, 0x3CCB9EA1817A824E //A1
1361 data8 0x4039D4D2977150EF, 0x3CDA42E149B6276A //A2
1362 data8 0xE089B8926AE2D9CB, 0x00004005 //A3
1363 data8 0x933901EBBB586C37, 0x00004008 //A4
1364 data8 0xCCD319BED1CFA1CD, 0x0000400A //A5
1365 data8 0x40D293C3F78D3C37 //A6
1366 data8 0x40FBB97AA0B6DD02 //A7
1367 data8 0x41251EA3345E5EB9 //A8
1368 data8 0x415057F65C92E7B0 //A9
1369 data8 0x41799C865241B505 //A10
1370 data8 0x41A445209EFE896B //A11
1371 data8 0x41D02D21880C953B //A12
1372 data8 0x41F9FFDE8C63E16D //A13
1373 data8 0x422504DC8302D2BE //A14
1374 data8 0x425111BF18C95414 //A15
1375 data8 0x427BCBE74A2B8EF7 //A16
1376 data8 0x42A7256F59B286F7 //A17
1377 data8 0x42D462D1586DE61F //A18
1378 data8 0x42FBB1228D6C5118 //A19
1379 // Polynomial coefficients for right root on [-5, -4]
1380 // Lgammal is approximated by polynomial within [-.163171 ; .161988 ] range
1381 data8 0x3C5840FBAFDEE5BB, 0x38CAC0336E8C490A //A0
1382 data8 0x403ACA5CF4921642, 0x3CCEDCDDA5491E56 //A1
1383 data8 0x40744415CD813F8E, 0x3CFBFEBC17E39146 //A2
1384 data8 0xAACD88D954E3E1BD, 0x0000400B //A3
1385 data8 0xCB68C710D75ED802, 0x0000400F //A4
1386 data8 0x8130F5AB997277AC, 0x00004014 //A5
1387 data8 0x41855E3DBF99EBA7 //A6
1388 data8 0x41CD14FE49C49FC2 //A7
1389 data8 0x421433DCE281F07D //A8
1390 data8 0x425C8399C7A92B6F //A9
1391 data8 0x42A45FBE67840F1A //A10
1392 data8 0x42ED68D75F9E6C98 //A11
1393 data8 0x433567291C27E5BE //A12
1394 data8 0x437F5ED7A9D9FD28 //A13
1395 data8 0x43C720A65C8AB711 //A14
1396 data8 0x441120A6C1D40B9B //A15
1397 data8 0x44596F561F2D1CBE //A16
1398 data8 0x44A3507DA81D5C01 //A17
1399 data8 0x44EF06A31E39EEDF //A18
1400 data8 0x45333774C99F523F //A19
1401 // Polynomial coefficients for right root on [-6, -5]
1402 // Lgammal is approximated by polynomial within [-.156450 ; .156126 ] range
1403 data8 0x3C71B82D6B2B3304, 0x3917186E3C0DC231 //A0
1404 data8 0x405ED72E0829AE02, 0x3C960C25157980EB //A1
1405 data8 0x40BCECC32EC22F9B, 0x3D5D8335A32F019C //A2
1406 data8 0x929EC2B1FB931F17, 0x00004012 //A3
1407 data8 0xD112EF96D37316DE, 0x00004018 //A4
1408 data8 0x9F00BB9BB13416AB, 0x0000401F //A5
1409 data8 0x425F7D8D5BDCB223 //A6
1410 data8 0x42C9A8D00C776CC6 //A7
1411 data8 0x433557FD8C481424 //A8
1412 data8 0x43A209221A953EF0 //A9
1413 data8 0x440EDC98D5618AB7 //A10
1414 data8 0x447AABD25E367378 //A11
1415 data8 0x44E73DE20CC3B288 //A12
1416 data8 0x455465257B4E0BD8 //A13
1417 data8 0x45C2011532085353 //A14
1418 data8 0x462FEE4CC191945B //A15
1419 data8 0x469C63AEEFEF0A7F //A16
1420 data8 0x4709D045390A3810 //A17
1421 data8 0x4778D360873C9F64 //A18
1422 data8 0x47E26965BE9A682A //A19
1423 // Polynomial coefficients for right root on [-7, -6]
1424 // Lgammal is approximated by polynomial within [-.154582 ; .154521 ] range
1425 data8 0x3C75F103A1B00A48, 0x391C041C190C726D //A0
1426 data8 0x40869DE49E3AF2AA, 0x3D1C17E1F813063B //A1
1427 data8 0x410FCE23484CFD10, 0x3DB6F38C2F11DAB9 //A2
1428 data8 0xEF281D1E1BE2055A, 0x00004019 //A3
1429 data8 0xFCE3DA92AC55DFF8, 0x00004022 //A4
1430 data8 0x8E9EA838A20BD58E, 0x0000402C //A5
1431 data8 0x4354F21E2FB9E0C9 //A6
1432 data8 0x43E9500994CD4F09 //A7
1433 data8 0x447F3A2C23C033DF //A8
1434 data8 0x45139152656606D8 //A9
1435 data8 0x45A8D45F8D3BF2E8 //A10
1436 data8 0x463FD32110E5BFE5 //A11
1437 data8 0x46D490B3BDBAE0BE //A12
1438 data8 0x476AC3CAD905DD23 //A13
1439 data8 0x48018558217AD473 //A14
1440 data8 0x48970AF371D30585 //A15
1441 data8 0x492E6273A8BEFFE3 //A16
1442 data8 0x49C47CC9AE3F1073 //A17
1443 data8 0x4A5D38E8C35EFF45 //A18
1444 data8 0x4AF0123E89694CD8 //A19
1445 // Polynomial coefficients for right root on [-8, -7]
1446 // Lgammal is approximated by polynomial within [-.154217 ; .154208 ] range
1447 data8 0xBCD2507D818DDD68, 0xB97F6940EA2871A0 //A0
1448 data8 0x40B3B407AA387BCB, 0x3D6320238F2C43D1 //A1
1449 data8 0x41683E85DAAFBAC7, 0x3E148D085958EA3A //A2
1450 data8 0x9F2A95AF1E10A548, 0x00004022 //A3
1451 data8 0x92F21522F482300E, 0x0000402E //A4
1452 data8 0x90B51AB03A1F244D, 0x0000403A //A5
1453 data8 0x44628E1C70EF534F //A6
1454 data8 0x452393E2BC32D244 //A7
1455 data8 0x45E5164141F4BA0B //A8
1456 data8 0x46A712B3A8AF5808 //A9
1457 data8 0x47698FD36CEDD0F2 //A10
1458 data8 0x482C9AE6BBAA3637 //A11
1459 data8 0x48F023821857C8E9 //A12
1460 data8 0x49B2569053FC106F //A13
1461 data8 0x4A74F646D5C1604B //A14
1462 data8 0x4B3811CF5ABA4934 //A15
1463 data8 0x4BFBB5DD6C84E233 //A16
1464 data8 0x4CC05021086F637B //A17
1465 data8 0x4D8450A345B0FB49 //A18
1466 data8 0x4E43825848865DB2 //A19
1467 // Polynomial coefficients for right root on [-9, -8]
1468 // Lgammal is approximated by polynomial within [-.154160 ; .154158 ] range
1469 data8 0x3CDF4358564F2B46, 0x397969BEE6042F81 //A0
1470 data8 0x40E3B088FED67721, 0x3D82787BA937EE85 //A1
1471 data8 0x41C83A3893550EF4, 0x3E542ED57E244DA8 //A2
1472 data8 0x9F003C6DC56E0B8E, 0x0000402B //A3
1473 data8 0x92BDF64A3213A699, 0x0000403A //A4
1474 data8 0x9074F503AAD417AF, 0x00004049 //A5
1475 data8 0x4582843E1313C8CD //A6
1476 data8 0x467387BD6A7826C1 //A7
1477 data8 0x4765074E788CF440 //A8
1478 data8 0x4857004DD9D1E09D //A9
1479 data8 0x4949792ED7530EAF //A10
1480 data8 0x4A3C7F089A292ED3 //A11
1481 data8 0x4B30125BF0AABB86 //A12
1482 data8 0x4C224175195E307E //A13
1483 data8 0x4D14DC4C8B32C08D //A14
1484 data8 0x4E07F1DB2786197E //A15
1485 data8 0x4EFB8EA1C336DACB //A16
1486 data8 0x4FF03797EACD0F23 //A17
1487 data8 0x50E4304A8E68A730 //A18
1488 data8 0x51D3618FB2EC9F93 //A19
1489 // Polynomial coefficients for right root on [-10, -9]
1490 // Lgammal is approximated by polynomial within [-.154152 ; .154152 ] range
1491 data8 0x3D42F34DA97ECF0C, 0x39FD1256F345B0D0 //A0
1492 data8 0x4116261203919787, 0x3DC12D44055588EB //A1
1493 data8 0x422EA8F32FB7FE99, 0x3ED849CE4E7B2D77 //A2
1494 data8 0xE25BAF73477A57B5, 0x00004034 //A3
1495 data8 0xEB021FD10060504A, 0x00004046 //A4
1496 data8 0x8220A208EE206C5F, 0x00004059 //A5
1497 data8 0x46B2C3903EC9DA14 //A6
1498 data8 0x47D64393744B9C67 //A7
1499 data8 0x48FAF79CCDC604DD //A8
1500 data8 0x4A20975DB8061EBA //A9
1501 data8 0x4B44AB9CBB38DB21 //A10
1502 data8 0x4C6A032F60094FE9 //A11
1503 data8 0x4D908103927634B4 //A12
1504 data8 0x4EB516CA21D30861 //A13
1505 data8 0x4FDB1BF12C58D318 //A14
1506 data8 0x510180AAE094A553 //A15
1507 data8 0x5226A8F2A2D45D57 //A16
1508 data8 0x534E00B6B0C8B809 //A17
1509 data8 0x5475022FE21215B2 //A18
1510 data8 0x5596B02BF6C5E19B //A19
1511 // Polynomial coefficients for right root on [-11, -10]
1512 // Lgammal is approximated by polynomial within [-.154151 ; .154151 ] range
1513 data8 0x3D7AA9C2E2B1029C, 0x3A15FB37578544DB //A0
1514 data8 0x414BAF825A0C91D4, 0x3DFB9DA2CE398747 //A1
1515 data8 0x4297F3EC8AE0AF03, 0x3F34208B55FB8781 //A2
1516 data8 0xDD0C97D3197F56DE, 0x0000403E //A3
1517 data8 0x8F6F3AF7A5499674, 0x00004054 //A4
1518 data8 0xC68DA1AF6D878EEB, 0x00004069 //A5
1519 data8 0x47F1E4E1E2197CE0 //A6
1520 data8 0x494A8A28E597C3EB //A7
1521 data8 0x4AA4175D0D35D705 //A8
1522 data8 0x4BFEE6F0AF69E814 //A9
1523 data8 0x4D580FE7B3DBB3C6 //A10
1524 data8 0x4EB2ECE60E4608AF //A11
1525 data8 0x500E04BE3E2B4F24 //A12
1526 data8 0x5167F9450F0FB8FD //A13
1527 data8 0x52C342BDE747603F //A14
1528 data8 0x541F1699D557268C //A15
1529 data8 0x557927C5F079864E //A16
1530 data8 0x56D4D10FEEDB030C //A17
1531 data8 0x5832385DF86AD28A //A18
1532 data8 0x598898914B4D6523 //A19
1533 // Polynomial coefficients for right root on [-12, -11]
1534 // Lgammal is approximated by polynomial within [-.154151 ; .154151 ] range
1535 data8 0xBD96F61647C58B03, 0xBA3ABB0C2A6C755B //A0
1536 data8 0x418308A82714B70D, 0x3E1088FC6A104C39 //A1
1537 data8 0x4306A493DD613C39, 0x3FB2341ECBF85741 //A2
1538 data8 0x8FA8FE98339474AB, 0x00004049 //A3
1539 data8 0x802CCDF570BA7942, 0x00004062 //A4
1540 data8 0xF3F748AF11A32890, 0x0000407A //A5
1541 data8 0x493E3B567EF178CF //A6
1542 data8 0x4ACED38F651BA362 //A7
1543 data8 0x4C600B357337F946 //A8
1544 data8 0x4DF0F71A52B54CCF //A9
1545 data8 0x4F8229F3B9FA2C70 //A10
1546 data8 0x5113A4C4979B770E //A11
1547 data8 0x52A56BC367F298D5 //A12
1548 data8 0x543785CF31842DC0 //A13
1549 data8 0x55C9FC37E3E40896 //A14
1550 data8 0x575CD5D1BA556C82 //A15
1551 data8 0x58F00A7AD99A9E08 //A16
1552 data8 0x5A824088688B008D //A17
1553 data8 0x5C15F75EF7E08EBD //A18
1554 data8 0x5DA462EA902F0C90 //A19
1555 // Polynomial coefficients for right root on [-13, -12]
1556 // Lgammal is approximated by polynomial within [-.154151 ; .154151 ] range
1557 data8 0x3DC3191752ACFC9D, 0x3A26CB6629532DBF //A0
1558 data8 0x41BC8CFC051191BD, 0x3E68A84DA4E62AF2 //A1
1559 data8 0x43797926294A0148, 0x400F345FF3723CFF //A2
1560 data8 0xF26D2AF700B82625, 0x00004053 //A3
1561 data8 0xA238B24A4B1F7B15, 0x00004070 //A4
1562 data8 0xE793B5C0A41A264F, 0x0000408C //A5
1563 data8 0x4A9585BDDACE863D //A6
1564 data8 0x4C6075953448088A //A7
1565 data8 0x4E29B2F38D1FC670 //A8
1566 data8 0x4FF4619B079C440F //A9
1567 data8 0x51C05DAE118D8AD9 //A10
1568 data8 0x538A8C7F87326AD4 //A11
1569 data8 0x5555B6937588DAB3 //A12
1570 data8 0x5721E1F8B6E6A7DB //A13
1571 data8 0x58EDA1D7A77DD6E5 //A14
1572 data8 0x5AB8A9616B7DC9ED //A15
1573 data8 0x5C84942AA209ED17 //A16
1574 data8 0x5E518FC34C6F54EF //A17
1575 data8 0x601FB3F17BCCD9A0 //A18
1576 data8 0x61E61128D512FE97 //A1
1577 // Polynomial coefficients for right root on [-14, -13]
1578 // Lgammal is approximated by polynomial within [-.154151 ; .154151 ] range
1579 data8 0xBE170D646421B3F5, 0xBAAD95F79FCB5097 //A0
1580 data8 0x41F7328CBFCD9AC7, 0x3E743B8B1E8AEDB1 //A1
1581 data8 0x43F0D0FA2DBDA237, 0x40A0422D6A227B55 //A2
1582 data8 0x82082DF2D32686CC, 0x0000405F //A3
1583 data8 0x8D64EE9B42E68B43, 0x0000407F //A4
1584 data8 0xA3FFD82E08C5F1F1, 0x0000409F //A5
1585 data8 0x4BF8C49D99123454 //A6
1586 data8 0x4DFEC79DDF11342F //A7
1587 data8 0x50038615A892F6BD //A8
1588 data8 0x520929453DB32EF1 //A9
1589 data8 0x54106A7808189A7F //A10
1590 data8 0x5615A302D03C207B //A11
1591 data8 0x581CC175AA736F5E //A12
1592 data8 0x5A233E071147C017 //A13
1593 data8 0x5C29E81917243F22 //A14
1594 data8 0x5E3184B0B5AC4707 //A15
1595 data8 0x6037C11DE62D8388 //A16
1596 data8 0x6240787C4B1C9D6C //A17
1597 data8 0x6448289235E80977 //A18
1598 data8 0x664B5352C6C3449E //A19
1599 // Polynomial coefficients for right root on [-15, -14]
1600 // Lgammal is approximated by polynomial within [-.154151 ; .154151 ] range
1601 data8 0x3E562C2E34A9207D, 0x3ADC00DA3DFF7A83 //A0
1602 data8 0x42344C3B2F0D90AB, 0x3EB8A2E979F24536 //A1
1603 data8 0x4469BFFF28B50D07, 0x41181E3D05C1C294 //A2
1604 data8 0xAE38F64DCB24D9F8, 0x0000406A //A3
1605 data8 0xA5C3F52C1B350702, 0x0000408E //A4
1606 data8 0xA83BC857BCD67A1B, 0x000040B2 //A5
1607 data8 0x4D663B4727B4D80A //A6
1608 data8 0x4FA82C965B0F7788 //A7
1609 data8 0x51EAD58C02908D95 //A8
1610 data8 0x542E427970E073D8 //A9
1611 data8 0x56714644C558A818 //A10
1612 data8 0x58B3EC2040C77BAE //A11
1613 data8 0x5AF72AE6A83D45B1 //A12
1614 data8 0x5D3B214F611F5D12 //A13
1615 data8 0x5F7FF5E49C54E92A //A14
1616 data8 0x61C2E917AB765FB2 //A15
1617 data8 0x64066FD70907B4C1 //A16
1618 data8 0x664B3998D60D0F9B //A17
1619 data8 0x689178710782FA8B //A18
1620 data8 0x6AD14A66C1C7BEC3 //A19
1621 // Polynomial coefficients for right root on [-16, -15]
1622 // Lgammal is approximated by polynomial within [-.154151 ; .154151 ] range
1623 data8 0xBE6D7E7192615BAE, 0xBB0137677D7CC719 //A0
1624 data8 0x4273077763F6628C, 0x3F09250FB8FC8EC9 //A1
1625 data8 0x44E6A1BF095B1AB3, 0x4178D5A74F6CB3B3 //A2
1626 data8 0x8F8E0D5060FCC76E, 0x00004076 //A3
1627 data8 0x800CC1DCFF092A63, 0x0000409E //A4
1628 data8 0xF3AB0BA9D14CDA72, 0x000040C5 //A5
1629 data8 0x4EDE3000A2F6D54F //A6
1630 data8 0x515EC613B9C8E241 //A7
1631 data8 0x53E003309FEEEA96 //A8
1632 data8 0x5660ED908D7C9A90 //A9
1633 data8 0x58E21E9B517B1A50 //A10
1634 data8 0x5B639745E4374EE2 //A11
1635 data8 0x5DE55BB626B2075D //A12
1636 data8 0x606772B7506BA747 //A13
1637 data8 0x62E9E581AB2E057B //A14
1638 data8 0x656CBAD1CF85D396 //A15
1639 data8 0x67EFF4EBD7989872 //A16
1640 data8 0x6A722D2B19B7E2F9 //A17
1641 data8 0x6CF5DEB3073B0743 //A18
1642 data8 0x6F744AC11550B93A //A19
1643 // Polynomial coefficients for right root on [-17, -16]
1644 // Lgammal is approximated by polynomial within [-.154151 ; .154151 ] range
1645 data8 0xBEDCC6291188207E, 0xBB872E3FDD48F5B7 //A0
1646 data8 0x42B3076EE7525EF9, 0x3F6687A5038CA81C //A1
1647 data8 0x4566A1AAD96EBCB5, 0x421F0FEDFBF548D2 //A2
1648 data8 0x8F8D4D3DE9850DBA, 0x00004082 //A3
1649 data8 0x800BDD6DA2CE1859, 0x000040AE //A4
1650 data8 0xF3A8EC4C9CDC1CE5, 0x000040D9 //A5
1651 data8 0x505E2FAFDB812628 //A6
1652 data8 0x531EC5B3A7508719 //A7
1653 data8 0x55E002F77E99B628 //A8
1654 data8 0x58A0ED4C9B4DAE54 //A9
1655 data8 0x5B621E4A8240F90C //A10
1656 data8 0x5E2396E5C8849814 //A11
1657 data8 0x60E55B43D8C5CE71 //A12
1658 data8 0x63A7722F5D45D01D //A13
1659 data8 0x6669E4E010DCE45A //A14
1660 data8 0x692CBA120D5E78F6 //A15
1661 data8 0x6BEFF4045350B22E //A16
1662 data8 0x6EB22C9807C21819 //A17
1663 data8 0x7175DE20D04617C4 //A18
1664 data8 0x74344AB87C6D655F //A19
1665 // Polynomial coefficients for right root on [-18, -17]
1666 // Lgammal is approximated by polynomial within [-.154151 ; .154151 ] range
1667 data8 0xBF28AEEE7B61D77C, 0xBBDBBB5FC57ABF79 //A0
1668 data8 0x42F436F56B3B8A0C, 0x3FA43EE3C5C576E9 //A1
1669 data8 0x45E98A22535D115D, 0x42984678BE78CC48 //A2
1670 data8 0xAC176F3775E6FCFC, 0x0000408E //A3
1671 data8 0xA3114F53A9FEB922, 0x000040BE //A4
1672 data8 0xA4D168A8334ABF41, 0x000040EE //A5
1673 data8 0x51E5B0E7EC7182BB //A6
1674 data8 0x54E77D67B876EAB6 //A7
1675 data8 0x57E9F7C30C09C4B6 //A8
1676 data8 0x5AED29B0488614CA //A9
1677 data8 0x5DF09486F87E79F9 //A10
1678 data8 0x60F30B199979654E //A11
1679 data8 0x63F60E02C7DCCC5F //A12
1680 data8 0x66F9B8A00EB01684 //A13
1681 data8 0x69FE2D3ED0700044 //A14
1682 data8 0x6D01C8363C7DCC84 //A15
1683 data8 0x700502B29C2F06E3 //A16
1684 data8 0x730962B4500F4A61 //A17
1685 data8 0x76103C6ED099192A //A18
1686 data8 0x79100C7132CFD6E3 //A19
1687 // Polynomial coefficients for right root on [-19, -18]
1688 // Lgammal is approximated by polynomial within [-.154151 ; .154151 ] range
1689 data8 0x3F3C19A53328A0C3, 0x3BE04ADC3FBE1458 //A0
1690 data8 0x4336C16C16C16C19, 0x3FE58CE3AC4A7C28 //A1
1691 data8 0x46702E85C0898B70, 0x432C922E412CEC6E //A2
1692 data8 0xF57B99A1C034335D, 0x0000409A //A3
1693 data8 0x82EC9634223DF909, 0x000040CF //A4
1694 data8 0x94F66D7557E2EA60, 0x00004103 //A5
1695 data8 0x5376118B79AE34D0 //A6
1696 data8 0x56BAE7106D52E548 //A7
1697 data8 0x5A00BD48CC8E25AB //A8
1698 data8 0x5D4529722821B493 //A9
1699 data8 0x608B1654AF31BBC1 //A10
1700 data8 0x63D182CC98AEA859 //A11
1701 data8 0x6716D43D5EEB05E8 //A12
1702 data8 0x6A5DF884FC172E1C //A13
1703 data8 0x6DA3CA7EBB97976B //A14
1704 data8 0x70EA416D0BE6D2EF //A15
1705 data8 0x743176C31EBB65F2 //A16
1706 data8 0x7777C401A8715CF9 //A17
1707 data8 0x7AC1110C6D350440 //A18
1708 data8 0x7E02D0971CF84865 //A19
1709 // Polynomial coefficients for right root on [-20, -19]
1710 // Lgammal is approximated by polynomial within [-.154151 ; .154151 ] range
1711 data8 0xBFAB767F9BE21803, 0xBC5ACEF5BB1BD8B5 //A0
1712 data8 0x4379999999999999, 0x4029241C7F5914C8 //A1
1713 data8 0x46F47AE147AE147A, 0x43AC2979B64B9D7E //A2
1714 data8 0xAEC33E1F67152993, 0x000040A7 //A3
1715 data8 0xD1B71758E219616F, 0x000040DF //A4
1716 data8 0x8637BD05AF6CF468, 0x00004118 //A5
1717 data8 0x55065E9F80F293DE //A6
1718 data8 0x588EADA78C44EE66 //A7
1719 data8 0x5C15798EE22DEF09 //A8
1720 data8 0x5F9E8ABFD644FA63 //A9
1721 data8 0x6325FD7FE29BD7CD //A10
1722 data8 0x66AFFC5C57E1F802 //A11
1723 data8 0x6A3774CD7D5C0181 //A12
1724 data8 0x6DC152724DE2A6FE //A13
1725 data8 0x7149BB138EB3D0C2 //A14
1726 data8 0x74D32FF8A70896C2 //A15
1727 data8 0x785D3749F9C72BD7 //A16
1728 data8 0x7BE5CCF65EBC4E40 //A17
1729 data8 0x7F641A891B5FC652 //A18
1730 data8 0x7FEFFFFFFFFFFFFF //A19
1731 LOCAL_OBJECT_END(lgammal_right_roots_polynomial_data)
1732
1733 LOCAL_OBJECT_START(lgammal_left_roots_polynomial_data)
1734 // Polynomial coefficients for left root on [-3, -2]
1735 // Lgammal is approximated by polynomial within [.084641 ; -.059553 ] range
1736 data8 0xBC0844590979B82E, 0xB8BC7CE8CE2ECC3B //A0
1737 data8 0xBFFEA12DA904B18C, 0xBC91A6B2BAD5EF6E //A1
1738 data8 0x4023267F3C265A51, 0x3CD7055481D03AED //A2
1739 data8 0xA0C2D618645F8E00, 0x0000C003 //A3
1740 data8 0xFA8256664F8CD2BE, 0x00004004 //A4
1741 data8 0xC2C422C103F57158, 0x0000C006 //A5
1742 data8 0x4084373F7CC70AF5 //A6
1743 data8 0xC0A12239BDD6BB95 //A7
1744 data8 0x40BDBA65E2709397 //A8
1745 data8 0xC0DA2D2504DFB085 //A9
1746 data8 0x40F758173CA5BF3C //A10
1747 data8 0xC11506C65C267E72 //A11
1748 data8 0x413318EE3A6B05FC //A12
1749 data8 0xC1517767F247DA98 //A13
1750 data8 0x41701237B4754D73 //A14
1751 data8 0xC18DB8A03BC5C3D8 //A15
1752 data8 0x41AB80953AC14A07 //A16
1753 data8 0xC1C9B7B76638D0A4 //A17
1754 data8 0x41EA727E3033E2D9 //A18
1755 data8 0xC20812C297729142 //A19
1756 //
1757 // Polynomial coefficients for left root on [-4, -3]
1758 // Lgammal is approximated by polynomial within [.147147 ; -.145158 ] range
1759 data8 0xBC3130AE5C4F54DB, 0xB8ED23294C13398A //A0
1760 data8 0xC034B99D966C5646, 0xBCE2E5FE3BC3DBB9 //A1
1761 data8 0x406F76DEAE0436BD, 0x3D14974DDEC057BD //A2
1762 data8 0xE929ACEA5979BE96, 0x0000C00A //A3
1763 data8 0xF47C14F8A0D52771, 0x0000400E //A4
1764 data8 0x88B7BC036937481C, 0x0000C013 //A5
1765 data8 0x4173E8F3AB9FC266 //A6
1766 data8 0xC1B7DBBE062FB11B //A7
1767 data8 0x41FD2F76DE7A47A7 //A8
1768 data8 0xC242225FE53B124D //A9
1769 data8 0x4286D12AE2FBFA30 //A10
1770 data8 0xC2CCFFC267A3C4C0 //A11
1771 data8 0x431294E10008E014 //A12
1772 data8 0xC357FAC8C9A2DF6A //A13
1773 data8 0x439F2190AB9FAE01 //A14
1774 data8 0xC3E44C1D8E8C67C3 //A15
1775 data8 0x442A8901105D5A38 //A16
1776 data8 0xC471C4421E908C3A //A17
1777 data8 0x44B92CD4D59D6D17 //A18
1778 data8 0xC4FB3A078B5247FA //A19
1779 // Polynomial coefficients for left root on [-5, -4]
1780 // Lgammal is approximated by polynomial within [.155671 ; -.155300 ] range
1781 data8 0xBC57BF3C6E8A94C1, 0xB902FB666934AC9E //A0
1782 data8 0xC05D224A3EF9E41F, 0xBCF6F5713913E440 //A1
1783 data8 0x40BB533C678A3955, 0x3D688E53E3C72538 //A2
1784 data8 0x869FBFF732E99B84, 0x0000C012 //A3
1785 data8 0xBA9537AD61392DEC, 0x00004018 //A4
1786 data8 0x89EAE8B1DEA06B05, 0x0000C01F //A5
1787 data8 0x425A8C5C53458D3C //A6
1788 data8 0xC2C5068B3ED6509B //A7
1789 data8 0x4330FFA575E99B4E //A8
1790 data8 0xC39BEC12DDDF7669 //A9
1791 data8 0x44073825725F74F9 //A10
1792 data8 0xC47380EBCA299047 //A11
1793 data8 0x44E084DD9B666437 //A12
1794 data8 0xC54C2DA6BF787ACF //A13
1795 data8 0x45B82D65C8D6FA42 //A14
1796 data8 0xC624D62113FE950A //A15
1797 data8 0x469200CC19B45016 //A16
1798 data8 0xC6FFDDC6DD938E2E //A17
1799 data8 0x476DD7C07184B9F9 //A18
1800 data8 0xC7D554A30085C052 //A19
1801 // Polynomial coefficients for left root on [-6, -5]
1802 // Lgammal is approximated by polynomial within [.157425 ; -.157360 ] range
1803 data8 0x3C9E20A87C8B79F1, 0x39488BE34B2427DB //A0
1804 data8 0xC08661F6A43A5E12, 0xBD3D912526D759CC //A1
1805 data8 0x410F79DCB794F270, 0x3DB9BEE7CD3C1BF5 //A2
1806 data8 0xEB7404450D0005DB, 0x0000C019 //A3
1807 data8 0xF7AE9846DFE4D4AB, 0x00004022 //A4
1808 data8 0x8AF535855A95B6DA, 0x0000C02C //A5
1809 data8 0x43544D54E9FE240E //A6
1810 data8 0xC3E8684E40CE6CFC //A7
1811 data8 0x447DF44C1D803454 //A8
1812 data8 0xC512AC305439B2BA //A9
1813 data8 0x45A79226AF79211A //A10
1814 data8 0xC63E0DFF7244893A //A11
1815 data8 0x46D35216C3A83AF3 //A12
1816 data8 0xC76903BE0C390E28 //A13
1817 data8 0x48004A4DECFA4FD5 //A14
1818 data8 0xC8954FBD243DB8BE //A15
1819 data8 0x492BF3A31EB18DDA //A16
1820 data8 0xC9C2C6A864521F3A //A17
1821 data8 0x4A5AB127C62E8DA1 //A18
1822 data8 0xCAECF60EF3183C57 //A19
1823 // Polynomial coefficients for left root on [-7, -6]
1824 // Lgammal is approximated by polynomial within [.157749 ; -.157739 ] range
1825 data8 0x3CC9B9E8B8D551D6, 0x3961813C8E1E10DB //A0
1826 data8 0xC0B3ABF7A5CEA91F, 0xBD55638D4BCB4CC4 //A1
1827 data8 0x4168349A25504236, 0x3E0287ECE50CCF76 //A2
1828 data8 0x9EC8ED6E4C219E67, 0x0000C022 //A3
1829 data8 0x9279EB1B799A3FF3, 0x0000402E //A4
1830 data8 0x90213EF8D9A5DBCF, 0x0000C03A //A5
1831 data8 0x4462775E857FB71C //A6
1832 data8 0xC52377E70B45FDBF //A7
1833 data8 0x45E4F3D28EDA8C28 //A8
1834 data8 0xC6A6E85571BD2D0B //A9
1835 data8 0x47695BB17E74DF74 //A10
1836 data8 0xC82C5AC0ED6A662F //A11
1837 data8 0x48EFF8159441C2E3 //A12
1838 data8 0xC9B22602C1B68AE5 //A13
1839 data8 0x4A74BA8CE7B34100 //A14
1840 data8 0xCB37C7E208482E4B //A15
1841 data8 0x4BFB5A1D57352265 //A16
1842 data8 0xCCC01CB3021212FF //A17
1843 data8 0x4D841613AC3431D1 //A18
1844 data8 0xCE431C9E9EE43AD9 //A19
1845 // Polynomial coefficients for left root on [-8, -7]
1846 // Lgammal is approximated by polynomial within [.157799 ; -.157798 ] range
1847 data8 0xBCF9C7A33AD9478C, 0xB995B0470F11E5ED //A0
1848 data8 0xC0E3AF76FE4C2F8B, 0xBD8DBCD503250511 //A1
1849 data8 0x41C838E76CAAF0D5, 0x3E5D79F5E2E069C3 //A2
1850 data8 0x9EF345992B262CE0, 0x0000C02B //A3
1851 data8 0x92AE0292985FD559, 0x0000403A //A4
1852 data8 0x90615420C08F7D8C, 0x0000C049 //A5
1853 data8 0x45828139342CEEB7 //A6
1854 data8 0xC67384066C31E2D3 //A7
1855 data8 0x476502BC4DAC2C35 //A8
1856 data8 0xC856FAADFF22ADC6 //A9
1857 data8 0x49497243255AB3CE //A10
1858 data8 0xCA3C768489520F6B //A11
1859 data8 0x4B300D1EA47AF838 //A12
1860 data8 0xCC223B0508AC620E //A13
1861 data8 0x4D14D46583338CD8 //A14
1862 data8 0xCE07E7A87AA068E4 //A15
1863 data8 0x4EFB811AD2F8BEAB //A16
1864 data8 0xCFF0351B51508523 //A17
1865 data8 0x50E4364CCBF53100 //A18
1866 data8 0xD1D33CFD0BF96FA6 //A19
1867 // Polynomial coefficients for left root on [-9, -8]
1868 // Lgammal is approximated by polynomial within [.157806 ; -.157806 ] range
1869 data8 0x3D333E4438B1B9D4, 0x39E7B956B83964C1 //A0
1870 data8 0xC11625EDFC63DCD8, 0xBDCF39625709EFAC //A1
1871 data8 0x422EA8C150480F16, 0x3EC16ED908AB7EDD //A2
1872 data8 0xE2598725E2E11646, 0x0000C034 //A3
1873 data8 0xEAFF2346DE3EBC98, 0x00004046 //A4
1874 data8 0x821E90DE12A0F05F, 0x0000C059 //A5
1875 data8 0x46B2C334AE5366FE //A6
1876 data8 0xC7D64314B43191B6 //A7
1877 data8 0x48FAF6ED5899E01B //A8
1878 data8 0xCA2096E4472AF37D //A9
1879 data8 0x4B44AAF49FB7E4C8 //A10
1880 data8 0xCC6A02469F2BD920 //A11
1881 data8 0x4D9080626D2EFC07 //A12
1882 data8 0xCEB515EDCF0695F7 //A13
1883 data8 0x4FDB1AC69BF36960 //A14
1884 data8 0xD1017F8274339270 //A15
1885 data8 0x5226A684961BAE2F //A16
1886 data8 0xD34E085C088404A5 //A17
1887 data8 0x547511892FF8960E //A18
1888 data8 0xD5968FA3B1ED67A9 //A19
1889 // Polynomial coefficients for left root on [-10, -9]
1890 // Lgammal is approximated by polynomial within [.157807 ; -.157807 ] range
1891 data8 0xBD355818A2B42BA2, 0xB9B7320B6A0D61EA //A0
1892 data8 0xC14BAF7DA5F3770E, 0xBDE64AF9A868F719 //A1
1893 data8 0x4297F3E8791F9CD3, 0x3F2A553E59B4835E //A2
1894 data8 0xDD0C5F7E551BD13C, 0x0000C03E //A3
1895 data8 0x8F6F0A3B2EB08BBB, 0x00004054 //A4
1896 data8 0xC68D4D5AD230BA08, 0x0000C069 //A5
1897 data8 0x47F1E4D8C35D1A3E //A6
1898 data8 0xC94A8A191DB0A466 //A7
1899 data8 0x4AA4174F65FE6AE8 //A8
1900 data8 0xCBFEE6D90F94E9DD //A9
1901 data8 0x4D580FD3438BE16C //A10
1902 data8 0xCEB2ECD456D50224 //A11
1903 data8 0x500E049F7FE64546 //A12
1904 data8 0xD167F92D9600F378 //A13
1905 data8 0x52C342AE2B43261A //A14
1906 data8 0xD41F15DEEDA4B67E //A15
1907 data8 0x55792638748AFB7D //A16
1908 data8 0xD6D4D760074F6E6B //A17
1909 data8 0x5832469D58ED3FA9 //A18
1910 data8 0xD988769F3DC76642 //A19
1911 // Polynomial coefficients for left root on [-11, -10]
1912 // Lgammal is approximated by polynomial within [.157807 ; -.157807 ] range
1913 data8 0xBDA050601F39778A, 0xBA0D4D1CE53E8241 //A0
1914 data8 0xC18308A7D8EA4039, 0xBE370C379D3EAD41 //A1
1915 data8 0x4306A49380644E6C, 0x3FBBB143C0E7B5C8 //A2
1916 data8 0x8FA8FB233E4AA6D2, 0x0000C049 //A3
1917 data8 0x802CC9D8AEAC207D, 0x00004062 //A4
1918 data8 0xF3F73EE651A37A13, 0x0000C07A //A5
1919 data8 0x493E3B550A7B9568 //A6
1920 data8 0xCACED38DAA060929 //A7
1921 data8 0x4C600B346BAB3BC6 //A8
1922 data8 0xCDF0F719193E3D26 //A9
1923 data8 0x4F8229F24528B151 //A10
1924 data8 0xD113A4C2D32FBBE2 //A11
1925 data8 0x52A56BC13DC4474D //A12
1926 data8 0xD43785CFAF5E3CE3 //A13
1927 data8 0x55C9FC3EA5941202 //A14
1928 data8 0xD75CD545A3341AF5 //A15
1929 data8 0x58F009911F77C282 //A16
1930 data8 0xDA8246294D210BEC //A17
1931 data8 0x5C1608AAC32C3A8E //A18
1932 data8 0xDDA446E570A397D5 //A19
1933 // Polynomial coefficients for left root on [-12, -11]
1934 // Lgammal is approximated by polynomial within [.157807 ; -.157807 ] range
1935 data8 0x3DEACBB3081C502E, 0x3A8AA6F01DEDF745 //A0
1936 data8 0xC1BC8CFBFB0A9912, 0xBE6556B6504A2AE6 //A1
1937 data8 0x43797926206941D7, 0x40289A9644C2A216 //A2
1938 data8 0xF26D2A78446D0839, 0x0000C053 //A3
1939 data8 0xA238B1D937FFED38, 0x00004070 //A4
1940 data8 0xE793B4F6DE470538, 0x0000C08C //A5
1941 data8 0x4A9585BDC44DC45D //A6
1942 data8 0xCC60759520342C47 //A7
1943 data8 0x4E29B2F3694C0404 //A8
1944 data8 0xCFF4619AE7B6BBAB //A9
1945 data8 0x51C05DADF52B89E8 //A10
1946 data8 0xD38A8C7F48819A4A //A11
1947 data8 0x5555B6932D687860 //A12
1948 data8 0xD721E1FACB6C1B5B //A13
1949 data8 0x58EDA1E2677C8F91 //A14
1950 data8 0xDAB8A8EC523C1F71 //A15
1951 data8 0x5C84930133F30411 //A16
1952 data8 0xDE51952FDFD1EC49 //A17
1953 data8 0x601FCCEC1BBD25F1 //A18
1954 data8 0xE1E5F2D76B610920 //A19
1955 // Polynomial coefficients for left root on [-13, -12]
1956 // Lgammal is approximated by polynomial within [.157807 ; -.157807 ] range
1957 data8 0xBE01612F373268ED, 0xBA97B7A18CDF103B //A0
1958 data8 0xC1F7328CBF7A4FAC, 0xBE89A25A6952F481 //A1
1959 data8 0x43F0D0FA2DBDA237, 0x40A0422EC1CE6084 //A2
1960 data8 0x82082DF2D32686C5, 0x0000C05F //A3
1961 data8 0x8D64EE9B42E68B36, 0x0000407F //A4
1962 data8 0xA3FFD82E08C630C9, 0x0000C09F //A5
1963 data8 0x4BF8C49D99123466 //A6
1964 data8 0xCDFEC79DDF1119ED //A7
1965 data8 0x50038615A892D242 //A8
1966 data8 0xD20929453DC8B537 //A9
1967 data8 0x54106A78083BA1EE //A10
1968 data8 0xD615A302C69E27B2 //A11
1969 data8 0x581CC175870FF16F //A12
1970 data8 0xDA233E0979E12B74 //A13
1971 data8 0x5C29E822BC568C80 //A14
1972 data8 0xDE31845DB5340FBC //A15
1973 data8 0x6037BFC6D498D5F9 //A16
1974 data8 0xE2407D92CD613E82 //A17
1975 data8 0x64483B9B62367EB7 //A18
1976 data8 0xE64B2DC830E8A799 //A1
1977 // Polynomial coefficients for left root on [-14, -13]
1978 // Lgammal is approximated by polynomial within [.157807 ; -.157807 ] range
1979 data8 0x3E563D0B930B371F, 0x3AE779957E14F012 //A0
1980 data8 0xC2344C3B2F083767, 0xBEC0B7769AA3DD66 //A1
1981 data8 0x4469BFFF28B50D07, 0x41181E3F13ED2401 //A2
1982 data8 0xAE38F64DCB24D9EE, 0x0000C06A //A3
1983 data8 0xA5C3F52C1B3506F2, 0x0000408E //A4
1984 data8 0xA83BC857BCD6BA92, 0x0000C0B2 //A5
1985 data8 0x4D663B4727B4D81A //A6
1986 data8 0xCFA82C965B0F62E9 //A7
1987 data8 0x51EAD58C02905B71 //A8
1988 data8 0xD42E427970FA56AD //A9
1989 data8 0x56714644C57D8476 //A10
1990 data8 0xD8B3EC2037EC95F2 //A11
1991 data8 0x5AF72AE68BBA5B3D //A12
1992 data8 0xDD3B2152C67AA6B7 //A13
1993 data8 0x5F7FF5F082861B8B //A14
1994 data8 0xE1C2E8BE125A5B7A //A15
1995 data8 0x64066E92FE9EBE7D //A16
1996 data8 0xE64B4201CDF9F138 //A17
1997 data8 0x689186351E58AA88 //A18
1998 data8 0xEAD132A585DFC60A //A19
1999 // Polynomial coefficients for left root on [-15, -14]
2000 // Lgammal is approximated by polynomial within [.157807 ; -.157807 ] range
2001 data8 0xBE6D7DDE12700AC1, 0xBB1E025BF1667FB5 //A0
2002 data8 0xC273077763F60AD5, 0xBF2A1698184C7A9A //A1
2003 data8 0x44E6A1BF095B1AB3, 0x4178D5AE8A4A2874 //A2
2004 data8 0x8F8E0D5060FCC767, 0x0000C076 //A3
2005 data8 0x800CC1DCFF092A57, 0x0000409E //A4
2006 data8 0xF3AB0BA9D14D37D1, 0x0000C0C5 //A5
2007 data8 0x4EDE3000A2F6D565 //A6
2008 data8 0xD15EC613B9C8C800 //A7
2009 data8 0x53E003309FEECCAA //A8
2010 data8 0xD660ED908D8B15C4 //A9
2011 data8 0x58E21E9B51A1C4AE //A10
2012 data8 0xDB639745DB82210D //A11
2013 data8 0x5DE55BB60C68FCF6 //A12
2014 data8 0xE06772BA3FCA23C6 //A13
2015 data8 0x62E9E58B4F702C31 //A14
2016 data8 0xE56CBA49B071ABE2 //A15
2017 data8 0x67EFF31E4F2BA36A //A16
2018 data8 0xEA7232C8804F32C3 //A17
2019 data8 0x6CF5EFEE929A0928 //A18
2020 data8 0xEF742EE03EC3E8FF //A19
2021 // Polynomial coefficients for left root on [-16, -15]
2022 // Lgammal is approximated by polynomial within [.157807 ; -.157807 ] range
2023 data8 0xBEDCC628FEAC7A1B, 0xBB80582C8BEBB198 //A0
2024 data8 0xC2B3076EE752595E, 0xBF5388F55AFAE53E //A1
2025 data8 0x4566A1AAD96EBCB5, 0x421F0FEFE2444293 //A2
2026 data8 0x8F8D4D3DE9850DB2, 0x0000C082 //A3
2027 data8 0x800BDD6DA2CE184C, 0x000040AE //A4
2028 data8 0xF3A8EC4C9CDC7A43, 0x0000C0D9 //A5
2029 data8 0x505E2FAFDB81263F //A6
2030 data8 0xD31EC5B3A7506CD9 //A7
2031 data8 0x55E002F77E999810 //A8
2032 data8 0xD8A0ED4C9B5C2900 //A9
2033 data8 0x5B621E4A8267C401 //A10
2034 data8 0xDE2396E5BFCFDA7A //A11
2035 data8 0x60E55B43BE6F9A79 //A12
2036 data8 0xE3A772324C7405FA //A13
2037 data8 0x6669E4E9B7E57A2D //A14
2038 data8 0xE92CB989F8A8FB37 //A15
2039 data8 0x6BEFF2368849A36E //A16
2040 data8 0xEEB23234FE191D55 //A17
2041 data8 0x7175EF5D1080B105 //A18
2042 data8 0xF4342ED7B1B7BE31 //A19
2043 // Polynomial coefficients for left root on [-17, -16]
2044 // Lgammal is approximated by polynomial within [.157807 ; -.157807 ] range
2045 data8 0xBF28AEEE7B58C790, 0xBBC4448DE371FA0A //A0
2046 data8 0xC2F436F56B3B89B1, 0xBF636755245AC63A //A1
2047 data8 0x45E98A22535D115D, 0x4298467DA93DB784 //A2
2048 data8 0xAC176F3775E6FCF2, 0x0000C08E //A3
2049 data8 0xA3114F53A9FEB908, 0x000040BE //A4
2050 data8 0xA4D168A8334AFE5A, 0x0000C0EE //A5
2051 data8 0x51E5B0E7EC7182CF //A6
2052 data8 0xD4E77D67B876D6B4 //A7
2053 data8 0x57E9F7C30C098C83 //A8
2054 data8 0xDAED29B0489EF7A7 //A9
2055 data8 0x5DF09486F8A524B8 //A10
2056 data8 0xE0F30B19910A2393 //A11
2057 data8 0x63F60E02AB3109F4 //A12
2058 data8 0xE6F9B8A3431854D5 //A13
2059 data8 0x69FE2D4A6D94218E //A14
2060 data8 0xED01C7E272A73560 //A15
2061 data8 0x7005017D82B186B6 //A16
2062 data8 0xF3096A81A69BD8AE //A17
2063 data8 0x76104951BAD67D5C //A18
2064 data8 0xF90FECC99786FD5B //A19
2065 // Polynomial coefficients for left root on [-18, -17]
2066 // Lgammal is approximated by polynomial within [.157807 ; -.157807 ] range
2067 data8 0x3F3C19A53328E26A, 0x3BE238D7BA036B3B //A0
2068 data8 0xC336C16C16C16C13, 0xBFEACE245DEC56F3 //A1
2069 data8 0x46702E85C0898B70, 0x432C922B64FD1DA4 //A2
2070 data8 0xF57B99A1C0343350, 0x0000C09A //A3
2071 data8 0x82EC9634223DF90D, 0x000040CF //A4
2072 data8 0x94F66D7557E3237D, 0x0000C103 //A5
2073 data8 0x5376118B79AE34D6 //A6
2074 data8 0xD6BAE7106D52CE49 //A7
2075 data8 0x5A00BD48CC8E11AB //A8
2076 data8 0xDD4529722833E2DF //A9
2077 data8 0x608B1654AF5F46AF //A10
2078 data8 0xE3D182CC90D8723F //A11
2079 data8 0x6716D43D46706AA0 //A12
2080 data8 0xEA5DF888C5B428D3 //A13
2081 data8 0x6DA3CA85888931A6 //A14
2082 data8 0xF0EA40EF2AC7E070 //A15
2083 data8 0x743175D1A251AFCD //A16
2084 data8 0xF777CB6E2B550D73 //A17
2085 data8 0x7AC11E468A134A51 //A18
2086 data8 0xFE02B6BDD0FC40AA //A19
2087 // Polynomial coefficients for left root on [-19, -18]
2088 // Lgammal is approximated by polynomial within [.157807 ; -.157807 ] range
2089 data8 0xBFAB767F9BE217FC, 0xBC4A5541CE0D8D0D //A0
2090 data8 0xC379999999999999, 0xC01A84981B490BE8 //A1
2091 data8 0x46F47AE147AE147A, 0x43AC2987BBC466EB //A2
2092 data8 0xAEC33E1F67152987, 0x0000C0A7 //A3
2093 data8 0xD1B71758E2196153, 0x000040DF //A4
2094 data8 0x8637BD05AF6D420E, 0x0000C118 //A5
2095 data8 0x55065E9F80F293B2 //A6
2096 data8 0xD88EADA78C44BFA7 //A7
2097 data8 0x5C15798EE22EC6CD //A8
2098 data8 0xDF9E8ABFD67895CF //A9
2099 data8 0x6325FD7FE13B0DE0 //A10
2100 data8 0xE6AFFC5C3DE70858 //A11
2101 data8 0x6A3774CE81C70D43 //A12
2102 data8 0xEDC1527412D8129F //A13
2103 data8 0x7149BABCDA8B7A72 //A14
2104 data8 0xF4D330AD49071BB5 //A15
2105 data8 0x785D4046F4C5F1FD //A16
2106 data8 0xFBE59BFEDBA73FAF //A17
2107 data8 0x7F64BEF2B2EC8DA1 //A18
2108 data8 0xFFEFFFFFFFFFFFFF //A19
2109 LOCAL_OBJECT_END(lgammal_left_roots_polynomial_data)
2110
2111
2112 //==============================================================
2113 // Code
2114 //==============================================================
2115
2116 .section .text
2117 GLOBAL_LIBM_ENTRY(__libm_lgammal)
2118 { .mfi
2119 getf.exp rSignExpX = f8
2120 // Test x for NaTVal, NaN, +/-0, +/-INF, denormals
2121 fclass.m p6,p0 = f8,0x1EF
2122 addl r17Ones = 0x1FFFF, r0 // exponent mask
2123 }
2124 { .mfi
2125 addl GR_ad_z_1 = @ltoff(Constants_Z_1#),gp
2126 fcvt.fx.s1 fXint = f8 // Convert arg to int (int repres. in FR)
2127 adds rDelta = 0x3FC, r0
2128 }
2129 ;;
2130 { .mfi
2131 getf.sig rSignifX = f8
2132 fcmp.lt.s1 p15, p14 = f8, f0
2133 shl rDelta = rDelta, 20 // single precision 1.5
2134 }
2135 { .mfi
2136 ld8 GR_ad_z_1 = [GR_ad_z_1]// get pointer to Constants_Z_1
2137 fma.s1 fTwo = f1, f1, f1 // 2.0
2138 addl rExp8 = 0x10002, r0 // exponent of 8.0
2139 }
2140 ;;
2141 { .mfi
2142 alloc rPFS_SAVED = ar.pfs, 0, 34, 4, 0 // get some registers
2143 fmerge.s fAbsX = f1, f8 // |x|
2144 and rExpX = rSignExpX, r17Ones // mask sign bit
2145 }
2146 { .mib
2147 addl rExpHalf = 0xFFFE, r0 // exponent of 0.5
2148 addl rExp2 = 0x10000, r0 // exponent of 2.0
2149 // branch out if x is NaTVal, NaN, +/-0, +/-INF, or denormalized number
2150 (p6) br.cond.spnt lgammal_spec
2151 }
2152 ;;
2153 _deno_back_to_main_path:
2154 { .mfi
2155 // Point to Constants_G_H_h1
2156 add rTbl1Addr = 0x040, GR_ad_z_1
2157 frcpa.s1 fRcpX, p0 = f1, f8 // initial approximation of 1/x
2158 extr.u GR_Index1 = rSignifX, 59, 4
2159 }
2160 { .mib
2161 (p14) cmp.ge.unc p8, p0 = rExpX, rExp8 // p8 = 1 if x >= 8.0
2162 adds rZ625 = 0x3F2, r0
2163 (p8) br.cond.spnt lgammal_big_positive // branch out if x >= 8.0
2164 }
2165 ;;
2166 { .mfi
2167 shladd rZ1offsett = GR_Index1, 2, GR_ad_z_1 // Point to Z_1
2168 fmerge.se fSignifX = f1, f8 // sifnificand of x
2169 // Get high 15 bits of significand
2170 extr.u GR_X_0 = rSignifX, 49, 15
2171 }
2172 { .mib
2173 cmp.lt.unc p9, p0 = rExpX, rExpHalf // p9 = 1 if |x| < 0.5
2174 // set p11 if 2 <= x < 4
2175 (p14) cmp.eq.unc p11, p0 = rExpX, rExp2
2176 (p9) br.cond.spnt lgammal_0_half // branch out if |x| < 0.5
2177 }
2178 ;;
2179 { .mfi
2180 ld4 GR_Z_1 = [rZ1offsett] // Load Z_1
2181 fms.s1 fA5L = f1, f1, f8 // for 0.75 <= x < 1.3125 path
2182 shl rZ625 = rZ625, 20 // sinfle precision 0.625
2183 }
2184 { .mib
2185 setf.s FR_MHalf = rDelta
2186 // set p10 if x >= 4.0
2187 (p14) cmp.gt.unc p10, p0 = rExpX, rExp2
2188 // branch to special path for 4.0 <= x < 8
2189 (p10) br.cond.spnt lgammal_4_8
2190 }
2191 ;;
2192 { .mfi
2193 // for 1.3125 <= x < 1.5625 path
2194 addl rPolDataPtr= @ltoff(lgammal_loc_min_data),gp
2195 // argument of polynomial approximation for 1.5625 <= x < 2.25
2196 fms.s1 fB4 = f8, f1, fTwo
2197 cmp.eq p12, p0 = rExpX, rExpHalf
2198 }
2199 { .mib
2200 addl rExpOne = 0xFFFF, r0 // exponent of 1.0
2201 // set p10 if significand of x >= 1.125
2202 (p11) cmp.le p11, p0 = 2, GR_Index1
2203 (p11) br.cond.spnt lgammal_2Q_4
2204 }
2205 ;;
2206 { .mfi
2207 // point to xMin for 1.3125 <= x < 1.5625 path
2208 ld8 rPolDataPtr = [rPolDataPtr]
2209 fcvt.xf fFltIntX = fXint // RTN(x)
2210 (p14) cmp.eq.unc p13, p7 = rExpX, rExpOne // p13 set if 1.0 <= x < 2.0
2211 }
2212 { .mib
2213 setf.s FR_FracX = rZ625
2214 // set p12 if |x| < 0.75
2215 (p12) cmp.gt.unc p12, p0 = 8, GR_Index1
2216 // branch out to special path for |x| < 0.75
2217 (p12) br.cond.spnt lgammal_half_3Q
2218 }
2219 ;;
2220 .pred.rel "mutex", p7, p13
2221 { .mfi
2222 getf.sig rXRnd = fXint // integer part of the input value
2223 fnma.s1 fInvX = f8, fRcpX, f1 // start of 1st NR iteration
2224 // Get bits 30-15 of X_0 * Z_1
2225 pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15
2226 }
2227 { .mib
2228 (p7) cmp.eq p6, p0 = rExpX, rExp2 // p6 set if 2.0 <= x < 2.25
2229 (p13) cmp.le p6, p0 = 9, GR_Index1
2230 // branch to special path 1.5625 <= x < 2.25
2231 (p6) br.cond.spnt lgammal_13Q_2Q
2232 }
2233 ;;
2234 //
2235 // For performance, don't use result of pmpyshr2.u for 4 cycles.
2236 //
2237 { .mfi
2238 shladd GR_ad_tbl_1 = GR_Index1, 4, rTbl1Addr // Point to G_1
2239 fma.s1 fSix = fTwo, fTwo, fTwo // 6.0
2240 add GR_ad_q = -0x60, GR_ad_z_1 // Point to Constants_Q
2241 }
2242 { .mib
2243 add rTmpPtr3 = -0x50, GR_ad_z_1
2244 (p13) cmp.gt p7, p0 = 5, GR_Index1
2245 // branch to special path 0.75 <= x < 1.3125
2246 (p7) br.cond.spnt lgammal_03Q_1Q
2247 }
2248 ;;
2249 { .mfi
2250 add rTmpPtr = 8, GR_ad_tbl_1
2251 fma.s1 fRoot = f8, f1, f1 // x + 1
2252 // Absolute value of int arg. Will be used as index in table with roots
2253 sub rXRnd = r0, rXRnd
2254 }
2255 { .mib
2256 ldfe fA5L = [rPolDataPtr], 16 // xMin
2257 addl rNegSingularity = 0x3003E, r0
2258 (p14) br.cond.spnt lgammal_loc_min
2259 }
2260 ;;
2261 { .mfi
2262 ldfps FR_G, FR_H = [GR_ad_tbl_1], 8 // Load G_1, H_1
2263 nop.f 0
2264 add rZ2Addr = 0x140, GR_ad_z_1 // Point to Constants_Z_2
2265 }
2266 { .mib
2267 ldfd FR_h = [rTmpPtr] // Load h_1
2268 // If arg is less or equal to -2^63
2269 cmp.geu.unc p8,p0 = rSignExpX, rNegSingularity
2270 // Singularity for x < -2^63 since all such arguments are integers
2271 // branch to special code which deals with singularity
2272 (p8) br.cond.spnt lgammal_singularity
2273 }
2274 ;;
2275 { .mfi
2276 ldfe FR_log2_hi = [GR_ad_q], 32 // Load log2_hi
2277 nop.f 0
2278 extr.u GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1
2279 }
2280 { .mfi
2281 ldfe FR_log2_lo = [rTmpPtr3], 32 // Load log2_lo
2282 fms.s1 fDx = f8, f1, fFltIntX // x - RTN(x)
2283 // index in table with roots and bounds
2284 adds rXint = -2, rXRnd
2285 }
2286 ;;
2287 { .mfi
2288 ldfe FR_Q4 = [GR_ad_q], 32 // Load Q4
2289 nop.f 0
2290 // set p12 if x may be close to negative root: -19.5 < x < -2.0
2291 cmp.gtu p12, p0 = 18, rXint
2292 }
2293 { .mfi
2294 shladd GR_ad_z_2 = GR_Index2, 2, rZ2Addr // Point to Z_2
2295 fma.s1 fRcpX = fInvX, fRcpX, fRcpX // end of 1st NR iteration
2296 // Point to Constants_G_H_h2
2297 add rTbl2Addr = 0x180, GR_ad_z_1
2298 }
2299 ;;
2300 { .mfi
2301 shladd GR_ad_tbl_2 = GR_Index2, 4, rTbl2Addr // Point to G_2
2302 // set p9 if x is integer and negative
2303 fcmp.eq.s1 p9, p0 = f8,fFltIntX
2304 // Point to Constants_G_H_h3
2305 add rTbl3Addr = 0x280, GR_ad_z_1
2306 }
2307 { .mfi
2308 ld4 GR_Z_2 = [GR_ad_z_2] // Load Z_2
2309 nop.f 0
2310 sub GR_N = rExpX, rExpHalf, 1
2311 }
2312 ;;
2313 { .mfi
2314 ldfe FR_Q3 = [rTmpPtr3], 32 // Load Q3
2315 nop.f 0
2316 // Point to lnsin polynomial coefficients
2317 adds rLnSinDataPtr = 864, rTbl3Addr
2318 }
2319 { .mfi
2320 ldfe FR_Q2 = [GR_ad_q],32 // Load Q2
2321 nop.f 0
2322 add rTmpPtr = 8, GR_ad_tbl_2
2323 }
2324 ;;
2325 { .mfi
2326 ldfe FR_Q1 = [rTmpPtr3] // Load Q1
2327 fcmp.lt.s1 p0, p15 = fAbsX, fSix // p15 is set when x < -6.0
2328 // point to table with roots and bounds
2329 adds rRootsBndAddr = -1296, GR_ad_z_1
2330 }
2331 { .mfb
2332 // Put integer N into rightmost significand
2333 setf.sig fFloatN = GR_N
2334 fma.s1 fThirteen = fSix, fTwo, f1 // 13.0
2335 // Singularity if -2^63 < x < 0 and x is integer
2336 // branch to special code which deals with singularity
2337 (p9) br.cond.spnt lgammal_singularity
2338 }
2339 ;;
2340 { .mfi
2341 ldfps FR_G2, FR_H2 = [GR_ad_tbl_2] // Load G_2, H_2
2342 // y = |x|/2^(exponent(x)) - 1.5
2343 fms.s1 FR_FracX = fSignifX, f1, FR_MHalf
2344 // Get bits 30-15 of X_1 * Z_2
2345 pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15
2346 }
2347 { .mfi
2348 ldfd FR_h2 = [rTmpPtr] // Load h_2
2349 fma.s1 fDxSqr = fDx, fDx, f0 // deltaX^2
2350 adds rTmpPtr3 = 128, rLnSinDataPtr
2351 }
2352 ;;
2353 //
2354 // For performance, don't use result of pmpyshr2.u for 4 cycles.
2355 //
2356 { .mfi
2357 getf.exp rRoot = fRoot // sign and biased exponent of (x + 1)
2358 nop.f 0
2359 // set p6 if -4 < x <= -2
2360 cmp.eq p6, p0 = rExpX, rExp2
2361 }
2362 { .mfi
2363 ldfpd fLnSin2, fLnSin2L = [rLnSinDataPtr], 16
2364 fnma.s1 fInvX = f8, fRcpX, f1 // start of 2nd NR iteration
2365 sub rIndexPol = rExpX, rExpHalf // index of polynom
2366 }
2367 ;;
2368 { .mfi
2369 ldfe fLnSin4 = [rLnSinDataPtr], 96
2370 // p10 is set if x is potential "right" root
2371 // p11 set for possible "left" root
2372 fcmp.lt.s1 p10, p11 = fDx, f0
2373 shl rIndexPol = rIndexPol, 6 // (i*16)*4
2374 }
2375 { .mfi
2376 ldfpd fLnSin18, fLnSin20 = [rTmpPtr3], 16
2377 nop.f 0
2378 mov rExp2tom7 = 0x0fff8 // Exponent of 2^-7
2379 }
2380 ;;
2381 { .mfi
2382 getf.sig rSignifDx = fDx // Get significand of RTN(x)
2383 nop.f 0
2384 // set p6 if -4 < x <= -3.0
2385 (p6) cmp.le.unc p6, p0 = 0x8, GR_Index1
2386 }
2387 { .mfi
2388 ldfpd fLnSin22, fLnSin24 = [rTmpPtr3], 16
2389 nop.f 0
2390 // mask sign bit in the exponent of (x + 1)
2391 and rRoot = rRoot, r17Ones
2392 }
2393 ;;
2394 { .mfi
2395 ldfe fLnSin16 = [rLnSinDataPtr], -80
2396 nop.f 0
2397 extr.u GR_Index3 = GR_X_2, 1, 5 // Extract bits 1-5 of X_2
2398 }
2399 { .mfi
2400 ldfpd fLnSin26, fLnSin28 = [rTmpPtr3], 16
2401 nop.f 0
2402 and rXRnd = 1, rXRnd
2403 }
2404 ;;
2405 { .mfi
2406 shladd GR_ad_tbl_3 = GR_Index3, 4, rTbl3Addr // Point to G_3
2407 fms.s1 fDxSqrL = fDx, fDx, fDxSqr // low part of deltaX^2
2408 // potential "left" root
2409 (p11) adds rRootsBndAddr = 560, rRootsBndAddr
2410 }
2411 { .mib
2412 ldfpd fLnSin30, fLnSin32 = [rTmpPtr3], 16
2413 // set p7 if |x+1| < 2^-7
2414 cmp.lt p7, p0 = rRoot, rExp2tom7
2415 // branch to special path for |x+1| < 2^-7
2416 (p7) br.cond.spnt _closeToNegOne
2417 }
2418 ;;
2419 { .mfi
2420 ldfps FR_G3, FR_H3 = [GR_ad_tbl_3], 8 // Load G_3, H_3
2421 fcmp.lt.s1 p14, p0 = fAbsX, fThirteen // set p14 if x > -13.0
2422 // base address of polynomial on range [-6.0, -0.75]
2423 adds rPolDataPtr = 3440, rTbl3Addr
2424 }
2425 { .mfi
2426 // (i*16)*4 + (i*16)*8 - offsett of polynomial on range [-6.0, -0.75]
2427 shladd rTmpPtr = rIndexPol, 2, rIndexPol
2428 fma.s1 fXSqr = FR_FracX, FR_FracX, f0 // y^2
2429 // point to left "near root" bound
2430 (p12) shladd rRootsBndAddr = rXint, 4, rRootsBndAddr
2431 }
2432 ;;
2433 { .mfi
2434 ldfpd fLnSin34, fLnSin36 = [rTmpPtr3], 16
2435 fma.s1 fRcpX = fInvX, fRcpX, fRcpX // end of 2nd NR iteration
2436 // add special offsett if -4 < x <= -3.0
2437 (p6) adds rPolDataPtr = 640, rPolDataPtr
2438 }
2439 { .mfi
2440 // point to right "near root" bound
2441 adds rTmpPtr2 = 8, rRootsBndAddr
2442 fnma.s1 fMOne = f1, f1, f0 // -1.0
2443 // Point to Bernulli numbers
2444 adds rBernulliPtr = 544, rTbl3Addr
2445 }
2446 ;;
2447 { .mfi
2448 // left bound of "near root" range
2449 (p12) ld8 rLeftBound = [rRootsBndAddr]
2450 fmerge.se fNormDx = f1, fDx // significand of DeltaX
2451 // base address + offsett for polynomial coeff. on range [-6.0, -0.75]
2452 add rPolDataPtr = rPolDataPtr, rTmpPtr
2453 }
2454 { .mfi
2455 // right bound of "near root" range
2456 (p12) ld8 rRightBound = [rTmpPtr2]
2457 fcvt.xf fFloatN = fFloatN
2458 // special "Bernulli" numbers for Stirling's formula for -13 < x < -6
2459 (p14) adds rBernulliPtr = 160, rBernulliPtr
2460 }
2461 ;;
2462 { .mfi
2463 ldfd FR_h3 = [GR_ad_tbl_3] // Load h_3
2464 fmpy.s1 FR_G = FR_G, FR_G2 // G = G_1 * G_2
2465 adds rTmpPtr3 = -160, rTmpPtr3
2466 }
2467 { .mfb
2468 adds rTmpPtr = 80, rPolDataPtr
2469 fadd.s1 FR_H = FR_H, FR_H2 // H = H_1 + H_2
2470 // p15 is set if -2^63 < x < 6.0 and x is not an integer
2471 // branch to path with implementation using Stirling's formula for neg. x
2472 (p15) br.cond.spnt _negStirling
2473 }
2474 ;;
2475 { .mfi
2476 ldfpd fA3, fA3L = [rPolDataPtr], 16 // A3
2477 fma.s1 fDelX4 = fDxSqr, fDxSqr, f0 // deltaX^4
2478 // Get high 4 bits of signif
2479 extr.u rIndex1Dx = rSignifDx, 59, 4
2480 }
2481 { .mfi
2482 ldfe fA5 = [rTmpPtr], -16 // A5
2483 fadd.s1 FR_h = FR_h, FR_h2 // h = h_1 + h_2
2484 adds rLnSinTmpPtr = 16, rLnSinDataPtr
2485 }
2486 ;;
2487 { .mfi
2488 ldfpd fA0, fA0L = [rPolDataPtr], 16 // A0
2489 fma.s1 fLnSin20 = fLnSin20, fDxSqr, fLnSin18
2490 // Get high 15 bits of significand
2491 extr.u rX0Dx = rSignifDx, 49, 15
2492 }
2493 { .mfi
2494 ldfe fA4 = [rTmpPtr], 192 // A4
2495 fms.s1 fXSqrL = FR_FracX, FR_FracX, fXSqr // low part of y^2
2496 shladd GR_ad_z_1 = rIndex1Dx, 2, GR_ad_z_1 // Point to Z_1
2497 }
2498 ;;
2499 { .mfi
2500 ldfpd fA1, fA1L = [rPolDataPtr], 16 // A1
2501 fma.s1 fX4 = fXSqr, fXSqr, f0 // y^4
2502 adds rTmpPtr2 = 32, rTmpPtr
2503 }
2504 { .mfi
2505 ldfpd fA18, fA19 = [rTmpPtr], 16 // A18, A19
2506 fma.s1 fLnSin24 = fLnSin24, fDxSqr, fLnSin22
2507 nop.i 0
2508 }
2509 ;;
2510 { .mfi
2511 ldfe fLnSin6 = [rLnSinDataPtr], 32
2512 fma.s1 fLnSin28 = fLnSin28, fDxSqr, fLnSin26
2513 nop.i 0
2514 }
2515 { .mfi
2516 ldfe fLnSin8 = [rLnSinTmpPtr], 32
2517 nop.f 0
2518 nop.i 0
2519 }
2520 ;;
2521 { .mfi
2522 ldfpd fA20, fA21 = [rTmpPtr], 16 // A20, A21
2523 fma.s1 fLnSin32 = fLnSin32, fDxSqr, fLnSin30
2524 nop.i 0
2525 }
2526 { .mfi
2527 ldfpd fA22, fA23 = [rTmpPtr2], 16 // A22, A23
2528 fma.s1 fB20 = f1, f1, FR_MHalf // 2.5
2529 (p12) cmp.ltu.unc p6, p0 = rSignifX, rLeftBound
2530 }
2531 ;;
2532 { .mfi
2533 ldfpd fA2, fA2L = [rPolDataPtr], 16 // A2
2534 fmpy.s1 FR_G = FR_G, FR_G3 // G = (G_1 * G_2) * G_3
2535 // set p6 if x falls in "near root" range
2536 (p6) cmp.geu.unc p6, p0 = rSignifX, rRightBound
2537 }
2538 { .mfb
2539 adds rTmpPtr3 = -64, rTmpPtr
2540 fadd.s1 FR_H = FR_H, FR_H3 // H = (H_1 + H_2) + H_3
2541 // branch to special path if x falls in "near root" range
2542 (p6) br.cond.spnt _negRoots
2543 }
2544 ;;
2545 { .mfi
2546 ldfpd fA24, fA25 = [rTmpPtr2], 16 // A24, A25
2547 fma.s1 fLnSin36 = fLnSin36, fDxSqr, fLnSin34
2548 (p11) cmp.eq.unc p7, p0 = 1,rXint // p7 set if -3.0 < x < -2.5
2549 }
2550 { .mfi
2551 adds rTmpPtr = -48, rTmpPtr
2552 fma.s1 fLnSin20 = fLnSin20, fDxSqr, fLnSin16
2553 addl rDelta = 0x5338, r0 // significand of -2.605859375
2554 }
2555 ;;
2556 { .mfi
2557 getf.exp GR_N = fDx // Get N = exponent of DeltaX
2558 fma.s1 fX6 = fX4, fXSqr, f0 // y^6
2559 // p7 set if -2.605859375 <= x < -2.5
2560 (p7) cmp.gt.unc p7, p0 = rDelta, GR_X_0
2561 }
2562 { .mfb
2563 ld4 GR_Z_1 = [GR_ad_z_1] // Load Z_1
2564 fma.s1 fDelX8 = fDelX4, fDelX4, f0 // deltaX^8
2565 // branch to special path for -2.605859375 <= x < -2.5
2566 (p7) br.cond.spnt _neg2andHalf
2567 }
2568 ;;
2569 { .mfi
2570 ldfpd fA14, fA15 = [rTmpPtr3], 16 // A14, A15
2571 fadd.s1 FR_h = FR_h, FR_h3 // h = (h_1 + h_2) + h_3
2572 adds rTmpPtr2 = 128 , rPolDataPtr
2573 }
2574 { .mfi
2575 ldfpd fA16, fA17 = [rTmpPtr], 16 // A16, A17
2576 fma.s1 fLnSin28 = fLnSin28, fDelX4, fLnSin24
2577 adds rPolDataPtr = 144 , rPolDataPtr
2578 }
2579 ;;
2580 { .mfi
2581 ldfe fLnSin10 = [rLnSinDataPtr], 32
2582 fma.s1 fRes1H = fA3, FR_FracX, f0 // (A3*y)hi
2583 and GR_N = GR_N, r17Ones // mask sign bit
2584 }
2585 { .mfi
2586 ldfe fLnSin12 = [rLnSinTmpPtr]
2587 fma.s1 fDelX6 = fDxSqr, fDelX4, f0 // DeltaX^6
2588 shladd GR_ad_tbl_1 = rIndex1Dx, 4, rTbl1Addr // Point to G_1
2589 }
2590 ;;
2591 { .mfi
2592 ldfe fA13 = [rPolDataPtr], -32 // A13
2593 fma.s1 fA4 = fA5, FR_FracX, fA4 // A5*y + A4
2594 // Get bits 30-15 of X_0 * Z_1
2595 pmpyshr2.u GR_X_1 = rX0Dx, GR_Z_1, 15
2596 }
2597 { .mfi
2598 ldfe fA12 = [rTmpPtr2], -32 // A12
2599 fms.s1 FR_r = FR_G, fSignifX, f1 // r = G * S_hi - 1
2600 sub GR_N = GR_N, rExpHalf, 1 // unbisaed exponent of DeltaX
2601 }
2602 ;;
2603 //
2604 // For performance, don't use result of pmpyshr2.u for 4 cycles.
2605 //
2606 .pred.rel "mutex",p10,p11
2607 { .mfi
2608 ldfe fA11 = [rPolDataPtr], -32 // A11
2609 // High part of log(|x|) = Y_hi = N * log2_hi + H
2610 fma.s1 fResH = fFloatN, FR_log2_hi, FR_H
2611 (p10) cmp.eq p8, p9 = rXRnd, r0
2612 }
2613 { .mfi
2614 ldfe fA10 = [rTmpPtr2], -32 // A10
2615 fma.s1 fRes6H = fA1, FR_FracX, f0 // (A1*y)hi
2616 (p11) cmp.eq p9, p8 = rXRnd, r0
2617 }
2618 ;;
2619 { .mfi
2620 ldfe fA9 = [rPolDataPtr], -32 // A9
2621 fma.s1 fB14 = fLnSin6, fDxSqr, f0 // (LnSin6*deltaX^2)hi
2622 cmp.eq p6, p7 = 4, rSgnGamSize
2623 }
2624 { .mfi
2625 ldfe fA8 = [rTmpPtr2], -32 // A8
2626 fma.s1 fA18 = fA19, FR_FracX, fA18
2627 nop.i 0
2628 }
2629 ;;
2630 { .mfi
2631 ldfe fA7 = [rPolDataPtr] // A7
2632 fma.s1 fA23 = fA23, FR_FracX, fA22
2633 nop.i 0
2634 }
2635 { .mfi
2636 ldfe fA6 = [rTmpPtr2] // A6
2637 fma.s1 fA21 = fA21, FR_FracX, fA20
2638 nop.i 0
2639 }
2640 ;;
2641 { .mfi
2642 ldfe fLnSin14 = [rLnSinDataPtr]
2643 fms.s1 fRes1L = fA3, FR_FracX, fRes1H // delta((A3*y)hi)
2644 extr.u GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1
2645 }
2646 { .mfi
2647 setf.sig fFloatNDx = GR_N
2648 fadd.s1 fPol = fRes1H, fA2 // (A3*y + A2)hi
2649 nop.i 0
2650 }
2651 ;;
2652 { .mfi
2653 ldfps FR_G, FR_H = [GR_ad_tbl_1], 8 // Load G_1, H_1
2654 fma.s1 fRes2H = fA4, fXSqr, f0 // ((A5 + A4*y)*y^2)hi
2655 nop.i 0
2656 }
2657 { .mfi
2658 shladd GR_ad_z_2 = GR_Index2, 2, rZ2Addr // Point to Z_2
2659 fma.s1 fA25 = fA25, FR_FracX, fA24
2660 shladd GR_ad_tbl_2 = GR_Index2, 4, rTbl2Addr // Point to G_2
2661 }
2662 ;;
2663 .pred.rel "mutex",p8,p9
2664 { .mfi
2665 ld4 GR_Z_2 = [GR_ad_z_2] // Load Z_2
2666 fms.s1 fRes6L = fA1, FR_FracX, fRes6H // delta((A1*y)hi)
2667 // sign of GAMMA(x) is negative
2668 (p8) adds rSgnGam = -1, r0
2669 }
2670 { .mfi
2671 adds rTmpPtr = 8, GR_ad_tbl_2
2672 fadd.s1 fRes3H = fRes6H, fA0 // (A1*y + A0)hi
2673 // sign of GAMMA(x) is positive
2674 (p9) adds rSgnGam = 1, r0
2675 }
2676 ;;
2677 { .mfi
2678 ldfps FR_G2, FR_H2 = [GR_ad_tbl_2] // Load G_2, H_2
2679 // (LnSin6*deltaX^2 + LnSin4)hi
2680 fadd.s1 fLnSinH = fB14, fLnSin4
2681 nop.i 0
2682 }
2683 { .mfi
2684 ldfd FR_h2 = [rTmpPtr] // Load h_2
2685 fms.s1 fB16 = fLnSin6, fDxSqr, fB14 // delta(LnSin6*deltaX^2)
2686 nop.i 0
2687 }
2688 ;;
2689 { .mfi
2690 ldfd fhDelX = [GR_ad_tbl_1] // Load h_1
2691 fma.s1 fA21 = fA21, fXSqr, fA18
2692 nop.i 0
2693 }
2694 { .mfi
2695 nop.m 0
2696 fma.s1 fLnSin36 = fLnSin36, fDelX4, fLnSin32
2697 nop.i 0
2698 }
2699 ;;
2700 { .mfi
2701 nop.m 0
2702 fma.s1 fRes1L = fA3L, FR_FracX, fRes1L // (A3*y)lo
2703 // Get bits 30-15 of X_1 * Z_
2704 pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15
2705 }
2706 { .mfi
2707 nop.m 0
2708 fsub.s1 fPolL = fA2, fPol
2709 nop.i 0
2710 }
2711 ;;
2712 //
2713 // For performance, don't use result of pmpyshr2.u for 4 cycles.
2714 //
2715 { .mfi
2716 nop.m 0
2717 // delta(((A5 + A4*y)*y^2)hi)
2718 fms.s1 fRes2L = fA4, fXSqr, fRes2H
2719 nop.i 0
2720 }
2721 { .mfi
2722 nop.m 0
2723 // (((A5 + A4*y)*y^2) + A3*y + A2)hi
2724 fadd.s1 fRes4H = fRes2H, fPol
2725 nop.i 0
2726 }
2727 ;;
2728 { .mfi
2729 // store signgam if size of variable is 4 bytes
2730 (p6) st4 [rSgnGamAddr] = rSgnGam
2731 fma.s1 fRes6L = fA1L, FR_FracX, fRes6L // (A1*y)lo
2732 nop.i 0
2733 }
2734 { .mfi
2735 // store signgam if size of variable is 8 bytes
2736 (p7) st8 [rSgnGamAddr] = rSgnGam
2737 fsub.s1 fRes3L = fA0, fRes3H
2738 nop.i 0
2739 }
2740 ;;
2741 { .mfi
2742 nop.m 0
2743 fsub.s1 fLnSinL = fLnSin4, fLnSinH
2744 nop.i 0
2745 }
2746 { .mfi
2747 nop.m 0
2748 // ((LnSin6*deltaX^2 + LnSin4)*deltaX^2)hi
2749 fma.s1 fB18 = fLnSinH, fDxSqr, f0
2750 nop.i 0
2751 }
2752 ;;
2753 { .mfi
2754 adds rTmpPtr = 8, rTbl3Addr
2755 fma.s1 fB16 = fLnSin6, fDxSqrL, fB16 // (LnSin6*deltaX^2)lo
2756 extr.u GR_Index3 = GR_X_2, 1, 5 // Extract bits 1-5 of X_2
2757 }
2758 { .mfi
2759 nop.m 0
2760 fma.s1 fA25 = fA25, fXSqr, fA23
2761 nop.i 0
2762 }
2763 ;;
2764 { .mfi
2765 shladd GR_ad_tbl_3 = GR_Index3, 4, rTbl3Addr // Point to G_3
2766 fadd.s1 fPolL = fPolL, fRes1H
2767 nop.i 0
2768 }
2769 { .mfi
2770 shladd rTmpPtr = GR_Index3, 4, rTmpPtr // Point to G_3
2771 fadd.s1 fRes1L = fRes1L, fA2L // (A3*y)lo + A2lo
2772 nop.i 0
2773 }
2774 ;;
2775 { .mfi
2776 ldfps FR_G3, FR_H3 = [GR_ad_tbl_3] // Load G_3, H_3
2777 fma.s1 fRes2L = fA4, fXSqrL, fRes2L // ((A5 + A4*y)*y^2)lo
2778 nop.i 0
2779 }
2780 { .mfi
2781 ldfd FR_h3 = [rTmpPtr] // Load h_3
2782 fsub.s1 fRes4L = fPol, fRes4H
2783 nop.i 0
2784 }
2785 ;;
2786 { .mfi
2787 nop.m 0
2788 // ((((A5 + A4*y)*y^2) + A3*y + A2)*y^2)hi
2789 fma.s1 fRes7H = fRes4H, fXSqr, f0
2790 nop.i 0
2791 }
2792 { .mfi
2793 nop.m 0
2794 fma.s1 fA15 = fA15, FR_FracX, fA14
2795 nop.i 0
2796 }
2797 ;;
2798 { .mfi
2799 nop.m 0
2800 fadd.s1 fRes3L = fRes3L, fRes6H
2801 nop.i 0
2802 }
2803 { .mfi
2804 nop.m 0
2805 fadd.s1 fRes6L = fRes6L, fA0L // (A1*y)lo + A0lo
2806 nop.i 0
2807 }
2808 ;;
2809 { .mfi
2810 nop.m 0
2811 fadd.s1 fLnSinL = fLnSinL, fB14
2812
2813 nop.i 0
2814 }
2815 { .mfi
2816 nop.m 0
2817 // delta((LnSin6*deltaX^2 + LnSin4)*deltaX^2)
2818 fms.s1 fB20 = fLnSinH, fDxSqr, fB18
2819 nop.i 0
2820 }
2821 ;;
2822 { .mfi
2823 nop.m 0
2824 fadd.s1 fPolL = fPolL, fRes1L // (A3*y + A2)lo
2825
2826 nop.i 0
2827 }
2828 { .mfi
2829 nop.m 0
2830 // ((LnSin6*deltaX^2 + LnSin4)*deltaX^2 + LnSin2)hi
2831 fadd.s1 fLnSin6 = fB18, fLnSin2
2832 nop.i 0
2833 }
2834 ;;
2835 { .mfi
2836 nop.m 0
2837 fadd.s1 fRes4L = fRes4L, fRes2H
2838 nop.i 0
2839 }
2840 { .mfi
2841 nop.m 0
2842 fma.s1 fA17 = fA17, FR_FracX, fA16
2843 nop.i 0
2844 }
2845 ;;
2846 { .mfi
2847 nop.m 0
2848 // delta(((((A5 + A4*y)*y^2) + A3*y + A2)*y^2)
2849 fms.s1 fRes7L = fRes4H, fXSqr, fRes7H
2850 nop.i 0
2851 }
2852 { .mfi
2853 nop.m 0
2854 fadd.s1 fPol = fRes7H, fRes3H
2855 nop.i 0
2856 }
2857 ;;
2858 { .mfi
2859 nop.m 0
2860 fadd.s1 fRes3L = fRes3L, fRes6L // (A1*y + A0)lo
2861 nop.i 0
2862 }
2863 { .mfi
2864 nop.m 0
2865 fma.s1 fA25 = fA25, fX4, fA21
2866 nop.i 0
2867 }
2868 ;;
2869 { .mfi
2870 nop.m 0
2871 // (LnSin6*deltaX^2 + LnSin4)lo
2872 fadd.s1 fLnSinL = fLnSinL, fB16
2873 nop.i 0
2874 }
2875 { .mfi
2876 nop.m 0
2877 fma.s1 fB20 = fLnSinH, fDxSqrL, fB20
2878 nop.i 0
2879 }
2880 ;;
2881 { .mfi
2882 nop.m 0
2883 fsub.s1 fLnSin4 = fLnSin2, fLnSin6
2884 nop.i 0
2885 }
2886 { .mfi
2887 nop.m 0
2888 // (((LnSin6*deltaX^2 + LnSin4)*deltaX^2 + LnSin2)*DeltaX^2)hi
2889 fma.s1 fLnSinH = fLnSin6, fDxSqr, f0
2890 nop.i 0
2891 }
2892 ;;
2893 { .mfi
2894 nop.m 0
2895 // ((A5 + A4*y)*y^2)lo + (A3*y + A2)lo
2896 fadd.s1 fRes2L = fRes2L, fPolL
2897 nop.i 0
2898 }
2899 { .mfi
2900 nop.m 0
2901 fma.s1 fA17 = fA17, fXSqr, fA15
2902 nop.i 0
2903 }
2904 ;;
2905 { .mfi
2906 nop.m 0
2907 // ((((A5 + A4*y)*y^2) + A3*y + A2)*y^2)lo
2908 fma.s1 fRes7L = fRes4H, fXSqrL, fRes7L
2909 nop.i 0
2910 }
2911 { .mfi
2912 nop.m 0
2913 fsub.s1 fPolL = fRes3H, fPol
2914 nop.i 0
2915 }
2916 ;;
2917 { .mfi
2918 nop.m 0
2919 fma.s1 fA13 = fA13, FR_FracX, fA12
2920 nop.i 0
2921 }
2922 { .mfi
2923 nop.m 0
2924 fma.s1 fA11 = fA11, FR_FracX, fA10
2925 nop.i 0
2926 }
2927 ;;
2928 { .mfi
2929 nop.m 0
2930 // ((LnSin6*deltaX^2 + LnSin4)*deltaX^2)lo
2931 fma.s1 fB20 = fLnSinL, fDxSqr, fB20
2932 nop.i 0
2933 }
2934 { .mfi
2935 nop.m 0
2936 fmpy.s1 FR_G = FR_G, FR_G2 // G = G_1 * G_2
2937 nop.i 0
2938 }
2939 ;;
2940 { .mfi
2941 nop.m 0
2942 fadd.s1 fLnSin4 = fLnSin4, fB18
2943 nop.i 0
2944 }
2945 { .mfi
2946 nop.m 0
2947 fms.s1 fLnSinL = fLnSin6, fDxSqr, fLnSinH
2948 nop.i 0
2949 }
2950 ;;
2951 { .mfi
2952 nop.m 0
2953 // (((A5 + A4*y)*y^2) + A3*y + A2)lo
2954 fadd.s1 fRes4L = fRes4L, fRes2L
2955 nop.i 0
2956 }
2957 { .mfi
2958 nop.m 0
2959 fadd.s1 fhDelX = fhDelX, FR_h2 // h = h_1 + h_2
2960 nop.i 0
2961 }
2962 ;;
2963 { .mfi
2964 nop.m 0
2965 fadd.s1 fRes7L = fRes7L, fRes3L
2966 nop.i 0
2967 }
2968 { .mfi
2969 nop.m 0
2970 fadd.s1 fPolL = fPolL, fRes7H
2971 nop.i 0
2972 }
2973 ;;
2974 { .mfi
2975 nop.m 0
2976 fcvt.xf fFloatNDx = fFloatNDx
2977 nop.i 0
2978 }
2979 { .mfi
2980 nop.m 0
2981 fadd.s1 FR_H = FR_H, FR_H2 // H = H_1 + H_2
2982 nop.i 0
2983 }
2984 ;;
2985 { .mfi
2986 nop.m 0
2987 fmpy.s1 FR_G = FR_G, FR_G3 // G = (G_1 * G_2) * G_3
2988 nop.i 0
2989 }
2990 { .mfi
2991 nop.m 0
2992 // ((LnSin6*deltaX^2 + LnSin4)*deltaX^2)lo + (LnSin2)lo
2993 fadd.s1 fLnSin2L = fLnSin2L, fB20
2994 nop.i 0
2995 }
2996 ;;
2997 { .mfi
2998 nop.m 0
2999 fma.s1 fA25 = fA25, fX4, fA17
3000 nop.i 0
3001 }
3002 { .mfi
3003 nop.m 0
3004 fma.s1 fA13 = fA13, fXSqr, fA11
3005 nop.i 0
3006 }
3007 ;;
3008 { .mfi
3009 nop.m 0
3010 fma.s1 fA9 = fA9, FR_FracX, fA8
3011 nop.i 0
3012 }
3013 { .mfi
3014 nop.m 0
3015 fma.s1 fA7 = fA7, FR_FracX, fA6
3016 nop.i 0
3017 }
3018 ;;
3019 { .mfi
3020 nop.m 0
3021 fma.s1 fLnSin36 = fLnSin36, fDelX8, fLnSin28
3022 nop.i 0
3023 }
3024 { .mfi
3025 nop.m 0
3026 fma.s1 fLnSin14 = fLnSin14, fDxSqr, fLnSin12
3027 nop.i 0
3028 }
3029 ;;
3030 { .mfi
3031 nop.m 0
3032 fma.s1 fLnSin10 = fLnSin10, fDxSqr, fLnSin8
3033 nop.i 0
3034 }
3035 { .mfi
3036 nop.m 0
3037 fadd.s1 FR_H = FR_H, FR_H3 // H = (H_1 + H_2) + H_3
3038 nop.i 0
3039 }
3040 ;;
3041 { .mfi
3042 nop.m 0
3043 fms.s1 fRDx = FR_G, fNormDx, f1 // r = G * S_hi - 1
3044 nop.i 0
3045 }
3046 { .mfi
3047 nop.m 0
3048 // poly_lo = r * Q4 + Q3
3049 fma.s1 FR_poly_lo = FR_r, FR_Q4, FR_Q3
3050 nop.i 0
3051 }
3052 ;;
3053 { .mfi
3054 nop.m 0
3055 fmpy.s1 FR_rsq = FR_r, FR_r // rsq = r * r
3056 nop.i 0
3057 }
3058 { .mfi
3059 nop.m 0
3060 // ((((A5 + A4*y)*y^2) + A3*y + A2)*y^2)lo + (A1*y + A0)lo
3061 fma.s1 fRes7L = fRes4L, fXSqr, fRes7L
3062 nop.i 0
3063 }
3064 ;;
3065 { .mfi
3066 nop.m 0
3067 fma.s1 fA25 = fA25, fX4, fA13
3068 nop.i 0
3069 }
3070 { .mfi
3071 nop.m 0
3072 fma.s1 fA9 = fA9, fXSqr, fA7
3073 nop.i 0
3074 }
3075 ;;
3076 { .mfi
3077 nop.m 0
3078 // h = N * log2_lo + h
3079 fma.s1 FR_h = fFloatN, FR_log2_lo, FR_h
3080 nop.i 0
3081 }
3082 { .mfi
3083 nop.m 0
3084 fadd.s1 fhDelX = fhDelX, FR_h3 // h = (h_1 + h_2) + h_3
3085 nop.i 0
3086 }
3087 ;;
3088 { .mfi
3089 nop.m 0
3090 fma.s1 fLnSin36 = fLnSin36, fDelX6, fLnSin20
3091 nop.i 0
3092 }
3093 { .mfi
3094 nop.m 0
3095 fma.s1 fLnSin14 = fLnSin14, fDelX4, fLnSin10
3096 nop.i 0
3097 }
3098 ;;
3099 { .mfi
3100 nop.m 0
3101 // poly_lo = r * Q4 + Q3
3102 fma.s1 fPolyLoDx = fRDx, FR_Q4, FR_Q3
3103 nop.i 0
3104 }
3105 { .mfi
3106 nop.m 0
3107 fmpy.s1 fRDxSq = fRDx, fRDx // rsq = r * r
3108 nop.i 0
3109 }
3110 ;;
3111 { .mfi
3112 nop.m 0
3113 // Y_hi = N * log2_hi + H
3114 fma.s1 fResLnDxH = fFloatNDx, FR_log2_hi, FR_H
3115 nop.i 0
3116 }
3117 { .mfi
3118 nop.m 0
3119 fma.s1 FR_rcub = FR_rsq, FR_r, f0 // rcub = r^3
3120 nop.i 0
3121 }
3122 ;;
3123 { .mfi
3124 nop.m 0
3125 fma.s1 fA9 = fA25, fX4, fA9
3126 nop.i 0
3127 }
3128 { .mfi
3129 nop.m 0
3130 fadd.s1 fPolL = fPolL, fRes7L
3131 nop.i 0
3132 }
3133 ;;
3134 { .mfi
3135 nop.m 0
3136 fadd.s1 fLnSin4 = fLnSin4, fLnSin2L
3137 nop.i 0
3138 }
3139 { .mfi
3140 nop.m 0
3141 // h = N * log2_lo + h
3142 fma.s1 fhDelX = fFloatNDx, FR_log2_lo, fhDelX
3143 nop.i 0
3144 }
3145 ;;
3146 { .mfi
3147 nop.m 0
3148 fma.s1 fLnSin36 = fLnSin36, fDelX8, fLnSin14
3149 nop.i 0
3150 }
3151 { .mfi
3152 nop.m 0
3153 // ((LnSin6*deltaX^2 + LnSin4)*deltaX^2 + LnSin2)lo
3154 fma.s1 fLnSinL = fLnSin6, fDxSqrL, fLnSinL
3155 nop.i 0
3156 }
3157 ;;
3158 { .mfi
3159 nop.m 0
3160 // poly_lo = poly_lo * r + Q2
3161 fma.s1 fPolyLoDx = fPolyLoDx, fRDx, FR_Q2
3162 nop.i 0
3163 }
3164 { .mfi
3165 nop.m 0
3166 fma.s1 fRDxCub = fRDxSq, fRDx, f0 // rcub = r^3
3167 nop.i 0
3168 }
3169 ;;
3170 { .mfi
3171 nop.m 0
3172 famax.s0 fRes5H = fPol, fResH
3173 nop.i 0
3174 }
3175 { .mfi
3176 nop.m 0
3177 // High part of (lgammal(|x|) + log(|x|))
3178 fadd.s1 fRes1H = fPol, fResH
3179 nop.i 0
3180 }
3181 ;;
3182 { .mfi
3183 nop.m 0
3184 // poly_lo = poly_lo * r + Q2
3185 fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_Q2
3186 nop.i 0
3187 }
3188 { .mfi
3189 nop.m 0
3190 fma.s1 fPolL = fA9, fX6, fPolL // P25lo
3191 nop.i 0
3192 }
3193 ;;
3194
3195 { .mfi
3196 nop.m 0
3197 famin.s0 fRes5L = fPol, fResH
3198 nop.i 0
3199 }
3200 { .mfi
3201 nop.m 0
3202 // High part of -(LnSin + log(|DeltaX|))
3203 fnma.s1 fRes2H = fResLnDxH, f1, fLnSinH
3204 nop.i 0
3205 }
3206 ;;
3207
3208 { .mfi
3209 nop.m 0
3210 // (((LnSin6*deltaX^2 + LnSin4)*deltaX^2 + LnSin2)*DeltaX^2)lo
3211 fma.s1 fLnSinL = fLnSin4, fDxSqr, fLnSinL
3212 nop.i 0
3213 }
3214 { .mfi
3215 nop.m 0
3216 fma.s1 fLnSin36 = fLnSin36, fDelX6, f0
3217 nop.i 0
3218 }
3219 ;;
3220 { .mfi
3221 nop.m 0
3222 // poly_hi = Q1 * rsq + r
3223 fma.s1 fPolyHiDx = FR_Q1, fRDxSq, fRDx
3224 nop.i 0
3225 }
3226 { .mfi
3227 nop.m 0
3228 // poly_lo = poly_lo*r^3 + h
3229 fma.s1 fPolyLoDx = fPolyLoDx, fRDxCub, fhDelX
3230 nop.i 0
3231 }
3232 ;;
3233 { .mfi
3234 nop.m 0
3235 fsub.s1 fRes1L = fRes5H, fRes1H
3236 nop.i 0
3237 }
3238 { .mfi
3239 nop.m 0
3240 // -(lgammal(|x|) + log(|x|))hi
3241 fnma.s1 fRes1H = fRes1H, f1, f0
3242
3243 nop.i 0
3244 }
3245 ;;
3246 { .mfi
3247 nop.m 0
3248 // poly_hi = Q1 * rsq + r
3249 fma.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r
3250 nop.i 0
3251 }
3252 { .mfi
3253 nop.m 0
3254 // poly_lo = poly_lo*r^3 + h
3255 fma.s1 FR_poly_lo = FR_poly_lo, FR_rcub, FR_h
3256 nop.i 0
3257 }
3258 ;;
3259 { .mfi
3260 nop.m 0
3261 fms.s1 fRes2L = fResLnDxH, fMOne, fRes2H
3262 nop.i 0
3263 }
3264 ;;
3265 { .mfi
3266 nop.m 0
3267 fma.s1 fLnSinL = fLnSin36, fDxSqr, fLnSinL
3268 nop.i 0
3269 }
3270 { .mfi
3271 nop.m 0
3272 // Y_lo = poly_hi + poly_lo
3273 fadd.s1 fResLnDxL = fPolyHiDx, fPolyLoDx
3274 nop.i 0
3275 }
3276 ;;
3277 { .mfi
3278 nop.m 0
3279 fadd.s1 fRes1L = fRes1L, fRes5L
3280 nop.i 0
3281 }
3282 { .mfi
3283 nop.m 0
3284 // high part of the final result
3285 fadd.s1 fYH = fRes2H, fRes1H
3286 nop.i 0
3287 }
3288 ;;
3289 { .mfi
3290 nop.m 0
3291 // Y_lo = poly_hi + poly_lo
3292 fadd.s1 fResL = FR_poly_hi, FR_poly_lo
3293 nop.i 0
3294 }
3295 ;;
3296 { .mfi
3297 nop.m 0
3298 famax.s0 fRes4H = fRes2H, fRes1H
3299 nop.i 0
3300 }
3301 ;;
3302 { .mfi
3303 nop.m 0
3304 famin.s0 fRes4L = fRes2H, fRes1H
3305 nop.i 0
3306 }
3307 ;;
3308 { .mfi
3309 nop.m 0
3310 // (LnSin)lo + (log(|DeltaX|))lo
3311 fsub.s1 fLnSinL = fLnSinL, fResLnDxL
3312 nop.i 0
3313 }
3314 { .mfi
3315 nop.m 0
3316 fadd.s1 fRes2L = fRes2L, fLnSinH
3317 nop.i 0
3318 }
3319 ;;
3320 { .mfi
3321 nop.m 0
3322 //(lgammal(|x|))lo + (log(|x|))lo
3323 fadd.s1 fPolL = fResL, fPolL
3324 nop.i 0
3325 }
3326 ;;
3327 { .mfi
3328 nop.m 0
3329 fsub.s1 fYL = fRes4H, fYH
3330 nop.i 0
3331 }
3332 ;;
3333 { .mfi
3334 nop.m 0
3335 // Low part of -(LnSin + log(|DeltaX|))
3336 fadd.s1 fRes2L = fRes2L, fLnSinL
3337 nop.i 0
3338 }
3339 { .mfi
3340 nop.m 0
3341 // High part of (lgammal(|x|) + log(|x|))
3342 fadd.s1 fRes1L = fRes1L, fPolL
3343 nop.i 0
3344 }
3345 ;;
3346 { .mfi
3347 nop.m 0
3348 fadd.s1 fYL = fYL, fRes4L
3349 nop.i 0
3350 }
3351 { .mfi
3352 nop.m 0
3353 fsub.s1 fRes2L = fRes2L, fRes1L
3354 nop.i 0
3355 }
3356 ;;
3357 { .mfi
3358 nop.m 0
3359 // low part of the final result
3360 fadd.s1 fYL = fYL, fRes2L
3361 nop.i 0
3362 }
3363 ;;
3364 { .mfb
3365 nop.m 0
3366 // final result for -6.0 < x <= -0.75, non-integer, "far" from roots
3367 fma.s0 f8 = fYH, f1, fYL
3368 // exit here for -6.0 < x <= -0.75, non-integer, "far" from roots
3369 br.ret.sptk b0
3370 }
3371 ;;
3372
3373 // here if |x+1| < 2^(-7)
3374 .align 32
3375 _closeToNegOne:
3376 { .mfi
3377 getf.exp GR_N = fDx // Get N = exponent of x
3378 fmerge.se fAbsX = f1, fDx // Form |deltaX|
3379 // Get high 4 bits of significand of deltaX
3380 extr.u rIndex1Dx = rSignifDx, 59, 4
3381 }
3382 { .mfi
3383 addl rPolDataPtr= @ltoff(lgammal_1pEps_data),gp
3384 fma.s1 fA0L = fDxSqr, fDxSqr, f0 // deltaX^4
3385 // sign of GAMMA is positive if p10 is set to 1
3386 (p10) adds rSgnGam = 1, r0
3387 }
3388 ;;
3389 { .mfi
3390 shladd GR_ad_z_1 = rIndex1Dx, 2, GR_ad_z_1 // Point to Z_1
3391 fnma.s1 fResL = fDx, f1, f0 // -(x+1)
3392 // Get high 15 bits of significand
3393 extr.u GR_X_0 = rSignifDx, 49, 15
3394 }
3395 { .mfi
3396 ld8 rPolDataPtr = [rPolDataPtr]
3397 nop.f 0
3398 shladd GR_ad_tbl_1 = rIndex1Dx, 4, rTbl1Addr // Point to G_1
3399 }
3400 ;;
3401 { .mfi
3402 ld4 GR_Z_1 = [GR_ad_z_1] // Load Z_1
3403 nop.f 0
3404 and GR_N = GR_N, r17Ones // mask sign bit
3405 }
3406 { .mfi
3407 adds rTmpPtr = 8, GR_ad_tbl_1
3408 nop.f 0
3409 cmp.eq p6, p7 = 4, rSgnGamSize
3410 }
3411 ;;
3412 { .mfi
3413 ldfps FR_G, FR_H = [GR_ad_tbl_1],8 // Load G_1, H_1
3414 nop.f 0
3415 adds rTmpPtr2 = 96, rPolDataPtr
3416 }
3417 { .mfi
3418 ldfd FR_h = [rTmpPtr] // Load h_1
3419 nop.f 0
3420 // unbiased exponent of deltaX
3421 sub GR_N = GR_N, rExpHalf, 1
3422 }
3423 ;;
3424 { .mfi
3425 adds rTmpPtr3 = 192, rPolDataPtr
3426 nop.f 0
3427 // sign of GAMMA is negative if p11 is set to 1
3428 (p11) adds rSgnGam = -1, r0
3429 }
3430 { .mfi
3431 ldfe fA1 = [rPolDataPtr], 16 // A1
3432 nop.f 0
3433 nop.i 0
3434 }
3435 ;;
3436 {.mfi
3437 ldfe fA2 = [rPolDataPtr], 16 // A2
3438 nop.f 0
3439 // Get bits 30-15 of X_0 * Z_1
3440 pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15
3441 }
3442 { .mfi
3443 ldfpd fA20, fA19 = [rTmpPtr2], 16 // P8, P7
3444 nop.f 0
3445 nop.i 0
3446 }
3447 ;;
3448 //
3449 // For performance, don't use result of pmpyshr2.u for 4 cycles.
3450 //
3451 { .mfi
3452 ldfe fA3 = [rPolDataPtr], 16 // A3
3453 nop.f 0
3454 nop.i 0
3455 }
3456 { .mfi
3457 ldfpd fA18, fA17 = [rTmpPtr2], 16 // P6, P5
3458 nop.f 0
3459 nop.i 0
3460 }
3461 ;;
3462 { .mfi
3463 ldfe fA4 = [rPolDataPtr], 16 // A4
3464 nop.f 0
3465 nop.i 0
3466 }
3467 { .mfi
3468 ldfpd fA16, fA15 = [rTmpPtr2], 16 // P4, p3
3469 nop.f 0
3470 nop.i 0
3471 }
3472 ;;
3473 { .mfi
3474 ldfpd fA5L, fA6 = [rPolDataPtr], 16 // A5, A6
3475 nop.f 0
3476 nop.i 0
3477 }
3478 { .mfi
3479 ldfpd fA14, fA13 = [rTmpPtr2], 16 // P2, P1
3480 nop.f 0
3481 nop.i 0
3482 }
3483 ;;
3484 { .mfi
3485 ldfpd fA7, fA8 = [rPolDataPtr], 16 // A7, A8
3486 nop.f 0
3487 extr.u GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1
3488 }
3489 { .mfi
3490 ldfe fLnSin2 = [rTmpPtr2], 16
3491 nop.f 0
3492 nop.i 0
3493 }
3494 ;;
3495 { .mfi
3496 shladd GR_ad_z_2 = GR_Index2, 2, rZ2Addr // Point to Z_2
3497 nop.f 0
3498 shladd GR_ad_tbl_2 = GR_Index2, 4, rTbl2Addr // Point to G_2
3499 }
3500 { .mfi
3501 ldfe fLnSin4 = [rTmpPtr2], 32
3502 nop.f 0
3503 nop.i 0
3504 }
3505 ;;
3506 { .mfi
3507 ld4 GR_Z_2 = [GR_ad_z_2] // Load Z_2
3508 nop.f 0
3509 adds rTmpPtr = 8, GR_ad_tbl_2
3510 }
3511 { .mfi
3512 // Put integer N into rightmost significand
3513 setf.sig fFloatN = GR_N
3514 nop.f 0
3515 nop.i 0
3516 }
3517 ;;
3518 { .mfi
3519 ldfe fLnSin6 = [rTmpPtr3]
3520 nop.f 0
3521 nop.i 0
3522 }
3523 { .mfi
3524 ldfe fLnSin8 = [rTmpPtr2]
3525 nop.f 0
3526 nop.i 0
3527 }
3528 ;;
3529 { .mfi
3530 ldfps FR_G2, FR_H2 = [GR_ad_tbl_2],8 // Load G_2, H_2
3531 nop.f 0
3532 nop.i 0
3533 }
3534 { .mfi
3535 ldfd FR_h2 = [rTmpPtr] // Load h_2
3536 nop.f 0
3537 nop.i 0
3538 }
3539 ;;
3540 { .mfi
3541 // store signgam if size of variable is 4 bytes
3542 (p6) st4 [rSgnGamAddr] = rSgnGam
3543 fma.s1 fResH = fA20, fResL, fA19 //polynomial for log(|x|)
3544 // Get bits 30-15 of X_1 * Z_2
3545 pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15
3546 }
3547 { .mfi
3548 // store signgam if size of variable is 8 bytes
3549 (p7) st8 [rSgnGamAddr] = rSgnGam
3550 fma.s1 fA2 = fA2, fDx, fA1 // polynomial for lgammal(|x|)
3551 nop.i 0
3552 }
3553 ;;
3554 //
3555 // For performance, don't use result of pmpyshr2.u for 4 cycles.
3556 //
3557 { .mfi
3558 nop.m 0
3559 fma.s1 fA18 = fA18, fResL, fA17 //polynomial for log(|x|)
3560 nop.i 0
3561 }
3562 ;;
3563 { .mfi
3564 nop.m 0
3565 fma.s1 fA16 = fA16, fResL, fA15 //polynomial for log(|x|)
3566 nop.i 0
3567 }
3568 { .mfi
3569 nop.m 0
3570 fma.s1 fA4 = fA4, fDx, fA3 // polynomial for lgammal(|x|)
3571 nop.i 0
3572 }
3573 ;;
3574 { .mfi
3575 nop.m 0
3576 fma.s1 fA14 = fA14, fResL, fA13 //polynomial for log(|x|)
3577 nop.i 0
3578 }
3579 { .mfi
3580 nop.m 0
3581 fma.s1 fA6 = fA6, fDx, fA5L // polynomial for lgammal(|x|)
3582 nop.i 0
3583 }
3584 ;;
3585 { .mfi
3586 nop.m 0
3587 fma.s1 fPol = fA8, fDx, fA7 // polynomial for lgammal(|x|)
3588 extr.u GR_Index3 = GR_X_2, 1, 5 // Extract bits 1-5 of X_2
3589 }
3590 ;;
3591 { .mfi
3592 shladd GR_ad_tbl_3 = GR_Index3, 4, rTbl3Addr // Point to G_3
3593 // loqw part of lnsin polynomial
3594 fma.s1 fRes3L = fLnSin4, fDxSqr, fLnSin2
3595 nop.i 0
3596 }
3597 ;;
3598 { .mfi
3599 ldfps FR_G3, FR_H3 = [GR_ad_tbl_3], 8 // Load G_3, H_3
3600 fcvt.xf fFloatN = fFloatN // N as FP number
3601 nop.i 0
3602 }
3603 { .mfi
3604 nop.m 0
3605 fma.s1 fResH = fResH, fDxSqr, fA18 // High part of log(|x|)
3606 nop.i 0
3607 }
3608 ;;
3609 { .mfi
3610 ldfd FR_h3 = [GR_ad_tbl_3] // Load h_3
3611 fma.s1 fA4 = fA4, fDxSqr, fA2 // Low part of lgammal(|x|)
3612 nop.i 0
3613 }
3614 { .mfi
3615 nop.m 0
3616 // high part of lnsin polynomial
3617 fma.s1 fRes3H = fLnSin8, fDxSqr, fLnSin6
3618 nop.i 0
3619 }
3620 ;;
3621 { .mfi
3622 nop.m 0
3623 fmpy.s1 FR_G = FR_G, FR_G2 // G = G_1 * G_2
3624 nop.i 0
3625 }
3626 { .mfi
3627 nop.m 0
3628 fadd.s1 FR_H = FR_H, FR_H2 // H = H_1 + H_2
3629 nop.i 0
3630 }
3631 ;;
3632 { .mfi
3633 nop.m 0
3634 fadd.s1 FR_h = FR_h, FR_h2 // h = h_1 + h_2
3635 nop.i 0
3636 }
3637 { .mfi
3638 nop.m 0
3639 fma.s1 fA16 = fA16, fDxSqr, fA14 // Low part of log(|x|)
3640 nop.i 0
3641 }
3642 ;;
3643 { .mfi
3644 nop.m 0
3645 fma.s1 fPol = fPol, fDxSqr, fA6 // High part of lgammal(|x|)
3646 nop.i 0
3647 }
3648 ;;
3649 { .mfi
3650 nop.m 0
3651 fma.s1 fResH = fResH, fA0L, fA16 // log(|x|)/deltaX^2 - deltaX
3652 nop.i 0
3653 }
3654 ;;
3655 { .mfi
3656 nop.m 0
3657 fmpy.s1 FR_G = FR_G, FR_G3 // G = (G_1 * G_2) * G_3
3658 nop.i 0
3659 }
3660 { .mfi
3661 nop.m 0
3662 fadd.s1 FR_H = FR_H, FR_H3 // H = (H_1 + H_2) + H_3
3663 nop.i 0
3664 }
3665 ;;
3666 { .mfi
3667 nop.m 0
3668 fadd.s1 FR_h = FR_h, FR_h3 // h = (h_1 + h_2) + h_3
3669 nop.i 0
3670 }
3671 ;;
3672 { .mfi
3673 nop.m 0
3674 fma.s1 fResH = fResH, fDxSqr, fResL // log(|x|)
3675 nop.i 0
3676 }
3677 { .mfi
3678 nop.m 0
3679 fma.s1 fPol = fPol, fA0L, fA4 // lgammal(|x|)/|x|
3680 nop.i 0
3681 }
3682 ;;
3683 { .mfi
3684 nop.m 0
3685 fms.s1 FR_r = FR_G, fAbsX, f1 // r = G * S_hi - 1
3686 nop.i 0
3687 }
3688 { .mfi
3689 nop.m 0
3690 // high part of log(deltaX)= Y_hi = N * log2_hi + H
3691 fma.s1 fRes4H = fFloatN, FR_log2_hi, FR_H
3692 nop.i 0
3693 }
3694 ;;
3695 { .mfi
3696 nop.m 0
3697 // h = N * log2_lo + h
3698 fma.s1 FR_h = fFloatN, FR_log2_lo, FR_h
3699 nop.i 0
3700 }
3701 ;;
3702 { .mfi
3703 nop.m 0
3704 fma.s1 fResH = fPol, fDx, fResH // lgammal(|x|) + log(|x|)
3705 nop.i 0
3706 }
3707 { .mfi
3708 nop.m 0
3709 // lnsin/deltaX^2
3710 fma.s1 fRes3H = fRes3H, fA0L, fRes3L
3711 nop.i 0
3712 }
3713 ;;
3714 { .mfi
3715 nop.m 0
3716 // poly_lo = r * Q4 + Q3
3717 fma.s1 FR_poly_lo = FR_r, FR_Q4, FR_Q3
3718 nop.i 0
3719 }
3720 { .mfi
3721 nop.m 0
3722 fmpy.s1 FR_rsq = FR_r, FR_r // rsq = r * r
3723 nop.i 0
3724 }
3725 ;;
3726 { .mfi
3727 nop.m 0
3728 // lnSin - log(|x|) - lgammal(|x|)
3729 fms.s1 fResH = fRes3H, fDxSqr, fResH
3730 nop.i 0
3731 }
3732 ;;
3733
3734 { .mfi
3735 nop.m 0
3736 // poly_lo = poly_lo * r + Q2
3737 fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_Q2
3738 nop.i 0
3739 }
3740 { .mfi
3741 nop.m 0
3742 fma.s1 FR_rcub = FR_rsq, FR_r, f0 // rcub = r^3
3743 nop.i 0
3744 }
3745 ;;
3746
3747 { .mfi
3748 nop.m 0
3749 // poly_hi = Q1 * rsq + r
3750 fma.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r
3751 nop.i 0
3752 }
3753 ;;
3754
3755 { .mfi
3756 nop.m 0
3757 // poly_lo = poly_lo*r^3 + h
3758 fma.s1 FR_poly_lo = FR_poly_lo, FR_rcub, FR_h
3759 nop.i 0
3760 }
3761 ;;
3762
3763 { .mfi
3764 nop.m 0
3765 // low part of log(|deltaX|) = Y_lo = poly_hi + poly_lo
3766 fadd.s1 fRes4L = FR_poly_hi, FR_poly_lo
3767 nop.i 0
3768 }
3769 ;;
3770 { .mfi
3771 nop.m 0
3772 fsub.s1 fResH = fResH, fRes4L
3773 nop.i 0
3774 }
3775 ;;
3776 { .mfb
3777 nop.m 0
3778 // final result for |x+1|< 2^(-7) path
3779 fsub.s0 f8 = fResH, fRes4H
3780 // exit for |x+1|< 2^(-7) path
3781 br.ret.sptk b0
3782 }
3783 ;;
3784
3785
3786 // here if -2^63 < x < -6.0 and x is not an integer
3787 // Also we are going to filter out cases when x falls in
3788 // range which is "close enough" to negative root. Rhis case
3789 // may occur only for -19.5 < x since other roots of lgamma are
3790 // insignificant from double extended point of view (they are closer
3791 // to RTN(x) than one ulp(x).
3792 .align 32
3793 _negStirling:
3794 { .mfi
3795 ldfe fLnSin6 = [rLnSinDataPtr], 32
3796 fnma.s1 fInvX = f8, fRcpX, f1 // start of 3rd NR iteration
3797 // Get high 4 bits of significand of deltaX
3798 extr.u rIndex1Dx = rSignifDx, 59, 4
3799 }
3800 { .mfi
3801 ldfe fLnSin8 = [rTmpPtr3], 32
3802 fadd.s1 FR_h = FR_h, FR_h2 // h = h_1 + h_2
3803 (p12) cmp.ltu.unc p6, p0 = rSignifX, rLeftBound
3804 }
3805 ;;
3806 { .mfi
3807 ldfe fLnSin10 = [rLnSinDataPtr], 32
3808 fmpy.s1 FR_G = FR_G, FR_G3 // G = (G_1 * G_2) * G_3
3809 // Get high 15 bits of significand
3810 extr.u GR_X_0 = rSignifDx, 49, 15
3811 }
3812 { .mfi
3813 shladd GR_ad_z_1 = rIndex1Dx, 2, GR_ad_z_1 // Point to Z_1
3814 fadd.s1 FR_H = FR_H, FR_H3 // H = (H_1 + H_2) + H_3
3815 // set p6 if x falls in "near root" range
3816 (p6) cmp.geu.unc p6, p0 = rSignifX, rRightBound
3817 }
3818 ;;
3819 { .mfi
3820 getf.exp GR_N = fDx // Get N = exponent of x
3821 fma.s1 fDx4 = fDxSqr, fDxSqr, f0 // deltaX^4
3822 adds rTmpPtr = 96, rBernulliPtr
3823 }
3824 { .mfb
3825 ld4 GR_Z_1 = [GR_ad_z_1] // Load Z_1
3826 fma.s1 fLnSin34 = fLnSin34, fDxSqr, fLnSin32
3827 // branch to special path if x falls in "near root" range
3828 (p6) br.cond.spnt _negRoots
3829 }
3830 ;;
3831 .pred.rel "mutex",p10,p11
3832 { .mfi
3833 ldfe fLnSin12 = [rTmpPtr3]
3834 fma.s1 fLnSin26 = fLnSin26, fDxSqr, fLnSin24
3835 (p10) cmp.eq p8, p9 = rXRnd, r0
3836 }
3837 { .mfi
3838 ldfe fLnSin14 = [rLnSinDataPtr]
3839 fma.s1 fLnSin30 = fLnSin30, fDxSqr, fLnSin28
3840 (p11) cmp.eq p9, p8 = rXRnd, r0
3841 }
3842 ;;
3843 { .mfi
3844 ldfpd fB2, fB2L = [rBernulliPtr], 16
3845 fma.s1 fLnSin18 = fLnSin18, fDxSqr, fLnSin16
3846 shladd GR_ad_tbl_1 = rIndex1Dx, 4, rTbl1Addr // Point to G_1
3847
3848 }
3849 { .mfi
3850 ldfe fB14 = [rTmpPtr], 16
3851 fma.s1 fLnSin22 = fLnSin22, fDxSqr, fLnSin20
3852 and GR_N = GR_N, r17Ones // mask sign bit
3853 }
3854 ;;
3855 { .mfi
3856 ldfe fB4 = [rBernulliPtr], 16
3857 fma.s1 fInvX = fInvX, fRcpX, fRcpX // end of 3rd NR iteration
3858 // Get bits 30-15 of X_0 * Z_1
3859 pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15
3860 }
3861 { .mfi
3862 ldfe fB16 = [rTmpPtr], 16
3863 fadd.s1 FR_h = FR_h, FR_h3 // h = (h_1 + h_2) + h_3
3864 adds rTmpPtr2 = 8, GR_ad_tbl_1
3865 }
3866 ;;
3867 //
3868 // For performance, don't use result of pmpyshr2.u for 4 cycles.
3869 //
3870 { .mfi
3871 ldfe fB6 = [rBernulliPtr], 16
3872 fms.s1 FR_r = FR_G, fSignifX, f1 // r = G * S_hi - 1
3873 adds rTmpPtr3 = -48, rTmpPtr
3874 }
3875 { .mfi
3876 ldfe fB18 = [rTmpPtr], 16
3877 // High part of the log(|x|) = Y_hi = N * log2_hi + H
3878 fma.s1 fResH = fFloatN, FR_log2_hi, FR_H
3879 sub GR_N = GR_N, rExpHalf, 1 // unbiased exponent of deltaX
3880 }
3881 ;;
3882 .pred.rel "mutex",p8,p9
3883 { .mfi
3884 ldfe fB8 = [rBernulliPtr], 16
3885 fma.s1 fLnSin36 = fLnSin36, fDx4, fLnSin34
3886 // sign of GAMMA(x) is negative
3887 (p8) adds rSgnGam = -1, r0
3888 }
3889 { .mfi
3890 ldfe fB20 = [rTmpPtr], -160
3891 fma.s1 fRes5H = fLnSin4, fDxSqr, f0
3892 // sign of GAMMA(x) is positive
3893 (p9) adds rSgnGam = 1, r0
3894
3895 }
3896 ;;
3897 { .mfi
3898 ldfe fB10 = [rBernulliPtr], 16
3899 fma.s1 fLnSin30 = fLnSin30, fDx4, fLnSin26
3900 (p14) adds rTmpPtr = -160, rTmpPtr
3901 }
3902 { .mfi
3903 ldfe fB12 = [rTmpPtr3], 16
3904 fma.s1 fDx8 = fDx4, fDx4, f0 // deltaX^8
3905 cmp.eq p6, p7 = 4, rSgnGamSize
3906 }
3907 ;;
3908 { .mfi
3909 ldfps fGDx, fHDx = [GR_ad_tbl_1], 8 // Load G_1, H_1
3910 fma.s1 fDx6 = fDx4, fDxSqr, f0 // deltaX^6
3911 extr.u GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1
3912 }
3913 { .mfi
3914 ldfd fhDx = [rTmpPtr2] // Load h_1
3915 fma.s1 fLnSin22 = fLnSin22, fDx4, fLnSin18
3916 nop.i 0
3917 }
3918 ;;
3919 { .mfi
3920 // Load two parts of C
3921 ldfpd fRes1H, fRes1L = [rTmpPtr], 16
3922 fma.s1 fRcpX = fInvX, fInvX, f0 // (1/x)^2
3923 shladd GR_ad_tbl_2 = GR_Index2, 4, rTbl2Addr // Point to G_2
3924 }
3925 { .mfi
3926 shladd GR_ad_z_2 = GR_Index2, 2, rZ2Addr // Point to Z_2
3927 fma.s1 FR_h = fFloatN, FR_log2_lo, FR_h// h = N * log2_lo + h
3928 nop.i 0
3929 }
3930 ;;
3931 { .mfi
3932 ld4 GR_Z_2 = [GR_ad_z_2] // Load Z_2
3933 fnma.s1 fInvXL = f8, fInvX, f1 // relative error of 1/x
3934 nop.i 0
3935 }
3936 { .mfi
3937 adds rTmpPtr2 = 8, GR_ad_tbl_2
3938 fma.s1 fLnSin8 = fLnSin8, fDxSqr, fLnSin6
3939 nop.i 0
3940 }
3941 ;;
3942 { .mfi
3943 ldfps FR_G2, FR_H2 = [GR_ad_tbl_2],8 // Load G_2, H_2
3944 // poly_lo = r * Q4 + Q3
3945 fma.s1 FR_poly_lo = FR_r, FR_Q4, FR_Q3
3946 nop.i 0
3947 }
3948 { .mfi
3949 ldfd fh2Dx = [rTmpPtr2] // Load h_2
3950 fmpy.s1 FR_rsq = FR_r, FR_r // rsq = r * r
3951 nop.i 0
3952 }
3953 ;;
3954 { .mfi
3955 nop.m 0
3956 fma.s1 fA1L = fB2, fInvX, f0 // (B2*(1/x))hi
3957 nop.i 0
3958 }
3959 { .mfi
3960 // Put integer N into rightmost significand
3961 setf.sig fFloatNDx = GR_N
3962 fms.s1 fRes4H = fResH, f1, f1 // ln(|x|)hi - 1
3963 nop.i 0
3964 }
3965 ;;
3966 { .mfi
3967 nop.m 0
3968 fadd.s1 fRes2H = fRes5H, fLnSin2//(lnSin4*DeltaX^2 + lnSin2)hi
3969 // Get bits 30-15 of X_1 * Z_2
3970 pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15
3971 }
3972 { .mfi
3973 nop.m 0
3974 fms.s1 fRes5L = fLnSin4, fDxSqr, fRes5H
3975 nop.i 0
3976 }
3977 ;;
3978 //
3979 // For performance, don't use result of pmpyshr2.u for 4 cycles.
3980 //
3981 { .mfi
3982 nop.m 0
3983 fma.s1 fInvX4 = fRcpX, fRcpX, f0 // (1/x)^4
3984 nop.i 0
3985 }
3986 { .mfi
3987 nop.m 0
3988 fma.s1 fB6 = fB6, fRcpX, fB4
3989 nop.i 0
3990 }
3991 ;;
3992 { .mfi
3993 // store signgam if size of variable is 4 bytes
3994 (p6) st4 [rSgnGamAddr] = rSgnGam
3995 fma.s1 fB18 = fB18, fRcpX, fB16
3996 nop.i 0
3997 }
3998 { .mfi
3999 // store signgam if size of variable is 8 bytes
4000 (p7) st8 [rSgnGamAddr] = rSgnGam
4001 fma.s1 fInvXL = fInvXL, fInvX, f0 // low part of 1/x
4002 nop.i 0
4003 }
4004 ;;
4005 { .mfi
4006 nop.m 0
4007 // poly_lo = poly_lo * r + Q2
4008 fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_Q2
4009 nop.i 0
4010 }
4011 { .mfi
4012 nop.m 0
4013 fma.s1 FR_rcub = FR_rsq, FR_r, f0 // rcub = r^3
4014 nop.i 0
4015 }
4016 ;;
4017 { .mfi
4018 nop.m 0
4019 fma.s1 fRes3H = fRes4H, f8, f0 // (-|x|*(ln(|x|)-1))hi
4020 extr.u GR_Index3 = GR_X_2, 1, 5 // Extract bits 1-5 of X_2
4021 }
4022 { .mfi
4023 nop.m 0
4024 // poly_hi = Q1 * rsq + r
4025 fma.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r
4026 nop.i 0
4027 }
4028 ;;
4029 { .mfi
4030 shladd GR_ad_tbl_3 = GR_Index3, 4, rTbl3Addr // Point to G_3
4031 fms.s1 fA2L = fB2, fInvX, fA1L // delta(B2*(1/x))
4032 nop.i 0
4033 }
4034 { .mfi
4035 nop.m 0
4036 fnma.s1 fBrnH = fRes1H, f1, fA1L // (-C - S(1/x))hi
4037 nop.i 0
4038 }
4039 ;;
4040 { .mfi
4041 ldfps fG3Dx, fH3Dx = [GR_ad_tbl_3],8 // Load G_3, H_3
4042 fma.s1 fInvX8 = fInvX4, fInvX4, f0 // (1/x)^8
4043 nop.i 0
4044 }
4045 { .mfi
4046 nop.m 0
4047 fma.s1 fB10 = fB10, fRcpX, fB8
4048 nop.i 0
4049 }
4050 ;;
4051
4052 { .mfi
4053 ldfd fh3Dx = [GR_ad_tbl_3] // Load h_3
4054 fma.s1 fB20 = fB20, fInvX4, fB18
4055 nop.i 0
4056 }
4057 { .mfi
4058 nop.m 0
4059 fma.s1 fB14 = fB14, fRcpX, fB12
4060 nop.i 0
4061 }
4062 ;;
4063 { .mfi
4064 nop.m 0
4065 fma.s1 fLnSin36 = fLnSin36, fDx8, fLnSin30
4066 nop.i 0
4067 }
4068 { .mfi
4069 nop.m 0
4070 fma.s1 fLnSin12 = fLnSin12, fDxSqr, fLnSin10
4071 nop.i 0
4072 }
4073 ;;
4074 { .mfi
4075 nop.m 0
4076 fsub.s1 fRes2L = fLnSin2, fRes2H
4077 nop.i 0
4078 }
4079 { .mfi
4080 nop.m 0
4081 fma.s1 fPol = fRes2H, fDxSqr, f0 // high part of LnSin
4082 nop.i 0
4083 }
4084 ;;
4085 { .mfi
4086 nop.m 0
4087 fnma.s1 fResH = fResH, FR_MHalf, fResH // -0.5*ln(|x|)hi
4088 nop.i 0
4089 }
4090 { .mfi
4091 nop.m 0
4092 fmpy.s1 fGDx = fGDx, FR_G2 // G = G_1 * G_2
4093 nop.i 0
4094 }
4095 ;;
4096 { .mfi
4097 nop.m 0
4098 // poly_lo = poly_lo*r^3 + h
4099 fma.s1 FR_poly_lo = FR_poly_lo, FR_rcub, FR_h
4100 nop.i 0
4101 }
4102 { .mfi
4103 nop.m 0
4104 // B2lo*(1/x)hi+ delta(B2*(1/x))
4105 fma.s1 fA2L = fB2L, fInvX, fA2L
4106 nop.i 0
4107 }
4108 ;;
4109 { .mfi
4110 nop.m 0
4111 fma.s1 fB20 = fB20, fInvX4, fB14
4112 nop.i 0
4113 }
4114 { .mfi
4115 nop.m 0
4116 fma.s1 fB10 = fB10, fInvX4, fB6
4117 nop.i 0
4118 }
4119 ;;
4120 { .mfi
4121 nop.m 0
4122 fcvt.xf fFloatNDx = fFloatNDx
4123 nop.i 0
4124 }
4125 { .mfi
4126 nop.m 0
4127 fma.s1 fLnSin14 = fLnSin14, fDx4, fLnSin12
4128 nop.i 0
4129 }
4130 ;;
4131 { .mfi
4132 nop.m 0
4133 fma.s1 fLnSin36 = fLnSin36, fDx8, fLnSin22
4134 nop.i 0
4135 }
4136 { .mfi
4137 nop.m 0
4138 fms.s1 fRes3L = fRes4H, f8, fRes3H // delta(-|x|*(ln(|x|)-1))
4139 nop.i 0
4140 }
4141 ;;
4142 { .mfi
4143 nop.m 0
4144 fmpy.s1 fGDx = fGDx, fG3Dx // G = (G_1 * G_2) * G_3
4145 nop.i 0
4146 }
4147 { .mfi
4148 nop.m 0
4149 // (-|x|*(ln(|x|)-1) - 0.5ln(|x|))hi
4150 fadd.s1 fRes4H = fRes3H, fResH
4151 nop.i 0
4152 }
4153 ;;
4154 { .mfi
4155 nop.m 0
4156 fma.s1 fA2L = fInvXL, fB2, fA2L //(B2*(1/x))lo
4157 nop.i 0
4158 }
4159 { .mfi
4160 nop.m 0
4161 // low part of log(|x|) = Y_lo = poly_hi + poly_lo
4162 fadd.s1 fResL = FR_poly_hi, FR_poly_lo
4163 nop.i 0
4164 }
4165 ;;
4166 { .mfi
4167 nop.m 0
4168 fma.s1 fB20 = fB20, fInvX8, fB10
4169 nop.i 0
4170 }
4171 { .mfi
4172 nop.m 0
4173 fma.s1 fInvX3 = fInvX, fRcpX, f0 // (1/x)^3
4174 nop.i 0
4175 }
4176 ;;
4177 { .mfi
4178 nop.m 0
4179 fadd.s1 fHDx = fHDx, FR_H2 // H = H_1 + H_2
4180 nop.i 0
4181 }
4182 { .mfi
4183 nop.m 0
4184 fadd.s1 fRes5L = fRes5L, fLnSin2L
4185 nop.i 0
4186 }
4187 ;;
4188 { .mfi
4189 nop.m 0
4190 fadd.s1 fRes2L = fRes2L, fRes5H
4191 nop.i 0
4192 }
4193 { .mfi
4194 nop.m 0
4195 fadd.s1 fhDx = fhDx, fh2Dx // h = h_1 + h_2
4196 nop.i 0
4197 }
4198 ;;
4199 { .mfi
4200 nop.m 0
4201 fms.s1 fBrnL = fRes1H, fMOne, fBrnH
4202 nop.i 0
4203 }
4204 { .mfi
4205 nop.m 0
4206 fms.s1 FR_r = fGDx, fNormDx, f1 // r = G * S_hi - 1
4207 nop.i 0
4208 }
4209 ;;
4210 { .mfi
4211 nop.m 0
4212 fma.s1 fRes3L = fResL, f8 , fRes3L // (-|x|*(ln(|x|)-1))lo
4213 nop.i 0
4214 }
4215 { .mfi
4216 nop.m 0
4217 fsub.s1 fRes4L = fRes3H, fRes4H
4218 nop.i 0
4219 }
4220 ;;
4221 { .mfi
4222 nop.m 0
4223 // low part of "Bernulli" polynomial
4224 fma.s1 fB20 = fB20, fInvX3, fA2L
4225 nop.i 0
4226 }
4227 { .mfi
4228 nop.m 0
4229 fnma.s1 fResL = fResL, FR_MHalf, fResL // -0.5*ln(|x|)lo
4230 nop.i 0
4231 }
4232 ;;
4233 { .mfi
4234 nop.m 0
4235 fadd.s1 fHDx = fHDx, fH3Dx // H = (H_1 + H_2) + H_3
4236 nop.i 0
4237 }
4238 { .mfi
4239 nop.m 0
4240 fms.s1 fPolL = fRes2H, fDxSqr, fPol
4241 nop.i 0
4242 }
4243 ;;
4244 { .mfi
4245 nop.m 0
4246 fadd.s1 fhDx = fhDx, fh3Dx // h = (h_1 + h_2) + h_3
4247 nop.i 0
4248 }
4249 { .mfi
4250 nop.m 0
4251 // (-|x|*(ln(|x|)-1) - 0.5ln(|x|) - C - S(1/x))hi
4252 fadd.s1 fB14 = fRes4H, fBrnH
4253 nop.i 0
4254 }
4255 ;;
4256 { .mfi
4257 nop.m 0
4258 // poly_lo = r * Q4 + Q3
4259 fma.s1 FR_poly_lo = FR_r, FR_Q4, FR_Q3
4260 nop.i 0
4261 }
4262 { .mfi
4263 nop.m 0
4264 fmpy.s1 FR_rsq = FR_r, FR_r // rsq = r * r
4265 nop.i 0
4266 }
4267 ;;
4268 { .mfi
4269 nop.m 0
4270 fadd.s1 fRes4L = fRes4L, fResH
4271 nop.i 0
4272 }
4273 { .mfi
4274 nop.m 0
4275 fadd.s1 fBrnL = fBrnL, fA1L
4276 nop.i 0
4277 }
4278 ;;
4279 { .mfi
4280 nop.m 0
4281 // (-|x|*(ln(|x|)-1))lo + (-0.5ln(|x|))lo
4282 fadd.s1 fRes3L = fRes3L, fResL
4283 nop.i 0
4284 }
4285 { .mfi
4286 nop.m 0
4287 fnma.s1 fB20 = fRes1L, f1, fB20 // -Clo - S(1/x)lo
4288 nop.i 0
4289 }
4290 ;;
4291 { .mfi
4292 nop.m 0
4293 fadd.s1 fRes2L = fRes2L, fRes5L // (lnSin4*DeltaX^2 + lnSin2)lo
4294 nop.i 0
4295 }
4296 { .mfi
4297 nop.m 0
4298 fma.s1 fPolL = fDxSqrL, fRes2H, fPolL
4299 nop.i 0
4300 }
4301 ;;
4302 { .mfi
4303 nop.m 0
4304 fma.s1 fLnSin14 = fLnSin14, fDx4, fLnSin8
4305 nop.i 0
4306 }
4307 { .mfi
4308 nop.m 0
4309 fma.s1 fLnSin36 = fLnSin36, fDx8, f0
4310 nop.i 0
4311 }
4312 ;;
4313 { .mfi
4314 nop.m 0
4315 // poly_lo = poly_lo * r + Q2
4316 fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_Q2
4317 nop.i 0
4318 }
4319 { .mfi
4320 nop.m 0
4321 fma.s1 FR_rcub = FR_rsq, FR_r, f0 // rcub = r^3
4322 nop.i 0
4323 }
4324 ;;
4325 { .mfi
4326 nop.m 0
4327 // poly_hi = Q1 * rsq + r
4328 fma.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r
4329 nop.i 0
4330 }
4331 { .mfi
4332 nop.m 0
4333 fsub.s1 fB12 = fRes4H, fB14
4334 nop.i 0
4335 }
4336 ;;
4337 { .mfi
4338 nop.m 0
4339 // (-|x|*(ln(|x|)-1) - 0.5ln(|x|))lo
4340 fadd.s1 fRes4L = fRes4L, fRes3L
4341 nop.i 0
4342 }
4343 { .mfi
4344 nop.m 0
4345 fadd.s1 fBrnL = fBrnL, fB20 // (-C - S(1/x))lo
4346 nop.i 0
4347 }
4348 ;;
4349 { .mfi
4350 nop.m 0
4351 // high part of log(|DeltaX|) = Y_hi = N * log2_hi + H
4352 fma.s1 fLnDeltaH = fFloatNDx, FR_log2_hi, fHDx
4353 nop.i 0
4354 }
4355 { .mfi
4356 nop.m 0
4357 // h = N * log2_lo + h
4358 fma.s1 fhDx = fFloatNDx, FR_log2_lo, fhDx
4359 nop.i 0
4360 }
4361 ;;
4362 { .mfi
4363 nop.m 0
4364 fma.s1 fPolL = fRes2L, fDxSqr, fPolL
4365 nop.i 0
4366 }
4367 { .mfi
4368 nop.m 0
4369 fma.s1 fLnSin14 = fLnSin36, fDxSqr, fLnSin14
4370 nop.i 0
4371 }
4372 ;;
4373 { .mfi
4374 nop.m 0
4375 // (-|x|*(ln(|x|)-1) - 0.5ln(|x|))lo + (- C - S(1/x))lo
4376 fadd.s1 fBrnL = fBrnL, fRes4L
4377 nop.i 0
4378 }
4379 { .mfi
4380 nop.m 0
4381 fadd.s1 fB12 = fB12, fBrnH
4382 nop.i 0
4383 }
4384 ;;
4385 { .mfi
4386 nop.m 0
4387 // poly_lo = poly_lo*r^3 + h
4388 fma.s1 FR_poly_lo = FR_poly_lo, FR_rcub, fhDx
4389 nop.i 0
4390 }
4391 { .mfi
4392 nop.m 0
4393 fnma.s1 fRes1H = fLnDeltaH, f1, fPol//(-ln(|DeltaX|) + LnSin)hi
4394 nop.i 0
4395 }
4396 ;;
4397 { .mfi
4398 nop.m 0
4399 fma.s1 fPolL = fDxSqrL, fRes2L, fPolL
4400 nop.i 0
4401 }
4402 { .mfi
4403 nop.m 0
4404 fma.s1 fLnSin36 = fLnSin14, fDx6, f0
4405 nop.i 0
4406 }
4407 ;;
4408 { .mfi
4409 nop.m 0
4410 // (-|x|*(ln(|x|)-1) - 0.5ln(|x|) - C - S(1/x))lo
4411 fadd.s1 fB12 = fB12, fBrnL
4412 nop.i 0
4413 }
4414 ;;
4415 { .mfi
4416 nop.m 0
4417 // low part of log(|DeltaX|) = Y_lo = poly_hi + poly_lo
4418 fadd.s1 fLnDeltaL= FR_poly_hi, FR_poly_lo
4419 nop.i 0
4420 }
4421 { .mfi
4422 nop.m 0
4423 fms.s1 fRes1L = fLnDeltaH, fMOne, fRes1H
4424 nop.i 0
4425 }
4426 ;;
4427 { .mfi
4428 nop.m 0
4429 fadd.s1 fPolL = fPolL, fLnSin36
4430 nop.i 0
4431 }
4432 { .mfi
4433 nop.m 0
4434 //(-|x|*(ln(|x|)-1)-0.5ln(|x|) - C - S(1/x))hi + (-ln(|DeltaX|) + LnSin)hi
4435 fadd.s1 f8 = fRes1H, fB14
4436 nop.i 0
4437 }
4438 ;;
4439 { .mfi
4440 nop.m 0
4441 //max((-|x|*(ln(|x|)-1)-0.5ln(|x|) - C - S(1/x))hi,
4442 // (-ln(|DeltaX|) + LnSin)hi)
4443 famax.s1 fMaxNegStir = fRes1H, fB14
4444 nop.i 0
4445 }
4446 { .mfi
4447 nop.m 0
4448 //min((-|x|*(ln(|x|)-1)-0.5ln(|x|) - C - S(1/x))hi,
4449 // (-ln(|DeltaX|) + LnSin)hi)
4450 famin.s1 fMinNegStir = fRes1H, fB14
4451 nop.i 0
4452 }
4453 ;;
4454 { .mfi
4455 nop.m 0
4456 fadd.s1 fRes1L = fRes1L, fPol
4457 nop.i 0
4458 }
4459 { .mfi
4460 nop.m 0
4461 // (-ln(|DeltaX|))lo + (LnSin)lo
4462 fnma.s1 fPolL = fLnDeltaL, f1, fPolL
4463 nop.i 0
4464 }
4465 ;;
4466 { .mfi
4467 nop.m 0
4468 fsub.s1 f9 = fMaxNegStir, f8 // delta1
4469 nop.i 0
4470 }
4471 ;;
4472 { .mfi
4473 nop.m 0
4474 fadd.s1 fRes1L = fRes1L, fPolL // (-ln(|DeltaX|) + LnSin)lo
4475 nop.i 0
4476 }
4477 ;;
4478 { .mfi
4479 nop.m 0
4480 fadd.s1 f9 = f9, fMinNegStir
4481 nop.i 0
4482 }
4483 ;;
4484 { .mfi
4485 nop.m 0
4486 fadd.s1 fRes1L = fRes1L, fB12
4487 nop.i 0
4488 }
4489 ;;
4490 { .mfi
4491 // low part of the result
4492 fadd.s1 f9 = f9, fRes1L
4493 nop.i 0
4494 }
4495 ;;
4496 { .mfb
4497 nop.m 0
4498 // final result for -2^63 < x < -6.0 path
4499 fma.s0 f8 = f8, f1, f9
4500 // exit here for -2^63 < x < -6.0 path
4501 br.ret.sptk b0
4502 }
4503 ;;
4504
4505 // here if x falls in neighbourhood of any negative root
4506 // "neighbourhood" typically means that |lgammal(x)| < 0.17
4507 // on the [-3.0,-2.0] range |lgammal(x)| has even less
4508 // magnitude
4509 // rXint contains index of the root
4510 // p10 is set if root belongs to "right" ones
4511 // p11 is set if root belongs to "left" ones
4512 // lgammal(x) is approximated by polynomial of
4513 // 19th degree from (x - root) argument
4514 .align 32
4515 _negRoots:
4516 { .mfi
4517 addl rPolDataPtr= @ltoff(lgammal_right_roots_polynomial_data),gp
4518 nop.f 0
4519 shl rTmpPtr2 = rXint, 7 // (i*16)*8
4520 }
4521 { .mfi
4522 adds rRootsAddr = -288, rRootsBndAddr
4523 nop.f 0
4524 nop.i 0
4525 }
4526 ;;
4527 { .mfi
4528 ldfe fRoot = [rRootsAddr] // FP representation of root
4529 nop.f 0
4530 shl rTmpPtr = rXint, 6 // (i*16)*4
4531 }
4532 { .mfi
4533 (p11) adds rTmpPtr2 = 3536, rTmpPtr2
4534 nop.f 0
4535 nop.i 0
4536 }
4537 ;;
4538 { .mfi
4539 ld8 rPolDataPtr = [rPolDataPtr]
4540 nop.f 0
4541 shladd rTmpPtr = rXint, 4, rTmpPtr // (i*16) + (i*16)*4
4542 }
4543 { .mfi
4544 adds rTmpPtr3 = 32, rTmpPtr2
4545 nop.f 0
4546 nop.i 0
4547 }
4548 ;;
4549 .pred.rel "mutex",p10,p11
4550 { .mfi
4551 add rTmpPtr3 = rTmpPtr, rTmpPtr3
4552 nop.f 0
4553 (p10) cmp.eq p8, p9 = rXRnd, r0
4554 }
4555 { .mfi
4556 // (i*16) + (i*16)*4 + (i*16)*8
4557 add rTmpPtr = rTmpPtr, rTmpPtr2
4558 nop.f 0
4559 (p11) cmp.eq p9, p8 = rXRnd, r0
4560 }
4561 ;;
4562 { .mfi
4563 add rTmpPtr2 = rPolDataPtr, rTmpPtr3
4564 nop.f 0
4565 nop.i 0
4566 }
4567 { .mfi
4568 add rPolDataPtr = rPolDataPtr, rTmpPtr // begin + offsett
4569 nop.f 0
4570 nop.i 0
4571 }
4572 ;;
4573 { .mfi
4574 ldfpd fA0, fA0L = [rPolDataPtr], 16 // A0
4575 nop.f 0
4576 adds rTmpPtr = 112, rTmpPtr2
4577 }
4578 { .mfi
4579 ldfpd fA2, fA2L = [rTmpPtr2], 16 // A2
4580 nop.f 0
4581 cmp.eq p12, p13 = 4, rSgnGamSize
4582 }
4583 ;;
4584 { .mfi
4585 ldfpd fA1, fA1L = [rPolDataPtr], 16 // A1
4586 nop.f 0
4587 nop.i 0
4588 }
4589 { .mfi
4590 ldfe fA3 = [rTmpPtr2], 128 // A4
4591 nop.f 0
4592 nop.i 0
4593 }
4594 ;;
4595 { .mfi
4596 ldfpd fA12, fA13 = [rTmpPtr], 16 // A12, A13
4597 nop.f 0
4598 adds rTmpPtr3 = 64, rPolDataPtr
4599 }
4600 { .mfi
4601 ldfpd fA16, fA17 = [rTmpPtr2], 16 // A16, A17
4602 nop.f 0
4603 adds rPolDataPtr = 32, rPolDataPtr
4604 }
4605 ;;
4606 .pred.rel "mutex",p8,p9
4607 { .mfi
4608 ldfpd fA14, fA15 = [rTmpPtr], 16 // A14, A15
4609 nop.f 0
4610 // sign of GAMMA(x) is negative
4611 (p8) adds rSgnGam = -1, r0
4612 }
4613 { .mfi
4614 ldfpd fA18, fA19 = [rTmpPtr2], 16 // A18, A19
4615 nop.f 0
4616 // sign of GAMMA(x) is positive
4617 (p9) adds rSgnGam = 1, r0
4618 }
4619 ;;
4620 { .mfi
4621 ldfe fA4 = [rPolDataPtr], 16 // A4
4622 nop.f 0
4623 nop.i 0
4624 }
4625 { .mfi
4626 ldfpd fA6, fA7 = [rTmpPtr3], 16 // A6, A7
4627 nop.f 0
4628 nop.i 0
4629 }
4630 ;;
4631 { .mfi
4632 ldfe fA5 = [rPolDataPtr], 16 // A5
4633 // if x equals to (rounded) root exactly
4634 fcmp.eq.s1 p6, p0 = f8, fRoot
4635 nop.i 0
4636 }
4637 { .mfi
4638 ldfpd fA8, fA9 = [rTmpPtr3], 16 // A8, A9
4639 fms.s1 FR_FracX = f8, f1, fRoot
4640 nop.i 0
4641 }
4642 ;;
4643 { .mfi
4644 // store signgam if size of variable is 4 bytes
4645 (p12) st4 [rSgnGamAddr] = rSgnGam
4646 nop.f 0
4647 nop.i 0
4648 }
4649 { .mfb
4650 // store signgam if size of variable is 8 bytes
4651 (p13) st8 [rSgnGamAddr] = rSgnGam
4652 // answer if x equals to (rounded) root exactly
4653 (p6) fadd.s0 f8 = fA0, fA0L
4654 // exit if x equals to (rounded) root exactly
4655 (p6) br.ret.spnt b0
4656 }
4657 ;;
4658 { .mmf
4659 ldfpd fA10, fA11 = [rTmpPtr3], 16 // A10, A11
4660 nop.m 0
4661 nop.f 0
4662 }
4663 ;;
4664 { .mfi
4665 nop.m 0
4666 fma.s1 fResH = fA2, FR_FracX, f0 // (A2*x)hi
4667 nop.i 0
4668 }
4669 { .mfi
4670 nop.m 0
4671 fma.s1 fA4L = FR_FracX, FR_FracX, f0 // x^2
4672 nop.i 0
4673 }
4674 ;;
4675 { .mfi
4676 nop.m 0
4677 fma.s1 fA17 = fA17, FR_FracX, fA16
4678 nop.i 0
4679 }
4680 {.mfi
4681 nop.m 0
4682 fma.s1 fA13 = fA13, FR_FracX, fA12
4683 nop.i 0
4684 }
4685 ;;
4686 { .mfi
4687 nop.m 0
4688 fma.s1 fA19 = fA19, FR_FracX, fA18
4689 nop.i 0
4690 }
4691 {.mfi
4692 nop.m 0
4693 fma.s1 fA15 = fA15, FR_FracX, fA14
4694 nop.i 0
4695 }
4696 ;;
4697 {.mfi
4698 nop.m 0
4699 fma.s1 fPol = fA7, FR_FracX, fA6
4700 nop.i 0
4701 }
4702 ;;
4703 {.mfi
4704 nop.m 0
4705 fma.s1 fA9 = fA9, FR_FracX, fA8
4706 nop.i 0
4707 }
4708 ;;
4709 { .mfi
4710 nop.m 0
4711 fms.s1 fResL = fA2, FR_FracX, fResH // delta(A2*x)
4712 nop.i 0
4713 }
4714 {.mfi
4715 nop.m 0
4716 fadd.s1 fRes1H = fResH, fA1 // (A2*x + A1)hi
4717 nop.i 0
4718 }
4719 ;;
4720 { .mfi
4721 nop.m 0
4722 fma.s1 fA11 = fA11, FR_FracX, fA10
4723 nop.i 0
4724 }
4725 {.mfi
4726 nop.m 0
4727 fma.s1 fA5L = fA4L, fA4L, f0 // x^4
4728 nop.i 0
4729 }
4730 ;;
4731 { .mfi
4732 nop.m 0
4733 fma.s1 fA19 = fA19, fA4L, fA17
4734 nop.i 0
4735 }
4736 {.mfi
4737 nop.m 0
4738 fma.s1 fA15 = fA15, fA4L, fA13
4739 nop.i 0
4740 }
4741 ;;
4742 { .mfi
4743 nop.m 0
4744 fma.s1 fPol = fPol, FR_FracX, fA5
4745 nop.i 0
4746 }
4747 {.mfi
4748 nop.m 0
4749 fma.s1 fA3L = fA4L, FR_FracX, f0 // x^3
4750 nop.i 0
4751 }
4752 ;;
4753 { .mfi
4754 nop.m 0
4755 // delta(A2*x) + A2L*x = (A2*x)lo
4756 fma.s1 fResL = fA2L, FR_FracX, fResL
4757 nop.i 0
4758 }
4759 {.mfi
4760 nop.m 0
4761 fsub.s1 fRes1L = fA1, fRes1H
4762 nop.i 0
4763 }
4764 ;;
4765 { .mfi
4766 nop.m 0
4767 fma.s1 fA11 = fA11, fA4L, fA9
4768 nop.i 0
4769 }
4770 {.mfi
4771 nop.m 0
4772 fma.s1 fA19 = fA19, fA5L, fA15
4773 nop.i 0
4774 }
4775 ;;
4776 {.mfi
4777 nop.m 0
4778 fma.s1 fPol = fPol, FR_FracX, fA4
4779 nop.i 0
4780 }
4781 ;;
4782 { .mfi
4783 nop.m 0
4784 fadd.s1 fResL = fResL, fA1L // (A2*x)lo + A1
4785 nop.i 0
4786 }
4787 {.mfi
4788 nop.m 0
4789 fadd.s1 fRes1L = fRes1L, fResH
4790 nop.i 0
4791 }
4792 ;;
4793 { .mfi
4794 nop.m 0
4795 fma.s1 fRes2H = fRes1H, FR_FracX, f0 // ((A2*x + A1)*x)hi
4796 nop.i 0
4797 }
4798 ;;
4799 {.mfi
4800 nop.m 0
4801 fma.s1 fA19 = fA19, fA5L, fA11
4802 nop.i 0
4803 }
4804 ;;
4805 {.mfi
4806 nop.m 0
4807 fma.s1 fPol = fPol, FR_FracX, fA3
4808 nop.i 0
4809 }
4810 ;;
4811 { .mfi
4812 nop.m 0
4813 fadd.s1 fRes1L = fRes1L, fResL // (A2*x + A1)lo
4814 nop.i 0
4815 }
4816 ;;
4817 { .mfi
4818 nop.m 0
4819 // delta((A2*x + A1)*x)
4820 fms.s1 fRes2L = fRes1H, FR_FracX, fRes2H
4821 nop.i 0
4822 }
4823 {.mfi
4824 nop.m 0
4825 fadd.s1 fRes3H = fRes2H, fA0 // ((A2*x + A1)*x + A0)hi
4826 nop.i 0
4827 }
4828 ;;
4829 { .mfi
4830 nop.m 0
4831 fma.s1 fA19 = fA19, fA5L, f0
4832 nop.i 0
4833 }
4834
4835 ;;
4836 { .mfi
4837 nop.m 0
4838 fma.s1 fRes2L = fRes1L, FR_FracX, fRes2L // ((A2*x + A1)*x)lo
4839 nop.i 0
4840 }
4841 {.mfi
4842 nop.m 0
4843 fsub.s1 fRes3L = fRes2H, fRes3H
4844 nop.i 0
4845 }
4846 ;;
4847 {.mfi
4848 nop.m 0
4849 fma.s1 fPol = fA19, FR_FracX, fPol
4850 nop.i 0
4851 }
4852 ;;
4853 { .mfi
4854 nop.m 0
4855 fadd.s1 fRes3L = fRes3L, fA0
4856 nop.i 0
4857 }
4858 {.mfi
4859 nop.m 0
4860 fadd.s1 fRes2L = fRes2L, fA0L // ((A2*x + A1)*x)lo + A0L
4861 nop.i 0
4862 }
4863 ;;
4864 { .mfi
4865 nop.m 0
4866 fadd.s1 fRes3L = fRes3L, fRes2L // (((A2*x + A1)*x) + A0)lo
4867 nop.i 0
4868 }
4869 ;;
4870 {.mfi
4871 nop.m 0
4872 fma.s1 fRes3L = fPol, fA3L, fRes3L
4873 nop.i 0
4874 }
4875 ;;
4876 { .mfb
4877 nop.m 0
4878 // final result for arguments which are close to negative roots
4879 fma.s0 f8 = fRes3H, f1, fRes3L
4880 // exit here for arguments which are close to negative roots
4881 br.ret.sptk b0
4882 }
4883 ;;
4884
4885 // here if |x| < 0.5
4886 .align 32
4887 lgammal_0_half:
4888 { .mfi
4889 ld4 GR_Z_1 = [rZ1offsett] // Load Z_1
4890 fma.s1 fA4L = f8, f8, f0 // x^2
4891 addl rPolDataPtr = @ltoff(lgammal_0_Half_data), gp
4892 }
4893 { .mfi
4894 shladd GR_ad_tbl_1 = GR_Index1, 4, rTbl1Addr// Point to G_1
4895 nop.f 0
4896 addl rLnSinDataPtr = @ltoff(lgammal_lnsin_data), gp
4897 }
4898 ;;
4899 { .mfi
4900 ldfps FR_G, FR_H = [GR_ad_tbl_1],8 // Load G_1, H_1
4901 nop.f 0
4902 // Point to Constants_Z_2
4903 add GR_ad_z_2 = 0x140, GR_ad_z_1
4904 }
4905 { .mfi
4906 add GR_ad_q = -0x60, GR_ad_z_1 // Point to Constants_Q
4907 nop.f 0
4908 // Point to Constants_G_H_h2
4909 add GR_ad_tbl_2 = 0x180, GR_ad_z_1
4910 }
4911 ;;
4912 { .mfi
4913 ld8 rPolDataPtr = [rPolDataPtr]
4914 nop.f 0
4915 // Point to Constants_G_H_h3
4916 add GR_ad_tbl_3 = 0x280, GR_ad_z_1
4917 }
4918 { .mfi
4919 ldfd FR_h = [GR_ad_tbl_1] // Load h_1
4920 nop.f 0
4921 sub GR_N = rExpX, rExpHalf, 1
4922 }
4923 ;;
4924 { .mfi
4925 ld8 rLnSinDataPtr = [rLnSinDataPtr]
4926 nop.f 0
4927 // Get bits 30-15 of X_0 * Z_1
4928 pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15
4929 }
4930 { .mfi
4931 ldfe FR_log2_hi = [GR_ad_q],16 // Load log2_hi
4932 nop.f 0
4933 sub GR_N = r0, GR_N
4934 }
4935 ;;
4936 //
4937 // For performance, don't use result of pmpyshr2.u for 4 cycles.
4938 //
4939 { .mfi
4940 ldfe FR_log2_lo = [GR_ad_q], 16 // Load log2_lo
4941 nop.f 0
4942 add rTmpPtr2 = 320, rPolDataPtr
4943 }
4944 { .mfi
4945 add rTmpPtr = 32, rPolDataPtr
4946 nop.f 0
4947 // exponent of 0.25
4948 adds rExp2 = -1, rExpHalf
4949 }
4950 ;;
4951 { .mfi
4952 ldfpd fA3, fA3L = [rPolDataPtr], 16 // A3
4953 fma.s1 fA5L = fA4L, fA4L, f0 // x^4
4954 nop.i 0
4955 }
4956 { .mfi
4957 ldfpd fA1, fA1L = [rTmpPtr], 16 // A1
4958 fms.s1 fB8 = f8, f8, fA4L // x^2 - <x^2>
4959 // set p6 if -0.5 < x <= -0.25
4960 (p15) cmp.eq.unc p6, p0 = rExpX, rExp2
4961 }
4962 ;;
4963 { .mfi
4964 ldfpd fA2, fA2L = [rPolDataPtr], 16 // A2
4965 nop.f 0
4966 // set p6 if -0.5 < x <= -0.40625
4967 (p6) cmp.le.unc p6, p0 = 10, GR_Index1
4968 }
4969 { .mfi
4970 ldfe fA21 = [rTmpPtr2], -16 // A21
4971 // Put integer N into rightmost significand
4972 nop.f 0
4973 adds rTmpPtr = 240, rTmpPtr
4974 }
4975 ;;
4976 { .mfi
4977 setf.sig fFloatN = GR_N
4978 nop.f 0
4979 extr.u GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1
4980 }
4981 { .mfi
4982 ldfe FR_Q4 = [GR_ad_q], 16 // Load Q4
4983 nop.f 0
4984 adds rPolDataPtr = 304, rPolDataPtr
4985 }
4986 ;;
4987 { .mfi
4988 ldfe fA20 = [rTmpPtr2], -32 // A20
4989 nop.f 0
4990 shladd GR_ad_z_2 = GR_Index2, 2, GR_ad_z_2 // Point to Z_2
4991 }
4992 { .mfi
4993 ldfe fA19 = [rTmpPtr], -32 // A19
4994 nop.f 0
4995 shladd GR_ad_tbl_2 = GR_Index2, 4, GR_ad_tbl_2// Point to G_2
4996 }
4997 ;;
4998 { .mfi
4999 ldfe fA17 = [rTmpPtr], -32 // A17
5000 nop.f 0
5001 adds rTmpPtr3 = 8, GR_ad_tbl_2
5002 }
5003 { .mfb
5004 ldfe fA18 = [rTmpPtr2], -32 // A18
5005 nop.f 0
5006 // branch to special path for -0.5 < x <= 0.40625
5007 (p6) br.cond.spnt lgammal_near_neg_half
5008 }
5009 ;;
5010 { .mmf
5011 ld4 GR_Z_2 = [GR_ad_z_2] // Load Z_2
5012 ldfe fA15 = [rTmpPtr], -32 // A15
5013 fma.s1 fB20 = fA5L, fA5L, f0 // x^8
5014 }
5015 ;;
5016 { .mmf
5017 ldfe fA16 = [rTmpPtr2], -32 // A16
5018 ldfe fA13 = [rTmpPtr], -32 // A13
5019 fms.s1 fB16 = fA4L, fA4L, fA5L
5020 }
5021 ;;
5022 { .mmf
5023 ldfps FR_G2, FR_H2 = [GR_ad_tbl_2], 8 // Load G_2, H_2
5024 ldfd FR_h2 = [rTmpPtr3] // Load h_2
5025 fmerge.s fB10 = f8, fA5L // sign(x) * x^4
5026 }
5027 ;;
5028 { .mmi
5029 ldfe fA14 = [rTmpPtr2], -32 // A14
5030 ldfe fA11 = [rTmpPtr], -32 // A11
5031 // Get bits 30-15 of X_1 * Z_2
5032 pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15
5033 }
5034 ;;
5035 //
5036 // For performance, don't use result of pmpyshr2.u for 4 cycles.
5037 //
5038 { .mfi
5039 ldfe fA12 = [rTmpPtr2], -32 // A12
5040 fma.s1 fRes4H = fA3, fAbsX, f0
5041 adds rTmpPtr3 = 16, GR_ad_q
5042 }
5043 { .mfi
5044 ldfe fA9 = [rTmpPtr], -32 // A9
5045 nop.f 0
5046 nop.i 0
5047 }
5048 ;;
5049 { .mmf
5050 ldfe fA10 = [rTmpPtr2], -32 // A10
5051 ldfe fA7 = [rTmpPtr], -32 // A7
5052 fma.s1 fB18 = fB20, fB20, f0 // x^16
5053 }
5054 ;;
5055 { .mmf
5056 ldfe fA8 = [rTmpPtr2], -32 // A8
5057 ldfe fA22 = [rPolDataPtr], 16 // A22
5058 fcvt.xf fFloatN = fFloatN
5059 }
5060 ;;
5061 { .mfi
5062 ldfe fA5 = [rTmpPtr], -32 // A5
5063 fma.s1 fA21 = fA21, fAbsX, fA20 // v16
5064 extr.u GR_Index3 = GR_X_2, 1, 5 // Extract bits 1-5 of X_2
5065 }
5066 { .mfi
5067 ldfe fA6 = [rTmpPtr2], -32 // A6
5068 nop.f 0
5069 nop.i 0
5070 }
5071 ;;
5072 { .mmf
5073 // Point to G_3
5074 shladd GR_ad_tbl_3 = GR_Index3, 4, GR_ad_tbl_3
5075 ldfe fA4 = [rTmpPtr2], -32 // A4
5076 fma.s1 fA19 = fA19, fAbsX, fA18 // v13
5077 }
5078 ;;
5079 .pred.rel "mutex",p14,p15
5080 { .mfi
5081 ldfps FR_G3, FR_H3 = [GR_ad_tbl_3],8 // Load G_3, H_3
5082 fms.s1 fRes4L = fA3, fAbsX, fRes4H
5083 (p14) adds rSgnGam = 1, r0
5084 }
5085 { .mfi
5086 cmp.eq p6, p7 = 4, rSgnGamSize
5087 fadd.s1 fRes2H = fRes4H, fA2
5088 (p15) adds rSgnGam = -1, r0
5089 }
5090 ;;
5091
5092 { .mfi
5093 ldfd FR_h3 = [GR_ad_tbl_3] // Load h_3
5094 fma.s1 fA17 = fA17, fAbsX, fA16 // v12
5095 nop.i 0
5096 }
5097 ;;
5098 { .mfi
5099 ldfe FR_Q3 = [GR_ad_q], 32 // Load Q3
5100 fmpy.s1 FR_G = FR_G, FR_G2 // G = G_1 * G_2
5101 nop.i 0
5102 }
5103 { .mfi
5104 ldfe FR_Q2 = [rTmpPtr3], 16 // Load Q2
5105 fadd.s1 FR_H = FR_H, FR_H2 // H = H_1 + H_2
5106 nop.i 0
5107 }
5108 ;;
5109 { .mfi
5110 ldfe FR_Q1 = [GR_ad_q] // Load Q1
5111 fma.s1 fA15 = fA15, fAbsX, fA14 // v8
5112 nop.i 0
5113 }
5114 { .mfi
5115 adds rTmpPtr3 = 32, rLnSinDataPtr
5116 fadd.s1 FR_h = FR_h, FR_h2 // h = h_1 + h_2
5117 nop.i 0
5118 }
5119 ;;
5120 { .mmf
5121 ldfpd fLnSin2, fLnSin2L = [rLnSinDataPtr], 16
5122 ldfe fLnSin6 = [rTmpPtr3], 32
5123 fma.s1 fA13 = fA13, fAbsX, fA12 // v7
5124
5125 }
5126 ;;
5127 { .mfi
5128 ldfe fLnSin4 = [rLnSinDataPtr], 32
5129 fma.s1 fRes4L = fA3L, fAbsX, fRes4L
5130 nop.i 0
5131 }
5132 { .mfi
5133 ldfe fLnSin10 = [rTmpPtr3], 32
5134 fsub.s1 fRes2L = fA2, fRes2H
5135 nop.i 0
5136 }
5137 ;;
5138 { .mfi
5139 ldfe fLnSin8 = [rLnSinDataPtr], 32
5140 fma.s1 fResH = fRes2H, fAbsX, f0
5141 nop.i 0
5142 }
5143 { .mfi
5144 ldfe fLnSin14 = [rTmpPtr3], 32
5145 fma.s1 fA22 = fA22, fA4L, fA21 // v15
5146 nop.i 0
5147 }
5148 ;;
5149 { .mfi
5150 ldfe fLnSin12 = [rLnSinDataPtr], 32
5151 fma.s1 fA9 = fA9, fAbsX, fA8 // v4
5152 nop.i 0
5153 }
5154 { .mfi
5155 ldfd fLnSin18 = [rTmpPtr3], 16
5156 fma.s1 fA11 = fA11, fAbsX, fA10 // v5
5157 nop.i 0
5158 }
5159 ;;
5160 { .mfi
5161 ldfe fLnSin16 = [rLnSinDataPtr], 24
5162 fma.s1 fA19 = fA19, fA4L, fA17 // v11
5163 nop.i 0
5164 }
5165 { .mfi
5166 ldfd fLnSin22 = [rTmpPtr3], 16
5167 fma.s1 fPolL = fA7, fAbsX, fA6
5168 nop.i 0
5169 }
5170 ;;
5171 { .mfi
5172 ldfd fLnSin20 = [rLnSinDataPtr], 16
5173 fmpy.s1 FR_G = FR_G, FR_G3 // G = (G_1 * G_2) * G_3
5174 nop.i 0
5175 }
5176 { .mfi
5177 ldfd fLnSin26 = [rTmpPtr3], 16
5178 fadd.s1 FR_H = FR_H, FR_H3 // H = (H_1 + H_2) + H_3
5179 nop.i 0
5180 }
5181 ;;
5182 { .mfi
5183 ldfd fLnSin24 = [rLnSinDataPtr], 16
5184 fadd.s1 fRes2L = fRes2L, fRes4H
5185 nop.i 0
5186 }
5187 { .mfi
5188 ldfd fLnSin30 = [rTmpPtr3], 16
5189 fadd.s1 fA2L = fA2L, fRes4L
5190 nop.i 0
5191 }
5192 ;;
5193 { .mfi
5194 ldfd fLnSin28 = [rLnSinDataPtr], 16
5195 fms.s1 fResL = fRes2H, fAbsX, fResH
5196 nop.i 0
5197 }
5198 { .mfi
5199 ldfd fLnSin34 = [rTmpPtr3], 8
5200 fadd.s1 fRes2H = fResH, fA1
5201 nop.i 0
5202 }
5203 ;;
5204 { .mfi
5205 ldfd fLnSin32 = [rLnSinDataPtr]
5206 fma.s1 fA11 = fA11, fA4L, fA9 // v3
5207 nop.i 0
5208 }
5209 { .mfi
5210 ldfd fLnSin36 = [rTmpPtr3]
5211 fma.s1 fA15 = fA15, fA4L, fA13 // v6
5212 nop.i 0
5213 }
5214 ;;
5215
5216 { .mfi
5217 // store signgam if size of variable is 4 bytes
5218 (p6) st4 [rSgnGamAddr] = rSgnGam
5219 fadd.s1 FR_h = FR_h, FR_h3 // h = (h_1 + h_2) + h_3
5220 nop.i 0
5221 }
5222 { .mfi
5223 // store signgam if size of variable is 8 bytes
5224 (p7) st8 [rSgnGamAddr] = rSgnGam
5225 fma.s1 fA5 = fA5, fAbsX, fA4
5226 nop.i 0
5227 }
5228 ;;
5229 { .mfi
5230 nop.m 0
5231 fms.s1 FR_r = FR_G, fSignifX, f1 // r = G * S_hi - 1
5232 nop.i 0
5233 }
5234 { .mfi
5235 nop.m 0
5236 // High part of the log(|x|): Y_hi = N * log2_hi + H
5237 fms.s1 FR_log2_hi = fFloatN, FR_log2_hi, FR_H
5238 nop.i 0
5239 }
5240 ;;
5241 { .mfi
5242 nop.m 0
5243 fadd.s1 fA3L = fRes2L, fA2L
5244 nop.i 0
5245 }
5246 { .mfi
5247 nop.m 0
5248 fma.s1 fA22 = fA22, fA5L, fA19
5249 nop.i 0
5250 }
5251 ;;
5252 { .mfi
5253 nop.m 0
5254 fsub.s1 fRes2L = fA1, fRes2H
5255 nop.i 0
5256 }
5257 { .mfi
5258 nop.m 0
5259 fma.s1 fRes3H = fRes2H, f8, f0
5260 nop.i 0
5261 }
5262 ;;
5263 { .mfi
5264 nop.m 0
5265 fma.s1 fA15 = fA15, fA5L, fA11 // v2
5266 nop.i 0
5267 }
5268 { .mfi
5269 nop.m 0
5270 fma.s1 fLnSin18 = fLnSin18, fA4L, fLnSin16
5271 nop.i 0
5272 }
5273 ;;
5274 { .mfi
5275 nop.m 0
5276 // h = N * log2_lo + h
5277 fms.s1 FR_h = fFloatN, FR_log2_lo, FR_h
5278 nop.i 0
5279 }
5280 { .mfi
5281 nop.m 0
5282 fma.s1 fPolL = fPolL, fA4L, fA5
5283 nop.i 0
5284 }
5285 ;;
5286 { .mfi
5287 nop.m 0
5288 // poly_lo = r * Q4 + Q3
5289 fma.s1 FR_poly_lo = FR_r, FR_Q4, FR_Q3
5290 nop.i 0
5291 }
5292 { .mfi
5293 nop.m 0
5294 fmpy.s1 FR_rsq = FR_r, FR_r // rsq = r * r
5295 nop.i 0
5296 }
5297 ;;
5298 { .mfi
5299 nop.m 0
5300 fma.s1 fResL = fA3L, fAbsX, fResL
5301 nop.i 0
5302 }
5303 { .mfi
5304 nop.m 0
5305 fma.s1 fLnSin30 = fLnSin30, fA4L, fLnSin28
5306 nop.i 0
5307 }
5308 ;;
5309 { .mfi
5310 nop.m 0
5311 fadd.s1 fRes2L = fRes2L, fResH
5312 nop.i 0
5313 }
5314 { .mfi
5315 nop.m 0
5316 fms.s1 fRes3L = fRes2H, f8, fRes3H
5317 nop.i 0
5318 }
5319 ;;
5320 { .mfi
5321 nop.m 0
5322 fadd.s1 fRes1H = fRes3H, FR_log2_hi
5323 nop.i 0
5324 }
5325 { .mfi
5326 nop.m 0
5327 fma.s1 fPol = fB20, fA22, fA15
5328 nop.i 0
5329 }
5330 ;;
5331 { .mfi
5332 nop.m 0
5333 fma.s1 fLnSin34 = fLnSin34, fA4L, fLnSin32
5334 nop.i 0
5335 }
5336 { .mfi
5337 nop.m 0
5338 fma.s1 fLnSin14 = fLnSin14, fA4L, fLnSin12
5339 nop.i 0
5340 }
5341 ;;
5342
5343 { .mfi
5344 nop.m 0
5345 // poly_lo = poly_lo * r + Q2
5346 fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_Q2
5347 nop.i 0
5348 }
5349 { .mfi
5350 nop.m 0
5351 fnma.s1 FR_rcub = FR_rsq, FR_r, f0 // rcub = r^3
5352 nop.i 0
5353 }
5354 ;;
5355 { .mfi
5356 nop.m 0
5357 // poly_hi = Q1 * rsq + r
5358 fma.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r
5359 nop.i 0
5360 }
5361 { .mfi
5362 nop.m 0
5363 fadd.s1 fA1L = fA1L, fResL
5364 nop.i 0
5365 }
5366 ;;
5367
5368 { .mfi
5369 nop.m 0
5370 fma.s1 fLnSin22 = fLnSin22, fA4L, fLnSin20
5371 nop.i 0
5372 }
5373 { .mfi
5374 nop.m 0
5375 fma.s1 fLnSin26 = fLnSin26, fA4L, fLnSin24
5376 nop.i 0
5377 }
5378 ;;
5379
5380 { .mfi
5381 nop.m 0
5382 fsub.s1 fRes1L = FR_log2_hi, fRes1H
5383 nop.i 0
5384 }
5385 { .mfi
5386 nop.m 0
5387 fma.s1 fPol = fPol, fA5L, fPolL
5388 nop.i 0
5389 }
5390 ;;
5391 { .mfi
5392 nop.m 0
5393 fma.s1 fLnSin34 = fLnSin36, fA5L, fLnSin34
5394 nop.i 0
5395 }
5396 { .mfi
5397 nop.m 0
5398 fma.s1 fLnSin18 = fLnSin18, fA5L, fLnSin14
5399 nop.i 0
5400 }
5401 ;;
5402 { .mfi
5403 nop.m 0
5404 fma.s1 fLnSin6 = fLnSin6, fA4L, fLnSin4
5405 nop.i 0
5406 }
5407 { .mfi
5408 nop.m 0
5409 fma.s1 fLnSin10 = fLnSin10, fA4L, fLnSin8
5410 nop.i 0
5411 }
5412 ;;
5413 { .mfi
5414 nop.m 0
5415 // poly_hi = Q1 * rsq + r
5416 fma.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r
5417 nop.i 0
5418 }
5419 { .mfi
5420 nop.m 0
5421 fadd.s1 fRes2L = fRes2L, fA1L
5422 nop.i 0
5423 }
5424 ;;
5425 { .mfi
5426 nop.m 0
5427 // poly_lo = poly_lo*r^3 + h
5428 fma.s1 FR_poly_lo = FR_poly_lo, FR_rcub, FR_h
5429 nop.i 0
5430 }
5431 { .mfi
5432 nop.m 0
5433 fma.s1 fB2 = fLnSin2, fA4L, f0
5434 nop.i 0
5435 }
5436 ;;
5437 { .mfi
5438 nop.m 0
5439 fadd.s1 fRes1L = fRes1L, fRes3H
5440 nop.i 0
5441 }
5442 { .mfi
5443 nop.m 0
5444 fma.s1 fPol = fPol, fB10, f0
5445 nop.i 0
5446 }
5447 ;;
5448 { .mfi
5449 nop.m 0
5450 fma.s1 fLnSin26 = fLnSin26, fA5L, fLnSin22
5451 nop.i 0
5452 }
5453 { .mfi
5454 nop.m 0
5455 fma.s1 fLnSin34 = fLnSin34, fA5L, fLnSin30
5456 nop.i 0
5457 }
5458 ;;
5459 { .mfi
5460 nop.m 0
5461 fma.s1 fLnSin10 = fLnSin10, fA5L, fLnSin6
5462 nop.i 0
5463 }
5464 { .mfi
5465 nop.m 0
5466 fma.s1 fLnSin2L = fLnSin2L, fA4L, f0
5467 nop.i 0
5468 }
5469 ;;
5470
5471 { .mfi
5472 nop.m 0
5473 fma.s1 fRes3L = fRes2L, f8, fRes3L
5474 nop.i 0
5475 }
5476 ;;
5477 { .mfi
5478 nop.m 0
5479 // Y_lo = poly_hi + poly_lo
5480 fsub.s1 FR_log2_lo = FR_poly_lo, FR_poly_hi
5481 nop.i 0
5482 }
5483 { .mfi
5484 nop.m 0
5485 fms.s1 fB4 = fLnSin2, fA4L, fB2
5486 nop.i 0
5487 }
5488 ;;
5489 { .mfi
5490 nop.m 0
5491 fadd.s1 fRes2H = fRes1H, fPol
5492 nop.i 0
5493 }
5494 ;;
5495 { .mfi
5496 nop.m 0
5497 fma.s1 fLnSin34 = fLnSin34, fB20, fLnSin26
5498 nop.i 0
5499 }
5500 ;;
5501 { .mfi
5502 nop.m 0
5503 fma.s1 fLnSin18 = fLnSin18, fB20, fLnSin10
5504 nop.i 0
5505 }
5506 { .mfi
5507 nop.m 0
5508 fma.s1 fLnSin2L = fB8, fLnSin2, fLnSin2L
5509 nop.i 0
5510 }
5511 ;;
5512
5513 { .mfi
5514 nop.m 0
5515 fadd.s1 FR_log2_lo = FR_log2_lo, fRes3L
5516 nop.i 0
5517 }
5518 ;;
5519 { .mfi
5520 nop.m 0
5521 fsub.s1 fRes2L = fRes1H, fRes2H
5522 nop.i 0
5523 }
5524 ;;
5525 { .mfi
5526 nop.m 0
5527 fma.s1 fB6 = fLnSin34, fB18, fLnSin18
5528 nop.i 0
5529 }
5530 { .mfi
5531 nop.m 0
5532 fadd.s1 fB4 = fLnSin2L, fB4
5533 nop.i 0
5534 }
5535 ;;
5536
5537 { .mfi
5538 nop.m 0
5539 fadd.s1 fRes1L = fRes1L, FR_log2_lo
5540 nop.i 0
5541 }
5542 ;;
5543 { .mfi
5544 nop.m 0
5545 fadd.s1 fRes2L = fRes2L, fPol
5546 nop.i 0
5547 }
5548 ;;
5549 { .mfi
5550 nop.m 0
5551 fma.s1 fB12 = fB6, fA5L, f0
5552 nop.i 0
5553 }
5554 ;;
5555 { .mfi
5556 nop.m 0
5557 fadd.s1 fRes2L = fRes2L, fRes1L
5558 nop.i 0
5559 }
5560 ;;
5561
5562 { .mfi
5563 nop.m 0
5564 fms.s1 fB14 = fB6, fA5L, fB12
5565 nop.i 0
5566 }
5567 { .mfb
5568 nop.m 0
5569 fadd.s1 fLnSin30 = fB2, fB12
5570 // branch out if x is negative
5571 (p15) br.cond.spnt _O_Half_neg
5572 }
5573 ;;
5574 { .mfb
5575 nop.m 0
5576 // sign(x)*Pol(|x|) - log(|x|)
5577 fma.s0 f8 = fRes2H, f1, fRes2L
5578 // it's an answer already for positive x
5579 // exit if 0 < x < 0.5
5580 br.ret.sptk b0
5581 }
5582 ;;
5583
5584 // here if x is negative and |x| < 0.5
5585 .align 32
5586 _O_Half_neg:
5587 { .mfi
5588 nop.m 0
5589 fma.s1 fB14 = fB16, fB6, fB14
5590 nop.i 0
5591 }
5592 { .mfi
5593 nop.m 0
5594 fsub.s1 fLnSin16 = fB2, fLnSin30
5595 nop.i 0
5596 }
5597 ;;
5598 { .mfi
5599 nop.m 0
5600 fadd.s1 fResH = fLnSin30, fRes2H
5601 nop.i 0
5602 }
5603 ;;
5604 { .mfi
5605 nop.m 0
5606 fadd.s1 fLnSin16 = fLnSin16, fB12
5607 nop.i 0
5608 }
5609 { .mfi
5610 nop.m 0
5611 fadd.s1 fB4 = fB14, fB4
5612 nop.i 0
5613 }
5614 ;;
5615 { .mfi
5616 nop.m 0
5617 fadd.s1 fLnSin16 = fB4, fLnSin16
5618 nop.i 0
5619 }
5620 { .mfi
5621 nop.m 0
5622 fsub.s1 fResL = fRes2H, fResH
5623 nop.i 0
5624 }
5625 ;;
5626 { .mfi
5627 nop.m 0
5628 fadd.s1 fResL = fResL, fLnSin30
5629 nop.i 0
5630 }
5631 { .mfi
5632 nop.m 0
5633 fadd.s1 fLnSin16 = fLnSin16, fRes2L
5634 nop.i 0
5635 }
5636 ;;
5637 { .mfi
5638 nop.m 0
5639 fadd.s1 fResL = fResL, fLnSin16
5640 nop.i 0
5641 }
5642 ;;
5643 { .mfb
5644 nop.m 0
5645 // final result for -0.5 < x < 0
5646 fma.s0 f8 = fResH, f1, fResL
5647 // exit for -0.5 < x < 0
5648 br.ret.sptk b0
5649 }
5650 ;;
5651
5652 // here if x >= 8.0
5653 // there are two computational paths:
5654 // 1) For x >10.0 Stirling's formula is used
5655 // 2) Polynomial approximation for 8.0 <= x <= 10.0
5656 .align 32
5657 lgammal_big_positive:
5658 { .mfi
5659 addl rPolDataPtr = @ltoff(lgammal_data), gp
5660 fmerge.se fSignifX = f1, f8
5661 // Get high 15 bits of significand
5662 extr.u GR_X_0 = rSignifX, 49, 15
5663 }
5664 {.mfi
5665 shladd rZ1offsett = GR_Index1, 2, GR_ad_z_1 // Point to Z_1
5666 fnma.s1 fInvX = f8, fRcpX, f1 // start of 1st NR iteration
5667 adds rSignif1andQ = 0x5, r0
5668 }
5669 ;;
5670 {.mfi
5671 ld4 GR_Z_1 = [rZ1offsett] // Load Z_1
5672 nop.f 0
5673 shl rSignif1andQ = rSignif1andQ, 61 // significand of 1.25
5674 }
5675 { .mfi
5676 cmp.eq p8, p0 = rExpX, rExp8 // p8 = 1 if 8.0 <= x < 16
5677 nop.f 0
5678 adds rSgnGam = 1, r0 // gamma is positive at this range
5679 }
5680 ;;
5681 { .mfi
5682 shladd GR_ad_tbl_1 = GR_Index1, 4, rTbl1Addr// Point to G_1
5683 nop.f 0
5684 add GR_ad_q = -0x60, GR_ad_z_1 // Point to Constants_Q
5685 }
5686 { .mlx
5687 ld8 rPolDataPtr = [rPolDataPtr]
5688 movl rDelta = 0x3FF2000000000000
5689 }
5690 ;;
5691 { .mfi
5692 ldfps FR_G, FR_H = [GR_ad_tbl_1],8 // Load G_1, H_1
5693 nop.f 0
5694 add GR_ad_z_2 = 0x140, GR_ad_z_1 // Point to Constants_Z_2
5695 }
5696 { .mfi
5697 // Point to Constants_G_H_h2
5698 add GR_ad_tbl_2 = 0x180, GR_ad_z_1
5699 nop.f 0
5700 // p8 = 1 if 8.0 <= x <= 10.0
5701 (p8) cmp.leu.unc p8, p0 = rSignifX, rSignif1andQ
5702 }
5703 ;;
5704 { .mfi
5705 ldfd FR_h = [GR_ad_tbl_1] // Load h_1
5706 nop.f 0
5707 // Get bits 30-15 of X_0 * Z_1
5708 pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15
5709 }
5710 { .mfb
5711 (p8) setf.d FR_MHalf = rDelta
5712 nop.f 0
5713 (p8) br.cond.spnt lgammal_8_10 // branch out if 8.0 <= x <= 10.0
5714 }
5715 ;;
5716 //
5717 // For performance, don't use result of pmpyshr2.u for 4 cycles.
5718 //
5719 { .mfi
5720 ldfe fA1 = [rPolDataPtr], 16 // Load overflow threshold
5721 fma.s1 fRcpX = fInvX, fRcpX, fRcpX // end of 1st NR iteration
5722 // Point to Constants_G_H_h3
5723 add GR_ad_tbl_3 = 0x280, GR_ad_z_1
5724 }
5725 { .mlx
5726 nop.m 0
5727 movl rDelta = 0xBFE0000000000000 // -0.5 in DP
5728 }
5729 ;;
5730 { .mfi
5731 ldfe FR_log2_hi = [GR_ad_q],16 // Load log2_hi
5732 nop.f 0
5733 sub GR_N = rExpX, rExpHalf, 1 // unbiased exponent of x
5734 }
5735 ;;
5736 { .mfi
5737 ldfe FR_log2_lo = [GR_ad_q],16 // Load log2_lo
5738 nop.f 0
5739 nop.i 0
5740 }
5741 { .mfi
5742 setf.d FR_MHalf = rDelta
5743 nop.f 0
5744 nop.i 0
5745 }
5746 ;;
5747 { .mfi
5748 // Put integer N into rightmost significand
5749 setf.sig fFloatN = GR_N
5750 nop.f 0
5751 extr.u GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1
5752 }
5753 { .mfi
5754 ldfe FR_Q4 = [GR_ad_q], 16 // Load Q4
5755 nop.f 0
5756 nop.i 0
5757 }
5758 ;;
5759 { .mfi
5760 shladd GR_ad_z_2 = GR_Index2, 2, GR_ad_z_2 // Point to Z_2
5761 nop.f 0
5762 shladd GR_ad_tbl_2 = GR_Index2, 4, GR_ad_tbl_2// Point to G_2
5763 }
5764 { .mfi
5765 ldfe FR_Q3 = [GR_ad_q], 16 // Load Q3
5766 nop.f 0
5767 nop.i 0
5768 }
5769 ;;
5770 { .mfi
5771 ld4 GR_Z_2 = [GR_ad_z_2] // Load Z_2
5772 fnma.s1 fInvX = f8, fRcpX, f1 // start of 2nd NR iteration
5773 nop.i 0
5774 }
5775 ;;
5776 { .mfi
5777 ldfps FR_G2, FR_H2 = [GR_ad_tbl_2], 8 // Load G_2, H_2
5778 nop.f 0
5779 nop.i 0
5780 }
5781 ;;
5782 { .mfi
5783 ldfd FR_h2 = [GR_ad_tbl_2] // Load h_2
5784 nop.f 0
5785 nop.i 0
5786 }
5787 ;;
5788 { .mfi
5789 ldfe FR_Q2 = [GR_ad_q],16 // Load Q2
5790 nop.f 0
5791 // Get bits 30-15 of X_1 * Z_2
5792 pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15
5793 }
5794 ;;
5795 //
5796 // For performance, don't use result of pmpyshr2.u for 4 cycles.
5797 //
5798 { .mfi
5799 ldfe FR_Q1 = [GR_ad_q] // Load Q1
5800 fcmp.gt.s1 p7,p0 = f8, fA1 // check if x > overflow threshold
5801 nop.i 0
5802 }
5803 ;;
5804 {.mfi
5805 ldfpd fA0, fA0L = [rPolDataPtr], 16 // Load two parts of C
5806 fma.s1 fRcpX = fInvX, fRcpX, fRcpX // end of 2nd NR iteration
5807 nop.i 0
5808 }
5809 ;;
5810 { .mfb
5811 ldfpd fB2, fA1 = [rPolDataPtr], 16
5812 nop.f 0
5813 (p7) br.cond.spnt lgammal_overflow // branch if x > overflow threshold
5814 }
5815 ;;
5816 {.mfi
5817 ldfe fB4 = [rPolDataPtr], 16
5818 fcvt.xf fFloatN = fFloatN
5819 extr.u GR_Index3 = GR_X_2, 1, 5 // Extract bits 1-5 of X_2
5820 }
5821 ;;
5822 { .mfi
5823 shladd GR_ad_tbl_3 = GR_Index3, 4, GR_ad_tbl_3// Point to G_3
5824 nop.f 0
5825 nop.i 0
5826 }
5827 { .mfi
5828 ldfe fB6 = [rPolDataPtr], 16
5829 nop.f 0
5830 nop.i 0
5831 }
5832 ;;
5833 { .mfi
5834 ldfps FR_G3, FR_H3 = [GR_ad_tbl_3], 8 // Load G_3, H_3
5835 nop.f 0
5836 nop.i 0
5837 }
5838 ;;
5839 { .mfi
5840 ldfd FR_h3 = [GR_ad_tbl_3] // Load h_3
5841 fmpy.s1 FR_G = FR_G, FR_G2 // G = G_1 * G_2
5842 nop.i 0
5843 }
5844 { .mfi
5845 nop.m 0
5846 fadd.s1 FR_H = FR_H, FR_H2 // H = H_1 + H_2
5847 nop.i 0
5848 }
5849 ;;
5850
5851 { .mfi
5852 ldfe fB8 = [rPolDataPtr], 16
5853 fadd.s1 FR_h = FR_h, FR_h2 // h = h_1 + h_2
5854 nop.i 0
5855 }
5856 { .mfi
5857 nop.m 0
5858 fnma.s1 fInvX = f8, fRcpX, f1 // start of 3rd NR iteration
5859 nop.i 0
5860 }
5861 ;;
5862 { .mfi
5863 ldfe fB10 = [rPolDataPtr], 16
5864 nop.f 0
5865 cmp.eq p6, p7 = 4, rSgnGamSize
5866 }
5867 ;;
5868 { .mfi
5869 ldfe fB12 = [rPolDataPtr], 16
5870 nop.f 0
5871 nop.i 0
5872 }
5873 ;;
5874 { .mfi
5875 ldfe fB14 = [rPolDataPtr], 16
5876 nop.f 0
5877 nop.i 0
5878 }
5879 ;;
5880
5881 { .mfi
5882 ldfe fB16 = [rPolDataPtr], 16
5883 // get double extended coefficients from two doubles
5884 // two doubles are needed in Stitling's formula for negative x
5885 fadd.s1 fB2 = fB2, fA1
5886 nop.i 0
5887 }
5888 ;;
5889 { .mfi
5890 ldfe fB18 = [rPolDataPtr], 16
5891 fma.s1 fInvX = fInvX, fRcpX, fRcpX // end of 3rd NR iteration
5892 nop.i 0
5893 }
5894 ;;
5895 { .mfi
5896 ldfe fB20 = [rPolDataPtr], 16
5897 nop.f 0
5898 nop.i 0
5899 }
5900 ;;
5901 { .mfi
5902 // store signgam if size of variable is 4 bytes
5903 (p6) st4 [rSgnGamAddr] = rSgnGam
5904 fmpy.s1 FR_G = FR_G, FR_G3 // G = (G_1 * G_2) * G_3
5905 nop.i 0
5906 }
5907 { .mfi
5908 // store signgam if size of variable is 8 bytes
5909 (p7) st8 [rSgnGamAddr] = rSgnGam
5910 fadd.s1 FR_H = FR_H, FR_H3 // H = (H_1 + H_2) + H_3
5911 nop.i 0
5912 }
5913 ;;
5914 { .mfi
5915 nop.m 0
5916 fadd.s1 FR_h = FR_h, FR_h3 // h = (h_1 + h_2) + h_3
5917 nop.i 0
5918 }
5919 ;;
5920 { .mfi
5921 nop.m 0
5922 fma.s1 fRcpX = fInvX, fInvX, f0 // 1/x^2
5923 nop.i 0
5924 }
5925 { .mfi
5926 nop.m 0
5927 fma.s1 fA0L = fB2, fInvX, fA0L
5928 nop.i 0
5929 }
5930 ;;
5931 { .mfi
5932 nop.m 0
5933 fms.s1 FR_r = fSignifX, FR_G, f1 // r = G * S_hi - 1
5934 nop.i 0
5935 }
5936 { .mfi
5937 nop.m 0
5938 // High part of the log(x): Y_hi = N * log2_hi + H
5939 fma.s1 fRes2H = fFloatN, FR_log2_hi, FR_H
5940 nop.i 0
5941 }
5942 ;;
5943
5944 { .mfi
5945 nop.m 0
5946 // h = N * log2_lo + h
5947 fma.s1 FR_h = fFloatN, FR_log2_lo, FR_h
5948 nop.i 0
5949 }
5950 { .mfi
5951 nop.m 0
5952 // High part of the log(x): Y_hi = N * log2_hi + H
5953 fma.s1 fRes1H = fFloatN, FR_log2_hi, FR_H
5954 nop.i 0
5955 }
5956 ;;
5957 {.mfi
5958 nop.m 0
5959 fma.s1 fPol = fB18, fRcpX, fB16 // v9
5960 nop.i 0
5961 }
5962 { .mfi
5963 nop.m 0
5964 fma.s1 fA2L = fRcpX, fRcpX, f0 // v10
5965 nop.i 0
5966 }
5967 ;;
5968 {.mfi
5969 nop.m 0
5970 fma.s1 fA3 = fB6, fRcpX, fB4 // v3
5971 nop.i 0
5972 }
5973 { .mfi
5974 nop.m 0
5975 fma.s1 fA4 = fB10, fRcpX, fB8 // v4
5976 nop.i 0
5977 }
5978 ;;
5979 { .mfi
5980 nop.m 0
5981 fms.s1 fRes2H =fRes2H, f1, f1 // log_Hi(x) -1
5982 nop.i 0
5983 }
5984 { .mfi
5985 nop.m 0
5986 // poly_lo = r * Q4 + Q3
5987 fma.s1 FR_poly_lo = FR_r, FR_Q4, FR_Q3
5988 nop.i 0
5989 }
5990 ;;
5991 { .mfi
5992 nop.m 0
5993 fma.s1 fRes1H = fRes1H, FR_MHalf, f0 // -0.5*log_Hi(x)
5994 nop.i 0
5995 }
5996 { .mfi
5997 nop.m 0
5998 fmpy.s1 FR_rsq = FR_r, FR_r // rsq = r * r
5999 nop.i 0
6000 }
6001 ;;
6002 { .mfi
6003 nop.m 0
6004 fma.s1 fA7 = fB14, fRcpX, fB12 // v7
6005 nop.i 0
6006 }
6007 { .mfi
6008 nop.m 0
6009 fma.s1 fA8 = fA2L, fB20, fPol // v8
6010 nop.i 0
6011 }
6012 ;;
6013 { .mfi
6014 nop.m 0
6015 fma.s1 fA2 = fA4, fA2L, fA3 // v2
6016 nop.i 0
6017 }
6018 { .mfi
6019 nop.m 0
6020 fma.s1 fA4L = fA2L, fA2L, f0 // v5
6021 nop.i 0
6022 }
6023 ;;
6024 { .mfi
6025 nop.m 0
6026 fma.s1 fResH = fRes2H, f8, f0 // (x*(ln(x)-1))hi
6027 nop.i 0
6028 }
6029 { .mfi
6030 nop.m 0
6031 // poly_lo = poly_lo * r + Q2
6032 fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_Q2
6033 nop.i 0
6034 }
6035 ;;
6036 { .mfi
6037 nop.m 0
6038 fma.s1 FR_rcub = FR_rsq, FR_r, f0 // rcub = r^3
6039 nop.i 0
6040 }
6041 { .mfi
6042 nop.m 0
6043 // poly_hi = Q1 * rsq + r
6044 fma.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r
6045 nop.i 0
6046 }
6047 ;;
6048 { .mfi
6049 nop.m 0
6050 fma.s1 fA11 = fRcpX, fInvX, f0 // 1/x^3
6051 nop.i 0
6052 }
6053 { .mfi
6054 nop.m 0
6055 fma.s1 fA6 = fA8, fA2L, fA7 // v6
6056 nop.i 0
6057 }
6058 ;;
6059 { .mfi
6060 nop.m 0
6061 fms.s1 fResL = fRes2H, f8, fResH // d(x*(ln(x)-1))
6062 nop.i 0
6063 }
6064 { .mfi
6065 nop.m 0
6066 fadd.s1 fRes3H = fResH, fRes1H // (x*(ln(x)-1) -0.5ln(x))hi
6067 nop.i 0
6068 }
6069 ;;
6070 { .mfi
6071 nop.m 0
6072 // poly_lo = poly_lo*r^3 + h
6073 fma.s1 FR_poly_lo = FR_poly_lo, FR_rcub, FR_h
6074 nop.i 0
6075 }
6076 ;;
6077 { .mfi
6078 nop.m 0
6079 fma.s1 fPol = fA4L, fA6, fA2 // v1
6080 nop.i 0
6081 }
6082 { .mfi
6083 nop.m 0
6084 // raise inexact exception
6085 fma.s0 FR_log2_lo = FR_log2_lo, FR_log2_lo, f0
6086 nop.i 0
6087 }
6088 ;;
6089 { .mfi
6090 nop.m 0
6091 fadd.s1 fRes4H = fRes3H, fA0 // (x*(ln(x)-1) -0.5ln(x))hi + Chi
6092 nop.i 0
6093 }
6094 { .mfi
6095 nop.m 0
6096 fsub.s1 fRes3L = fResH, fRes3H
6097 nop.i 0
6098 }
6099 ;;
6100 { .mfi
6101 nop.m 0
6102 // Y_lo = poly_hi + poly_lo
6103 fadd.s1 fRes2L = FR_poly_hi, FR_poly_lo
6104 nop.i 0
6105 }
6106 ;;
6107
6108 { .mfi
6109 nop.m 0
6110 fma.s1 fA0L = fPol, fA11, fA0L // S(1/x) + Clo
6111 nop.i 0
6112 }
6113 ;;
6114 { .mfi
6115 nop.m 0
6116 fadd.s1 fRes3L = fRes3L, fRes1H
6117 nop.i 0
6118 }
6119 { .mfi
6120 nop.m 0
6121 fsub.s1 fRes4L = fRes3H, fRes4H
6122 nop.i 0
6123 }
6124 ;;
6125 { .mfi
6126 nop.m 0
6127 fma.s1 fResL = fRes2L, f8 , fResL // lo part of x*(ln(x)-1)
6128 nop.i 0
6129 }
6130 ;;
6131 { .mfi
6132 nop.m 0
6133 // Clo + S(1/x) - 0.5*logLo(x)
6134 fma.s1 fA0L = fRes2L, FR_MHalf, fA0L
6135 nop.i 0
6136 }
6137 ;;
6138 { .mfi
6139 nop.m 0
6140 fadd.s1 fRes4L = fRes4L, fA0
6141 nop.i 0
6142 }
6143 ;;
6144 { .mfi
6145 nop.m 0
6146 // Clo + S(1/x) - 0.5*logLo(x) + (x*(ln(x)-1))lo
6147 fadd.s1 fA0L = fA0L, fResL
6148 nop.i 0
6149 }
6150 ;;
6151 { .mfi
6152 nop.m 0
6153 fadd.s1 fRes4L = fRes4L, fRes3L
6154 nop.i 0
6155 }
6156 ;;
6157 { .mfi
6158 nop.m 0
6159 fadd.s1 fRes4L = fRes4L, fA0L
6160 nop.i 0
6161 }
6162 ;;
6163 { .mfb
6164 nop.m 0
6165 fma.s0 f8 = fRes4H, f1, fRes4L
6166 // exit for x > 10.0
6167 br.ret.sptk b0
6168 }
6169 ;;
6170 // here if 8.0 <= x <= 10.0
6171 // Result = P15(y), where y = x/8.0 - 1.5
6172 .align 32
6173 lgammal_8_10:
6174 { .mfi
6175 addl rPolDataPtr = @ltoff(lgammal_8_10_data), gp
6176 fms.s1 FR_FracX = fSignifX, f1, FR_MHalf // y = x/8.0 - 1.5
6177 cmp.eq p6, p7 = 4, rSgnGamSize
6178 }
6179 ;;
6180 { .mfi
6181 ld8 rLnSinDataPtr = [rPolDataPtr]
6182 nop.f 0
6183 nop.i 0
6184 }
6185 { .mfi
6186 ld8 rPolDataPtr = [rPolDataPtr]
6187 nop.f 0
6188 nop.i 0
6189 }
6190 ;;
6191 { .mfi
6192 adds rZ1offsett = 32, rLnSinDataPtr
6193 nop.f 0
6194 nop.i 0
6195 }
6196 { .mfi
6197 adds rLnSinDataPtr = 48, rLnSinDataPtr
6198 nop.f 0
6199 nop.i 0
6200 }
6201 ;;
6202 { .mfi
6203 ldfpd fA1, fA1L = [rPolDataPtr], 16 // A1
6204 nop.f 0
6205 nop.i 0
6206 }
6207 { .mfi
6208 ldfe fA2 = [rZ1offsett], 32 // A5
6209 nop.f 0
6210 nop.i 0
6211 }
6212 ;;
6213 { .mfi
6214 ldfpd fA0, fA0L = [rPolDataPtr], 16 // A0
6215 fma.s1 FR_rsq = FR_FracX, FR_FracX, f0 // y^2
6216 nop.i 0
6217 }
6218 { .mfi
6219 ldfe fA3 = [rLnSinDataPtr],32 // A5
6220 nop.f 0
6221 nop.i 0
6222 }
6223 ;;
6224 { .mmf
6225 ldfe fA4 = [rZ1offsett], 32 // A4
6226 ldfe fA5 = [rLnSinDataPtr], 32 // A5
6227 nop.f 0
6228 }
6229 ;;
6230 { .mmf
6231 ldfe fA6 = [rZ1offsett], 32 // A6
6232 ldfe fA7 = [rLnSinDataPtr], 32 // A7
6233 nop.f 0
6234 }
6235 ;;
6236 { .mmf
6237 ldfe fA8 = [rZ1offsett], 32 // A8
6238 ldfe fA9 = [rLnSinDataPtr], 32 // A9
6239 nop.f 0
6240 }
6241 ;;
6242 { .mmf
6243 ldfe fA10 = [rZ1offsett], 32 // A10
6244 ldfe fA11 = [rLnSinDataPtr], 32 // A11
6245 nop.f 0
6246 }
6247 ;;
6248 { .mmf
6249 ldfe fA12 = [rZ1offsett], 32 // A12
6250 ldfe fA13 = [rLnSinDataPtr], 32 // A13
6251 fma.s1 FR_Q4 = FR_rsq, FR_rsq, f0 // y^4
6252 }
6253 ;;
6254 { .mmf
6255 ldfe fA14 = [rZ1offsett], 32 // A14
6256 ldfe fA15 = [rLnSinDataPtr], 32 // A15
6257 nop.f 0
6258 }
6259 ;;
6260 { .mfi
6261 nop.m 0
6262 fma.s1 fRes1H = FR_FracX, fA1, f0
6263 nop.i 0
6264 }
6265 ;;
6266 { .mfi
6267 nop.m 0
6268 fma.s1 fA3 = fA3, FR_FracX, fA2 // v4
6269 nop.i 0
6270 }
6271 ;;
6272 { .mfi
6273 nop.m 0
6274 fma.s1 fA5 = fA5, FR_FracX, fA4 // v5
6275 nop.i 0
6276 }
6277 ;;
6278 { .mfi
6279 // store sign of GAMMA(x) if size of variable is 4 bytes
6280 (p6) st4 [rSgnGamAddr] = rSgnGam
6281 fma.s1 fA3L = FR_Q4, FR_Q4, f0 // v9 = y^8
6282 nop.i 0
6283 }
6284 { .mfi
6285 // store sign of GAMMA(x) if size of variable is 8 bytes
6286 (p7) st8 [rSgnGamAddr] = rSgnGam
6287 fma.s1 fA7 = fA7, FR_FracX, fA6 // v7
6288 nop.i 0
6289 }
6290 ;;
6291 { .mfi
6292 nop.m 0
6293 fma.s1 fA9 = fA9, FR_FracX, fA8 // v8
6294 nop.i 0
6295 }
6296 ;;
6297 { .mfi
6298 nop.m 0
6299 fms.s1 fRes1L = FR_FracX, fA1, fRes1H
6300 nop.i 0
6301 }
6302 { .mfi
6303 nop.m 0
6304 fma.s1 fA11 = fA11, FR_FracX, fA10 // v12
6305 nop.i 0
6306 }
6307 ;;
6308 { .mfi
6309 nop.m 0
6310 fma.s1 fA13 = fA13, FR_FracX, fA12 // v13
6311 nop.i 0
6312 }
6313 { .mfi
6314 nop.m 0
6315 fma.s1 fRes2H = fRes1H, f1, fA0
6316 nop.i 0
6317 }
6318 ;;
6319 { .mfi
6320 nop.m 0
6321 fma.s1 fA15 = fA15, FR_FracX, fA14 // v16
6322 nop.i 0
6323 }
6324 { .mfi
6325 nop.m 0
6326 fma.s1 fA5 = fA5, FR_rsq, fA3 // v3
6327 nop.i 0
6328 }
6329 ;;
6330 { .mfi
6331 nop.m 0
6332 fma.s1 fA9 = fA9, FR_rsq, fA7 // v6
6333 nop.i 0
6334 }
6335 ;;
6336 { .mfi
6337 nop.m 0
6338 fma.s1 fRes1L = FR_FracX, fA1L, fRes1L
6339 nop.i 0
6340 }
6341 ;;
6342 { .mfi
6343 nop.m 0
6344 fms.s1 fRes2L = fA0, f1, fRes2H
6345 nop.i 0
6346 }
6347 { .mfi
6348 nop.m 0
6349 fma.s1 fA13 = fA13, FR_rsq, fA11 // v11
6350 nop.i 0
6351 }
6352 ;;
6353 { .mfi
6354 nop.m 0
6355 fma.s1 fA9 = fA9, FR_Q4, fA5 // v2
6356 nop.i 0
6357 }
6358 ;;
6359 { .mfi
6360 nop.m 0
6361 fma.s1 fRes1L = fRes1L, f1, fA0L
6362 nop.i 0
6363 }
6364 ;;
6365 { .mfi
6366 nop.m 0
6367 fma.s1 fRes2L = fRes2L, f1, fRes1H
6368 nop.i 0
6369 }
6370 { .mfi
6371 nop.m 0
6372 fma.s1 fA15 = fA15, FR_Q4, fA13 // v10
6373 nop.i 0
6374 }
6375 ;;
6376 { .mfi
6377 nop.m 0
6378 fma.s1 fRes2L = fRes1L, f1, fRes2L
6379 nop.i 0
6380 }
6381 { .mfi
6382 nop.m 0
6383 fma.s1 fPol = fA3L, fA15, fA9
6384 nop.i 0
6385 }
6386 ;;
6387 { .mfi
6388 nop.m 0
6389 fma.s1 f8 = FR_rsq , fPol, fRes2H
6390 nop.i 0
6391 }
6392 { .mfi
6393 nop.m 0
6394 fma.s1 fPol = fPol, FR_rsq, f0
6395 nop.i 0
6396 }
6397 ;;
6398 { .mfi
6399 nop.m 0
6400 fms.s1 fRes1L = fRes2H, f1, f8
6401 nop.i 0
6402 }
6403 ;;
6404 { .mfi
6405 nop.m 0
6406 fma.s1 fRes1L = fRes1L, f1, fPol
6407 nop.i 0
6408 }
6409 ;;
6410 {.mfi
6411 nop.m 0
6412 fma.s1 fRes1L = fRes1L, f1, fRes2L
6413 nop.i 0
6414 }
6415 ;;
6416 { .mfb
6417 nop.m 0
6418 fma.s0 f8 = f8, f1, fRes1L
6419 // exit for 8.0 <= x <= 10.0
6420 br.ret.sptk b0
6421 }
6422 ;;
6423
6424 // here if 4.0 <=x < 8.0
6425 .align 32
6426 lgammal_4_8:
6427 { .mfi
6428 addl rPolDataPtr= @ltoff(lgammal_4_8_data),gp
6429 fms.s1 FR_FracX = fSignifX, f1, FR_MHalf
6430 adds rSgnGam = 1, r0
6431 }
6432 ;;
6433 { .mfi
6434 ld8 rPolDataPtr = [rPolDataPtr]
6435 nop.f 0
6436 nop.i 0
6437 }
6438 ;;
6439
6440 { .mfb
6441 adds rTmpPtr = 160, rPolDataPtr
6442 nop.f 0
6443 // branch to special path which computes polynomial of 25th degree
6444 br.sptk lgamma_polynom25
6445 }
6446 ;;
6447
6448 // here if 2.25 <=x < 4.0
6449 .align 32
6450 lgammal_2Q_4:
6451 { .mfi
6452 addl rPolDataPtr= @ltoff(lgammal_2Q_4_data),gp
6453 fms.s1 FR_FracX = fSignifX, f1, FR_MHalf
6454 adds rSgnGam = 1, r0
6455 }
6456 ;;
6457 { .mfi
6458 ld8 rPolDataPtr = [rPolDataPtr]
6459 nop.f 0
6460 nop.i 0
6461 }
6462 ;;
6463
6464 { .mfb
6465 adds rTmpPtr = 160, rPolDataPtr
6466 nop.f 0
6467 // branch to special path which computes polynomial of 25th degree
6468 br.sptk lgamma_polynom25
6469 }
6470 ;;
6471
6472 // here if 0.5 <= |x| < 0.75
6473 .align 32
6474 lgammal_half_3Q:
6475 .pred.rel "mutex", p14, p15
6476 { .mfi
6477 (p14) addl rPolDataPtr= @ltoff(lgammal_half_3Q_data),gp
6478 // FR_FracX = x - 0.625 for positive x
6479 (p14) fms.s1 FR_FracX = f8, f1, FR_FracX
6480 (p14) adds rSgnGam = 1, r0
6481 }
6482 { .mfi
6483 (p15) addl rPolDataPtr= @ltoff(lgammal_half_3Q_neg_data),gp
6484 // FR_FracX = x + 0.625 for negative x
6485 (p15) fma.s1 FR_FracX = f8, f1, FR_FracX
6486 (p15) adds rSgnGam = -1, r0
6487 }
6488 ;;
6489 { .mfi
6490 ld8 rPolDataPtr = [rPolDataPtr]
6491 nop.f 0
6492 nop.i 0
6493 }
6494 ;;
6495 { .mfb
6496 adds rTmpPtr = 160, rPolDataPtr
6497 nop.f 0
6498 // branch to special path which computes polynomial of 25th degree
6499 br.sptk lgamma_polynom25
6500 }
6501 ;;
6502 // here if 1.3125 <= x < 1.5625
6503 .align 32
6504 lgammal_loc_min:
6505 { .mfi
6506 adds rSgnGam = 1, r0
6507 nop.f 0
6508 nop.i 0
6509 }
6510 { .mfb
6511 adds rTmpPtr = 160, rPolDataPtr
6512 fms.s1 FR_FracX = f8, f1, fA5L
6513 br.sptk lgamma_polynom25
6514 }
6515 ;;
6516 // here if -2.605859375 <= x < -2.5
6517 // special polynomial approximation used since neither "near root"
6518 // approximation nor reflection formula give satisfactory accuracy on
6519 // this range
6520 .align 32
6521 _neg2andHalf:
6522 { .mfi
6523 addl rPolDataPtr= @ltoff(lgammal_neg2andHalf_data),gp
6524 fma.s1 FR_FracX = fB20, f1, f8 // 2.5 + x
6525 adds rSgnGam = -1, r0
6526 }
6527 ;;
6528 {.mfi
6529 ld8 rPolDataPtr = [rPolDataPtr]
6530 nop.f 0
6531 nop.i 0
6532 }
6533 ;;
6534 { .mfb
6535 adds rTmpPtr = 160, rPolDataPtr
6536 nop.f 0
6537 // branch to special path which computes polynomial of 25th degree
6538 br.sptk lgamma_polynom25
6539 }
6540 ;;
6541
6542 // here if -0.5 < x <= -0.40625
6543 .align 32
6544 lgammal_near_neg_half:
6545 { .mmf
6546 addl rPolDataPtr= @ltoff(lgammal_near_neg_half_data),gp
6547 setf.exp FR_FracX = rExpHalf
6548 nop.f 0
6549 }
6550 ;;
6551 { .mfi
6552 ld8 rPolDataPtr = [rPolDataPtr]
6553 nop.f 0
6554 adds rSgnGam = -1, r0
6555 }
6556 ;;
6557 { .mfb
6558 adds rTmpPtr = 160, rPolDataPtr
6559 fma.s1 FR_FracX = FR_FracX, f1, f8
6560 // branch to special path which computes polynomial of 25th degree
6561 br.sptk lgamma_polynom25
6562 }
6563 ;;
6564
6565 // here if there an answer is P25(x)
6566 // rPolDataPtr, rTmpPtr point to coefficients
6567 // x is in FR_FracX register
6568 .align 32
6569 lgamma_polynom25:
6570 { .mfi
6571 ldfpd fA3, fA0L = [rPolDataPtr], 16 // A3
6572 nop.f 0
6573 cmp.eq p6, p7 = 4, rSgnGamSize
6574 }
6575 { .mfi
6576 ldfpd fA18, fA19 = [rTmpPtr], 16 // D7, D6
6577 nop.f 0
6578 nop.i 0
6579 }
6580 ;;
6581 { .mfi
6582 ldfpd fA1, fA1L = [rPolDataPtr], 16 // A1
6583 nop.f 0
6584 nop.i 0
6585 }
6586 { .mfi
6587 ldfpd fA16, fA17 = [rTmpPtr], 16 // D4, D5
6588 nop.f 0
6589 }
6590 ;;
6591 { .mfi
6592 ldfpd fA12, fA13 = [rPolDataPtr], 16 // D0, D1
6593 nop.f 0
6594 nop.i 0
6595 }
6596 { .mfi
6597 ldfpd fA14, fA15 = [rTmpPtr], 16 // D2, D3
6598 nop.f 0
6599 nop.i 0
6600 }
6601 ;;
6602 { .mfi
6603 ldfpd fA24, fA25 = [rPolDataPtr], 16 // C21, C20
6604 nop.f 0
6605 nop.i 0
6606 }
6607 { .mfi
6608 ldfpd fA22, fA23 = [rTmpPtr], 16 // C19, C18
6609 nop.f 0
6610 nop.i 0
6611 }
6612 ;;
6613 { .mfi
6614 ldfpd fA2, fA2L = [rPolDataPtr], 16 // A2
6615 fma.s1 fA4L = FR_FracX, FR_FracX, f0 // x^2
6616 nop.i 0
6617 }
6618 { .mfi
6619 ldfpd fA20, fA21 = [rTmpPtr], 16 // C17, C16
6620 nop.f 0
6621 nop.i 0
6622 }
6623 ;;
6624 { .mfi
6625 ldfe fA11 = [rTmpPtr], 16 // E7
6626 nop.f 0
6627 nop.i 0
6628 }
6629 { .mfi
6630 ldfpd fA0, fA3L = [rPolDataPtr], 16 // A0
6631 nop.f 0
6632 nop.i 0
6633 };;
6634 { .mfi
6635 ldfe fA10 = [rPolDataPtr], 16 // E6
6636 nop.f 0
6637 nop.i 0
6638 }
6639 { .mfi
6640 ldfe fA9 = [rTmpPtr], 16 // E5
6641 nop.f 0
6642 nop.i 0
6643 }
6644 ;;
6645 { .mmf
6646 ldfe fA8 = [rPolDataPtr], 16 // E4
6647 ldfe fA7 = [rTmpPtr], 16 // E3
6648 nop.f 0
6649 }
6650 ;;
6651 { .mmf
6652 ldfe fA6 = [rPolDataPtr], 16 // E2
6653 ldfe fA5 = [rTmpPtr], 16 // E1
6654 nop.f 0
6655 }
6656 ;;
6657 { .mfi
6658 ldfe fA4 = [rPolDataPtr], 16 // E0
6659 fma.s1 fA5L = fA4L, fA4L, f0 // x^4
6660 nop.i 0
6661 }
6662 { .mfi
6663 nop.m 0
6664 fms.s1 fB2 = FR_FracX, FR_FracX, fA4L // x^2 - <x^2>
6665 nop.i 0
6666 }
6667 ;;
6668 { .mfi
6669 // store signgam if size of variable is 4 bytes
6670 (p6) st4 [rSgnGamAddr] = rSgnGam
6671 fma.s1 fRes4H = fA3, FR_FracX, f0 // (A3*x)hi
6672 nop.i 0
6673 }
6674 { .mfi
6675 // store signgam if size of variable is 8 bytes
6676 (p7) st8 [rSgnGamAddr] = rSgnGam
6677 fma.s1 fA19 = fA19, FR_FracX, fA18 // D7*x + D6
6678 nop.i 0
6679 }
6680 ;;
6681 { .mfi
6682 nop.m 0
6683 fma.s1 fResH = fA1, FR_FracX, f0 // (A1*x)hi
6684 nop.i 0
6685 }
6686 { .mfi
6687 nop.m 0
6688 fma.s1 fB6 = fA1L, FR_FracX, fA0L // A1L*x + A0L
6689 nop.i 0
6690 }
6691 ;;
6692 { .mfi
6693 nop.m 0
6694 fma.s1 fA17 = fA17, FR_FracX, fA16 // D5*x + D4
6695 nop.i 0
6696 }
6697 { .mfi
6698 nop.m 0
6699 fma.s1 fA15 = fA15, FR_FracX, fA14 // D3*x + D2
6700 nop.i 0
6701 }
6702 ;;
6703 { .mfi
6704 nop.m 0
6705 fma.s1 fA25 = fA25, FR_FracX, fA24 // C21*x + C20
6706 nop.i 0
6707 }
6708 { .mfi
6709 nop.m 0
6710 fma.s1 fA13 = fA13, FR_FracX, fA12 // D1*x + D0
6711 nop.i 0
6712 }
6713 ;;
6714 { .mfi
6715 nop.m 0
6716 fma.s1 fA23 = fA23, FR_FracX, fA22 // C19*x + C18
6717 nop.i 0
6718 }
6719 { .mfi
6720 nop.m 0
6721 fma.s1 fA21 = fA21, FR_FracX, fA20 // C17*x + C16
6722 nop.i 0
6723 }
6724 ;;
6725 { .mfi
6726 nop.m 0
6727 fms.s1 fRes4L = fA3, FR_FracX, fRes4H // delta((A3*x)hi)
6728 nop.i 0
6729 }
6730 { .mfi
6731 nop.m 0
6732 fadd.s1 fRes2H = fRes4H, fA2 // (A3*x + A2)hi
6733 nop.i 0
6734 }
6735 ;;
6736 { .mfi
6737 nop.m 0
6738 fms.s1 fResL = fA1, FR_FracX, fResH // d(A1*x)
6739 nop.i 0
6740 }
6741 { .mfi
6742 nop.m 0
6743 fadd.s1 fRes1H = fResH, fA0 // (A1*x + A0)hi
6744 nop.i 0
6745 }
6746 ;;
6747 { .mfi
6748 nop.m 0
6749 fma.s1 fA19 = fA19, fA4L, fA17 // Dhi
6750 nop.i 0
6751 }
6752 { .mfi
6753 nop.m 0
6754 fma.s1 fA11 = fA11, FR_FracX, fA10 // E7*x + E6
6755 nop.i 0
6756 }
6757 ;;
6758 { .mfi
6759 nop.m 0
6760 // Doing this to raise inexact flag
6761 fma.s0 fA10 = fA0, fA0, f0
6762 nop.i 0
6763 }
6764 ;;
6765 { .mfi
6766 nop.m 0
6767 fma.s1 fA15 = fA15, fA4L, fA13 // Dlo
6768 nop.i 0
6769 }
6770 { .mfi
6771 nop.m 0
6772 // (C21*x + C20)*x^2 + C19*x + C18
6773 fma.s1 fA25 = fA25, fA4L, fA23
6774 nop.i 0
6775 }
6776 ;;
6777 { .mfi
6778 nop.m 0
6779 fma.s1 fA9 = fA9, FR_FracX, fA8 // E5*x + E4
6780 nop.i 0
6781 }
6782 { .mfi
6783 nop.m 0
6784 fma.s1 fA7 = fA7, FR_FracX, fA6 // E3*x + E2
6785 nop.i 0
6786 }
6787 ;;
6788 { .mfi
6789 nop.m 0
6790 fma.s1 fRes4L = fA3L, FR_FracX, fRes4L // (A3*x)lo
6791 nop.i 0
6792 }
6793 { .mfi
6794 nop.m 0
6795 fsub.s1 fRes2L = fA2, fRes2H
6796 nop.i 0
6797 }
6798 ;;
6799 { .mfi
6800 nop.m 0
6801 fadd.s1 fResL = fResL, fB6 // (A1L*x + A0L) + d(A1*x)
6802 nop.i 0
6803 }
6804 { .mfi
6805 nop.m 0
6806 fsub.s1 fRes1L = fA0, fRes1H
6807 nop.i 0
6808 }
6809 ;;
6810 { .mfi
6811 nop.m 0
6812 fma.s1 fA5 = fA5, FR_FracX, fA4 // E1*x + E0
6813 nop.i 0
6814 }
6815 { .mfi
6816 nop.m 0
6817 fma.s1 fB8 = fA5L, fA5L, f0 // x^8
6818 nop.i 0
6819 }
6820 ;;
6821 { .mfi
6822 nop.m 0
6823 // ((C21*x + C20)*x^2 + C19*x + C18)*x^2 + C17*x + C16
6824 fma.s1 fA25 = fA25, fA4L, fA21
6825 nop.i 0
6826 }
6827 { .mfi
6828 nop.m 0
6829 fma.s1 fA19 = fA19, fA5L, fA15 // D
6830 nop.i 0
6831 }
6832 ;;
6833 { .mfi
6834 nop.m 0
6835 fma.s1 fA11 = fA11, fA4L, fA9 // Ehi
6836 nop.i 0
6837 }
6838 ;;
6839 { .mfi
6840 nop.m 0
6841 fadd.s1 fRes2L = fRes2L, fRes4H
6842 nop.i 0
6843 }
6844 { .mfi
6845 nop.m 0
6846 fadd.s1 fRes4L = fRes4L, fA2L // (A3*x)lo + A2L
6847 nop.i 0
6848 }
6849 ;;
6850 { .mfi
6851 nop.m 0
6852 fma.s1 fRes3H = fRes2H, fA4L, f0 // ((A3*x + A2)*x^2)hi
6853 nop.i 0
6854 }
6855 { .mfi
6856 nop.m 0
6857 fadd.s1 fRes1L = fRes1L, fResH
6858 nop.i 0
6859 }
6860 ;;
6861 { .mfi
6862 nop.m 0
6863 fma.s1 fRes3L = fRes2H, fB2, f0 // (A3*x + A2)hi*d(x^2)
6864 nop.i 0
6865 }
6866 { .mfi
6867 nop.m 0
6868 fma.s1 fA7 = fA7, fA4L, fA5 // Elo
6869 nop.i 0
6870 }
6871 ;;
6872 { .mfi
6873 nop.m 0
6874 fma.s1 fA25 = fA25, fB8, fA19 // C*x^8 + D
6875 nop.i 0
6876 }
6877 ;;
6878 { .mfi
6879 nop.m 0
6880 fadd.s1 fRes2L = fRes2L, fRes4L // (A3*x + A2)lo
6881 nop.i 0
6882 }
6883 ;;
6884 { .mfi
6885 nop.m 0
6886 fms.s1 fB4 = fRes2H, fA4L, fRes3H // d((A3*x + A2)*x^2))
6887 nop.i 0
6888 }
6889 { .mfi
6890 nop.m 0
6891 fadd.s1 fRes1L = fRes1L, fResL // (A1*x + A0)lo
6892 nop.i 0
6893 }
6894 ;;
6895 { .mfi
6896 nop.m 0
6897 fadd.s1 fB20 = fRes3H, fRes1H // Phi
6898 nop.i 0
6899 }
6900 { .mfi
6901 nop.m 0
6902 fma.s1 fA11 = fA11, fA5L, fA7 // E
6903 nop.i 0
6904 }
6905 ;;
6906 { .mfi
6907 nop.m 0
6908 // ( (A3*x + A2)lo*<x^2> + (A3*x + A2)hi*d(x^2))
6909 fma.s1 fRes3L = fRes2L, fA4L, fRes3L
6910 nop.i 0
6911 }
6912 ;;
6913 { .mfi
6914 nop.m 0
6915 // d((A3*x + A2)*x^2)) + (A1*x + A0)lo
6916 fadd.s1 fRes1L = fRes1L, fB4
6917 nop.i 0
6918 }
6919 ;;
6920 { .mfi
6921 nop.m 0
6922 fsub.s1 fB18 = fRes1H, fB20
6923 nop.i 0
6924 }
6925 { .mfi
6926 nop.m 0
6927 fma.s1 fPol = fA25, fB8, fA11
6928 nop.i 0
6929 }
6930 ;;
6931 { .mfi
6932 nop.m 0
6933 fadd.s1 fRes1L = fRes1L, fRes3L
6934 nop.i 0
6935 }
6936 ;;
6937 { .mfi
6938 nop.m 0
6939 fadd.s1 fB18 = fB18, fRes3H
6940 nop.i 0
6941 }
6942 { .mfi
6943 nop.m 0
6944 fma.s1 fRes4H = fPol, fA5L, fB20
6945 nop.i 0
6946 }
6947 ;;
6948 { .mfi
6949 nop.m 0
6950 fma.s1 fPolL = fPol, fA5L, f0
6951 nop.i 0
6952 }
6953 ;;
6954 { .mfi
6955 nop.m 0
6956 fadd.s1 fB18 = fB18, fRes1L // Plo
6957 nop.i 0
6958 }
6959 { .mfi
6960 nop.m 0
6961 fsub.s1 fRes4L = fB20, fRes4H
6962 nop.i 0
6963 }
6964 ;;
6965 { .mfi
6966 nop.m 0
6967 fadd.s1 fB18 = fB18, fPolL
6968 nop.i 0
6969 }
6970 ;;
6971 { .mfi
6972 nop.m 0
6973 fadd.s1 fRes4L = fRes4L, fB18
6974 nop.i 0
6975 }
6976 ;;
6977 { .mfb
6978 nop.m 0
6979 fma.s0 f8 = fRes4H, f1, fRes4L
6980 // P25(x) computed, exit here
6981 br.ret.sptk b0
6982 }
6983 ;;
6984
6985
6986 // here if 0.75 <= x < 1.3125
6987 .align 32
6988 lgammal_03Q_1Q:
6989 { .mfi
6990 addl rPolDataPtr= @ltoff(lgammal_03Q_1Q_data),gp
6991 fma.s1 FR_FracX = fA5L, f1, f0 // x
6992 adds rSgnGam = 1, r0
6993 }
6994 { .mfi
6995 nop.m 0
6996 fma.s1 fB4 = fA5L, fA5L, f0 // x^2
6997 nop.i 0
6998 }
6999 ;;
7000 { .mfi
7001 ld8 rPolDataPtr = [rPolDataPtr]
7002 nop.f 0
7003 nop.i 0
7004 }
7005 ;;
7006 { .mfb
7007 adds rTmpPtr = 144, rPolDataPtr
7008 nop.f 0
7009 br.sptk lgamma_polynom24x
7010 }
7011 ;;
7012
7013 // here if 1.5625 <= x < 2.25
7014 .align 32
7015 lgammal_13Q_2Q:
7016 { .mfi
7017 addl rPolDataPtr= @ltoff(lgammal_13Q_2Q_data),gp
7018 fma.s1 FR_FracX = fB4, f1, f0 // x
7019 adds rSgnGam = 1, r0
7020 }
7021 { .mfi
7022 nop.m 0
7023 fma.s1 fB4 = fB4, fB4, f0 // x^2
7024 nop.i 0
7025 }
7026 ;;
7027 { .mfi
7028 ld8 rPolDataPtr = [rPolDataPtr]
7029 nop.f 0
7030 nop.i 0
7031 }
7032 ;;
7033 { .mfb
7034 adds rTmpPtr = 144, rPolDataPtr
7035 nop.f 0
7036 br.sptk lgamma_polynom24x
7037 }
7038 ;;
7039
7040 // here if result is Pol24(x)
7041 // x is in FR_FracX,
7042 // rPolDataPtr, rTmpPtr point to coefficients
7043 .align 32
7044 lgamma_polynom24x:
7045 { .mfi
7046 ldfpd fA4, fA2L = [rPolDataPtr], 16
7047 nop.f 0
7048 cmp.eq p6, p7 = 4, rSgnGamSize
7049 }
7050 { .mfi
7051 ldfpd fA23, fA24 = [rTmpPtr], 16 // C18, C19
7052 nop.f 0
7053 nop.i 0
7054 }
7055 ;;
7056 { .mfi
7057 ldfpd fA3, fA1L = [rPolDataPtr], 16
7058 fma.s1 fA5L = fB4, fB4, f0 // x^4
7059 nop.i 0
7060 }
7061 { .mfi
7062 ldfpd fA19, fA20 = [rTmpPtr], 16 // D6, D7
7063 fms.s1 fB2 = FR_FracX, FR_FracX, fB4 // x^2 - <x^2>
7064 nop.i 0
7065 }
7066 ;;
7067 { .mmf
7068 ldfpd fA15, fA16 = [rPolDataPtr], 16 // D2, D3
7069 ldfpd fA17, fA18 = [rTmpPtr], 16 // D4, D5
7070 nop.f 0
7071 }
7072 ;;
7073 { .mmf
7074 ldfpd fA13, fA14 = [rPolDataPtr], 16 // D0, D1
7075 ldfpd fA12, fA21 = [rTmpPtr], 16 // E7, C16
7076 nop.f 0
7077 }
7078 ;;
7079 { .mfi
7080 ldfe fA11 = [rPolDataPtr], 16 // E6
7081 nop.f 0
7082 nop.i 0
7083 }
7084 { .mfi
7085 ldfe fA10 = [rTmpPtr], 16 // E5
7086 nop.f 0
7087 nop.i 0
7088 }
7089 ;;
7090 { .mfi
7091 ldfpd fA2, fA4L = [rPolDataPtr], 16
7092 nop.f 0
7093 nop.i 0
7094 }
7095 { .mfi
7096 ldfpd fA1, fA3L = [rTmpPtr], 16
7097 nop.f 0
7098 nop.i 0
7099 }
7100 ;;
7101 { .mfi
7102 ldfpd fA22, fA25 = [rPolDataPtr], 16 // C17, C20
7103 fma.s1 fA0 = fA5L, fA5L, f0 // x^8
7104 nop.i 0
7105 }
7106 { .mfi
7107 nop.m 0
7108 fma.s1 fA0L = fA5L, FR_FracX, f0 // x^5
7109 nop.i 0
7110 }
7111 ;;
7112 { .mmf
7113 ldfe fA9 = [rPolDataPtr], 16 // E4
7114 ldfe fA8 = [rTmpPtr], 16 // E3
7115 nop.f 0
7116 }
7117 ;;
7118 { .mmf
7119 ldfe fA7 = [rPolDataPtr], 16 // E2
7120 ldfe fA6 = [rTmpPtr], 16 // E1
7121 nop.f 0
7122 }
7123 ;;
7124 { .mfi
7125 ldfe fA5 = [rTmpPtr], 16 // E0
7126 fma.s1 fRes4H = fA4, fB4, f0 // A4*<x^2>
7127 nop.i 0
7128 }
7129 { .mfi
7130 nop.m 0
7131 fma.s1 fPol = fA24, FR_FracX, fA23 // C19*x + C18
7132 nop.i 0
7133 }
7134 ;;
7135 { .mfi
7136 // store signgam if size of variable is 4 bytes
7137 (p6) st4 [rSgnGamAddr] = rSgnGam
7138 fma.s1 fRes1H = fA3, fB4, f0 // A3*<x^2>
7139 nop.i 0
7140 }
7141 { .mfi
7142 // store signgam if size of variable is 8 bytes
7143 (p7) st8 [rSgnGamAddr] = rSgnGam
7144 fma.s1 fA1L = fA3, fB2,fA1L // A3*d(x^2) + A1L
7145 nop.i 0
7146 }
7147 ;;
7148 { .mfi
7149 nop.m 0
7150 fma.s1 fA20 = fA20, FR_FracX, fA19 // D7*x + D6
7151 nop.i 0
7152 }
7153 { .mfi
7154 nop.m 0
7155 fma.s1 fA18 = fA18, FR_FracX, fA17 // D5*x + D4
7156 nop.i 0
7157 }
7158 ;;
7159 { .mfi
7160 nop.m 0
7161 fma.s1 fA16 = fA16, FR_FracX, fA15 // D3*x + D2
7162 nop.i 0
7163 }
7164 { .mfi
7165 nop.m 0
7166 fma.s1 fA14 = fA14, FR_FracX, fA13 // D1*x + D0
7167 nop.i 0
7168 }
7169 ;;
7170 { .mfi
7171 nop.m 0
7172 fma.s1 fA2L = fA4, fB2,fA2L // A4*d(x^2) + A2L
7173 nop.i 0
7174 }
7175 { .mfi
7176 nop.m 0
7177 fma.s1 fA12 = fA12, FR_FracX, fA11 // E7*x + E6
7178 nop.i 0
7179 }
7180 ;;
7181 { .mfi
7182 nop.m 0
7183 fms.s1 fRes2L = fA4, fB4, fRes4H // delta(A4*<x^2>)
7184 nop.i 0
7185 }
7186 { .mfi
7187 nop.m 0
7188 fadd.s1 fRes2H = fRes4H, fA2 // A4*<x^2> + A2
7189 nop.i 0
7190 }
7191 ;;
7192 { .mfi
7193 nop.m 0
7194 fms.s1 fRes3L = fA3, fB4, fRes1H // delta(A3*<x^2>)
7195 nop.i 0
7196 }
7197 { .mfi
7198 nop.m 0
7199 fadd.s1 fRes3H = fRes1H, fA1 // A3*<x^2> + A1
7200 nop.i 0
7201 }
7202 ;;
7203 { .mfi
7204 nop.m 0
7205 fma.s1 fA20 = fA20, fB4, fA18 // (D7*x + D6)*x^2 + D5*x + D4
7206 nop.i 0
7207 }
7208 { .mfi
7209 nop.m 0
7210 fma.s1 fA22 = fA22, FR_FracX, fA21 // C17*x + C16
7211 nop.i 0
7212 }
7213 ;;
7214 { .mfi
7215 nop.m 0
7216 fma.s1 fA16 = fA16, fB4, fA14 // (D3*x + D2)*x^2 + D1*x + D0
7217 nop.i 0
7218 }
7219 { .mfi
7220 nop.m 0
7221 fma.s1 fPol = fA25, fB4, fPol // C20*x^2 + C19*x + C18
7222 nop.i 0
7223 }
7224 ;;
7225 { .mfi
7226 nop.m 0
7227 fma.s1 fA2L = fA4L, fB4, fA2L // A4L*<x^2> + A4*d(x^2) + A2L
7228 nop.i 0
7229 }
7230 { .mfi
7231 nop.m 0
7232 fma.s1 fA1L = fA3L, fB4, fA1L // A3L*<x^2> + A3*d(x^2) + A1L
7233 nop.i 0
7234 }
7235 ;;
7236 { .mfi
7237 nop.m 0
7238 fsub.s1 fRes4L = fA2, fRes2H // d1
7239 nop.i 0
7240 }
7241 { .mfi
7242 nop.m 0
7243 fma.s1 fResH = fRes2H, fB4, f0 // (A4*<x^2> + A2)*x^2
7244 nop.i 0
7245 }
7246 ;;
7247 { .mfi
7248 nop.m 0
7249 fsub.s1 fRes1L = fA1, fRes3H // d1
7250 nop.i 0
7251 }
7252 { .mfi
7253 nop.m 0
7254 fma.s1 fB6 = fRes3H, FR_FracX, f0 // (A3*<x^2> + A1)*x
7255 nop.i 0
7256 }
7257 ;;
7258 { .mfi
7259 nop.m 0
7260 fma.s1 fA10 = fA10, FR_FracX, fA9 // E5*x + E4
7261 nop.i 0
7262 }
7263 { .mfi
7264 nop.m 0
7265 fma.s1 fA8 = fA8, FR_FracX, fA7 // E3*x + E2
7266 nop.i 0
7267 }
7268 ;;
7269 { .mfi
7270 nop.m 0
7271 // (C20*x^2 + C19*x + C18)*x^2 + C17*x + C16
7272 fma.s1 fPol = fPol, fB4, fA22
7273 nop.i 0
7274 }
7275 { .mfi
7276 nop.m 0
7277 fma.s1 fA6 = fA6, FR_FracX, fA5 // E1*x + E0
7278 nop.i 0
7279 }
7280 ;;
7281 { .mfi
7282 nop.m 0
7283 // A4L*<x^2> + A4*d(x^2) + A2L + delta(A4*<x^2>)
7284 fadd.s1 fRes2L = fA2L, fRes2L
7285 nop.i 0
7286 }
7287 { .mfi
7288 nop.m 0
7289 // A3L*<x^2> + A3*d(x^2) + A1L + delta(A3*<x^2>)
7290 fadd.s1 fRes3L = fA1L, fRes3L
7291 nop.i 0
7292 }
7293 ;;
7294 { .mfi
7295 nop.m 0
7296 fadd.s1 fRes4L = fRes4L, fRes4H // d2
7297 nop.i 0
7298 }
7299 { .mfi
7300 nop.m 0
7301 fms.s1 fResL = fRes2H, fB4, fResH // d(A4*<x^2> + A2)*x^2)
7302 nop.i 0
7303 }
7304 ;;
7305 { .mfi
7306 nop.m 0
7307 fadd.s1 fRes1L = fRes1L, fRes1H // d2
7308 nop.i 0
7309 }
7310 { .mfi
7311 nop.m 0
7312 fms.s1 fB8 = fRes3H, FR_FracX, fB6 // d((A3*<x^2> + A1)*x)
7313 nop.i 0
7314 }
7315 ;;
7316 { .mfi
7317 nop.m 0
7318 fadd.s1 fB10 = fResH, fB6 // (A4*x^4 + .. + A1*x)hi
7319 nop.i 0
7320 }
7321 { .mfi
7322 nop.m 0
7323 fma.s1 fA12 = fA12, fB4, fA10 // Ehi
7324 nop.i 0
7325 }
7326 ;;
7327 { .mfi
7328 nop.m 0
7329 // ((D7*x + D6)*x^2 + D5*x + D4)*x^4 + (D3*x + D2)*x^2 + D1*x + D0
7330 fma.s1 fA20 = fA20, fA5L, fA16
7331 nop.i 0
7332 }
7333 { .mfi
7334 nop.m 0
7335 fma.s1 fA8 = fA8, fB4, fA6 // Elo
7336 nop.i 0
7337 }
7338 ;;
7339 { .mfi
7340 nop.m 0
7341 fadd.s1 fRes2L = fRes2L, fRes4L // (A4*<x^2> + A2)lo
7342 nop.i 0
7343 }
7344 { .mfi
7345 nop.m 0
7346 // d(A4*<x^2> + A2)*x^2) + A4*<x^2> + A2)*d(x^2)
7347 fma.s1 fResL = fRes2H, fB2, fResL
7348 nop.i 0
7349 }
7350 ;;
7351 { .mfi
7352 nop.m 0
7353 fadd.s1 fRes3L = fRes3L, fRes1L // (A4*<x^2> + A2)lo
7354 nop.i 0
7355 }
7356 ;;
7357 { .mfi
7358 nop.m 0
7359 fsub.s1 fB12 = fB6, fB10
7360 nop.i 0
7361 }
7362 ;;
7363 { .mfi
7364 nop.m 0
7365 fma.s1 fPol = fPol, fA0, fA20 // PolC*x^8 + PolD
7366 nop.i 0
7367 }
7368 { .mfi
7369 nop.m 0
7370 fma.s1 fPolL = fA12, fA5L, fA8 // E
7371 nop.i 0
7372 }
7373 ;;
7374 { .mfi
7375 nop.m 0
7376 fma.s1 fResL = fB4, fRes2L, fResL // ((A4*<x^2> + A2)*x^2)lo
7377 nop.i 0
7378 }
7379 ;;
7380 { .mfi
7381 nop.m 0
7382 fma.s1 fRes3L = fRes3L, FR_FracX, fB8 // ((A3*<x^2> + A1)*x)lo
7383 nop.i 0
7384 }
7385 ;;
7386 { .mfi
7387 nop.m 0
7388 fadd.s1 fB12 = fB12, fResH
7389 nop.i 0
7390 }
7391 ;;
7392 { .mfi
7393 nop.m 0
7394 fma.s1 fPol = fPol, fA0, fPolL
7395 nop.i 0
7396 }
7397 ;;
7398 { .mfi
7399 nop.m 0
7400 fadd.s1 fRes3L = fRes3L, fResL
7401 nop.i 0
7402 }
7403 ;;
7404 { .mfi
7405 nop.m 0
7406 fma.s1 fRes2H = fPol, fA0L, fB10
7407 nop.i 0
7408 }
7409 ;;
7410 { .mfi
7411 nop.m 0
7412 fadd.s1 fRes3L = fB12, fRes3L
7413 nop.i 0
7414 }
7415 ;;
7416 { .mfi
7417 nop.m 0
7418 fsub.s1 fRes4L = fB10, fRes2H
7419 nop.i 0
7420 }
7421 ;;
7422 { .mfi
7423 nop.m 0
7424 fma.s1 fRes4L = fPol, fA0L, fRes4L
7425 nop.i 0
7426 }
7427 ;;
7428 { .mfi
7429 nop.m 0
7430 fadd.s1 fRes4L = fRes4L, fRes3L
7431 nop.i 0
7432 }
7433 ;;
7434 { .mfb
7435 nop.m 0
7436 // final result for all paths for which the result is Pol24(x)
7437 fma.s0 f8 = fRes2H, f1, fRes4L
7438 // here is the exit for all paths for which the result is Pol24(x)
7439 br.ret.sptk b0
7440 }
7441 ;;
7442
7443
7444 // here if x is natval, nan, +/-inf, +/-0, or denormal
7445 .align 32
7446 lgammal_spec:
7447 { .mfi
7448 nop.m 0
7449 fclass.m p9, p0 = f8, 0xB // +/-denormals
7450 nop.i 0
7451 };;
7452 { .mfi
7453 nop.m 0
7454 fclass.m p6, p0 = f8, 0x1E1 // Test x for natval, nan, +inf
7455 nop.i 0
7456 };;
7457 { .mfb
7458 nop.m 0
7459 fclass.m p7, p0 = f8, 0x7 // +/-0
7460 (p9) br.cond.sptk lgammal_denormal_input
7461 };;
7462 { .mfb
7463 nop.m 0
7464 nop.f 0
7465 // branch out if x is natval, nan, +inf
7466 (p6) br.cond.spnt lgammal_nan_pinf
7467 };;
7468 { .mfb
7469 nop.m 0
7470 nop.f 0
7471 (p7) br.cond.spnt lgammal_singularity
7472 };;
7473 // if we are still here then x = -inf
7474 { .mfi
7475 cmp.eq p6, p7 = 4, rSgnGamSize
7476 nop.f 0
7477 adds rSgnGam = 1, r0
7478 };;
7479 { .mfi
7480 // store signgam if size of variable is 4 bytes
7481 (p6) st4 [rSgnGamAddr] = rSgnGam
7482 nop.f 0
7483 nop.i 0
7484 }
7485 { .mfb
7486 // store signgam if size of variable is 8 bytes
7487 (p7) st8 [rSgnGamAddr] = rSgnGam
7488 fma.s0 f8 = f8,f8,f0 // return +inf, no call to error support
7489 br.ret.spnt b0
7490 };;
7491
7492 // here if x is NaN, NatVal or +INF
7493 .align 32
7494 lgammal_nan_pinf:
7495 { .mfi
7496 cmp.eq p6, p7 = 4, rSgnGamSize
7497 nop.f 0
7498 adds rSgnGam = 1, r0
7499 }
7500 ;;
7501 { .mfi
7502 // store signgam if size of variable is 4 bytes
7503 (p6) st4 [rSgnGamAddr] = rSgnGam
7504 fma.s0 f8 = f8,f1,f8 // return x+x if x is natval, nan, +inf
7505 nop.i 0
7506 }
7507 { .mfb
7508 // store signgam if size of variable is 8 bytes
7509 (p7) st8 [rSgnGamAddr] = rSgnGam
7510 nop.f 0
7511 br.ret.sptk b0
7512 }
7513 ;;
7514
7515 // here if x denormal or unnormal
7516 .align 32
7517 lgammal_denormal_input:
7518 { .mfi
7519 nop.m 0
7520 fma.s0 fResH = f1, f1, f8 // raise denormal exception
7521 nop.i 0
7522 }
7523 { .mfi
7524 nop.m 0
7525 fnorm.s1 f8 = f8 // normalize input value
7526 nop.i 0
7527 }
7528 ;;
7529 { .mfi
7530 getf.sig rSignifX = f8
7531 fmerge.se fSignifX = f1, f8
7532 nop.i 0
7533 }
7534 { .mfi
7535 getf.exp rSignExpX = f8
7536 fcvt.fx.s1 fXint = f8 // Convert arg to int (int repres. in FR)
7537 nop.i 0
7538 }
7539 ;;
7540 { .mfi
7541 getf.exp rSignExpX = f8
7542 fcmp.lt.s1 p15, p14 = f8, f0
7543 nop.i 0
7544 }
7545 ;;
7546 { .mfb
7547 and rExpX = rSignExpX, r17Ones
7548 fmerge.s fAbsX = f1, f8 // |x|
7549 br.cond.sptk _deno_back_to_main_path
7550 }
7551 ;;
7552
7553
7554 // here if overflow (x > overflow_bound)
7555 .align 32
7556 lgammal_overflow:
7557 { .mfi
7558 addl r8 = 0x1FFFE, r0
7559 nop.f 0
7560 cmp.eq p6, p7 = 4, rSgnGamSize
7561 }
7562 { .mfi
7563 adds rSgnGam = 1, r0
7564 nop.f 0
7565 nop.i 0
7566 }
7567 ;;
7568 { .mfi
7569 setf.exp f9 = r8
7570 fmerge.s FR_X = f8,f8
7571 mov GR_Parameter_TAG = 102 // overflow
7572 };;
7573 { .mfi
7574 // store signgam if size of variable is 4 bytes
7575 (p6) st4 [rSgnGamAddr] = rSgnGam
7576 nop.f 0
7577 nop.i 0
7578 }
7579 { .mfb
7580 // store signgam if size of variable is 8 bytes
7581 (p7) st8 [rSgnGamAddr] = rSgnGam
7582 fma.s0 FR_RESULT = f9,f9,f0 // Set I,O and +INF result
7583 br.cond.sptk __libm_error_region
7584 };;
7585
7586 // here if x is negative integer or +/-0 (SINGULARITY)
7587 .align 32
7588 lgammal_singularity:
7589 { .mfi
7590 adds rSgnGam = 1, r0
7591 fclass.m p8,p0 = f8,0x6 // is x -0?
7592 mov GR_Parameter_TAG = 103 // negative
7593 }
7594 { .mfi
7595 cmp.eq p6, p7 = 4, rSgnGamSize
7596 fma.s1 FR_X = f0,f0,f8
7597 nop.i 0
7598 };;
7599 { .mfi
7600 (p8) sub rSgnGam = r0, rSgnGam
7601 nop.f 0
7602 nop.i 0
7603 }
7604 { .mfi
7605 nop.m 0
7606 nop.f 0
7607 nop.i 0
7608 };;
7609 { .mfi
7610 // store signgam if size of variable is 4 bytes
7611 (p6) st4 [rSgnGamAddr] = rSgnGam
7612 nop.f 0
7613 nop.i 0
7614 }
7615 { .mfb
7616 // store signgam if size of variable is 8 bytes
7617 (p7) st8 [rSgnGamAddr] = rSgnGam
7618 frcpa.s0 FR_RESULT, p0 = f1, f0
7619 br.cond.sptk __libm_error_region
7620 };;
7621
7622 GLOBAL_LIBM_END(__libm_lgammal)
7623
7624
7625
7626 LOCAL_LIBM_ENTRY(__libm_error_region)
7627 .prologue
7628 { .mfi
7629 add GR_Parameter_Y=-32,sp // Parameter 2 value
7630 nop.f 0
7631 .save ar.pfs,GR_SAVE_PFS
7632 mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
7633 }
7634 { .mfi
7635 .fframe 64
7636 add sp=-64,sp // Create new stack
7637 nop.f 0
7638 mov GR_SAVE_GP=gp // Save gp
7639 };;
7640 { .mmi
7641 stfe [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack
7642 add GR_Parameter_X = 16,sp // Parameter 1 address
7643 .save b0, GR_SAVE_B0
7644 mov GR_SAVE_B0=b0 // Save b0
7645 };;
7646 .body
7647 { .mib
7648 stfe [GR_Parameter_X] = FR_X // Store Parameter 1 on stack
7649 add GR_Parameter_RESULT = 0,GR_Parameter_Y
7650 nop.b 0 // Parameter 3 address
7651 }
7652 { .mib
7653 stfe [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack
7654 add GR_Parameter_Y = -16,GR_Parameter_Y
7655 br.call.sptk b0=__libm_error_support# // Call error handling function
7656 };;
7657 { .mmi
7658 add GR_Parameter_RESULT = 48,sp
7659 nop.m 999
7660 nop.i 999
7661 };;
7662 { .mmi
7663 ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack
7664 .restore sp
7665 add sp = 64,sp // Restore stack pointer
7666 mov b0 = GR_SAVE_B0 // Restore return address
7667 };;
7668 { .mib
7669 mov gp = GR_SAVE_GP // Restore gp
7670 mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
7671 br.ret.sptk b0 // Return
7672 };;
7673
7674 LOCAL_LIBM_END(__libm_error_region#)
7675
7676 .type __libm_error_support#,@function
7677 .global __libm_error_support#