]> git.ipfire.org Git - thirdparty/glibc.git/blob - ports/sysdeps/ia64/fpu/w_tgammal.S
Move all files into ports/ subdirectory in preparation for merge with glibc
[thirdparty/glibc.git] / ports / sysdeps / ia64 / fpu / w_tgammal.S
1 .file "tgammal.s"
2
3
4 // Copyright (c) 2002 - 2005, Intel Corporation
5 // All rights reserved.
6 //
7 // Contributed 2002 by the Intel Numerics Group, Intel Corporation
8 //
9 // Redistribution and use in source and binary forms, with or without
10 // modification, are permitted provided that the following conditions are
11 // met:
12 //
13 // * Redistributions of source code must retain the above copyright
14 // notice, this list of conditions and the following disclaimer.
15 //
16 // * Redistributions in binary form must reproduce the above copyright
17 // notice, this list of conditions and the following disclaimer in the
18 // documentation and/or other materials provided with the distribution.
19 //
20 // * The name of Intel Corporation may not be used to endorse or promote
21 // products derived from this software without specific prior written
22 // permission.
23
24 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
25 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
26 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
27 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
28 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
29 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
30 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
31 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
32 // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
33 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
34 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35 //
36 // Intel Corporation is the author of this code, and requests that all
37 // problem reports or change requests be submitted to it directly at
38 // http://www.intel.com/software/products/opensource/libraries/num.htm.
39 //
40 // History
41 //==============================================================
42 // 01/16/02 Initial version
43 // 05/20/02 Cleaned up namespace and sf0 syntax
44 // 02/10/03 Reordered header: .section, .global, .proc, .align;
45 // used data8 for long double table values
46 // 03/17/03 Moved tgammal_libm_err label into .proc region
47 // 04/10/03 Changed error codes for overflow and negative integers
48 // 03/31/05 Reformatted delimiters between data tables
49 //
50 // API
51 //==============================================================
52 // long double tgammal(long double)
53 //
54 // Resources Used:
55 //
56 // Floating-Point Registers: f8-f15
57 // f32-f127
58 //
59 // General Purpose Registers: r32-r67
60 //
61 // Predicate Registers: p6-p15
62 //
63 //*********************************************************************
64 //
65 // IEEE Special Conditions:
66 //
67 // tgammal(+inf) = +inf
68 // tgammal(-inf) = QNaN
69 // tgammal(+/-0) = +/-inf
70 // tgammal(x<0, x - integer) = QNaN
71 // tgammal(SNaN) = QNaN
72 // tgammal(QNaN) = QNaN
73 //
74 //*********************************************************************
75 // Overview of operation
76 //==============================================================
77 //
78 // Algorithm description
79 // ---------------------
80 //
81 // There are 3 main paths in the implementation
82 // (and additional special values branches)
83 //
84 // 1) |X| >= 13 - Stirling formula computation
85 // a) Positive arguments:
86 // TGAMMAL(X) = exp((X-0.5)*ln(X) - X + C + S(Z)),
87 // where C = 0.5*ln(2*Pi) , Z = 1/Z, S(Z) - Bernulli polynomial
88 // (up to 'B18' term).
89 // Some of these calculation done in multiprecision.
90 // Ln returns multiprecision result too
91 // and exp also accepts and returns pair of values.
92 //
93 // b) Negative arguments
94 // TGAMMAL(-X) = PI/(X*TGAMMAL(X)*sin(PI*X)).
95 // (X*sin(PI*X))/PI calculated in parallel with TGAMMAL.
96 // Here we use polynomial of 9th degree with 2 multiprecision steps.
97 // Argument range reduction is:
98 // N = [x] with round to nearest, r = x - N, -0.5 <= r < 0.5
99 // After ((X-0.5)*ln(X) - X + C + S(Z)) completed we just invert
100 // its result and compute exp with negative argument (1/exp(x)=exp(-x))
101 // Then we multiply exp result to PI/(X*sin(PI*X)).
102 //
103 // 2) 1 <= |X| < 13 - Polynomial part
104 // a) Positive arguments:
105 // All values are splitted to such intervals as:
106 // #0->[2;3], #1->[3,4], #2->[5,6]...
107 // For even intervals we just use polynomial computation with degree 20
108 // and first 6 multiprecision computations.
109 // Range reduction looks like
110 // N = [x] with truncate, r = x - N - 0.5, -0.5 <= r < 0.5
111 // For odd intervals we use reccurent formula:
112 // TGAMMAL(X) = TGAMMA(X-1)*(X-1)
113 // [1;2] interval is splitted to 3 subranges:
114 // [1;1.25], [1.25;1.75], [1.75;2] with the same polynomial forms
115 //
116 // b) Negative arguments
117 // TGAMMAL(-X) = PI/(X*TGAMMAL(X)*sin(PI*X)).
118 // (X*sin(PI*X))/PI calculated in parallel with TGAMMAL.
119 // After multiplication by TGAMMAL(X) result we calculate reciprocal
120 // and get final result.
121 //
122 // 3) 0 < |X| < 1 - Near 0 part
123 // a) Here we use reccurent formula TGAMMAL(X) = TGAMMAL(X+1)/X
124 // TGAMMAL(X+1) calculated as shown above,
125 // 1/X result obtained in parallel. Then we just multiply these values.
126 // There is only additional separated subrange: [0;0.125] with specific
127 // polynomial constants set.
128 //
129 // b) Negative arguments
130 // TGAMMAL(-X) = PI/(TGAMMAL(X+1)*sin(PI*X)).
131 // There is no need to compute 1/X.
132
133
134
135 RODATA
136
137 .align 16
138 LOCAL_OBJECT_START(Constants_Tgammal_log_80_Q)
139 // log2_hi, log2_lo, Q_6, Q_5, Q_4, Q_3, Q_2, Q_1
140 data4 0x00000000,0xB1721800,0x00003FFE,0x00000000
141 data4 0x4361C4C6,0x82E30865,0x0000BFE2,0x00000000
142 data4 0xA51BE0AF,0x92492453,0x00003FFC,0x00000000
143 data4 0xA0CFD29F,0xAAAAAB73,0x0000BFFC,0x00000000
144 data4 0xCCCE3872,0xCCCCCCCC,0x00003FFC,0x00000000
145 data4 0xFFFFB4FB,0xFFFFFFFF,0x0000BFFC,0x00000000
146 data4 0xAAAAAAAB,0xAAAAAAAA,0x00003FFD,0x00000000
147 data4 0x00000000,0x80000000,0x0000BFFE,0x00000000
148 LOCAL_OBJECT_END(Constants_Tgammal_log_80_Q)
149
150 .align 64
151 LOCAL_OBJECT_START(Constants_Tgammal_log_80_Z_G_H_h1)
152 // Z1 - 16 bit fixed, G1 and H1 IEEE single, h1 IEEE double
153 data4 0x00008000,0x3F800000,0x00000000,0x00000000
154 data4 0x00000000,0x00000000,0x00000000,0x00000000
155 data4 0x00007879,0x3F70F0F0,0x3D785196,0x00000000
156 data4 0xEBA0E0D1,0x8B1D330B,0x00003FDA,0x00000000
157 data4 0x000071C8,0x3F638E38,0x3DF13843,0x00000000
158 data4 0x9EADD553,0xE2AF365E,0x00003FE2,0x00000000
159 data4 0x00006BCB,0x3F579430,0x3E2FF9A0,0x00000000
160 data4 0x752F34A2,0xF585FEC3,0x0000BFE3,0x00000000
161 data4 0x00006667,0x3F4CCCC8,0x3E647FD6,0x00000000
162 data4 0x893B03F3,0xF3546435,0x00003FE2,0x00000000
163 data4 0x00006187,0x3F430C30,0x3E8B3AE7,0x00000000
164 data4 0x39CDD2AC,0xBABA62E0,0x00003FE4,0x00000000
165 data4 0x00005D18,0x3F3A2E88,0x3EA30C68,0x00000000
166 data4 0x457978A1,0x8718789F,0x00003FE2,0x00000000
167 data4 0x0000590C,0x3F321640,0x3EB9CEC8,0x00000000
168 data4 0x3185E56A,0x9442DF96,0x0000BFE4,0x00000000
169 data4 0x00005556,0x3F2AAAA8,0x3ECF9927,0x00000000
170 data4 0x2BBE2CBD,0xCBF9A4BF,0x00003FE4,0x00000000
171 data4 0x000051EC,0x3F23D708,0x3EE47FC5,0x00000000
172 data4 0x852D5935,0xF3537535,0x00003FE3,0x00000000
173 data4 0x00004EC5,0x3F1D89D8,0x3EF8947D,0x00000000
174 data4 0x46CDF32F,0xA1F1E699,0x0000BFDF,0x00000000
175 data4 0x00004BDB,0x3F17B420,0x3F05F3A1,0x00000000
176 data4 0xD8484CE3,0x84A61856,0x00003FE4,0x00000000
177 data4 0x00004925,0x3F124920,0x3F0F4303,0x00000000
178 data4 0xFF28821B,0xC7DD97E0,0x0000BFE2,0x00000000
179 data4 0x0000469F,0x3F0D3DC8,0x3F183EBF,0x00000000
180 data4 0xEF1FD32F,0xD3C4A887,0x00003FE3,0x00000000
181 data4 0x00004445,0x3F088888,0x3F20EC80,0x00000000
182 data4 0x464C76DA,0x84672BE6,0x00003FE5,0x00000000
183 data4 0x00004211,0x3F042108,0x3F29516A,0x00000000
184 data4 0x18835FB9,0x9A43A511,0x0000BFE5,0x00000000
185 LOCAL_OBJECT_END(Constants_Tgammal_log_80_Z_G_H_h1)
186
187 .align 64
188 LOCAL_OBJECT_START(Constants_Tgammal_log_80_Z_G_H_h2)
189 // Z2 - 16 bit fixed, G2 and H2 IEEE single, h2 IEEE double
190 data4 0x00008000,0x3F800000,0x00000000,0x00000000
191 data4 0x00000000,0x00000000,0x00000000,0x00000000
192 data4 0x00007F81,0x3F7F00F8,0x3B7F875D,0x00000000
193 data4 0x211398BF,0xAD08B116,0x00003FDB,0x00000000
194 data4 0x00007F02,0x3F7E03F8,0x3BFF015B,0x00000000
195 data4 0xC376958E,0xB106790F,0x00003FDE,0x00000000
196 data4 0x00007E85,0x3F7D08E0,0x3C3EE393,0x00000000
197 data4 0x79A7679A,0xFD03F242,0x0000BFDA,0x00000000
198 data4 0x00007E08,0x3F7C0FC0,0x3C7E0586,0x00000000
199 data4 0x05E7AE08,0xF03F81C3,0x0000BFDF,0x00000000
200 data4 0x00007D8D,0x3F7B1880,0x3C9E75D2,0x00000000
201 data4 0x049EB22F,0xD1B87D3C,0x00003FDE,0x00000000
202 data4 0x00007D12,0x3F7A2328,0x3CBDC97A,0x00000000
203 data4 0x3A9E81E0,0xFABC8B95,0x00003FDF,0x00000000
204 data4 0x00007C98,0x3F792FB0,0x3CDCFE47,0x00000000
205 data4 0x7C4B5443,0xF5F3653F,0x00003FDF,0x00000000
206 data4 0x00007C20,0x3F783E08,0x3CFC15D0,0x00000000
207 data4 0xF65A1773,0xE78AB204,0x00003FE0,0x00000000
208 data4 0x00007BA8,0x3F774E38,0x3D0D874D,0x00000000
209 data4 0x7B8EF695,0xDB7CBFFF,0x0000BFE0,0x00000000
210 data4 0x00007B31,0x3F766038,0x3D1CF49B,0x00000000
211 data4 0xCF773FB3,0xC0241AEA,0x0000BFE0,0x00000000
212 data4 0x00007ABB,0x3F757400,0x3D2C531D,0x00000000
213 data4 0xC9539FDF,0xFC8F4D48,0x00003FE1,0x00000000
214 data4 0x00007A45,0x3F748988,0x3D3BA322,0x00000000
215 data4 0x954665C2,0x9CD035FB,0x0000BFE1,0x00000000
216 data4 0x000079D1,0x3F73A0D0,0x3D4AE46F,0x00000000
217 data4 0xDD367A30,0xEC9017C7,0x00003FE1,0x00000000
218 data4 0x0000795D,0x3F72B9D0,0x3D5A1756,0x00000000
219 data4 0xCB11189C,0xEE6625D3,0x0000BFE1,0x00000000
220 data4 0x000078EB,0x3F71D488,0x3D693B9D,0x00000000
221 data4 0xBE11C424,0xA49C8DB5,0x0000BFE0,0x00000000
222 LOCAL_OBJECT_END(Constants_Tgammal_log_80_Z_G_H_h2)
223
224 .align 64
225 LOCAL_OBJECT_START(Constants_Tgammal_log_80_h3_G_H)
226 // h3 IEEE double extended, H3 and G3 IEEE single
227 data4 0x112666B0,0xAAACAAB1,0x00003FD3,0x3F7FFC00
228 data4 0x9B7FAD21,0x90051030,0x00003FD8,0x3F7FF400
229 data4 0xF4D783C4,0xA6B46F46,0x00003FDA,0x3F7FEC00
230 data4 0x11C6DDCA,0xDA148D88,0x0000BFD8,0x3F7FE400
231 data4 0xCA964D95,0xCE65C1D8,0x0000BFD8,0x3F7FDC00
232 data4 0x23412D13,0x883838EE,0x0000BFDB,0x3F7FD400
233 data4 0x983ED687,0xB7E5CFA1,0x00003FDB,0x3F7FCC08
234 data4 0xE3C3930B,0xDBE23B16,0x0000BFD9,0x3F7FC408
235 data4 0x48AA4DFC,0x9B92F1FC,0x0000BFDC,0x3F7FBC10
236 data4 0xCE9C8F7E,0x9A8CEB15,0x0000BFD9,0x3F7FB410
237 data4 0x0DECE74A,0x8C220879,0x00003FDC,0x3F7FAC18
238 data4 0x2F053150,0xB25CA912,0x0000BFDA,0x3F7FA420
239 data4 0xD9A5BE20,0xA5876555,0x00003FDB,0x3F7F9C20
240 data4 0x2053F087,0xC919BB6E,0x00003FD9,0x3F7F9428
241 data4 0x041E9A77,0xB70BDA79,0x00003FDC,0x3F7F8C30
242 data4 0xEA1C9C30,0xF18A5C08,0x00003FDA,0x3F7F8438
243 data4 0x796D89E5,0xA3790D84,0x0000BFDD,0x3F7F7C40
244 data4 0xA2915A3A,0xE1852369,0x0000BFDD,0x3F7F7448
245 data4 0xA39ED868,0xD803858F,0x00003FDC,0x3F7F6C50
246 data4 0x9417EBB7,0xB2EEE356,0x0000BFDD,0x3F7F6458
247 data4 0x9BB0D07F,0xED5C1F8A,0x0000BFDC,0x3F7F5C68
248 data4 0xE87C740A,0xD6D201A0,0x0000BFDD,0x3F7F5470
249 data4 0x1CA74025,0xE8DEBF5E,0x00003FDC,0x3F7F4C78
250 data4 0x1F34A7EB,0x9A995A97,0x0000BFDC,0x3F7F4488
251 data4 0x359EED97,0x9CB0F742,0x0000BFDA,0x3F7F3C90
252 data4 0xBBC6A1C8,0xD6F833C2,0x0000BFDD,0x3F7F34A0
253 data4 0xE71090EC,0xE1F68F2A,0x00003FDC,0x3F7F2CA8
254 data4 0xC160A74F,0xD1881CF1,0x0000BFDB,0x3F7F24B8
255 data4 0xD78CB5A4,0x9AD05AE2,0x00003FD6,0x3F7F1CC8
256 data4 0x9A77DC4B,0xE658CB8E,0x0000BFDD,0x3F7F14D8
257 data4 0x6BD6D312,0xBA281296,0x00003FDC,0x3F7F0CE0
258 data4 0xF95210D0,0xB478BBEB,0x0000BFDB,0x3F7F04F0
259 data4 0x38800100,0x39400480,0x39A00640,0x39E00C41 // H's start here
260 data4 0x3A100A21,0x3A300F22,0x3A4FF51C,0x3A6FFC1D
261 data4 0x3A87F20B,0x3A97F68B,0x3AA7EB86,0x3AB7E101
262 data4 0x3AC7E701,0x3AD7DD7B,0x3AE7D474,0x3AF7CBED
263 data4 0x3B03E1F3,0x3B0BDE2F,0x3B13DAAA,0x3B1BD766
264 data4 0x3B23CC5C,0x3B2BC997,0x3B33C711,0x3B3BBCC6
265 data4 0x3B43BAC0,0x3B4BB0F4,0x3B53AF6D,0x3B5BA620
266 data4 0x3B639D12,0x3B6B9444,0x3B7393BC,0x3B7B8B6D
267 LOCAL_OBJECT_END(Constants_Tgammal_log_80_h3_G_H)
268
269 .align 64
270 LOCAL_OBJECT_START(Constants_Tgammal_stirling)
271 //0.5*ln(2*Pi)=9.1893853320467266954096885e-01 + 7.2239360881843238220057778e-17
272 data8 0x3FED67F1C864BEB4, 0x3C94D252F2400510
273 // Bernulli numbers
274 data8 0xAAAAAAAAAAAAAAAB, 0x00003FFB //B2 = 8.3333333333333333333333333333e-02
275 data8 0xBF66C16C16C16C17 //B4 = -2.7777777777777777777777777778e-03
276 data8 0x3F4A01A01A01A01A //B6 = 7.9365079365079365079365079365e-04
277 data8 0xBF43813813813814 //B8 = -5.9523809523809523809523809524e-04
278 data8 0x3F4B951E2B18FF23 //B10 = 8.4175084175084175084175084175e-04
279 data8 0xBF5F6AB0D9993C7D //B12 = -1.9175269175269175269175269175e-03
280 data8 0x3F7A41A41A41A41A //B14 = 6.4102564102564102564102564103e-03
281 data8 0xBF9E4286CB0F5398 //B16 = -2.9550653594771241830065359477e-02
282 data8 0x3FC6FE96381E0680 //B18 = 1.7964437236883057316493849002e-01
283 data8 0x3FE0000000000000 // 0.5
284 LOCAL_OBJECT_END(Constants_Tgammal_stirling)
285
286 .align 64
287 LOCAL_OBJECT_START(Constants_Tgammal_sin)
288 // Polynomial coefficients for the sin(Pi*x)/Pi, 0 <= |x| < 0.5
289 //A2 = 8.1174242528335360802316245099e-01 + 5.1302254650266899774269946201e-18
290 data8 0x3FE9F9CB402BC46C, 0x3C57A8B3819B7CEC
291 //A1 = -1.6449340668482264060656916627e+00 + -3.0210280454695477893051351574e-17
292 data8 0xBFFA51A6625307D3, 0xBC816A402079D0EF
293 data8 0xF3AEF1FFCCE6C813, 0x0000BFE3 //A9 = -7.0921197799923779127089910470e-09
294 data8 0x87D54408E6D4BB9D, 0x00003FE9 //A8 = 2.5300880778252693946712766029e-07
295 data8 0xEA12033DCE7B8ED9, 0x0000BFED //A7 = -6.9758403885461690048189307819e-06
296 data8 0x9BA38C952A59D1A8, 0x00003FF2 //A6 = 1.4842878710882320255092707181e-04
297 data8 0x99C0B55178FF0E38, 0x0000BFF6 //A5 = -2.3460810348048124421268761990e-03
298 data8 0xD63402E798FEC896, 0x00003FF9 //A4 = 2.6147847817611456327417812320e-02
299 data8 0xC354723906D95E92, 0x0000BFFC //A3 = -1.9075182412208257558294507774e-01
300 LOCAL_OBJECT_END(Constants_Tgammal_sin)
301
302 .align 64
303 LOCAL_OBJECT_START(Constants_Tgammal_exp_64_Arg)
304 data4 0x00000000,0xB17217F4,0x00003FF2,0x00000000 // L_hi = hi part log(2)/2^12
305 data4 0xF278ECE6,0xF473DE6A,0x00003FD4,0x00000000 // L_lo = lo part log(2)/2^12
306 LOCAL_OBJECT_END(Constants_Tgammal_exp_64_Arg)
307
308 LOCAL_OBJECT_START(Constants_Tgammal_exp_64_A)
309 data4 0xB1B736A0,0xAAAAAAAB,0x00003FFA,0x00000000 // A3
310 data4 0x90CD6327,0xAAAAAAAB,0x00003FFC,0x00000000 // A2
311 data4 0xFFFFFFFF,0xFFFFFFFF,0x00003FFD,0x00000000 // A1
312 LOCAL_OBJECT_END(Constants_Tgammal_exp_64_A)
313
314 LOCAL_OBJECT_START(Constants_Tgammal_exp_64_T1)
315 data4 0x3F800000,0x3F8164D2,0x3F82CD87,0x3F843A29
316 data4 0x3F85AAC3,0x3F871F62,0x3F88980F,0x3F8A14D5
317 data4 0x3F8B95C2,0x3F8D1ADF,0x3F8EA43A,0x3F9031DC
318 data4 0x3F91C3D3,0x3F935A2B,0x3F94F4F0,0x3F96942D
319 data4 0x3F9837F0,0x3F99E046,0x3F9B8D3A,0x3F9D3EDA
320 data4 0x3F9EF532,0x3FA0B051,0x3FA27043,0x3FA43516
321 data4 0x3FA5FED7,0x3FA7CD94,0x3FA9A15B,0x3FAB7A3A
322 data4 0x3FAD583F,0x3FAF3B79,0x3FB123F6,0x3FB311C4
323 data4 0x3FB504F3,0x3FB6FD92,0x3FB8FBAF,0x3FBAFF5B
324 data4 0x3FBD08A4,0x3FBF179A,0x3FC12C4D,0x3FC346CD
325 data4 0x3FC5672A,0x3FC78D75,0x3FC9B9BE,0x3FCBEC15
326 data4 0x3FCE248C,0x3FD06334,0x3FD2A81E,0x3FD4F35B
327 data4 0x3FD744FD,0x3FD99D16,0x3FDBFBB8,0x3FDE60F5
328 data4 0x3FE0CCDF,0x3FE33F89,0x3FE5B907,0x3FE8396A
329 data4 0x3FEAC0C7,0x3FED4F30,0x3FEFE4BA,0x3FF28177
330 data4 0x3FF5257D,0x3FF7D0DF,0x3FFA83B3,0x3FFD3E0C
331 LOCAL_OBJECT_END(Constants_Tgammal_exp_64_T1)
332
333 LOCAL_OBJECT_START(Constants_Tgammal_exp_64_T2)
334 data4 0x3F800000,0x3F80058C,0x3F800B18,0x3F8010A4
335 data4 0x3F801630,0x3F801BBD,0x3F80214A,0x3F8026D7
336 data4 0x3F802C64,0x3F8031F2,0x3F803780,0x3F803D0E
337 data4 0x3F80429C,0x3F80482B,0x3F804DB9,0x3F805349
338 data4 0x3F8058D8,0x3F805E67,0x3F8063F7,0x3F806987
339 data4 0x3F806F17,0x3F8074A8,0x3F807A39,0x3F807FCA
340 data4 0x3F80855B,0x3F808AEC,0x3F80907E,0x3F809610
341 data4 0x3F809BA2,0x3F80A135,0x3F80A6C7,0x3F80AC5A
342 data4 0x3F80B1ED,0x3F80B781,0x3F80BD14,0x3F80C2A8
343 data4 0x3F80C83C,0x3F80CDD1,0x3F80D365,0x3F80D8FA
344 data4 0x3F80DE8F,0x3F80E425,0x3F80E9BA,0x3F80EF50
345 data4 0x3F80F4E6,0x3F80FA7C,0x3F810013,0x3F8105AA
346 data4 0x3F810B41,0x3F8110D8,0x3F81166F,0x3F811C07
347 data4 0x3F81219F,0x3F812737,0x3F812CD0,0x3F813269
348 data4 0x3F813802,0x3F813D9B,0x3F814334,0x3F8148CE
349 data4 0x3F814E68,0x3F815402,0x3F81599C,0x3F815F37
350 LOCAL_OBJECT_END(Constants_Tgammal_exp_64_T2)
351
352 LOCAL_OBJECT_START(Constants_Tgammal_exp_64_W1)
353 data8 0x0000000000000000, 0xBE384454171EC4B4
354 data8 0xBE6947414AA72766, 0xBE5D32B6D42518F8
355 data8 0x3E68D96D3A319149, 0xBE68F4DA62415F36
356 data8 0xBE6DDA2FC9C86A3B, 0x3E6B2E50F49228FE
357 data8 0xBE49C0C21188B886, 0x3E64BFC21A4C2F1F
358 data8 0xBE6A2FBB2CB98B54, 0x3E5DC5DE9A55D329
359 data8 0x3E69649039A7AACE, 0x3E54728B5C66DBA5
360 data8 0xBE62B0DBBA1C7D7D, 0x3E576E0409F1AF5F
361 data8 0x3E6125001A0DD6A1, 0xBE66A419795FBDEF
362 data8 0xBE5CDE8CE1BD41FC, 0xBE621376EA54964F
363 data8 0x3E6370BE476E76EE, 0x3E390D1A3427EB92
364 data8 0x3E1336DE2BF82BF8, 0xBE5FF1CBD0F7BD9E
365 data8 0xBE60A3550CEB09DD, 0xBE5CA37E0980F30D
366 data8 0xBE5C541B4C082D25, 0xBE5BBECA3B467D29
367 data8 0xBE400D8AB9D946C5, 0xBE5E2A0807ED374A
368 data8 0xBE66CB28365C8B0A, 0x3E3AAD5BD3403BCA
369 data8 0x3E526055C7EA21E0, 0xBE442C75E72880D6
370 data8 0x3E58B2BB85222A43, 0xBE5AAB79522C42BF
371 data8 0xBE605CB4469DC2BC, 0xBE589FA7A48C40DC
372 data8 0xBE51C2141AA42614, 0xBE48D087C37293F4
373 data8 0x3E367A1CA2D673E0, 0xBE51BEBB114F7A38
374 data8 0xBE6348E5661A4B48, 0xBDF526431D3B9962
375 data8 0x3E3A3B5E35A78A53, 0xBE46C46C1CECD788
376 data8 0xBE60B7EC7857D689, 0xBE594D3DD14F1AD7
377 data8 0xBE4F9C304C9A8F60, 0xBE52187302DFF9D2
378 data8 0xBE5E4C8855E6D68F, 0xBE62140F667F3DC4
379 data8 0xBE36961B3BF88747, 0x3E602861C96EC6AA
380 data8 0xBE3B5151D57FD718, 0x3E561CD0FC4A627B
381 data8 0xBE3A5217CA913FEA, 0x3E40A3CC9A5D193A
382 data8 0xBE5AB71310A9C312, 0x3E4FDADBC5F57719
383 data8 0x3E361428DBDF59D5, 0x3E5DB5DB61B4180D
384 data8 0xBE42AD5F7408D856, 0x3E2A314831B2B707
385 LOCAL_OBJECT_END(Constants_Tgammal_exp_64_W1)
386
387 LOCAL_OBJECT_START(Constants_Tgammal_exp_64_W2)
388 data8 0x0000000000000000, 0xBE641F2537A3D7A2
389 data8 0xBE68DD57AD028C40, 0xBE5C77D8F212B1B6
390 data8 0x3E57878F1BA5B070, 0xBE55A36A2ECAE6FE
391 data8 0xBE620608569DFA3B, 0xBE53B50EA6D300A3
392 data8 0x3E5B5EF2223F8F2C, 0xBE56A0D9D6DE0DF4
393 data8 0xBE64EEF3EAE28F51, 0xBE5E5AE2367EA80B
394 data8 0x3E47CB1A5FCBC02D, 0xBE656BA09BDAFEB7
395 data8 0x3E6E70C6805AFEE7, 0xBE6E0509A3415EBA
396 data8 0xBE56856B49BFF529, 0x3E66DD3300508651
397 data8 0x3E51165FC114BC13, 0x3E53333DC453290F
398 data8 0x3E6A072B05539FDA, 0xBE47CD877C0A7696
399 data8 0xBE668BF4EB05C6D9, 0xBE67C3E36AE86C93
400 data8 0xBE533904D0B3E84B, 0x3E63E8D9556B53CE
401 data8 0x3E212C8963A98DC8, 0xBE33138F032A7A22
402 data8 0x3E530FA9BC584008, 0xBE6ADF82CCB93C97
403 data8 0x3E5F91138370EA39, 0x3E5443A4FB6A05D8
404 data8 0x3E63DACD181FEE7A, 0xBE62B29DF0F67DEC
405 data8 0x3E65C4833DDE6307, 0x3E5BF030D40A24C1
406 data8 0x3E658B8F14E437BE, 0xBE631C29ED98B6C7
407 data8 0x3E6335D204CF7C71, 0x3E529EEDE954A79D
408 data8 0x3E5D9257F64A2FB8, 0xBE6BED1B854ED06C
409 data8 0x3E5096F6D71405CB, 0xBE3D4893ACB9FDF5
410 data8 0xBDFEB15801B68349, 0x3E628D35C6A463B9
411 data8 0xBE559725ADE45917, 0xBE68C29C042FC476
412 data8 0xBE67593B01E511FA, 0xBE4A4313398801ED
413 data8 0x3E699571DA7C3300, 0x3E5349BE08062A9E
414 data8 0x3E5229C4755BB28E, 0x3E67E42677A1F80D
415 data8 0xBE52B33F6B69C352, 0xBE6B3550084DA57F
416 data8 0xBE6DB03FD1D09A20, 0xBE60CBC42161B2C1
417 data8 0x3E56ED9C78A2B771, 0xBE508E319D0FA795
418 data8 0xBE59482AFD1A54E9, 0xBE2A17CEB07FD23E
419 data8 0x3E68BF5C17365712, 0x3E3956F9B3785569
420 LOCAL_OBJECT_END(Constants_Tgammal_exp_64_W2)
421
422
423
424 LOCAL_OBJECT_START(Constants_Tgammal_poly)
425
426 // Polynomial coefficients for the tgammal(x), 2 <= |x| < 3
427 //A5 = 2.8360780594841213109180699803e-02 + 2.2504152891014320704380000000e-19
428 data8 0x3F9D0A9BC49353D2, 0x3C109AEA0F23CE2D
429 //A4 = 1.0967323400216015538699565468e-01 + 9.9225166000430644587276000000e-18
430 data8 0x3FBC138B89492C5B, 0x3C66E138506D5652
431 //A3 = 2.5387124684114281691904579930e-01 + 2.2667777637607113205546600000e-17
432 data8 0x3FD03F6D2FA4F4F8, 0x3C7A2258DA8CD8B1
433 data8 0xC5866457328BC39B, 0x00003FE3 //A20 = 5.7487331964156762795056629138e-09
434 data8 0xE93D9F1ACD59C929, 0x0000BFE4 //A19= -1.3576396100397317396956445658e-08
435 data8 0xE33389C8F6CBA813, 0x00003FE5 //A18 = 2.6449714924964597501721434271e-08
436 data8 0x8FE7B25B9CD26D2A, 0x0000BFE7 //A17= -6.7011017946055513660266853311e-08
437 data8 0xB89F4721BFBC15B0, 0x00003FE8 //A16 = 1.7194280320370423615174419192e-07
438 data8 0xE49CBDC1874EBABA, 0x0000BFE9 //A15= -4.2582353660153782928729466776e-07
439 data8 0x913AF50A336129CA, 0x00003FEB //A14 = 1.0820500665257088283172211622e-06
440 data8 0xABCF0F7313B3B332, 0x0000BFEC //A13= -2.5601510627710417669568115706e-06
441 //A2 = 6.5455857798133676439533701341e-01 + 1.3292075193155190798867000000e-18
442 data8 0x3FE4F224D4B7E01C, 0x3C3885014A2B8319
443 //A1 = 9.3473452162608550164435428087e-01 + 3.2785154201417136611642400000e-17
444 data8 0x3FEDE9585F1A7093, 0x3C82E63C1B5028BF
445 //A0 = 1.3293403881791368004172682049e+00 + 2.2005689328949279282607500000e-16
446 data8 0x3FF544FA6D47B38F, 0x3CAFB6AA9829E81F
447 data8 0xF3668F799997C76D, 0x00003FED //A12 = 7.2539039479124273660331538367e-06
448 data8 0xD6C6BBD54CDEAEB1, 0x0000BFEE //A11= -1.2801665282681088568639378920e-05
449 data8 0x809E4763B06F6883, 0x00003FF1 //A10 = 6.1329973609906572700697893187e-05
450 data8 0x8443B000F8F9A71A, 0x00003FED //A9 = 3.9417864189995544394564413428e-06
451 data8 0xC5C7E6D62A6991D8, 0x00003FF4 //A8 = 7.5447412886334708803357581519e-04
452 data8 0xD2AF690725C62D88, 0x00003FF5 //A7 = 1.6074004848394703022110823298e-03
453 data8 0xAA44E635D4B7B682, 0x00003FF8 //A6 = 1.0392403425906843901680697839e-02
454 //
455 // Polynomial coefficients for the tgammal(x), 4 <= |x| < 5
456 //A5 = 1.1600674810589555185913468449e+00 + 3.0229979112715124660731000000e-17
457 data8 0x3FF28FA2EB44D22E, 0x3C816D285234C815
458 //A4 = 3.1374268565470946334983182169e+00 + 1.3694868953995008497659600000e-16
459 data8 0x400919734073B1E1, 0x3CA3BC83CD7E9565
460 //A3 = 7.0834593993741057360580271052e+00 + 3.3899702569039156457249800000e-16
461 data8 0x401C5576617B6C1F, 0x3CB86D6431213296
462 data8 0xA4A5FB49C094966B, 0x00003FDA //A20 = 9.3591760106637809309720130828e-12
463 data8 0xA9260DA0F51D7ED8, 0x00003FDD //A19 = 7.6919898428091669411809372180e-11
464 data8 0xA16441DFB14BD6E1, 0x00003FE0 //A18 = 5.8713933014370867331213494535e-10
465 data8 0x95F098D9C2234849, 0x00003FE3 //A17 = 4.3638234584169302324461091035e-09
466 data8 0x8581817400E5AD2B, 0x00003FE6 //A16 = 3.1084260332429955234755367839e-08
467 data8 0xE272940E373EBE15, 0x00003FE8 //A15 = 2.1089573544273993580820317236e-07
468 data8 0xB6B3391145D226FB, 0x00003FEB //A14 = 1.3612217421122787182942706259e-06
469 data8 0x8B9428C4DF95FCD5, 0x00003FEE //A13 = 8.3195416382628990683949003789e-06
470 //A2 = 1.2665135075272345943631080445e+01 + 9.8721896915973874255877000000e-16
471 data8 0x4029548C95A76F38, 0x3CD1C8BE715B8E13
472 //A1 = 1.6154969393303069580269948347e+01 + 9.6850518810678379641029000000e-16
473 data8 0x403027AC12FC1E1E, 0x3CD172711C15501B
474 //A0 = 1.1631728396567448058362970187e+01 + 8.7078125362814179268673000000e-16
475 data8 0x40274371E7866C65, 0x3CCF5F8A1A5FACA0
476 data8 0xC94A903114272C03, 0x00003FF0 //A12 = 4.7991576836334427243159066630e-05
477 data8 0x8844262960E04BE6, 0x00003FF3 //A11 = 2.5990716419283017929486175141e-04
478 data8 0xAC5418A76767678D, 0x00003FF5 //A10 = 1.3147621245497801180184809726e-03
479 data8 0xCA231B6EFE959132, 0x00003FF7 //A9 = 6.1687358811367989146517222415e-03
480 data8 0xDA38E39C13819D2A, 0x00003FF9 //A8 = 2.6638454961912040754759086920e-02
481 data8 0xD696DF8D8389FE53, 0x00003FFB //A7 = 1.0477995539298934056097943975e-01
482 data8 0xBDD5C153048BC435, 0x00003FFD //A6 = 3.7077144754791605130056406006e-01
483 //
484 // Polynomial coefficients for the tgammal(x), 6 <= |x| < 7
485 //A5 = 6.7169398121054200601065531373e+01 + 2.9481001527213915901489600000e-15
486 data8 0x4050CAD76B377BA0, 0x3CEA8DDB2B2DE93E
487 //A4 = 1.6115104376855398982115730178e+02 + 1.3422421925418824418257300000e-14
488 data8 0x406424D559BDC687, 0x3D0E397FDB5B33DC
489 //A3 = 3.1812194028053562533386866562e+02 + 3.9881709875858650942409600000e-14
490 data8 0x4073E1F377A6CF73, 0x3D26738F63FE9C4C
491 data8 0xD6E1B5FF90CAABD3, 0x00003FE1 //A20 = 1.5634700199277480081025480635e-09
492 data8 0xD451987B925DD37E, 0x00003FE4 //A19 = 1.2358576813211397717382327174e-08
493 data8 0xBFC151B67FA58E6B, 0x00003FE7 //A18 = 8.9292951435632759686382657901e-08
494 data8 0xA9034C5E1D67572E, 0x00003FEA //A17 = 6.2962205718327848327368724720e-07
495 data8 0x8E40F6EAA30A71EC, 0x00003FED //A16 = 4.2394926442967995119170095258e-06
496 data8 0xE3C3541B03A1C350, 0x00003FEF //A15 = 2.7151465666109594512258841637e-05
497 data8 0xACE2E58436B2DDCE, 0x00003FF2 //A14 = 1.6487723793339152877117376243e-04
498 data8 0xF7EAF8D8D1CAA3D1, 0x00003FF4 //A13 = 9.4573158112768812533636022369e-04
499 //A2 = 4.8664351544258869353143381886e+02 + 4.7424047995944376868895400000e-14
500 data8 0x407E6A4BD6D9463B, 0x3D2AB2868D79E192
501 //A1 = 5.1615277644992545447166776285e+02 + 3.0901956935588717379242200000e-14
502 data8 0x40802138E2DC003B, 0x3D216570FB601AEA
503 //A0 = 2.8788527781504433278314536437e+02 + 2.8213174117085164944959600000e-14
504 data8 0x4071FE2A1911F7D6, 0x3D1FC3E4CF4DB5AF
505 data8 0xA72B88E48D3D1BAB, 0x00003FF7 //A12 = 5.1016252919939028020562237471e-03
506 data8 0xD2EFB1067DB4FFB2, 0x00003FF9 //A11 = 2.5749059441230515023024615917e-02
507 data8 0xF788AF9522205C24, 0x00003FFB //A10 = 1.2086617635601742290221382521e-01
508 data8 0x861A6CE06CB29EAF, 0x00003FFE //A9 = 5.2384071807018493367136112163e-01
509 data8 0x84FBDE0947718B58, 0x00004000 //A8 = 2.0778727617851237754568261869e+00
510 data8 0xEEC1371E265A2C3A, 0x00004001 //A7 = 7.4610858525146049022238037342e+00
511 data8 0xBF514B9BE68ED59D, 0x00004003 //A6 = 2.3914694993947572859629197920e+01
512 //
513 // Polynomial coefficients for the tgammal(x), 8 <= |x| < 9
514 //A5 = 5.8487447114416836484451778233e+03 + 4.7365465221455983144182900000e-13
515 data8 0x40B6D8BEA568B6FD, 0x3D60AA4D44C2589B
516 //A4 = 1.2796464063087094473303295672e+04 + 1.2373341702514898266244200000e-12
517 data8 0x40C8FE3B666B532D, 0x3D75C4752C5B4783
518 //A3 = 2.2837606581322281272150576115e+04 + 2.6598064610627891398831000000e-13
519 data8 0x40D64D66D23A7764, 0x3D52B77B3A10EA5C
520 data8 0xB23418F75B0BE22A, 0x00003FE9 //A20 = 3.3192989594206801808678663868e-07
521 data8 0xA984A7BC8B856ED2, 0x00003FEC //A19 = 2.5260177918662350066375115788e-06
522 data8 0x921A49729416372C, 0x00003FEF //A18 = 1.7416797068239475136398213598e-05
523 data8 0xF5BB9415CC399CA4, 0x00003FF1 //A17 = 1.1717449586392814601938207599e-04
524 data8 0xC50B91A40B81F9DF, 0x00003FF4 //A16 = 7.5166775151159345732094429036e-04
525 data8 0x96002572326DB203, 0x00003FF7 //A15 = 4.5776541559407384162139204300e-03
526 data8 0xD81A1A595E4157BA, 0x00003FF9 //A14 = 2.6379634345126284099420760736e-02
527 data8 0x92B700D0CFECADD8, 0x00003FFC //A13 = 1.4327622675407940907282658100e-01
528 //A2 = 3.1237895525940199149772524834e+04 + 3.1280450505163186432331700000e-12
529 data8 0x40DE8179504C0878, 0x3D8B83BB33FBB766
530 //A1 = 2.9192841741344487672904506326e+04 + 7.9300780509779689630767000000e-13
531 data8 0x40DC8235DF171691, 0x3D6BE6C780EE54DF
532 //A0 = 1.4034407293483411194756627083e+04 + 1.4038139346291543309253700000e-12
533 data8 0x40CB693422315F90, 0x3D78B23746113FCE
534 data8 0xBAE50807548BC711, 0x00003FFE //A12 = 7.3005724123917935346868107005e-01
535 data8 0xDE28B1F57E68CFB6, 0x00004000 //A11 = 3.4712338349724065462763671443e+00
536 data8 0xF4DCA5A5FF901118, 0x00004002 //A10 = 1.5303868912154033908205911714e+01
537 data8 0xF85AAA1AD5E84E5E, 0x00004004 //A9 = 6.2088539523416399361048051373e+01
538 data8 0xE5AA8BB1BF02934D, 0x00004006 //A8 = 2.2966619406617480799195651466e+02
539 data8 0xBF6CFEFD67F59845, 0x00004008 //A7 = 7.6570306334640770654588802417e+02
540 data8 0x8DB5D2F001635C29, 0x0000400A //A6 = 2.2673639984182571062068713002e+03
541 //
542 // Polynomial coefficients for the tgammal(x), 10 <= |x| < 11
543 //A5 = 7.2546009516580589115619659424e+05 + 1.0343348865365065212891728822e-10
544 data8 0x412623A830B99290, 0x3DDC6E7C157611C4
545 //A4 = 1.4756292870840241666883230209e+06 + 8.1516565365333844166705674775e-11
546 data8 0x4136842D497E56AF, 0x3DD66837E4C3F9EE
547 //A3 = 2.4356116926500420086085796356e+06 + 3.5508860076560925641351069404e-10
548 data8 0x4142950DD8A8C1AF, 0x3DF866C8E3DD0980
549 data8 0xB7FD0D1EEAC38EB4, 0x00003FF1 //A20 = 8.7732544640091602721643775932e-05
550 data8 0xA9345C64AC750AE9, 0x00003FF4 //A19 = 6.4546407626804942279126469603e-04
551 data8 0x8BEABC81BE1E93C9, 0x00003FF7 //A18 = 4.2699261134524096128048819443e-03
552 data8 0xE1CD281EDD7315F8, 0x00003FF9 //A17 = 2.7563646660310313164706189622e-02
553 data8 0xAD8A5BA6D0FD9758, 0x00003FFC //A16 = 1.6947310643831556048460963841e-01
554 data8 0xFCDDA464AD3F182E, 0x00003FFE //A15 = 9.8775699098518676937088606052e-01
555 data8 0xAE0DCE2F7B60D1AE, 0x00004001 //A14 = 5.4391852309591064073782104822e+00
556 data8 0xE1745D9ABEB8D1A7, 0x00004003 //A13 = 2.8181819161363002758615770457e+01
557 //A2 = 3.0619656223573554307222366333e+06 + 1.0819940302945474471259520006e-10
558 data8 0x41475C66CFA967E4, 0x3DDDBDDB2A27334B
559 //A1 = 2.6099413018962685018777847290e+06 + 3.6851882860056025385268615240e-10
560 data8 0x4143E98AA6A48974, 0x3DF9530D42589AB6
561 //A0 = 1.1332783889487853739410638809e+06 + 1.9339350553312096248591829758e-10
562 data8 0x41314ADE639225C9, 0x3DEA946DD6C2C8D3
563 data8 0x88BCFAAE71812A1C, 0x00004006 //A12 = 1.3673820009490115307300592012e+02
564 data8 0x9A770F5AB540A326, 0x00004008 //A11 = 6.1786031215382040427126476507e+02
565 data8 0xA170C1D2C6B413FC, 0x0000400A //A10 = 2.5830473201524594051391525170e+03
566 data8 0x9AE56061CB02EB55, 0x0000400C //A9 = 9.9133441230507404119297200255e+03
567 data8 0x872390769650FBE2, 0x0000400E //A8 = 3.4595564309496661629764193479e+04
568 data8 0xD3E5E8D6923910C1, 0x0000400F //A7 = 1.0849181904819284819615140521e+05
569 data8 0x930D70602F50B754, 0x00004011 //A6 = 3.0116351174131169193070583741e+05
570 //
571 // Polynomial coefficients for the tgammal(x), 12 <= |x| < 13
572 //A5 = 1.2249876249976964294910430908e+08 + 6.0051348061679753770848000000e-09
573 data8 0x419D34BB29FFC39D, 0x3E39CAB72E01818D
574 //A4 = 2.3482765927605420351028442383e+08 + 1.1874729051592862323641700000e-08
575 data8 0x41ABFE5F168D56FA, 0x3E4980338AA7B04B
576 //A3 = 3.6407329688125067949295043945e+08 + 2.6657200942150363994658700000e-08
577 data8 0x41B5B35150E199A5, 0x3E5C9F79C0EB5300
578 data8 0xE89AE0F8D726329D, 0x00003FF9 //A20 = 2.8394164465429105626588451540e-02
579 data8 0xCF90981F86E38013, 0x00003FFC //A19 = 2.0270002071785908652476845915e-01
580 data8 0xA56C658079CA8C4A, 0x00003FFF //A18 = 1.2923704984019263122675412350e+00
581 data8 0x80AEF96A67C5615A, 0x00004002 //A17 = 8.0427183300456238315262463506e+00
582 data8 0xBE886D7529678931, 0x00004004 //A16 = 4.7633230047847868242503413461e+01
583 data8 0x858EDBA4CE2F7508, 0x00004007 //A15 = 2.6711607799594541057655957154e+02
584 data8 0xB0B0A3AF388274F0, 0x00004009 //A14 = 1.4135199810126975119809102782e+03
585 data8 0xDBA87137988751EF, 0x0000400B //A13 = 7.0290552818218513870879313985e+03
586 //A2 = 4.2828433593031734228134155273e+08 + 3.9760422293645854535247300000e-08
587 data8 0x41B98719AFEE2947, 0x3E6558A17E0D3007
588 //A1 = 3.4008253676084774732589721680e+08 + 1.2558352335001093116071000000e-09
589 data8 0x41B4453F68C2C6EB, 0x3E159338C5BC7EC3
590 //A0 = 1.3684336546556583046913146973e+08 + 2.6786516700381562934240300000e-08
591 data8 0x41A05020CAEE5EA5, 0x3E5CC3058A858579
592 data8 0xFF5E3940FB4BA576, 0x0000400D //A12 = 3.2687111823895439312116108631e+04
593 data8 0x8A08C124C7F74B6C, 0x00004010 //A11 = 1.4134701786994123329786229006e+05
594 data8 0x89D701953540BFFB, 0x00004012 //A10 = 5.6459209892773907605385652281e+05
595 data8 0xFC46344B3116C3AD, 0x00004013 //A9 = 2.0666305367147234406757715163e+06
596 data8 0xD183EBD7A400151F, 0x00004015 //A8 = 6.8653979211730981618367536737e+06
597 data8 0x9C083A40742112F4, 0x00004017 //A7 = 2.0451444503543981795037456447e+07
598 data8 0xCD3C475B1A8B6662, 0x00004018 //A6 = 5.3801245423495149598177886823e+07
599 LOCAL_OBJECT_END(Constants_Tgammal_poly)
600
601
602 LOCAL_OBJECT_START(Constants_Tgammal_poly_splitted)
603
604 // Polynomial coefficients for the tgammal(x), 1 <= |x| < 1.25
605 //A5 = -9.8199506890310417350775651357e-01+ -3.2546247786122976510752200000e-17
606 data8 0xBFEF6C80EC38B509, 0xBC82C2FA7A3DE3BD
607 //A4 = 9.8172808683439960475425323239e-01 + 4.4847611775298520359811400000e-17
608 data8 0x3FEF6A51055096B0, 0x3C89DA56DE95EFE4
609 //A3 = -9.0747907608088618225394839101e-01 +-1.0244057366544064435443970000e-16
610 data8 0xBFED0A118F324B62, 0xBC9D86C7B9EBCFFF
611 data8 0xB8E3FDAA66CC738E, 0x00003FFB //A20 = 9.0278608095877488976217714815e-02
612 data8 0xA76067AE1738699C, 0x0000BFFD //A19 =-3.2690738678103132837070881737e-01
613 data8 0x9D66B13718408C44, 0x00003FFE //A18 = 6.1484820933424283818320582920e-01
614 data8 0xD4AC67BBB4AE5599, 0x0000BFFE //A17 =-8.3075569470082063491389474937e-01
615 data8 0xF1426ED1C1488DB3, 0x00003FFE //A16 = 9.4241993542644505594957058785e-01
616 data8 0xFC12EB07AA6F4B6B, 0x0000BFFE //A15 =-9.8466366707947121954333549690e-01
617 data8 0xFF2B32CFE5B0DDC8, 0x00003FFE //A14 = 9.9675290656677214804168895915e-01
618 data8 0xFFD8E7E6FF3662EA, 0x0000BFFE //A13 =-9.9940347089360552383472582319e-01
619 //A2 = 9.8905599532797250361682017683e-01 + 5.1760162410376024240867300000e-17
620 data8 0x3FEFA658C23B1578, 0x3C8DD673A61F6FE7
621 //A1 = -5.7721566490153275452712478000e-01+ -1.0607935612223465065923310000e-16
622 data8 0xBFE2788CFC6FB618, 0xBC9E9346622D53B7
623 //A0 = 9.9999999999999988897769753748e-01 + 1.1102230245372554544790880000e-16
624 data8 0x3FEFFFFFFFFFFFFF, 0x3C9FFFFFFFF51E4E
625 data8 0xFFF360DF628F0BC9, 0x00003FFE //A12 = 9.9980740979895815468216470840e-01
626 data8 0xFFEF8F9A72B40480, 0x0000BFFE //A11 = -9.9974916001038145045939523470e-01
627 data8 0xFFE037B8C7E39952, 0x00003FFE //A10 = 9.9951504002809911822597567307e-01
628 data8 0xFFC01E08F348BED2, 0x0000BFFE //A9 = -9.9902522772325406705059517941e-01
629 data8 0xFF83DAC83119B52C, 0x00003FFE //A8 = 9.9810569179053383842734164901e-01
630 data8 0xFEF9F8AB891ABB24, 0x0000BFFE //A7 = -9.9600176036720260345608796766e-01
631 data8 0xFE3F0537573C8235, 0x00003FFE //A6 = 9.9314911461918778676646301341e-01
632 //
633 // Polynomial coefficients for the tgammal(x), 1.25 <= |x| < 1.75
634 //A5 = -7.7523052299853054125655660300e-02+ -1.2693512521686721504433600000e-17
635 data8 0xBFB3D88CFE50601B, 0xBC6D44ED60EE2170
636 //A4 = 1.4464535904462152982041800442e-01 + 2.5426820829345729856648800000e-17
637 data8 0x3FC283BD374EB2A9, 0x3C7D50AC436187C3
638 //A3 = -1.0729480456477220873257039102e-01+ -6.2429894945456418196551000000e-18
639 data8 0xBFBB77AC1CA2EBA5, 0xBC5CCA6BCC422D41
640 data8 0xF732D2689F323283, 0x00003FF2 //A20 = 2.3574688251652899567587145422e-04
641 data8 0xB6B00E23DE89D13A, 0x0000BFF3 //A19 =-3.4844916488842618776630058875e-04
642 data8 0xE98396FE4A1B2799, 0x00003FF3 //A18 =4.4539265198744452020440735977e-04
643 data8 0xAF8D235A640DB1A2, 0x0000BFF4 //A17 =-6.6967514303333563295261178346e-04
644 data8 0x8513B736C918B261, 0x00003FF5 //A16 = 1.0152970456990865810615917715e-03
645 data8 0xC790A1A2C78D8E17, 0x0000BFF5 //A15 =-1.5225598630329403515321688394e-03
646 data8 0x959706CFA638CDE2, 0x00003FF6 //A14 = 2.2825614575133879623648932383e-03
647 data8 0xE050A6021E129860, 0x0000BFF6 //A13 =-3.4227757733947066666295285936e-03
648 //A2 = 4.1481345368830113695679528973e-01 + 3.1252439808354284892632100000e-17
649 data8 0x3FDA8C4DBA620D56, 0x3C82040BCB483C76
650 //A1 = 3.2338397448885010387886751460e-02 + 3.4437825798552300531443100000e-18
651 data8 0x3FA08EA88EE561B1, 0x3C4FC366D6C64806
652 //A0 = 8.8622692545275794095971377828e-01 + 7.2689375867553992399219000000e-17
653 data8 0x3FEC5BF891B4EF6A, 0x3C94F3877D311C0C
654 data8 0xA8275AADC09D16FC, 0x00003FF7 //A12 = 5.1316445128621071486146117136e-03
655 data8 0xFBFE2CE9215267A2, 0x0000BFF7 //A11= -7.6902121820788373000579382408e-03
656 data8 0xBCC8EEAB67ECD91D, 0x00003FF8 //A10 = 1.1522515369164312742737727262e-02
657 data8 0x8D1614BB97E5E8C2, 0x0000BFF9 //A9 = -1.7222443097804730395560633583e-02
658 data8 0xD3A963578BE291E3, 0x00003FF9 //A8 = 2.5837606456090186343624210891e-02
659 data8 0x9BA7EAE64C42FDF7, 0x0000BFFA //A7 = -3.8001935555045161419575037512e-02
660 data8 0xF0115BA1A77607E7, 0x00003FFA //A6 = 5.8610303817173477119764956736e-02
661 //
662 // Polynomial coefficients for the tgammal(x), 1.75 <= |x| < 2.0
663 //A5 = 2.6698206874501426502654943818e-04 + 3.4033756836921062797887300000e-20
664 data8 0x3F317F3740FE2A68, 0x3BE417093234B06E
665 //A4 = 7.4249010753513894345090307070e-02 + 3.9810018444482764697014200000e-18
666 data8 0x3FB301FBB0F25A92, 0x3C525BEFFABB622F
667 //A3 = -8.1576919247086265851720554565e-02+ -5.2716624487804746360745000000e-19
668 data8 0xBFB4E239984650AC, 0xBC2372F1C4F276FF
669 data8 0xFEF3AEE71038E9A3, 0x00003FEB //A20 = 1.8995395865421509009969188571e-06
670 data8 0xA11CFA2672BF876A, 0x0000BFEB //A19 =-1.2003868221414015771269244270e-06
671 data8 0xF8E107215DAE2164, 0x00003FEC //A18 = 3.7085863210303833432006027217e-06
672 data8 0xBCDDD3FC011EF7D6, 0x00003FEC //A17 = 2.8143303971756051015245433043e-06
673 data8 0x8683C4687FA22E68, 0x00003FEE //A16 = 8.0177018464360416764308252462e-06
674 data8 0xFDA09E5D33E32968, 0x00003FEE //A15 = 1.5117372062443781157389064848e-05
675 data8 0xFFB00D0CFF4089B4, 0x00003FEF //A14 = 3.0480348961227424242198174995e-05
676 data8 0xFEF6C39566785085, 0x00003FF0 //A13 = 6.0788135974125244644334004947e-05
677 //A2 = 4.1184033042643969357854416558e-01 + 1.2103396182129232634761000000e-18
678 data8 0x3FDA5B978B96BEBF, 0x3C3653AAD0A139E4
679 //A1 = -4.2278433509846713445057275749e-01+ -4.9429151528135657430413000000e-18
680 data8 0xBFDB0EE6072093CE, 0xBC56CB907027554F
681 //A0 = 1.0000000000000000000000000000e+00 + 1.0969171200000000000000000000e-31
682 data8 0x3FF0000000000000, 0x3981CC6A5B20B4D5
683 data8 0xFF2B7BA9A8D68C37, 0x00003FF1 //A12 = 1.2167446884801403650547161615e-04
684 data8 0xFCA53468E3692EF1, 0x00003FF2 //A11 = 2.4094136329542400976250900707e-04
685 data8 0x808D698A9C993615, 0x00003FF4 //A10 = 4.9038845704938303659791698883e-04
686 data8 0xF10F8E3FB8BB4AFB, 0x00003FF4 //A9 = 9.1957383840999861214472423976e-04
687 data8 0x89E224E42F93F005, 0x00003FF6 //A8 = 2.1039333407187324139473634747e-03
688 data8 0xBAF374824937A323, 0x00003FF6 //A7 = 2.8526458211545152218493600470e-03
689 data8 0xB6BF7564F52140C6, 0x00003FF8 //A6 = 1.1154045718131014476684982178e-02
690 //
691 // Polynomial coefficients for the tgammal(x), 0.0 <= |x| < 0.125
692 //A5 = -9.8199506890314514073736518185e-01+ -5.9363811993837985890950900000e-17
693 data8 0xBFEF6C80EC38B67A, 0xBC911C46B447C81F
694 //A4 = 9.8172808683440015986576554496e-01 + 2.7457414262802803699834200000e-17
695 data8 0x3FEF6A51055096B5, 0x3C7FA7FF90ACAD1F
696 //A3 = -9.0747907608088618225394839101e-01 + -1.0676255850934306734701780000e-16
697 data8 0xBFED0A118F324B62, 0xBC9EC5AFB633438D
698 data8 0x9217E83FA207CB80, 0x00003FFD //A20 = 2.8533864762086088781083621561e-01
699 data8 0xA8DABFA52FDF03EC, 0x0000BFFE //A19= -6.5958783896337186303285832783e-01
700 data8 0xE331ED293AF39F9B, 0x00003FFE //A18 = 8.8748056656454687449654731184e-01
701 data8 0xF9163C5DDB52419D, 0x0000BFFE //A17= -9.7299554149078295602977718525e-01
702 data8 0xFEC0A1C672CB9265, 0x00003FFE //A16 = 9.9512683005268190987854104489e-01
703 data8 0xFFD2D65B8EA7B5F4, 0x0000BFFE //A15= -9.9931087241443958201592847861e-01
704 data8 0xFFF93AA39EE53445, 0x00003FFE //A14 = 9.9989668364186884793382816496e-01
705 data8 0xFFFB99A9A3F5F480, 0x0000BFFE //A13= -9.9993286506283835663204999212e-01
706 //A2 = 9.8905599532797250361682017683e-01 + 5.1778575360788420716540100000e-17
707 data8 0x3FEFA658C23B1578, 0x3C8DD92B45408D07
708 //A1 = -5.7721566490153275452712478000e-01+ -1.0607938730998824663273110000e-16
709 data8 0xBFE2788CFC6FB618, 0xBC9E9346F8FDE55B
710 //A0 = 9.9999999999999988897769753748e-01 + 1.1102230246251564036631420000e-16
711 data8 0x3FEFFFFFFFFFFFFF, 0x3C9FFFFFFFFFFFFF
712 data8 0xFFF7FEBB545812C1, 0x00003FFE //A12 = 9.9987785409425126648628395084e-01
713 data8 0xFFF00C02E943A3F2, 0x0000BFFE //A11= -9.9975657530855116454438747397e-01
714 data8 0xFFE0420AADC53820, 0x00003FFE //A10 = 9.9951565514290485919027183699e-01
715 data8 0xFFC01EB42EF27EEB, 0x0000BFFE //A9 = -9.9902526759155739377365522320e-01
716 data8 0xFF83DAD0BF23FF12, 0x00003FFE //A8 = 9.9810569378236378800364235948e-01
717 data8 0xFEF9F8ABDBCDB2F3, 0x0000BFFE //A7 = -9.9600176044241699109053158187e-01
718 data8 0xFE3F05375988491D, 0x00003FFE //A6 = 9.9314911462127599008937257662e-01
719 LOCAL_OBJECT_END(Constants_Tgammal_poly_splitted)
720
721 .align 64
722 LOCAL_OBJECT_START(Constants_Tgammal_common)
723 // Positive overflow value
724 data8 0x3FE0000000000000 // 0.5
725 data8 0x3FF8000000000000 // 1.5
726 data8 0x3FD0000000000000 // 0.25
727 data8 0x0000000000000000 // 0
728 data8 0xDB718C066B352E21, 0x00004009 // Positive overflow value
729 LOCAL_OBJECT_END(Constants_Tgammal_common)
730
731
732
733 //=======================================================
734 // Lgamma registers
735
736 // General Purpose Registers
737 GR_l_Log_Table = r33
738 GR_l_Log_Table1 = r34
739 GR_l_BIAS = r34
740 GR_l_Index1 = r35
741 GR_l_Index2 = r36
742 GR_l_signif_Z = r37
743 GR_l_X_0 = r38
744 GR_l_X_1 = r39
745 GR_l_X_2 = r40
746 GR_l_Z_1 = r41
747 GR_l_Z_2 = r42
748 GR_l_N = r43
749 GR_l_Index3 = r44
750 GR_l_Stirling_Table = r45
751 GR_l_N_Unbiased = r46
752
753 // Floating Point Registers
754 FR_l_logl_X = f8
755
756 FR_l_h_3 = f10
757 FR_l_poly_hi = f10
758 FR_l_W = f11
759 FR_l_S = f12
760 FR_l_GS_hi = f13
761 FR_l_Y_lo = f13
762 FR_l_r_cor = f14
763 FR_l_G_1 = f15
764 FR_l_G = f15
765 FR_l_H_1 = f32
766 FR_l_H = f32
767 FR_l_h = f33
768 FR_l_h_1 = f33
769 FR_l_N = f33
770 FR_l_G_2 = f34
771 FR_l_H_2 = f35
772 FR_l_h_2 = f36
773 FR_l_G_3 = f37
774 FR_l_log2_hi = f38
775 FR_l_GS_lo = f39
776 FR_l_H_3 = f40
777 FR_l_float_N = f41
778 FR_l_Q_4 = f42
779 FR_l_Q_3 = f43
780 FR_l_Q_2 = f44
781 FR_l_Q_1 = f45
782 FR_l_Q_5 = f46
783 FR_l_Q_6 = f47
784 FR_l_log2_lo = f48
785 FR_l_r = f49
786 FR_l_poly_lo = f50
787 FR_l_poly = f51
788 FR_l_rsq = f52
789 FR_l_Y_lo_res = f53
790
791 FR_l_Y0 = f55
792 FR_l_Q0 = f56
793 FR_l_E0 = f57
794 FR_l_E2 = f58
795 FR_l_E1 = f59
796 FR_l_Y1 = f60
797 FR_l_E3 = f61
798 FR_l_Y2 = f62
799
800 FR_l_Z = f63
801 FR_l_Z2 = f64
802 FR_l_Z4 = f65
803 FR_l_Z8 = f66
804
805 FR_l_CH = f67
806 FR_l_CL = f68
807
808 FR_l_B2 = f69
809 FR_l_B4 = f70
810 FR_l_B6 = f71
811 FR_l_B8 = f72
812 FR_l_B10 = f73
813 FR_l_B12 = f74
814 FR_l_B14 = f75
815 FR_l_B16 = f76
816 FR_l_B18 = f77
817 FR_l_Half = f78
818 FR_l_SS = f79
819 FR_l_AbsX_m_Half = f80
820 FR_l_CXH = f81
821 FR_l_CXL = f82
822 FR_l_SSCXH = f83
823 FR_l_SSCXL = f84
824 FR_l_XYH = f85
825 FR_l_XYL = f86
826 FR_l_Temp = f87
827
828 FR_l_logl_YHi = f88
829 FR_l_logl_YLo = f89
830
831 FR_l_SignedXYH = f123
832
833 FR_l_AbsX = f127
834
835
836
837 //=======================================================
838 // Negative part registers
839
840 // General Purpose Registers
841 GR_n_sin_Table = r47
842 GR_n_XN = r48
843
844 // Float point registers
845 FR_n_IXNS = f125
846 FR_n_IXN = f126
847
848 FR_n_XNS = f90
849 FR_n_XS = f91
850 FR_n_XS2 = f92
851 FR_n_XS2L = f93
852 FR_n_XS4 = f94
853 FR_n_XS7 = f95
854 FR_n_XS8 = f96
855 FR_n_TT = f97
856 FR_n_TH = f98
857 FR_n_TL = f99
858
859 FR_n_A2H = f100
860 FR_n_A2L = f101
861 FR_n_A1H = f102
862 FR_n_A1L = f103
863 FR_n_A9 = f104
864 FR_n_A8 = f105
865 FR_n_A7 = f106
866 FR_n_A6 = f107
867 FR_n_A5 = f108
868 FR_n_A4 = f109
869 FR_n_A3 = f110
870
871 FR_n_PolyH = f111
872 FR_n_PolyL = f112
873
874 FR_n_Poly1H = f113
875 FR_n_SinxH = f113 // the same as FR_n_Poly1H
876 FR_n_Poly1L = f114
877 FR_n_SinxL = f114 // the same as FR_n_Poly1L
878
879 FR_n_Tail = f115
880 FR_n_NegOne = f116
881
882 FR_n_Y0 = f117
883
884 FR_n_Q0 = f118
885 FR_n_E0 = f119
886
887 FR_n_E2 = f120
888 FR_n_E1 = f121
889
890 FR_n_Y1 = f55
891 FR_n_E3 = f56
892
893 FR_n_Y2 = f57
894 FR_n_R0 = f58
895
896 FR_n_E4 = f59
897 FR_n_RcpResH = f60
898
899 FR_n_Y3 = f61
900 FR_n_R1 = f62
901 FR_n_Temp = f63
902
903 FR_n_RcpResL = f64
904
905 FR_n_ResH = f65
906 FR_n_ResL = f66
907
908
909
910
911 //=======================================================
912 // Exp registers
913
914 // General Purpose Registers
915 GR_e_ad_Arg = r33
916 GR_e_ad_A = r34
917 GR_e_signexp_x = r35
918 GR_e_exp_x = r35
919 GR_e_exp_mask = r36
920 GR_e_ad_W1 = r37
921 GR_e_ad_W2 = r38
922 GR_e_M2 = r39
923 GR_e_M1 = r40
924 GR_e_K = r41
925 GR_e_exp_2_mk = r42
926 GR_e_exp_2_k = r43
927 GR_e_ad_T1 = r44
928 GR_e_ad_T2 = r45
929 GR_e_N_fix = r46
930 GR_e_one = r47
931 GR_e_exp_bias = r48
932 GR_e_sig_inv_ln2 = r49
933 GR_e_rshf_2to51 = r50
934 GR_e_exp_2tom51 = r51
935 GR_e_rshf = r52
936
937 // Floating Point Registers
938 FR_e_RSHF_2TO51 = f10
939 FR_e_INV_LN2_2TO63 = f11
940 FR_e_W_2TO51_RSH = f12
941 FR_e_2TOM51 = f13
942 FR_e_RSHF = f14
943 FR_e_Y_hi = f15
944 FR_e_Y_lo = f32
945 FR_e_scale = f33
946 FR_e_float_N = f34
947 FR_e_N_signif = f35
948 FR_e_L_hi = f36
949 FR_e_L_lo = f37
950 FR_e_r = f38
951 FR_e_W1 = f39
952 FR_e_T1 = f40
953 FR_e_W2 = f41
954 FR_e_T2 = f42
955 FR_e_W1_p1 = f43
956 FR_e_rsq = f44
957 FR_e_A2 = f45
958 FR_e_r4 = f46
959 FR_e_A3 = f47
960 FR_e_poly = f48
961 FR_e_T = f49
962 FR_e_W = f50
963 FR_e_Wp1 = f51
964 FR_e_r6 = f52
965 FR_e_2_mk = f53
966 FR_e_A1 = f54
967 FR_e_T_scale = f55
968 FR_e_result_lo = f56
969 FR_e_W_T_scale = f57
970 FR_e_Wp1_T_scale = f58
971
972 FR_e_expl_Input_X = f123
973 FR_e_expl_Input_Y = f124
974 FR_e_expl_Output_X = f123
975 FR_e_expl_Output_Y = f124
976
977
978 FR_e_expl_Input_AbsX = f122
979
980
981
982 //=======================================================
983 // Common registers
984
985 // General Purpose Registers
986 GR_c_Table = r53
987 GR_c_NegUnderflow = r54
988 GR_c_NegSingularity = r55
989 GR_c_X = r56
990 GR_c_SignBit = r57
991 GR_c_13 = r58
992
993
994 // Floating Point Registers
995 FR_c_PosOverflow = f123
996 FR_c_XN = f124
997
998
999 //=======================================================
1000 // Polynomial part registers
1001
1002 // General Purpose Registers
1003 GR_p_Table = r59
1004 GR_p_XN = r33
1005 GR_p_Table2 = r34
1006 GR_p_Int = r35
1007 GR_p_Offset = r36
1008 GR_p_Offset2 = r38
1009 GR_p_X_Sgnd = GR_l_signif_Z // = r37
1010 GR_p_Exp = r61
1011 GR_p_Bias = r62
1012 GR_p_0p75 = r63
1013
1014 // Floating Point Registers
1015 FR_p_AbsX = FR_l_AbsX // = f127
1016 FR_p_IXN = FR_n_IXN // = f126
1017 FR_p_XN = f32
1018 FR_p_0p5 = f33
1019 FR_p_1p5 = f34
1020 FR_p_AbsXM1 = f35
1021 FR_p_2 = f36
1022
1023 FR_p_A20 = f37
1024 FR_p_A19 = f38
1025 FR_p_A18 = f39
1026 FR_p_A17 = f40
1027 FR_p_A16 = f41
1028 FR_p_A15 = f42
1029 FR_p_A14 = f43
1030 FR_p_A13 = f44
1031 FR_p_A12 = f45
1032 FR_p_A11 = f46
1033 FR_p_A10 = f47
1034 FR_p_A9 = f48
1035 FR_p_A8 = f49
1036 FR_p_A7 = f50
1037 FR_p_A6 = f51
1038 FR_p_A5H = f52
1039 FR_p_A5L = f53
1040 FR_p_A4H = f54
1041 FR_p_A4L = f55
1042 FR_p_A3H = f56
1043 FR_p_A3L = f57
1044 FR_p_A2H = f58
1045 FR_p_A2L = f59
1046 FR_p_A1H = f60
1047 FR_p_A1L = f61
1048 FR_p_A0H = f62
1049 FR_p_A0L = f63
1050
1051 FR_p_XR = f64
1052 FR_p_XR2 = f65
1053 FR_p_XR2L = f52
1054
1055 FR_p_XR3 = f58
1056 FR_p_XR3L = f38
1057
1058 FR_p_XR4 = f42
1059 FR_p_XR6 = f40
1060 FR_p_XR8 = f37
1061
1062 FR_p_Poly5H = f66
1063 FR_p_Poly5L = f67
1064 FR_p_Poly4H = f53
1065 FR_p_Poly4L = f44
1066 FR_p_Poly3H = f41
1067 FR_p_Poly3L = f47
1068 FR_p_Poly2H = f68
1069 FR_p_Poly2L = f54
1070 FR_p_Poly1H = f55
1071 FR_p_Poly1L = f46
1072 FR_p_Poly0H = f39
1073 FR_p_Poly0L = f43
1074
1075 FR_p_Temp5H = f69
1076 FR_p_Temp5L = f70
1077 FR_p_Temp4H = f71
1078 FR_p_Temp4L = f60
1079 FR_p_Temp2H = f72
1080 FR_p_Temp2L = f73
1081 FR_p_Temp1H = f59
1082 FR_p_Temp1L = f61
1083 FR_p_Temp0H = f49
1084 FR_p_Temp0L = f48
1085 FR_p_PolyTail = f45
1086 FR_p_OddPoly0H = f56
1087 FR_p_OddPoly0L = f51
1088
1089 FR_p_0p25 = f73
1090
1091
1092 //=======================================================
1093 // Negative polynomial part registers
1094 // General Purpose Registers
1095 GR_r_sin_Table = r47
1096 GR_r_sin_Table2 = r60
1097
1098 // Floating Point Registers
1099 FR_r_IXNS = FR_n_IXNS
1100 FR_r_IXN = FR_n_IXN
1101
1102 FR_r_AbsX = FR_l_AbsX
1103
1104 FR_r_A9 = f74
1105 FR_r_A8 = f75
1106 FR_r_A7 = f76
1107 FR_r_A6 = f77
1108 FR_r_A5 = f78
1109 FR_r_A4 = f79
1110 FR_r_A3 = f80
1111 FR_r_A2H = f81
1112 FR_r_A2L = f82
1113 FR_r_A1H = f83
1114 FR_r_A1L = f84
1115
1116 FR_r_XNS = f85
1117 FR_r_XS = f86
1118 FR_r_XS2 = f87
1119 FR_r_XS2L = f88
1120 FR_r_XS4 = f89
1121 FR_r_XS7 = f90
1122 FR_r_XS8 = f91
1123
1124 FR_r_Tail = f92
1125
1126 FR_r_TT = f93
1127 FR_r_TH = f94
1128 FR_r_TL = f95
1129
1130 FR_r_ResH = f96
1131 FR_r_ResL = f97
1132
1133 FR_r_Res3H = f98
1134 FR_r_Res3L = f99
1135
1136 FR_r_Res1H = f100
1137 FR_r_Res1L = f101
1138
1139
1140
1141 FR_r_Y0 = f102
1142 FR_r_Q0 = f103
1143 FR_r_E0 = f104
1144 FR_r_E2 = f105
1145 FR_r_E1 = f106
1146 FR_r_Y1 = f107
1147 FR_r_E3 = f108
1148 FR_r_Y2 = f109
1149 FR_r_R0 = f110
1150 FR_r_E4 = f111
1151 FR_r_ZH = f112
1152 FR_r_Y3 = f113
1153 FR_r_R1 = f114
1154 FR_r_ZHN = f115
1155 FR_r_ZL = f115
1156 FR_r_NegOne = f116
1157
1158 FR_z_Y0 = f102
1159 FR_z_Q0 = f103
1160 FR_z_E0 = f104
1161 FR_z_E2 = f105
1162 FR_z_E1 = f106
1163 FR_z_Y1 = f107
1164 FR_z_E3 = f108
1165 FR_z_Y2 = f109
1166 FR_z_R0 = f110
1167 FR_z_E4 = f111
1168 FR_z_ZH = f112
1169 FR_z_Y3 = f113
1170 FR_z_R1 = f114
1171 FR_z_ZL = f115
1172
1173
1174 // General Purpose Registers
1175 GR_SAVE_PFS = r32
1176 GR_DenOverflow = r33
1177 GR_u_XN = r34
1178
1179 GR_SAVE_B0 = r35
1180 GR_SAVE_GP = r36
1181 GR_SAVE_SP = r37
1182
1183 // Floating Point Registers
1184 FR_u_IXN = f34
1185
1186
1187 // ERROR HANDLER REGISTERS
1188 GR_Parameter_X = r64
1189 GR_Parameter_Y = r65
1190 GR_Parameter_RESULT = r66
1191 GR_Parameter_TAG = r67
1192
1193 FR_RESULT = f8
1194 FR_X = f32
1195 FR_Y = f1
1196
1197
1198 .section .text
1199 GLOBAL_LIBM_ENTRY(tgammal)
1200 { .mfi
1201 alloc r32 = ar.pfs,0,32,4,0
1202 fabs FR_l_AbsX = f8 // Get absolute value of X
1203 addl GR_n_sin_Table = @ltoff(Constants_Tgammal_sin), gp
1204 }
1205 { .mfi
1206 addl GR_l_Log_Table=@ltoff(Constants_Tgammal_log_80_Z_G_H_h1#),gp
1207 nop.f 0
1208 addl GR_l_Stirling_Table = @ltoff(Constants_Tgammal_stirling), gp
1209 };;
1210
1211 { .mfi
1212 getf.sig GR_l_signif_Z = f8 // Significand of X
1213 fcvt.fx.s1 FR_n_IXNS = f8 // Convert to fixed point
1214 addl GR_c_Table = @ltoff(Constants_Tgammal_common), gp
1215 }
1216 { .mfi
1217 ld8 GR_l_Log_Table = [GR_l_Log_Table]
1218 nop.f 0
1219 addl GR_p_Table = @ltoff(Constants_Tgammal_poly), gp
1220 };;
1221
1222 { .mfi
1223 ld8 GR_n_sin_Table = [GR_n_sin_Table]
1224 fclass.m p6,p0 = f8,0x1EF // Check x for NaN, 0, INF, denorm
1225 // NatVal.
1226 addl GR_c_NegSingularity = 0x1003E, r0
1227 }
1228 { .mlx
1229 ld8 GR_l_Stirling_Table = [GR_l_Stirling_Table]
1230 movl GR_c_13 = 0x402A000000000000 // 13.0
1231 };;
1232
1233 { .mfi
1234 getf.d GR_c_X = f8 // Double prec. X to general register
1235 frcpa.s1 FR_z_Y0,p0 = f1,f8 // y = frcpa(x) (for negatives)
1236 extr.u GR_l_Index1 = GR_l_signif_Z, 59, 4 // = High 4 bits of Z
1237 }
1238 { .mlx
1239 ld8 GR_c_Table = [GR_c_Table]
1240 movl GR_c_SignBit = 0x8000000000000000 // High bit (sign)
1241 };;
1242
1243 { .mfi
1244 ld8 GR_p_Table = [GR_p_Table]
1245 fcmp.lt.s1 p15, p14 = f8,f0 // p14 - positive arg, p15 - negative
1246 shl GR_l_Index1 = GR_l_Index1,5 // Adjust Index1 ptr (x32)
1247 }
1248 { .mfb
1249 adds GR_c_NegUnderflow = 1765, r0
1250 nop.f 0
1251 (p6) br.cond.spnt tgammal_spec // Spec. values processing branch ////////////
1252 // (0s, INFs, NANs, NatVals, denormals) //////
1253 };;
1254
1255 { .mfi
1256 ldfpd FR_l_CH,FR_l_CL= [GR_l_Stirling_Table], 16 // Load CH, CL
1257 fcvt.fx.trunc.s1 FR_n_IXN = FR_l_AbsX // Abs arg to int by trunc
1258 extr.u GR_l_X_0 = GR_l_signif_Z, 49, 15 // High 15 bit of Z
1259 }
1260 { .mfi
1261 add GR_l_Index1 = GR_l_Index1,GR_l_Log_Table // Add offset
1262 fma.s1 FR_p_2 = f1, f1, f1 // 2.0
1263 andcm GR_c_X = GR_c_X, GR_c_SignBit // Remove sign
1264 };;
1265
1266 { .mfi
1267 addl GR_l_Log_Table = @ltoff(Constants_Tgammal_log_80_Z_G_H_h2#), gp
1268 fcmp.lt.s1 p10, p0 = FR_l_AbsX, f1 // If |X|<1 then p10 = 1
1269 nop.i 0
1270 }
1271 { .mlx
1272 ld2 GR_l_Z_1 = [GR_l_Index1],4 // load Z_1 from Index1
1273 movl GR_l_BIAS = 0x000000000000FFFF // Bias for exponent
1274 };;
1275
1276 { .mfi
1277 ld8 GR_l_Log_Table = [GR_l_Log_Table]
1278 frcpa.s1 FR_l_Y0, p0 = f1, FR_l_AbsX // y = frcpa(x)
1279 nop.i 0
1280 }
1281 { .mfi
1282 ldfs FR_l_G_1 = [GR_l_Index1],4 // Load G_1
1283 fsub.s1 FR_l_W = FR_l_AbsX, f1 // W = |X|-1
1284 nop.i 0
1285 };;
1286
1287 { .mfi
1288 getf.exp GR_l_N_Unbiased= FR_l_AbsX // exponent of |X|
1289 fmerge.se FR_l_S = f1, FR_l_AbsX // S = merging of X and 1.0
1290 cmp.gtu p11, p0 = GR_c_13, GR_c_X // If 1 <= |X| < 13
1291 // then p11 = 1
1292 }
1293 { .mfb
1294 ldfs FR_l_H_1 = [GR_l_Index1],8 // Load H_1
1295 fcvt.xf FR_n_XNS = FR_n_IXNS // Convert to FP repr. of int X
1296 (p10) br.cond.spnt tgamma_lt_1 // Branch to |X| < 1 path ///////////////////
1297 };;
1298
1299 { .mfi
1300 ldfpd FR_n_A2H, FR_n_A2L = [GR_n_sin_Table], 16
1301 nop.f 0
1302 pmpyshr2.u GR_l_X_1 = GR_l_X_0,GR_l_Z_1,15 // Adjust Index2 (x32)
1303 }
1304 { .mfb
1305 ldfe FR_l_B2 = [GR_l_Stirling_Table], 16
1306 nop.f 0
1307 (p11) br.cond.spnt tgamma_lt_13 // Branch to 1 <= |X| < 13 path ///////////////
1308 };;
1309
1310 { .mfi
1311 ldfe FR_l_h_1 = [GR_l_Index1],0
1312 nop.f 0
1313 sub GR_l_N = GR_l_N_Unbiased, GR_l_BIAS // N - BIAS
1314 }
1315 { .mib
1316 ldfpd FR_l_B4,FR_l_B6= [GR_l_Stirling_Table], 16 // Load C
1317 (p15) cmp.geu.unc p8,p0 = GR_l_N_Unbiased, GR_c_NegSingularity
1318 (p8) br.cond.spnt tgammal_singularity // Singularity for arg < to -2^63 //////
1319 };;
1320
1321 { .mmi
1322 (p15) ldfpd FR_n_A1H, FR_n_A1L = [GR_n_sin_Table], 16
1323 ldfpd FR_l_B8, FR_l_B10 = [GR_l_Stirling_Table], 16
1324 add GR_c_Table = 0x20, GR_c_Table
1325 };;
1326
1327 { .mfi
1328 (p15) ldfe FR_n_A9 = [GR_n_sin_Table], 16
1329 fma.s1 FR_l_Q0 = f1,FR_l_Y0,f0 // Q0 = Y0
1330 nop.i 0
1331 }
1332 { .mfi
1333 ldfpd FR_l_B12, FR_l_B14 = [GR_l_Stirling_Table], 16
1334 fnma.s1 FR_l_E0 = FR_l_Y0,FR_l_AbsX,f1 // e = 1-b*y
1335 nop.i 0
1336 };;
1337
1338 { .mfi
1339 (p15) ldfe FR_n_A8 = [GR_n_sin_Table], 16
1340 fcvt.xf FR_c_XN = FR_n_IXN // Convert to FP repr. of int X
1341 extr.u GR_l_Index2 = GR_l_X_1, 6, 4 // Extract Index2
1342 }
1343 { .mfi
1344 ldfpd FR_l_B16, FR_l_B18 = [GR_l_Stirling_Table], 16
1345 nop.f 0
1346 nop.i 0
1347 };;
1348
1349 { .mfi
1350 (p15) ldfe FR_n_A7 = [GR_n_sin_Table], 16
1351 fms.s1 FR_l_CXH = FR_l_CH, f1, FR_l_AbsX // CXH = CH+|X|
1352 shl GR_l_Index2 = GR_l_Index2,5
1353 }
1354 { .mfi
1355 ldfd FR_l_Half = [GR_l_Stirling_Table] // Load 0.5
1356 nop.f 0
1357 nop.i 0
1358 };;
1359
1360 { .mfi
1361 add GR_l_Index2 = GR_l_Index2, GR_l_Log_Table // Add offset
1362 nop.f 0
1363 nop.i 0
1364 }
1365 { .mfi
1366 (p15) ldfe FR_n_A6 = [GR_n_sin_Table], 16
1367 (p15) fma.s1 FR_n_XS = FR_l_AbsX , f1, FR_n_XNS // xs = x - int(x)
1368 nop.i 0
1369 };;
1370
1371 { .mmi
1372 ld2 GR_l_Z_2 = [GR_l_Index2],4
1373 addl GR_l_Log_Table = @ltoff(Constants_Tgammal_log_80_h3_G_H#),gp
1374 nop.i 0
1375 };;
1376
1377 { .mfi
1378 ld8 GR_l_Log_Table = [GR_l_Log_Table]
1379 fma.s1 FR_l_E2 = FR_l_E0,FR_l_E0,FR_l_E0 // e2 = e+e^2
1380 nop.i 0
1381 }
1382 { .mfi
1383 ldfs FR_l_G_2 = [GR_l_Index2],4
1384 fma.s1 FR_l_E1 = FR_l_E0,FR_l_E0,f0 // e1 = e^2
1385 nop.i 0
1386 };;
1387
1388 { .mmi
1389 ldfs FR_l_H_2 = [GR_l_Index2],8
1390 (p15) ldfe FR_n_A5 = [GR_n_sin_Table], 16
1391 nop.i 0
1392 };;
1393
1394 { .mfi
1395 setf.sig FR_l_float_N = GR_l_N // float_N = Make N a fp number
1396 nop.f 0
1397 pmpyshr2.u GR_l_X_2 = GR_l_X_1,GR_l_Z_2,15 // X_2 = X_1 * Z_2
1398 }
1399 { .mfi
1400 ldfe FR_l_h_2 = [GR_l_Index2],0
1401 fma.s1 FR_l_CXL = FR_l_AbsX, f1, FR_l_CXH // CXL = |X|+CXH
1402 add GR_l_Log_Table1= 0x200, GR_l_Log_Table
1403 };;
1404
1405 { .mfi
1406 (p15) ldfe FR_n_A4 = [GR_n_sin_Table], 16
1407 (p15) fcmp.eq.unc.s1 p9,p0 = FR_l_AbsX, FR_c_XN //if argument is integer
1408 // and negative
1409 nop.i 0
1410 }
1411 { .mfi
1412 ldfe FR_c_PosOverflow = [GR_c_Table],16 //Load pos overflow value
1413 (p15) fma.s1 FR_n_XS2 = FR_n_XS, FR_n_XS, f0 // xs^2 = xs*xs
1414 nop.i 0
1415 };;
1416
1417 { .mfi
1418 (p15) ldfe FR_n_A3 = [GR_n_sin_Table], 16
1419 nop.f 0
1420 nop.i 0
1421 };;
1422
1423 { .mfi
1424 (p15) getf.sig GR_n_XN = FR_n_IXN // int(x) to general reg
1425 fma.s1 FR_l_Y1 = FR_l_Y0,FR_l_E2,FR_l_Y0 // y1 = y+y*e2
1426 nop.i 0
1427 }
1428 { .mfb
1429 nop.m 0
1430 fma.s1 FR_l_E3 = FR_l_E1,FR_l_E1,FR_l_E0 // e3 = e+e1^2
1431 (p9) br.cond.spnt tgammal_singularity // Singularity for integer /////////////
1432 // and negative arguments //////////////
1433 };;
1434
1435 { .mfi
1436 nop.m 0
1437 fms.s1 FR_l_AbsX_m_Half = FR_l_AbsX, f1, FR_l_Half // |x|-0.5
1438 extr.u GR_l_Index2 = GR_l_X_2, 1, 5 // Get Index3
1439 };;
1440
1441 { .mfi
1442 shladd GR_l_Log_Table1= GR_l_Index2, 2, GR_l_Log_Table1
1443 nop.f 0
1444 shladd GR_l_Index3 = GR_l_Index2,4, GR_l_Log_Table // Index3
1445 }
1446 { .mfb
1447 (p15) cmp.gtu.unc p11, p0 = GR_n_XN, GR_c_NegUnderflow // X < -1765
1448 fms.s1 FR_l_CXL = FR_l_CH, f1, FR_l_CXL // CXL = CH - CXL
1449 (p11) br.cond.spnt tgammal_underflow // Singularity for negative argument //////
1450 // at underflow domain (X < -1765) //////
1451 };;
1452
1453 { .mfi
1454 addl GR_l_Log_Table = @ltoff(Constants_Tgammal_log_80_Q#), gp
1455 (p15) fma.s1 FR_n_TT = FR_n_A2L, FR_n_XS2, f0 // T=A2L*x^2
1456 tbit.nz.unc p13, p12 = GR_n_XN, 0x0 // whether [X] odd or even
1457 }
1458 { .mfi
1459 nop.m 0
1460 (p15) fms.s1 FR_n_XS2L = FR_n_XS, FR_n_XS, FR_n_XS2 // xs^2 Low part
1461 nop.i 0
1462 };;
1463
1464 { .mfi
1465 ld8 GR_l_Log_Table = [GR_l_Log_Table]
1466 (p15) fma.s1 FR_n_A7 = FR_n_A8, FR_n_XS2, FR_n_A7 // poly tail
1467 nop.i 0
1468 }
1469 { .mfi
1470 ldfe FR_l_h_3 = [GR_l_Index3],12
1471 (p15) fma.s1 FR_n_XS4 = FR_n_XS2, FR_n_XS2, f0 // xs^4 = xs^2*xs^2
1472 nop.i 0
1473 };;
1474
1475 { .mfi
1476 ldfs FR_l_H_3 = [GR_l_Log_Table1], 0
1477 fma.s1 FR_l_Y2 = FR_l_Y1, FR_l_E3, FR_l_Y0 // y2 = y+y1*e3
1478 nop.i 0
1479 }
1480 { .mfi
1481 ldfs FR_l_G_3 = [GR_l_Index3], 0
1482 fnma.s1 FR_l_Z = FR_l_AbsX,FR_l_Q0,f1 // r = a-b*q
1483 nop.i 0
1484 };;
1485
1486 { .mfi
1487 nop.m 0
1488 fmpy.s1 FR_l_G = FR_l_G_1, FR_l_G_2 // G = G1 * G_2
1489 nop.i 0
1490 }
1491 { .mfi
1492 nop.m 0
1493 fadd.s1 FR_l_H = FR_l_H_1, FR_l_H_2 // H = H_1 + H_2
1494 nop.i 0
1495 };;
1496
1497 { .mfi
1498 ldfe FR_l_log2_hi = [GR_l_Log_Table],16 // load log2_hi part
1499 fadd.s1 FR_l_h = FR_l_h_1, FR_l_h_2 // h = h_1 + h_2
1500 nop.i 0
1501 }
1502 { .mfi
1503 nop.m 0
1504 fcvt.xf FR_l_float_N = FR_l_float_N // int(N)
1505 nop.i 0
1506 };;
1507
1508 { .mfi
1509 ldfe FR_l_log2_lo = [GR_l_Log_Table],16 // Load log2_lo part
1510 fma.s1 FR_l_CXL = FR_l_CXL, f1, FR_l_CL
1511 nop.i 0
1512 }
1513 { .mfi
1514 nop.m 0
1515 (p15) fma.s1 FR_n_TT = FR_n_A2H, FR_n_XS2L, FR_n_TT // T=A2H*x2L+T
1516 nop.i 0
1517 };;
1518
1519 { .mfi
1520 ldfe FR_l_Q_6 = [GR_l_Log_Table],16
1521 (p15) fma.s1 FR_n_A3 = FR_n_A4, FR_n_XS2, FR_n_A3 // poly tail
1522 nop.i 0
1523 }
1524 { .mfi
1525 nop.m 0
1526 (p15) fma.s1 FR_n_A5 = FR_n_A6, FR_n_XS2, FR_n_A5 // poly tail
1527 nop.i 0
1528 };;
1529
1530 { .mfi
1531 ldfe FR_l_Q_5 = [GR_l_Log_Table],16
1532 (p15) fabs FR_n_XS = FR_n_XS // abs(xs)
1533 nop.i 0
1534 }
1535 { .mfi
1536 nop.m 0
1537 fma.s1 FR_l_Z = FR_l_Z,FR_l_Y2,FR_l_Q0 // x_hi = q+r*y2
1538 nop.i 0
1539 };;
1540
1541 { .mfi
1542 ldfe FR_l_Q_4 = [GR_l_Log_Table],16
1543 (p15) fma.s1 FR_n_A7 = FR_n_A9, FR_n_XS4, FR_n_A7 // poly tail
1544 nop.i 0
1545 }
1546 { .mfi
1547 nop.m 0
1548 (p15) fma.s1 FR_n_XS7 = FR_n_XS4, FR_n_XS2, f0 // = x^4*x^2
1549 nop.i 0
1550 };;
1551
1552 { .mfi
1553 ldfe FR_l_Q_3 = [GR_l_Log_Table],16
1554 fneg FR_n_NegOne = f1 // -1.0
1555 nop.i 0
1556 }
1557 { .mfi
1558 nop.m 0
1559 (p15) fma.s1 FR_n_XS8 = FR_n_XS4, FR_n_XS4, f0 // xs^8 = xs^4*xs^4
1560 nop.i 0
1561 };;
1562
1563 { .mfi
1564 ldfe FR_l_Q_2 = [GR_l_Log_Table],16
1565 fadd.s1 FR_l_h = FR_l_h, FR_l_h_3 // h = h_1 + h_2 + h_3
1566 nop.i 0
1567 }
1568 { .mfi
1569 nop.m 0
1570 (p15) fma.s1 FR_n_TH = FR_n_A2H, FR_n_XS2, FR_n_TT // A2H*xs2+T
1571 nop.i 0
1572 };;
1573
1574 { .mfi
1575 ldfe FR_l_Q_1 = [GR_l_Log_Table],16
1576 fmpy.s1 FR_l_G = FR_l_G, FR_l_G_3 // G = G_1 * G_2 * G_3
1577 nop.i 0
1578 }
1579 { .mfi
1580 nop.m 0
1581 fadd.s1 FR_l_H = FR_l_H, FR_l_H_3 // H = H_1 + H_2 + H_3
1582 nop.i 0
1583 };;
1584
1585 { .mfi
1586 nop.m 0
1587 fma.s1 FR_l_Z2 = FR_l_Z, FR_l_Z, f0 // Z^2
1588 nop.i 0
1589 }
1590 { .mfi
1591 nop.m 0
1592 (p15) fma.s1 FR_n_A3 = FR_n_A5, FR_n_XS4, FR_n_A3 // poly tail
1593 nop.i 0
1594 };;
1595
1596 { .mfi
1597 nop.m 0
1598 (p14) fcmp.gt.unc.s1 p7,p0 = FR_l_AbsX, FR_c_PosOverflow //X > 1755.5483
1599 // (overflow domain, result cannot be represented by normal value)
1600 nop.i 0
1601 }
1602 { .mfi
1603 nop.m 0
1604 (p15) fma.s1 FR_n_XS7 = FR_n_XS7, FR_n_XS, f0 // x^7 construction
1605 nop.i 0
1606 };;
1607
1608 { .mfi
1609 nop.m 0
1610 (p15) fms.s1 FR_n_TL = FR_n_A2H, FR_n_XS2, FR_n_TH // A2H*xs2+TH
1611 nop.i 0
1612 }
1613 { .mfi
1614 nop.m 0
1615 (p15) fma.s1 FR_n_PolyH = FR_n_TH, f1, FR_n_A1H // PolyH=TH+A1H
1616 nop.i 0
1617 };;
1618
1619 { .mfi
1620 nop.m 0
1621 fmpy.s1 FR_l_GS_hi = FR_l_G, FR_l_S // GS_hi = G*S
1622 nop.i 0
1623 }
1624 { .mfb
1625 nop.m 0
1626 fms.s1 FR_l_r = FR_l_G, FR_l_S, f1 // r = G*S -1
1627 (p7) br.cond.spnt tgammal_overflow // Overflow path for arg > 1755.5483 //////
1628 };;
1629
1630 { .mfi
1631 nop.m 0
1632 fma.s1 FR_l_B14 = FR_l_B16, FR_l_Z2, FR_l_B14// bernulli tail
1633 nop.i 0
1634 }
1635 { .mfi
1636 nop.m 0
1637 fma.s1 FR_l_Z4 = FR_l_Z2, FR_l_Z2, f0 // Z^4 = Z^2*Z^2
1638 nop.i 0
1639 };;
1640
1641 { .mfi
1642 nop.m 0
1643 fma.s1 FR_l_B2 = FR_l_B4, FR_l_Z2, FR_l_B2 // bernulli tail
1644 nop.i 0
1645 }
1646 { .mfi
1647 nop.m 0
1648 fma.s1 FR_l_B6 = FR_l_B8, FR_l_Z2, FR_l_B6 // bernulli tail
1649 nop.i 0
1650 };;
1651
1652 { .mfi
1653 nop.m 0
1654 fma.s1 FR_l_B10 = FR_l_B12, FR_l_Z2, FR_l_B10// bernulli tail
1655 nop.i 0
1656 }
1657 { .mfi
1658 nop.m 0
1659 (p15) fma.s1 FR_n_Tail = FR_n_A7, FR_n_XS8, FR_n_A3 // poly tail
1660 nop.i 0
1661 };;
1662
1663 { .mfi
1664 nop.m 0
1665 (p15) fma.s1 FR_n_TL = FR_n_TL, f1, FR_n_TT // TL = TL+T
1666 nop.i 0
1667 }
1668 { .mfi
1669 nop.m 0
1670 (p15) fms.s1 FR_n_PolyL = FR_n_A1H, f1, FR_n_PolyH // polyH+A1H
1671 nop.i 0
1672 };;
1673
1674 { .mfi
1675 nop.m 0
1676 fma.s1 FR_l_poly_lo = FR_l_r, FR_l_Q_6, FR_l_Q_5 // Q_5+r*Q_6
1677 nop.i 0
1678 }
1679 { .mfi
1680 nop.m 0
1681 fsub.s1 FR_l_r_cor = FR_l_GS_hi, f1 // r_cor = GS_hi -1
1682 nop.i 0
1683 };;
1684
1685 { .mfi
1686 nop.m 0
1687 fms.s1 FR_l_GS_lo = FR_l_G, FR_l_S, FR_l_GS_hi // G*S-GS_hi
1688 nop.i 0
1689 }
1690 { .mfi
1691 nop.m 0
1692 fma.s1 FR_l_poly = FR_l_r, FR_l_Q_2, FR_l_Q_1 //poly=r*Q2+Q1
1693 nop.i 0
1694 };;
1695
1696 { .mfi
1697 nop.m 0
1698 fmpy.s1 FR_l_rsq = FR_l_r, FR_l_r // rsq = r * r
1699 nop.i 0
1700 }
1701 { .mfi
1702 nop.m 0
1703 fma.s1 FR_l_G = FR_l_float_N, FR_l_log2_hi, FR_l_H // Tbl =
1704 // float_N*log2_hi + H
1705 nop.i 0
1706 };;
1707
1708 { .mfi
1709 nop.m 0
1710 fma.s1 FR_l_Y_lo = FR_l_float_N, FR_l_log2_lo, FR_l_h // Y_lo=
1711 // float_N*log2_lo + h
1712 nop.i 0
1713 }
1714 { .mfi
1715 nop.m 0
1716 fma.s1 FR_l_B14 = FR_l_B18, FR_l_Z4, FR_l_B14 //bernulli tail
1717 nop.i 0
1718 };;
1719
1720 { .mfi
1721 nop.m 0
1722 fma.s1 FR_l_B2 = FR_l_B6, FR_l_Z4, FR_l_B2 //bernulli tail
1723 nop.i 0
1724 }
1725 { .mfi
1726 nop.m 0
1727 fma.s1 FR_l_Z8 = FR_l_Z4, FR_l_Z4, f0 //bernulli tail
1728 nop.i 0
1729 };;
1730
1731 { .mfi
1732 nop.m 0
1733 fma.s1 FR_l_poly_lo = FR_l_r, FR_l_poly_lo, FR_l_Q_4 // poly_lo =
1734 // Q_4 + r * poly_lo
1735 nop.i 0
1736 }
1737 { .mfi
1738 nop.m 0
1739 fsub.s1 FR_l_r_cor = FR_l_r_cor, FR_l_r // r_cor = r_cor - r
1740 nop.i 0
1741 };;
1742
1743 { .mfi
1744 nop.m 0
1745 (p15) fma.s1 FR_n_PolyL = FR_n_PolyL, f1, FR_n_TH // polyL+TH
1746 nop.i 0
1747 }
1748 { .mfi
1749 nop.m 0
1750 (p15) fma.s1 FR_n_TT = FR_n_TL, f1, FR_n_A1L // TL+A1L
1751 nop.i 0
1752 };;
1753
1754 { .mfi
1755 nop.m 0
1756 fadd.s1 FR_l_logl_YHi = FR_l_G, FR_l_r // Y_hi = Tbl + r
1757 nop.i 0
1758 };;
1759
1760 { .mfi
1761 nop.m 0
1762 fma.s1 FR_l_B10 = FR_l_B14, FR_l_Z4, FR_l_B10 //bernulli tail
1763 nop.i 0
1764 };;
1765
1766 { .mfi
1767 nop.m 0
1768 fma.s1 FR_l_poly_lo = FR_l_r, FR_l_poly_lo, FR_l_Q_3 // poly_lo =
1769 // Q_3 + r * poly_lo
1770 nop.i 0
1771 }
1772 { .mfi
1773 nop.m 0
1774 fadd.s1 FR_l_r_cor = FR_l_r_cor, FR_l_GS_lo // r_cor=r_cor+GS_lo
1775 nop.i 0
1776 };;
1777
1778 { .mfi
1779 nop.m 0
1780 (p15) fma.s1 FR_n_PolyL = FR_n_PolyL, f1, FR_n_TT // polyL+TT
1781 nop.i 0
1782 };;
1783
1784 { .mfi
1785 nop.m 0
1786 fsub.s1 FR_l_Y_lo_res = FR_l_G, FR_l_logl_YHi // Y_lo = Tbl - Y_hi
1787 nop.i 0
1788 }
1789 { .mfi
1790 nop.m 0
1791 fma.s1 FR_l_XYH = FR_l_logl_YHi, FR_l_AbsX_m_Half, f0 // XYH=
1792 // YHi*|x-0.5|
1793 nop.i 0
1794 };;
1795
1796 { .mfi
1797 nop.m 0
1798 fma.s1 FR_l_SS = FR_l_B10, FR_l_Z8, FR_l_B2 // bernulli tail
1799 nop.i 0
1800 };;
1801
1802 { .mfi
1803 nop.m 0
1804 fadd.s1 FR_l_r_cor = FR_l_r_cor, FR_l_Y_lo // r_cor = r_cor+Y_lo
1805 nop.i 0
1806 }
1807 { .mfi
1808 nop.m 0
1809 fma.s1 FR_l_poly = FR_l_rsq, FR_l_poly_lo, FR_l_poly //poly=
1810 // r^2*polyLo+poly
1811 nop.i 0
1812 };;
1813
1814 { .mfi
1815 nop.m 0
1816 (p15) fma.s1 FR_n_TT = FR_n_PolyL, FR_n_XS2, f0 // T=polyL*xs^2
1817 nop.i 0
1818 };;
1819
1820 { .mfi
1821 nop.m 0
1822 fadd.s1 FR_l_Y_lo = FR_l_Y_lo_res, FR_l_r // Y_lo = Y_lo + r
1823 nop.i 0
1824 }
1825 { .mfi
1826 nop.m 0
1827 fms.s1 FR_l_XYL = FR_l_logl_YHi, FR_l_AbsX_m_Half, FR_l_XYH
1828 // XYL = YHi*|x-0.5|-XYH
1829 nop.i 0
1830 };;
1831
1832 { .mfi
1833 nop.m 0
1834 fma.s1 FR_l_SSCXH = FR_l_SS, FR_l_Z, FR_l_CXH // SS*Z+CXH
1835 nop.i 0
1836 }
1837 { .mfi
1838 mov GR_e_exp_2tom51= 0xffff-51 // 2^-51
1839 (p15) fma.s1 FR_l_SignedXYH = FR_l_XYH, FR_n_NegOne, f0 // XYH = -XYH
1840 // for negatives
1841 nop.i 0
1842 };;
1843
1844 { .mlx
1845 nop.m 0
1846 movl GR_e_rshf_2to51 = 0x4718000000000000 // 1.10000 2^(63+51)
1847 }
1848 { .mlx
1849 nop.m 0
1850 movl GR_e_sig_inv_ln2 = 0xb8aa3b295c17f0bc //significand of 1/ln2
1851 };;
1852
1853 { .mfi
1854 nop.m 0
1855 fma.s1 FR_l_poly = FR_l_rsq, FR_l_poly, FR_l_r_cor // poly =
1856 // rsq * poly + r_cor
1857 nop.i 0
1858 };;
1859
1860 { .mfi
1861 addl GR_e_ad_Arg = @ltoff(Constants_Tgammal_exp_64_Arg#),gp
1862 (p15) fma.s1 FR_n_TT = FR_n_PolyH, FR_n_XS2L, FR_n_TT
1863 mov GR_e_exp_mask = 0x1FFFF // Form exponent mask
1864 }
1865 { .mlx
1866 nop.m 0
1867 movl GR_e_rshf = 0x43e8000000000000 // 1.10000 2^63 rshift
1868 };;
1869
1870
1871 { .mmi
1872 setf.sig FR_e_INV_LN2_2TO63 = GR_e_sig_inv_ln2 // form 1/ln2 * 2^63
1873 setf.d FR_e_RSHF_2TO51 = GR_e_rshf_2to51 // 1.1000 * 2^(63+51)
1874 nop.i 0
1875 };;
1876
1877 { .mfi
1878 nop.m 0
1879 fms.s1 FR_l_SSCXL = FR_l_CXH, f1, FR_l_SSCXH // CXH+SS*CXH
1880 nop.i 0
1881 }
1882 { .mfi
1883 nop.m 0
1884 fma.s1 FR_e_expl_Input_AbsX = FR_l_XYH, f1, FR_l_SSCXH // HI EXP
1885 nop.i 0
1886 };;
1887
1888 .pred.rel "mutex",p14,p15
1889 { .mfi
1890 nop.m 0
1891 (p14) fma.s1 FR_e_expl_Input_X = FR_l_XYH, f1, FR_l_SSCXH // HI EXP
1892 mov GR_e_exp_bias = 0x0FFFF // Set exponent bias
1893 }
1894 { .mfi
1895 ld8 GR_e_ad_Arg = [GR_e_ad_Arg] // Point to Arg table
1896 (p15) fms.s1 FR_e_expl_Input_X = FR_l_SignedXYH, f1, FR_l_SSCXH // HI EXP
1897 nop.i 0
1898 };;
1899
1900 { .mfi
1901 nop.m 0
1902 fadd.s1 FR_l_logl_YLo = FR_l_Y_lo, FR_l_poly // YLo = YLo+poly
1903 nop.i 0
1904 };;
1905
1906 { .mfi
1907 setf.exp FR_e_2TOM51 = GR_e_exp_2tom51 //2^-51 for scaling float_N
1908 (p15) fma.s1 FR_n_TH = FR_n_PolyH, FR_n_XS2, FR_n_TT // TH=
1909 // polyH*xs^2+T
1910 nop.i 0
1911 }
1912 { .mib
1913 setf.d FR_e_RSHF = GR_e_rshf // Right shift const 1.1000*2^63
1914 nop.i 0
1915 nop.b 0
1916 };;
1917
1918 { .mfi
1919 add GR_e_ad_A = 0x20, GR_e_ad_Arg // Point to A table
1920 nop.f 0
1921 add GR_e_ad_T1 = 0x50, GR_e_ad_Arg // Point to T1 table
1922 }
1923 { .mfi
1924 add GR_e_ad_T2 = 0x150, GR_e_ad_Arg // Point to T2 table
1925 nop.f 0
1926 nop.i 0
1927 };;
1928
1929 { .mfi
1930 nop.m 0
1931 fma.s1 FR_l_SSCXL = FR_l_SS, FR_l_Z, FR_l_SSCXL
1932 nop.i 0
1933 }
1934 { .mfi
1935 nop.m 0
1936 fms.s1 FR_e_expl_Input_Y = FR_l_XYH, f1, FR_e_expl_Input_AbsX
1937 nop.i 0
1938 };;
1939
1940 { .mfi
1941 ldfe FR_e_L_hi = [GR_e_ad_Arg],16 // Get L_hi
1942 nop.f 0
1943 nop.i 0
1944 };;
1945
1946 { .mfi
1947 nop.m 0
1948 fma.s1 FR_l_XYL = FR_l_logl_YLo, FR_l_AbsX_m_Half, FR_l_XYL
1949 // XYL = YLo*|x-0.5|+XYL
1950 nop.i 0
1951 };;
1952
1953 { .mfi
1954 ldfe FR_e_L_lo = [GR_e_ad_Arg],16 // Get L_lo
1955 (p15) fms.s1 FR_n_TL = FR_n_PolyH, FR_n_XS2, FR_n_TH // TL =
1956 // = polyH*xs^2-TH
1957 add GR_e_ad_W1 = 0x100, GR_e_ad_T2 // Point to W1 table
1958 }
1959 { .mfi
1960 nop.m 0
1961 (p15) fma.s1 FR_n_Poly1H = FR_n_TH, f1, f1 // poly1H = TH+1
1962 add GR_e_ad_W2 = 0x300, GR_e_ad_T2 // Point to W2 table
1963 };;
1964
1965 { .mmi
1966 getf.exp GR_e_signexp_x = FR_e_expl_Input_X // Extract sign and exp
1967 ldfe FR_e_A3 = [GR_e_ad_A],16 // Get A3
1968 nop.i 0
1969 };;
1970
1971 { .mfi
1972 nop.m 0
1973 fma.s1 FR_l_SSCXL = FR_l_SSCXL, f1, FR_l_CXL
1974 nop.i 0
1975 }
1976 { .mfi
1977 nop.m 0
1978 fma.s1 FR_e_expl_Input_Y = FR_e_expl_Input_Y, f1, FR_l_SSCXH
1979 nop.i 0
1980 };;
1981
1982 { .mfi
1983 nop.m 0
1984 fma.s1 FR_e_N_signif=FR_e_expl_Input_X,FR_e_INV_LN2_2TO63,FR_e_RSHF_2TO51
1985 and GR_e_exp_x = GR_e_signexp_x, GR_e_exp_mask
1986 };;
1987
1988 { .mmi
1989 sub GR_e_exp_x = GR_e_exp_x, GR_e_exp_bias // Get exponent
1990 ldfe FR_e_A2 = [GR_e_ad_A],16 // Get A2 for main path
1991 nop.i 0
1992 };;
1993
1994 { .mfi
1995 nop.m 0
1996 (p15) fma.s1 FR_n_PolyH = FR_n_Poly1H, FR_n_XS, f0//sin(Pi*x) poly
1997 nop.i 0
1998 }
1999 { .mfi
2000 nop.m 0
2001 (p15) fms.s1 FR_n_Poly1L = f1, f1, FR_n_Poly1H//sin(Pi*x) poly
2002 nop.i 0
2003 };;
2004
2005 { .mfi
2006 nop.m 0
2007 (p15) fma.s1 FR_n_TL = FR_n_TL, f1, FR_n_TT//sin(Pi*x) poly
2008 nop.i 0
2009 };;
2010
2011 { .mfi
2012 nop.m 0
2013 fma.s1 FR_l_Temp = FR_l_XYL, f1, FR_l_SSCXL // XYL+SS*CXL
2014 nop.i 0
2015 }
2016 { .mfi
2017 nop.m 0
2018 (p15) fma.s1 FR_e_expl_Input_Y = FR_e_expl_Input_Y, FR_n_NegOne, f0
2019 // Negate lo part of exp argument for negative input values
2020 nop.i 0
2021 };;
2022
2023 { .mfi
2024 ldfe FR_e_A1 = [GR_e_ad_A],16 // Get A1
2025 nop.f 0
2026 nop.i 0
2027 }
2028 { .mfi
2029 nop.m 0
2030 fms.s1 FR_e_float_N = FR_e_N_signif, FR_e_2TOM51, FR_e_RSHF
2031 // Get float N = signd*2^51-RSHIFTER
2032 nop.i 0
2033 };;
2034
2035 { .mfi
2036 nop.m 0
2037 (p15) fma.s1 FR_n_Poly1L = FR_n_Poly1L, f1, FR_n_TH //sin(Pi*x) poly
2038 nop.i 0
2039 }
2040 { .mfi
2041 nop.m 0
2042 (p15) fms.s1 FR_n_PolyL = FR_n_Poly1H, FR_n_XS, FR_n_PolyH//sin(Pi*x)
2043 nop.i 0
2044 };;
2045
2046 { .mfi
2047 getf.sig GR_e_N_fix = FR_e_N_signif // Get N from significand
2048 nop.f 0
2049 nop.i 0
2050 };;
2051
2052 .pred.rel "mutex",p14,p15
2053 { .mfi
2054 nop.m 0
2055 (p14) fma.s1 FR_e_expl_Input_Y = FR_e_expl_Input_Y, f1, FR_l_Temp
2056 nop.i 0
2057 }
2058 { .mfi
2059 nop.m 0
2060 (p15) fms.s1 FR_e_expl_Input_Y = FR_e_expl_Input_Y, f1, FR_l_Temp
2061 // arguments for exp computation
2062 nop.i 0
2063 };;
2064
2065 { .mfi
2066 nop.m 0
2067 fnma.s1 FR_e_r = FR_e_L_hi, FR_e_float_N, FR_e_expl_Input_X
2068 // r = -L_hi * float_N + x
2069 extr.u GR_e_M1 = GR_e_N_fix, 6, 6 // Extract index M_1
2070 };;
2071
2072 { .mfi
2073 nop.m 0
2074 (p15) fma.s1 FR_n_Poly1L = FR_n_Poly1L, f1, FR_n_TL //sin(Pi*x) poly
2075 nop.i 0
2076 };;
2077
2078
2079 { .mmf
2080 nop.m 0
2081 nop.m 0
2082 fma.s1 FR_e_r = FR_e_r, f1, FR_e_expl_Input_Y
2083 // r = r + FR_e_expl_Input_Y
2084 };;
2085
2086 { .mmi
2087 shladd GR_e_ad_W1 = GR_e_M1,3,GR_e_ad_W1 // Point to W1
2088 shladd GR_e_ad_T1 = GR_e_M1,2,GR_e_ad_T1 // Point to T1
2089 extr.u GR_e_M2 = GR_e_N_fix, 0, 6 // Extract index M_2
2090 };;
2091
2092
2093 { .mfi
2094 ldfs FR_e_T1 = [GR_e_ad_T1],0 // Get T1
2095 nop.f 0
2096 extr GR_e_K = GR_e_N_fix, 12, 32 //Extract limit range K
2097 }
2098 { .mfi
2099 shladd GR_e_ad_T2 = GR_e_M2,2,GR_e_ad_T2 // Point to T2
2100 (p15) fma.s1 FR_n_PolyL = FR_n_Poly1L, FR_n_XS, FR_n_PolyL
2101 //sin(Pi*x) poly
2102 shladd GR_e_ad_W2 = GR_e_M2,3,GR_e_ad_W2 // Point to W2
2103 };;
2104
2105 { .mfi
2106 ldfs FR_e_T2 = [GR_e_ad_T2],0 // Get T2
2107 nop.f 0
2108 add GR_e_exp_2_k = GR_e_exp_bias, GR_e_K // exp of 2^k
2109 }
2110 { .mfi
2111 ldfd FR_e_W1 = [GR_e_ad_W1],0 // Get W1
2112 nop.f 0
2113 sub GR_e_exp_2_mk = GR_e_exp_bias, GR_e_K // exp of 2^-k
2114 };;
2115
2116 { .mmi
2117 ldfd FR_e_W2 = [GR_e_ad_W2],0 // Get W2
2118 nop.m 0
2119 nop.i 0
2120 };;
2121
2122 { .mmf
2123 setf.exp FR_e_scale = GR_e_exp_2_k // Set scale = 2^k
2124 setf.exp FR_e_2_mk = GR_e_exp_2_mk // Form 2^-k
2125 fnma.s1 FR_e_r = FR_e_L_lo, FR_e_float_N, FR_e_r
2126 // r = -L_lo * float_N + r
2127 };;
2128
2129 { .mfi
2130 nop.m 0
2131 (p15) fma.s1 FR_n_PolyL = FR_n_Tail, FR_n_XS7, FR_n_PolyL
2132 //sin(Pi*x) poly
2133 nop.i 0
2134 };;
2135
2136 { .mfi
2137 nop.m 0
2138 fma.s1 FR_e_poly = FR_e_r, FR_e_A3, FR_e_A2 // poly=r*A3+A2
2139 nop.i 0
2140 }
2141 { .mfi
2142 nop.m 0
2143 fmpy.s1 FR_e_rsq = FR_e_r, FR_e_r // rsq = r * r
2144 nop.i 0
2145 };;
2146
2147 { .mfi
2148 nop.m 0
2149 fmpy.s1 FR_e_T = FR_e_T1, FR_e_T2 // T = T1 * T2
2150 nop.i 0
2151 }
2152 { .mfi
2153 nop.m 0
2154 fadd.s1 FR_e_W1_p1 = FR_e_W1, f1 // W1_p1 = W1 + 1.0
2155 nop.i 0
2156 };;
2157
2158 { .mfi
2159 nop.m 0
2160 (p15) fma.s1 FR_n_TT = FR_n_PolyL, FR_l_AbsX, f0 //sin(Pi*x) poly
2161 nop.i 0
2162 };;
2163
2164 { .mfi
2165 nop.m 0
2166 fma.s1 FR_e_poly = FR_e_r, FR_e_poly, FR_e_A1
2167 // poly = r * poly + A1
2168 nop.i 0
2169 };;
2170
2171 { .mfi
2172 nop.m 0
2173 fma.s1 FR_e_T_scale = FR_e_T, FR_e_scale, f0 // T_scale=T*scale
2174 nop.i 0
2175 }
2176 { .mfi
2177 nop.m 0
2178 fma.s1 FR_e_W = FR_e_W2, FR_e_W1_p1, FR_e_W1
2179 // W = W2 * (W1+1.0) + W1
2180 nop.i 0
2181 };;
2182
2183 { .mfi
2184 nop.m 0
2185 (p15) fma.s1 FR_n_SinxH = FR_n_PolyH, FR_l_AbsX, FR_n_TT
2186 // sin(Pi*x) poly
2187 nop.i 0
2188 };;
2189
2190 { .mfi
2191 nop.m 0
2192 mov FR_e_Y_hi = FR_e_T // Assume Y_hi = T
2193 nop.i 0
2194 };;
2195
2196 { .mfi
2197 nop.m 0
2198 fma.s1 FR_e_poly = FR_e_rsq, FR_e_poly, FR_e_r
2199 // poly = rsq * poly + r
2200 nop.i 0
2201 };;
2202
2203 { .mfi
2204 nop.m 0
2205 fma.s1 FR_e_Wp1_T_scale = FR_e_W, FR_e_T_scale, FR_e_T_scale
2206 // (W+1)*T*scale
2207 nop.i 0
2208 }
2209 { .mfi
2210 nop.m 0
2211 fma.s1 FR_e_W_T_scale = FR_e_W, FR_e_T_scale, f0 // W*T*scale
2212 nop.i 0
2213 };;
2214
2215 { .mfi
2216 nop.m 0
2217 (p15) fms.s1 FR_n_SinxL = FR_n_PolyH, FR_l_AbsX, FR_n_SinxH
2218 // Low part of sin
2219 nop.i 0
2220 };;
2221
2222 { .mfi
2223 nop.m 0
2224 (p15) frcpa.s1 FR_n_Y0, p0 = f1, FR_n_SinxH // y = frcpa(b)
2225 nop.i 0
2226 };;
2227
2228 { .mfi
2229 nop.m 0
2230 fma.s1 FR_e_result_lo = FR_e_Wp1_T_scale, FR_e_poly, FR_e_W_T_scale
2231 // Low part of exp result
2232 nop.i 0
2233 };;
2234
2235 { .mfi
2236 nop.m 0
2237 (p15) fma.s1 FR_n_SinxL = FR_n_SinxL, f1, FR_n_TT // sin low result
2238 nop.i 0
2239 };;
2240
2241 { .mfi
2242 nop.m 0
2243 (p15) fma.s1 FR_n_Q0 = f1,FR_n_Y0,f0 // q = y
2244 nop.i 0
2245 }
2246 { .mfi
2247 nop.m 0
2248 (p15) fnma.s1 FR_n_E0 = FR_n_Y0, FR_n_SinxH, f1 // e = 1-b*y
2249 nop.i 0
2250 };;
2251
2252
2253 { .mfb
2254 nop.m 0
2255 (p14) fma.s0 f8 = FR_e_Y_hi, FR_e_scale, FR_e_result_lo
2256 (p14) br.ret.spnt b0 // Exit for positive Stirling path //////////////////////
2257 };;
2258
2259 { .mfi
2260 nop.m 0
2261 fma.s1 FR_e_expl_Output_X = FR_e_Y_hi, FR_e_scale, f0 // exp result
2262 nop.i 0
2263 }
2264 { .mfi
2265 nop.m 0
2266 fma.s1 FR_e_expl_Output_Y = FR_e_result_lo, f1, f0// exp lo result
2267 nop.i 0
2268 };;
2269
2270 { .mfi
2271 nop.m 0
2272 fma.s1 FR_n_E2 = FR_n_E0,FR_n_E0,FR_n_E0 // e2 = e+e^2
2273 nop.i 0
2274 }
2275 { .mfi
2276 nop.m 0
2277 fma.s1 FR_n_E1 = FR_n_E0,FR_n_E0,f0 // e1 = e^2
2278 nop.i 0
2279 };;
2280
2281 { .mfi
2282 nop.m 0
2283 fma.s1 FR_n_Y1 = FR_n_Y0,FR_n_E2,FR_n_Y0 // y1 = y+y*e2
2284 nop.i 0
2285 }
2286 { .mfi
2287 nop.m 0
2288 fma.s1 FR_n_E3 = FR_n_E1,FR_n_E1,FR_n_E0 // e3 = e+e1^2
2289 nop.i 0
2290 };;
2291
2292 { .mfi
2293 nop.m 0
2294 fma.s1 FR_n_Y2 = FR_n_Y1,FR_n_E3,FR_n_Y0 // y2 = y+y1*e3
2295 nop.i 0
2296 }
2297 { .mfi
2298 nop.m 0
2299 fnma.s1 FR_n_R0 = FR_n_SinxH,FR_n_Q0,f1 // r = a-b*q
2300 nop.i 0
2301 };;
2302
2303 { .mfi
2304 nop.m 0
2305 fnma.s1 FR_n_E4 = FR_n_SinxH,FR_n_Y2,f1 // e4 = 1-b*y2
2306 nop.i 0
2307 }
2308 { .mfi
2309 nop.m 0
2310 fma.s1 FR_n_RcpResH = FR_n_R0,FR_n_Y2,FR_n_Q0 // x = q+r*y2
2311 nop.i 0
2312 };;
2313
2314 { .mfi
2315 nop.m 0
2316 fma.s1 FR_n_Y3 = FR_n_Y2,FR_n_E4,FR_n_Y2 // y3 = y2+y2*e4
2317 nop.i 0
2318 }
2319 { .mfi
2320 nop.m 0
2321 fnma.s1 FR_n_R1 = FR_n_SinxH,FR_n_RcpResH,f1 // r1 = a-b*x
2322 nop.i 0
2323 };;
2324
2325 { .mfi
2326 nop.m 0
2327 fnma.s1 FR_n_R1 = FR_n_SinxL,FR_n_RcpResH,FR_n_R1
2328 // r1 = r1 - b_lo*X
2329 nop.i 0
2330 };;
2331
2332 { .mfi
2333 nop.m 0
2334 fma.s1 FR_n_RcpResL = FR_n_R1,FR_n_Y3,f0 // x_lo = r1*y3
2335 nop.i 0
2336 }
2337 { .mfi
2338 nop.m 0
2339 fma.s1 FR_n_Temp = FR_n_RcpResH, FR_e_expl_Output_Y, f0
2340 // Multiplying exp and sin result
2341 nop.i 0
2342 };;
2343
2344 { .mfi
2345 nop.m 0
2346 fma.s1 FR_n_Temp = FR_n_RcpResL, FR_e_expl_Output_X, FR_n_Temp
2347 // Multiplying exp and sin result
2348 nop.i 0
2349 };;
2350
2351 { .mfi
2352 nop.m 0
2353 fma.s1 FR_n_ResH = FR_n_RcpResH, FR_e_expl_Output_X, FR_n_Temp
2354 // Multiplying exp and sin result
2355 nop.i 0
2356 };;
2357
2358 { .mfi
2359 nop.m 0
2360 fms.s1 FR_n_ResL = FR_n_RcpResH, FR_e_expl_Output_X, FR_n_ResH
2361 // Multiplying exp and sin result
2362 nop.i 0
2363 }
2364 { .mfi
2365 nop.m 0
2366 (p12) fma.s1 FR_n_ResH = FR_n_ResH, FR_n_NegOne, f0 // Negate
2367 nop.i 0
2368 };;
2369
2370 { .mfi
2371 nop.m 0
2372 fma.s1 FR_n_ResL = FR_n_ResL, f1, FR_n_Temp
2373 // Multiplying exp and sin result - low result obtained
2374 nop.i 0
2375 };;
2376
2377 .pred.rel "mutex",p12,p13
2378 { .mfi
2379 nop.m 0
2380 (p13) fma.s0 f8 = FR_n_ResH, f1, FR_n_ResL // For odd
2381 nop.i 0
2382 }
2383 { .mfb
2384 nop.m 0
2385 (p12) fms.s0 f8 = FR_n_ResH, f1, FR_n_ResL // For even
2386 br.ret.sptk b0 // Exit for negative Stirling path //////////////////////
2387 };;
2388
2389
2390 //////////// 1 <= |X| < 13 path ////////////////////////////////////////////////
2391 //------------------------------------------------------------------------------
2392 .align 64
2393 tgamma_lt_13:
2394 { .mfi
2395 getf.sig GR_p_XN = FR_p_IXN // Get significand
2396 fcvt.xf FR_p_XN = FR_p_IXN // xn = [x]
2397 add GR_r_sin_Table2= 0x40, GR_r_sin_Table // Shifted table addr.
2398 }
2399 { .mfi
2400 ldfpd FR_p_0p5, FR_p_1p5 = [GR_c_Table], 16 // 0.5 & 1.5
2401 fms.s1 FR_p_AbsXM1 = FR_p_AbsX, f1, f1 // X-1
2402 add GR_p_Table2 = 0xB0, GR_p_Table
2403 };;
2404
2405 { .mfi
2406 add GR_r_sin_Table = -16, GR_r_sin_Table // For compensation
2407 fcvt.xf FR_r_XNS = FR_r_IXNS // Convert int repr to float
2408 shr.u GR_p_X_Sgnd = GR_p_X_Sgnd, 59 // Get only 5 bit of signd
2409 };;
2410
2411 { .mfi
2412 ldfpd FR_r_A2H,FR_r_A2L = [GR_r_sin_Table], 16 // Load A2
2413 nop.f 0
2414 add GR_p_Int = -2, GR_p_XN // int = int - 2
2415 }
2416 { .mfi
2417 ldfe FR_r_A6 = [GR_r_sin_Table2], 16
2418 nop.f 0
2419 cmp.gtu p11, p12 = 0x2, GR_p_XN // p11: x < 2 (splitted intervals),
2420 // p12: x > 2 (base intervals)
2421 };;
2422
2423 { .mfi
2424 ldfpd FR_r_A1H, FR_r_A1L = [GR_r_sin_Table], 16
2425 nop.f 0
2426 shr GR_p_Int = GR_p_Int, 1 // int/2
2427 }
2428 { .mfi
2429 ldfe FR_r_A5 = [GR_r_sin_Table2], 16
2430 nop.f 0
2431 (p11) cmp.gtu.unc p10, p11 = 0x1C, GR_p_X_Sgnd // sgnd(x) < 0.75
2432 };;
2433
2434 { .mfi
2435 ldfe FR_r_A9 = [GR_r_sin_Table], 16
2436 nop.f 0
2437 shl GR_p_Offset = GR_p_Int, 4 // offset = int*16
2438 }
2439 { .mfi
2440 ldfe FR_r_A4 = [GR_r_sin_Table2], 16
2441 nop.f 0
2442 (p10) cmp.gtu.unc p9, p10 = 0x14, GR_p_X_Sgnd // sgnd(x) < 0.25
2443 };;
2444
2445
2446 { .mfi
2447 ldfe FR_r_A8 = [GR_r_sin_Table], 16
2448 nop.f 0
2449 (p12) tbit.nz.unc p13, p12 = GR_p_XN, 0x0 // p13: reccurent computations
2450 // X is at [3;4], [5;6], [7;8]... interval
2451 }
2452 { .mfi
2453 ldfe FR_r_A3 = [GR_r_sin_Table2], 16
2454 nop.f 0
2455 shladd GR_p_Offset = GR_p_Int, 2, GR_p_Offset // +int*4
2456 };;
2457
2458 .pred.rel "mutex",p9,p11
2459 { .mfi
2460 add GR_p_Offset = GR_p_Int, GR_p_Offset
2461 // +int, so offset = int*21
2462 (p9) fms.s1 FR_p_XR = FR_p_AbsX, f1, f1 // r = x-1
2463 nop.i 0
2464 }
2465 { .mfi
2466 ldfe FR_r_A7 = [GR_r_sin_Table], 16
2467 (p11) fms.s1 FR_p_XR = FR_p_2, f1, FR_p_AbsX
2468 // r = 2-x for 1.75 < x < 2
2469 nop.i 0
2470 };;
2471
2472 .pred.rel "mutex",p9,p10
2473 .pred.rel "mutex",p10,p11
2474 .pred.rel "mutex",p9,p11
2475 { .mfi
2476 (p9) add GR_p_Offset = 126, r0 // 1.0 < x < 1.25 table
2477 (p15) fcmp.eq.unc.s1 p7,p0 = FR_p_AbsX, FR_p_XN
2478 // If arg is integer and negative - singularity branch
2479 nop.i 0
2480 }
2481 { .mfi
2482 (p10) add GR_p_Offset = 147, r0 // 1.25 < x < 1.75 table
2483 nop.f 0
2484 (p11) add GR_p_Offset = 168, r0 // 1.75 < x < 2.0 table
2485 };;
2486
2487 { .mmf
2488 shladd GR_p_Table = GR_p_Offset, 4, GR_p_Table
2489 shladd GR_p_Table2 = GR_p_Offset, 4, GR_p_Table2
2490 fma.s1 FR_r_XS = FR_r_AbsX , f1, FR_r_XNS // xs = x - [x]
2491 };;
2492
2493 { .mmb
2494 ldfpd FR_p_A5H, FR_p_A5L = [GR_p_Table], 16
2495 ldfpd FR_p_A2H, FR_p_A2L = [GR_p_Table2], 16
2496 (p7) br.cond.spnt tgammal_singularity // Singularity for integer /////////////
2497 // and negative argument ///////////////
2498 };;
2499
2500 { .mfi
2501 ldfpd FR_p_A4H, FR_p_A4L = [GR_p_Table], 16
2502 fma.s1 FR_p_XN = FR_p_XN, f1, FR_p_0p5 // xn = xn+0.5
2503 nop.i 0
2504 }
2505 { .mfi
2506 ldfpd FR_p_A1H, FR_p_A1L = [GR_p_Table2], 16
2507 (p10) fms.s1 FR_p_XR = FR_p_AbsX, f1, FR_p_1p5 // r = x - 1.5
2508 nop.i 0
2509 };;
2510
2511 { .mmi
2512 ldfpd FR_p_A3H, FR_p_A3L = [GR_p_Table], 16
2513 ldfpd FR_p_A0H, FR_p_A0L = [GR_p_Table2], 16
2514 nop.i 0
2515 };;
2516
2517 { .mmi
2518 ldfe FR_p_A20 = [GR_p_Table], 16
2519 ldfe FR_p_A12 = [GR_p_Table2], 16
2520 nop.i 0
2521 };;
2522
2523 { .mmf
2524 ldfe FR_p_A19 = [GR_p_Table], 16
2525 ldfe FR_p_A11 = [GR_p_Table2], 16
2526 fma.s1 FR_r_XS2 = FR_r_XS, FR_r_XS, f0 // xs2 = xs*xs
2527 };;
2528
2529 { .mmi
2530 ldfe FR_p_A18 = [GR_p_Table], 16
2531 ldfe FR_p_A10 = [GR_p_Table2], 16
2532 nop.i 0
2533 };;
2534
2535 .pred.rel "mutex",p12,p13
2536 { .mfi
2537 ldfe FR_p_A17 = [GR_p_Table], 16
2538 (p12) fms.s1 FR_p_XR = FR_p_AbsX, f1, FR_p_XN // r = x - xn
2539 nop.i 0
2540 }
2541 { .mfi
2542 ldfe FR_p_A9 = [GR_p_Table2], 16
2543 (p13) fms.s1 FR_p_XR = FR_p_AbsX, f1, FR_p_XN
2544 nop.i 0
2545 };;
2546
2547 { .mmi
2548 ldfe FR_p_A16 = [GR_p_Table], 16
2549 ldfe FR_p_A8 = [GR_p_Table2], 16
2550 (p9) cmp.eq p12, p0 = r0, r0 // clear p12
2551 };;
2552
2553 { .mmi
2554 ldfe FR_p_A15 = [GR_p_Table], 16
2555 ldfe FR_p_A7 = [GR_p_Table2], 16
2556 (p10) cmp.eq p12, p0 = r0, r0 // clear p12
2557 };;
2558
2559 { .mfi
2560 ldfe FR_p_A14 = [GR_p_Table], 16
2561 fma.s1 FR_r_TH = FR_r_A2H, FR_r_XS2, f0 // sin for neg
2562 (p11) cmp.eq p12, p0 = r0, r0 // clear p12
2563 }
2564 { .mfi
2565 ldfe FR_p_A6 = [GR_p_Table2], 16
2566 fma.s1 FR_r_TL = FR_r_A2L, FR_r_XS2, f0 // sin for neg
2567 nop.i 0
2568 };;
2569
2570 { .mfi
2571 ldfe FR_p_A13 = [GR_p_Table], 16
2572 fms.s1 FR_r_XS2L = FR_r_XS, FR_r_XS, FR_r_XS2 // x2Lo part
2573 nop.i 0
2574 };;
2575
2576 { .mfi
2577 nop.m 0
2578 fma.s1 FR_p_Temp5H = FR_p_A5H, FR_p_XR, f0 // A5H*r
2579 // 'Low poly'
2580 nop.i 0
2581 }
2582 { .mfi
2583 nop.m 0
2584 fma.s1 FR_p_XR2 = FR_p_XR, FR_p_XR, f0 // r^2 = r*r
2585 nop.i 0
2586 };;
2587
2588 { .mfi
2589 nop.m 0
2590 fabs FR_r_XS = FR_r_XS // abs(xs)
2591 nop.i 0
2592 }
2593 { .mfi
2594 nop.m 0
2595 fma.s1 FR_p_Temp2H = FR_p_A2H, FR_p_XR, f0 // A2H*r
2596 // 'High poly'
2597 nop.i 0
2598 };;
2599
2600 { .mfi
2601 nop.m 0
2602 fms.s1 FR_r_TT = FR_r_A2H, FR_r_XS2, FR_r_TH // sin for neg
2603 nop.i 0
2604 }
2605 { .mfi
2606 nop.m 0
2607 fma.s1 FR_r_ResH = FR_r_TH, f1, FR_r_A1H // sin for neg
2608 nop.i 0
2609 };;
2610
2611 { .mfi
2612 nop.m 0
2613 fma.s1 FR_r_TL = FR_r_A2H, FR_r_XS2L, FR_r_TL // sin for neg
2614 nop.i 0
2615 };;
2616
2617 { .mfi
2618 nop.m 0
2619 fms.s1 FR_p_Temp5L = FR_p_A5H,FR_p_XR,FR_p_Temp5H //A5H*r delta
2620 // 'Low poly'
2621 nop.i 0
2622 }
2623 { .mfi
2624 nop.m 0
2625 fma.s1 FR_p_Poly5H = FR_p_Temp5H, f1, FR_p_A4H // A5H*r+A4H
2626 // 'Low poly'
2627 nop.i 0
2628 };;
2629
2630 { .mfi
2631 nop.m 0
2632 fms.s1 FR_p_Temp2L = FR_p_A2H, FR_p_XR, FR_p_Temp2H//A2H*r delta
2633 //'High poly'
2634 nop.i 0
2635 }
2636 { .mfi
2637 nop.m 0
2638 fma.s1 FR_p_Poly2H = FR_p_Temp2H, f1, FR_p_A1H // A2H*r+A1H
2639 //'High poly'
2640 nop.i 0
2641 };;
2642
2643 { .mfi
2644 nop.m 0
2645 fma.s1 FR_p_XR3 = FR_p_XR2, FR_p_XR, f0 // r^3 = r^2*r
2646 nop.i 0
2647 }
2648 { .mfi
2649 nop.m 0
2650 fms.s1 FR_p_XR2L = FR_p_XR, FR_p_XR, FR_p_XR2 // r^2 delta
2651 nop.i 0
2652 };;
2653
2654 { .mfi
2655 nop.m 0
2656 fma.s1 FR_p_A18 = FR_p_A19, FR_p_XR, FR_p_A18 // Poly tail
2657 nop.i 0
2658 }
2659 { .mfi
2660 nop.m 0
2661 fma.s1 FR_p_A14 = FR_p_A15, FR_p_XR, FR_p_A14 // Poly tail
2662 nop.i 0
2663 };;
2664
2665 { .mfi
2666 nop.m 0
2667 fma.s1 FR_p_XR4 = FR_p_XR2, FR_p_XR2, f0 // r^4 = r^2*r^2
2668 nop.i 0
2669 };;
2670
2671 { .mfi
2672 nop.m 0
2673 fma.s1 FR_p_Temp5L = FR_p_A5L, FR_p_XR, FR_p_Temp5L// Low part
2674 // of A5*r+A4
2675 nop.i 0
2676 }
2677 { .mfi
2678 nop.m 0
2679 fms.s1 FR_p_Poly5L = FR_p_A4H, f1, FR_p_Poly5H // Low part
2680 // of A5*r+A4
2681 nop.i 0
2682 };;
2683
2684 { .mfi
2685 nop.m 0
2686 fma.s1 FR_p_Temp4H = FR_p_Poly5H, FR_p_XR, f0 // (A5H*r+A4H)*r
2687 nop.i 0
2688 }
2689 { .mfi
2690 nop.m 0
2691 fma.s1 FR_p_Temp2L = FR_p_A2L, FR_p_XR, FR_p_Temp2L // A2*r low
2692 nop.i 0
2693 };;
2694
2695 { .mfi
2696 nop.m 0
2697 fms.s1 FR_p_Poly2L = FR_p_A1H, f1, FR_p_Poly2H // High poly
2698 nop.i 0
2699 }
2700 { .mfi
2701 nop.m 0
2702 fma.s1 FR_p_Temp1H = FR_p_Poly2H, FR_p_XR, f0 // High poly
2703 nop.i 0
2704 };;
2705
2706 { .mfi
2707 nop.m 0
2708 fms.s1 FR_p_XR3L = FR_p_XR2, FR_p_XR, FR_p_XR3 // x^3 delta
2709 nop.i 0
2710 }
2711 { .mfi
2712 nop.m 0
2713 fma.s1 FR_p_A16 = FR_p_A17, FR_p_XR, FR_p_A16 // Poly tail
2714 nop.i 0
2715 };;
2716
2717 { .mfi
2718 nop.m 0
2719 fms.s1 FR_r_ResL = FR_r_A1H, f1, FR_r_ResH // sin for neg
2720 nop.i 0
2721 }
2722 { .mfi
2723 nop.m 0
2724 fma.s1 FR_r_TL = FR_r_TL, f1, FR_r_TT // sin for neg
2725 nop.i 0
2726 };;
2727
2728 { .mfi
2729 nop.m 0
2730 fma.s1 FR_p_Temp5L = FR_p_Temp5L, f1, FR_p_A4L // Low poly
2731 nop.i 0
2732 }
2733 { .mfi
2734 nop.m 0
2735 fma.s1 FR_p_Poly5L = FR_p_Poly5L, f1, FR_p_Temp5H // Low poly
2736 nop.i 0
2737 };;
2738
2739 { .mfi
2740 nop.m 0
2741 fms.s1 FR_p_Temp4L = FR_p_Poly5H,FR_p_XR,FR_p_Temp4H //Low poly
2742 nop.i 0
2743 }
2744 { .mfi
2745 nop.m 0
2746 fma.s1 FR_p_Poly4H = FR_p_Temp4H, f1, FR_p_A3H // Low poly
2747 nop.i 0
2748 };;
2749
2750 { .mfi
2751 nop.m 0
2752 fma.s1 FR_p_Temp2L = FR_p_Temp2L, f1, FR_p_A1L // High poly
2753 nop.i 0
2754 }
2755 { .mfi
2756 nop.m 0
2757 fma.s1 FR_p_Poly2L = FR_p_Poly2L, f1, FR_p_Temp2H // High poly
2758 nop.i 0
2759 };;
2760
2761 { .mfi
2762 nop.m 0
2763 fms.s1 FR_p_Temp1L = FR_p_Poly2H,FR_p_XR,FR_p_Temp1H //High poly
2764 nop.i 0
2765 }
2766 { .mfi
2767 nop.m 0
2768 fma.s1 FR_p_Poly1H = FR_p_Temp1H, f1, FR_p_A0H // High poly
2769 nop.i 0
2770 };;
2771
2772 { .mfi
2773 nop.m 0
2774 fma.s1 FR_p_A12 = FR_p_A13, FR_p_XR, FR_p_A12 // Poly tail
2775 nop.i 0
2776 }
2777 { .mfi
2778 nop.m 0
2779 fma.s1 FR_p_XR3L = FR_p_XR2L, FR_p_XR, FR_p_XR3L // x^3 low
2780 nop.i 0
2781 };;
2782
2783 { .mfi
2784 nop.m 0
2785 fma.s1 FR_p_Poly5L = FR_p_Poly5L, f1, FR_p_Temp5L // Low poly
2786 nop.i 0
2787 }
2788 { .mfi
2789 nop.m 0
2790 fma.s1 FR_p_A10 = FR_p_A11, FR_p_XR, FR_p_A10 // Poly tail
2791 nop.i 0
2792 };;
2793
2794 { .mfi
2795 nop.m 0
2796 fms.s1 FR_p_Poly4L = FR_p_A3H, f1, FR_p_Poly4H // Low poly
2797 nop.i 0
2798 }
2799 { .mfi
2800 nop.m 0
2801 fma.s1 FR_p_A6 = FR_p_A7, FR_p_XR, FR_p_A6 // Poly tail
2802 nop.i 0
2803 };;
2804
2805 { .mfi
2806 nop.m 0
2807 fma.s1 FR_p_A8 = FR_p_A9, FR_p_XR, FR_p_A8 // Poly tail
2808 nop.i 0
2809 }
2810 { .mfi
2811 nop.m 0
2812 fma.s1 FR_p_XR6 = FR_p_XR4, FR_p_XR2, f0 // Poly tail
2813 nop.i 0
2814 };;
2815
2816 { .mfi
2817 nop.m 0
2818 fma.s1 FR_p_Poly2L = FR_p_Poly2L, f1, FR_p_Temp2L // High poly
2819 nop.i 0
2820 }
2821 { .mfi
2822 nop.m 0
2823 fms.s1 FR_p_Poly1L = FR_p_A0H, f1, FR_p_Poly1H // High poly
2824 nop.i 0
2825 };;
2826
2827 { .mfi
2828 nop.m 0
2829 fma.s1 FR_r_ResL = FR_r_ResL, f1, FR_r_TH // sin for neg
2830 nop.i 0
2831 }
2832 { .mfi
2833 nop.m 0
2834 fma.s1 FR_r_TT = FR_r_TL, f1, FR_r_A1L // sin for neg
2835 nop.i 0
2836 };;
2837
2838 { .mfi
2839 nop.m 0
2840 fma.s1 FR_p_Temp4L = FR_p_Poly5L,FR_p_XR,FR_p_Temp4L // Low poly
2841 nop.i 0
2842 }
2843 { .mfi
2844 nop.m 0
2845 fma.s1 FR_p_A18 = FR_p_A20, FR_p_XR2, FR_p_A18 // Poly tail
2846 nop.i 0
2847 };;
2848
2849 { .mfi
2850 nop.m 0
2851 fma.s1 FR_p_Poly4L = FR_p_Poly4L, f1, FR_p_Temp4H // Low poly
2852 nop.i 0
2853 }
2854 { .mfi
2855 nop.m 0
2856 fma.s1 FR_p_A14 = FR_p_A16, FR_p_XR2, FR_p_A14 // Poly tail
2857 nop.i 0
2858 };;
2859
2860 { .mfi
2861 nop.m 0
2862 fma.s1 FR_p_A6 = FR_p_A8, FR_p_XR2, FR_p_A6 // Poly tail
2863 nop.i 0
2864 }
2865 { .mfi
2866 nop.m 0
2867 fma.s1 FR_p_A10 = FR_p_A12, FR_p_XR2, FR_p_A10 // Poly tail
2868 nop.i 0
2869 };;
2870
2871 { .mfi
2872 nop.m 0
2873 fma.s1 FR_p_Temp1L = FR_p_Poly2L,FR_p_XR,FR_p_Temp1L //High poly
2874 nop.i 0
2875 }
2876 { .mfi
2877 nop.m 0
2878 fma.s1 FR_p_Poly1L = FR_p_Poly1L, f1, FR_p_Temp1H // High poly
2879 nop.i 0
2880 };;
2881
2882 { .mfi
2883 nop.m 0
2884 fma.s1 FR_r_ResL = FR_r_ResL, f1, FR_r_TT // sin for neg
2885 nop.i 0
2886 }
2887 { .mfi
2888 nop.m 0
2889 fma.s1 FR_r_TH = FR_r_ResH, FR_r_XS2, f0 // sin for neg
2890 nop.i 0
2891 };;
2892
2893 { .mfi
2894 nop.m 0
2895 fma.s1 FR_p_Temp4L = FR_p_Temp4L, f1, FR_p_A3L // Low poly
2896 nop.i 0
2897 }
2898 { .mfi
2899 nop.m 0
2900 fma.s1 FR_p_Poly3H = FR_p_Poly4H, FR_p_XR3, f0 // Low poly
2901 nop.i 0
2902 };;
2903
2904 { .mfi
2905 nop.m 0
2906 fma.s1 FR_p_A14 = FR_p_A18, FR_p_XR4, FR_p_A14 // Poly tail
2907 nop.i 0
2908 }
2909 { .mfi
2910 nop.m 0
2911 fma.s1 FR_p_XR8 = FR_p_XR4, FR_p_XR4, f0 // Poly tail
2912 nop.i 0
2913 };;
2914
2915 { .mfi
2916 nop.m 0
2917 fma.s1 FR_r_TL = FR_r_ResH, FR_r_XS2L, f0 // sin for neg
2918 nop.i 0
2919 };;
2920
2921 { .mfi
2922 nop.m 0
2923 fma.s1 FR_p_Temp1L = FR_p_Temp1L, f1, FR_p_A0L // High poly
2924 nop.i 0
2925 }
2926 { .mfi
2927 nop.m 0
2928 fma.s1 FR_p_A6 = FR_p_A10, FR_p_XR4, FR_p_A6 // Poly tail
2929 nop.i 0
2930 };;
2931
2932 { .mfi
2933 nop.m 0
2934 fms.s1 FR_r_TT = FR_r_ResH, FR_r_XS2, FR_r_TH // sin for neg
2935 nop.i 0
2936 }
2937 { .mfi
2938 nop.m 0
2939 fma.s1 FR_r_Res3H = FR_r_TH, f1, f1 // sin for neg
2940 nop.i 0
2941 };;
2942
2943 { .mfi
2944 nop.m 0
2945 fma.s1 FR_p_Poly4L = FR_p_Poly4L, f1, FR_p_Temp4L // Low poly
2946 nop.i 0
2947 }
2948 { .mfi
2949 nop.m 0
2950 fma.s1 FR_p_Poly3L = FR_p_Poly4H, FR_p_XR3L, f0 // Low poly
2951 nop.i 0
2952 };;
2953
2954 { .mfi
2955 nop.m 0
2956 fma.s1 FR_p_Poly0H = FR_p_Poly3H,f1,FR_p_Poly1H //Low & High add
2957 nop.i 0
2958 }
2959 { .mfi
2960 nop.m 0
2961 fma.s1 FR_r_A7 = FR_r_A8, FR_r_XS2, FR_r_A7 // sin for neg
2962 nop.i 0
2963 };;
2964
2965 { .mfi
2966 nop.m 0
2967 fma.s1 FR_r_TL = FR_r_ResL, FR_r_XS2, FR_r_TL // sin for neg
2968 nop.i 0
2969 }
2970 { .mfi
2971 nop.m 0
2972 fma.s1 FR_r_XS4 = FR_r_XS2, FR_r_XS2, f0 // sin for neg
2973 nop.i 0
2974 };;
2975
2976 { .mfi
2977 nop.m 0
2978 fma.s1 FR_p_Poly1L = FR_p_Poly1L, f1, FR_p_Temp1L // High poly
2979 nop.i 0
2980 }
2981 { .mfi
2982 nop.m 0
2983 fma.s1 FR_p_PolyTail = FR_p_A14, FR_p_XR8, FR_p_A6 // Poly tail
2984 nop.i 0
2985 };;
2986
2987 { .mfi
2988 nop.m 0
2989 fms.s1 FR_r_Res3L = f1, f1, FR_r_Res3H // sin for neg
2990 nop.i 0
2991 }
2992 { .mfi
2993 nop.m 0
2994 fma.s1 FR_r_ResH = FR_r_Res3H, FR_r_XS, f0 // sin for neg
2995 nop.i 0
2996 };;
2997
2998 { .mfi
2999 nop.m 0
3000 fms.s1 FR_p_Temp0L = FR_p_Poly4H,FR_p_XR3,FR_p_Poly3H //Low poly
3001 nop.i 0
3002 }
3003 { .mfi
3004 nop.m 0
3005 fma.s1 FR_p_Poly3L = FR_p_Poly4L,FR_p_XR3,FR_p_Poly3L //Low poly
3006 nop.i 0
3007 };;
3008
3009 { .mfi
3010 nop.m 0
3011 fms.s1 FR_p_Poly0L = FR_p_Poly1H,f1,FR_p_Poly0H //Low & High add
3012 nop.i 0
3013 }
3014 { .mfi
3015 nop.m 0
3016 (p13) fma.s1 FR_p_OddPoly0H = FR_p_Poly0H, FR_p_AbsXM1, f0
3017 // Reccurent computations - multiplying by X-1
3018 nop.i 0
3019 };;
3020
3021 { .mfi
3022 nop.m 0
3023 fma.s1 FR_r_TL = FR_r_TL, f1, FR_r_TT // sin for neg
3024 nop.i 0
3025 }
3026 { .mfi
3027 nop.m 0
3028 fma.s1 FR_r_A3 = FR_r_A4, FR_r_XS2, FR_r_A3 // sin for neg
3029 nop.i 0
3030 };;
3031
3032 { .mfi
3033 nop.m 0
3034 fma.s1 FR_p_Poly1L = FR_p_PolyTail,FR_p_XR6,FR_p_Poly1L//High
3035 nop.i 0
3036 }
3037 { .mfi
3038 nop.m 0
3039 fma.s1 FR_r_A5 = FR_r_A6, FR_r_XS2, FR_r_A5 // sin for neg
3040 nop.i 0
3041 };;
3042
3043 { .mfi
3044 nop.m 0
3045 fma.s1 FR_r_Res3L = FR_r_Res3L, f1, FR_r_TH // sin for neg
3046 nop.i 0
3047 }
3048 { .mfi
3049 nop.m 0
3050 fms.s1 FR_r_ResL = FR_r_Res3H, FR_r_XS, FR_r_ResH//sin for neg
3051 nop.i 0
3052 };;
3053
3054 { .mfi
3055 nop.m 0
3056 fma.s1 FR_p_Poly3L = FR_p_Poly3L, f1, FR_p_Temp0L // Low poly
3057 nop.i 0
3058 }
3059 { .mfi
3060 nop.m 0
3061 fma.s1 FR_r_A7 = FR_r_A9, FR_r_XS4, FR_r_A7 // sin for neg
3062 nop.i 0
3063 };;
3064
3065 { .mfi
3066 nop.m 0
3067 fma.s1 FR_p_Poly0L = FR_p_Poly0L,f1,FR_p_Poly3H //Low & High add
3068 nop.i 0
3069 }
3070 { .mfi
3071 nop.m 0
3072 (p13) fms.s1 FR_p_OddPoly0L = FR_p_Poly0H, FR_p_AbsXM1, FR_p_OddPoly0H
3073 // Reccurent computations - multiplying by X-1 (low part)
3074 nop.i 0
3075 };;
3076
3077 { .mfi
3078 nop.m 0
3079 fma.s1 FR_r_A3 = FR_r_A5, FR_r_XS4, FR_r_A3 // sin for neg
3080 nop.i 0
3081 }
3082 { .mfi
3083 nop.m 0
3084 fma.s1 FR_r_XS7 = FR_r_XS4, FR_r_XS2, f0 // xs^6
3085 nop.i 0
3086 };;
3087
3088 { .mfi
3089 nop.m 0
3090 fma.s1 FR_r_Res3L = FR_r_Res3L, f1, FR_r_TL // sin for neg
3091 nop.i 0
3092 }
3093 { .mfi
3094 nop.m 0
3095 fma.s1 FR_r_XS8 = FR_r_XS4, FR_r_XS4, f0 // sin for neg
3096 nop.i 0
3097 };;
3098
3099 { .mfi
3100 nop.m 0
3101 fma.s1 FR_p_Temp0H = FR_p_Poly3L,f1,FR_p_Poly1L //Low & High add
3102 nop.i 0
3103 };;
3104
3105 { .mfi
3106 nop.m 0
3107 fma.s1 FR_r_XS7 = FR_r_XS7, FR_r_XS, f0 // xs^7
3108 nop.i 0
3109 };;
3110
3111 { .mfi
3112 nop.m 0
3113 fma.s1 FR_r_ResL = FR_r_Res3L, FR_r_XS, FR_r_ResL//sin for neg
3114 nop.i 0
3115 }
3116 { .mfi
3117 nop.m 0
3118 fma.s1 FR_r_Tail = FR_r_A7, FR_r_XS8, FR_r_A3 // sin tail res
3119 nop.i 0
3120 };;
3121
3122 { .mfi
3123 nop.m 0
3124 fma.s1 FR_p_Poly0L = FR_p_Poly0L,f1,FR_p_Temp0H //Low & High add
3125 nop.i 0
3126 };;
3127
3128
3129 { .mfi
3130 nop.m 0
3131 fma.s1 FR_r_ResL = FR_r_Tail,FR_r_XS7,FR_r_ResL //sin for neg
3132 nop.i 0
3133 };;
3134
3135 { .mfi
3136 nop.m 0
3137 (p13) fma.s1 FR_p_OddPoly0L = FR_p_Poly0L, FR_p_AbsXM1, FR_p_OddPoly0L
3138 // Reccurent computations - multiplying by X-1 (low part)
3139 nop.i 0
3140 };;
3141
3142 { .mfi
3143 nop.m 0
3144 fma.s1 FR_r_TT = FR_r_ResL, FR_r_AbsX, f0 // X*sin
3145 nop.i 0
3146 };;
3147
3148 .pred.rel "mutex",p12,p13
3149 { .mfi
3150 nop.m 0
3151 (p12) fma.s0 f8 = FR_p_Poly0H, f1, FR_p_Poly0L // Even
3152 nop.i 0
3153 }
3154 { .mfb
3155 nop.m 0
3156 (p13) fma.s0 f8 = FR_p_OddPoly0H, f1, FR_p_OddPoly0L // Odd
3157 (p14) br.ret.spnt b0 // Exit for 1 <= |X| < 13 path (positive arguments)/////
3158 };;
3159
3160 { .mfi
3161 nop.m 0
3162 (p13) fma.s1 FR_p_Poly0H = FR_p_OddPoly0H, f1, f0
3163 // Reccurent computations
3164 nop.i 0
3165 }
3166 { .mfi
3167 nop.m 0
3168 (p13) fma.s1 FR_p_Poly0L = FR_p_OddPoly0L, f1, f0
3169 // Reccurent computations
3170 nop.i 0
3171 };;
3172
3173 { .mfi
3174 nop.m 0
3175 fma.s1 FR_r_Res1H = FR_r_ResH, FR_r_AbsX, FR_r_TT // X*sin
3176 (p11) cmp.eq p13, p12 = r0, r0
3177 };;
3178
3179 { .mfi
3180 nop.m 0
3181 fms.s1 FR_r_Res1L = FR_r_ResH,FR_r_AbsX,FR_r_Res1H// X*sin
3182 (p9) cmp.eq p13, p12 = r0, r0
3183 };;
3184
3185 { .mfi
3186 nop.m 0
3187 fma.s1 FR_r_Res1L = FR_r_Res1L, f1, FR_r_TT // sin for neg
3188 (p10) cmp.eq p13, p12 = r0, r0
3189 }
3190 { .mfi
3191 nop.m 0
3192 fma.s1 FR_r_TL = FR_p_Poly0L, FR_r_Res1H, f0 // mult by sin
3193 nop.i 0
3194 };;
3195
3196 { .mfi
3197 nop.m 0
3198 fma.s1 FR_r_TL = FR_p_Poly0H,FR_r_Res1L,FR_r_TL//mult by sin
3199 nop.i 0
3200 };;
3201
3202 { .mfi
3203 nop.m 0
3204 fma.s1 FR_r_ResH = FR_p_Poly0H,FR_r_Res1H,FR_r_TL//mult by sin
3205 nop.i 0
3206 };;
3207
3208 { .mfi
3209 nop.m 0
3210 fms.s1 FR_r_ResL = FR_p_Poly0H,FR_r_Res1H,FR_r_ResH//sin mult
3211 nop.i 0
3212 };;
3213
3214 { .mfi
3215 nop.m 0
3216 frcpa.s1 FR_r_Y0,p0 = f1,FR_r_ResH // y = frcpa(b)
3217 nop.i 0
3218 };;
3219
3220 { .mfi
3221 nop.m 0
3222 fneg FR_r_NegOne = f1 // Form -1.0
3223 nop.i 0
3224 }
3225 { .mfi
3226 nop.m 0
3227 fma.s1 FR_r_ResL = FR_r_ResL, f1, FR_r_TL //Low result of mult
3228 nop.i 0
3229 };;
3230
3231 { .mfi
3232 nop.m 0
3233 fma.s1 FR_r_Q0 = f1,FR_r_Y0,f0 // q = a*y
3234 nop.i 0
3235 }
3236 { .mfi
3237 nop.m 0
3238 fnma.s1 FR_r_E0 = FR_r_Y0,FR_r_ResH,f1 // e = 1-b*y
3239 nop.i 0
3240 };;
3241
3242 { .mfi
3243 nop.m 0
3244 fma.s1 FR_r_E2 = FR_r_E0,FR_r_E0,FR_r_E0 // e2 = e+e^2
3245 nop.i 0
3246 }
3247 { .mfi
3248 nop.m 0
3249 fma.s1 FR_r_E1 = FR_r_E0,FR_r_E0,f0 // e1 = e^2
3250 nop.i 0
3251 };;
3252
3253 { .mfi
3254 nop.m 0
3255 fma.s1 FR_r_Y1 = FR_r_Y0,FR_r_E2,FR_r_Y0 // y1 = y+y*e2
3256 nop.i 0
3257 }
3258 { .mfi
3259 nop.m 0
3260 fma.s1 FR_r_E3 = FR_r_E1,FR_r_E1,FR_r_E0 // e3 = e+e1^2
3261 nop.i 0
3262 };;
3263
3264 { .mfi
3265 nop.m 0
3266 fma.s1 FR_r_Y2 = FR_r_Y1,FR_r_E3,FR_r_Y0 // y2 = y+y1*e3
3267 nop.i 0
3268 }
3269 { .mfi
3270 nop.m 0
3271 fnma.s1 FR_r_R0 = FR_r_ResH,FR_r_Q0,f1 // r = a-b*q
3272 nop.i 0
3273 };;
3274
3275 { .mfi
3276 nop.m 0
3277 fnma.s1 FR_r_E4 = FR_r_ResH,FR_r_Y2,f1 // e4 = 1-b*y2
3278 nop.i 0
3279 }
3280 { .mfi
3281 nop.m 0
3282 fma.s1 FR_r_ZH = FR_r_R0,FR_r_Y2,FR_r_Q0 // x = q+r*y2
3283 nop.i 0
3284 };;
3285
3286 { .mfi
3287 nop.m 0
3288 fma.s1 FR_r_Y3 = FR_r_Y2,FR_r_E4,FR_r_Y2 // y3 = y2+y2*e4
3289 nop.i 0
3290 }
3291 { .mfi
3292 nop.m 0
3293 fnma.s1 FR_r_R1 = FR_r_ResH,FR_r_ZH,f1 // r1 = a-b*x
3294 nop.i 0
3295 };;
3296
3297 { .mfi
3298 nop.m 0
3299 fnma.s1 FR_r_R1 = FR_r_ResL,FR_r_ZH,FR_r_R1 // r1=r1-b_lo*X
3300 nop.i 0
3301 }
3302 { .mfi
3303 nop.m 0
3304 (p12) fma.s1 FR_r_ZHN = FR_r_ZH,FR_r_NegOne, f0 // Negate for evens
3305 nop.i 0
3306 };;
3307
3308 .pred.rel "mutex",p13,p12
3309 { .mfi
3310 nop.m 0
3311 (p13) fma.s0 f8 = FR_r_R1,FR_r_Y3,FR_r_ZH // Final result
3312 nop.i 0
3313 }
3314 { .mfb
3315 nop.m 0
3316 (p12) fnma.s0 f8 = FR_r_R1,FR_r_Y3,FR_r_ZHN // Final result
3317 br.ret.sptk b0 // Exit for 1 <= |X| < 13 path (negative arguments)//////
3318 };;
3319
3320
3321 //////////// |X| < 1 path /////////////////////////////////////////////////////
3322 //------------------------------------------------------------------------------
3323 .align 64
3324 tgamma_lt_1:
3325 { .mfi
3326 getf.exp GR_p_Exp = FR_p_AbsX // exp of abs X
3327 fma.s1 FR_z_Q0 = f1,FR_z_Y0,f0 // q = a*y
3328 add GR_r_sin_Table2= 0x50, GR_r_sin_Table
3329 }
3330 { .mfi
3331 ldfpd FR_p_0p5, FR_p_1p5 = [GR_c_Table], 16
3332 fnma.s1 FR_z_E0 = FR_z_Y0,f8,f1 // e = 1-b*y
3333 add GR_p_Table2 = 0xB0, GR_p_Table
3334 };;
3335
3336 { .mfi
3337 ldfd FR_p_0p25 = [GR_c_Table]
3338 fcvt.xf FR_r_XNS = FR_r_IXNS // Convert int repr to float
3339 shr.u GR_p_X_Sgnd = GR_p_X_Sgnd, 60
3340 // Obtain only 4 bits of significand
3341 }
3342 { .mfi
3343 nop.m 0
3344 nop.f 0
3345 add GR_p_Bias = 0xffff, r0 // Set bias
3346 };;
3347
3348 { .mfi
3349 ldfpd FR_r_A2H, FR_r_A2L = [GR_r_sin_Table], 16
3350 nop.f 0
3351 shl GR_p_XN = GR_p_Exp, 4
3352 // Shift exp to 4 bits left to set place for significand
3353 }
3354 { .mlx
3355 ldfe FR_r_A6 = [GR_r_sin_Table2], 16
3356 movl GR_p_0p75 = 0xfffec // 0.75
3357 };;
3358
3359 { .mfi
3360 ldfpd FR_r_A1H, FR_r_A1L = [GR_r_sin_Table], 16
3361 nop.f 0
3362 or GR_p_XN = GR_p_XN, GR_p_X_Sgnd
3363 // Combine exp with 4 high bits of significand
3364 }
3365 { .mfi
3366 ldfe FR_r_A5 = [GR_r_sin_Table2], 16
3367 nop.f 0
3368 sub GR_p_Exp = GR_p_Exp, GR_p_Bias // Unbiased exp
3369 };;
3370
3371 { .mmi
3372 ldfe FR_r_A9 = [GR_r_sin_Table], 16
3373 ldfe FR_r_A4 = [GR_r_sin_Table2], 16
3374 cmp.gtu.unc p10, p11 = GR_p_0p75, GR_p_XN // sgnd(x) < 0.75
3375 };;
3376
3377 { .mfi
3378 ldfe FR_r_A8 = [GR_r_sin_Table], 16
3379 fma.s1 FR_z_E2 = FR_z_E0,FR_z_E0,FR_z_E0 // e2 = e+e^2
3380 (p10) cmp.gt.unc p9, p10 = -2, GR_p_Exp // x < 0.25
3381 }
3382 { .mfi
3383 ldfe FR_r_A3 = [GR_r_sin_Table2], 16
3384 fma.s1 FR_z_E1 = FR_z_E0,FR_z_E0,f0 // e1 = e^2
3385 (p11) add GR_p_Offset = 168, r0 // [0.75;1] interval
3386 };;
3387
3388 { .mmi
3389 (p10) add GR_p_Offset = 147, r0 // [0.25;0.75] interval
3390 ldfe FR_r_A7 = [GR_r_sin_Table], 16
3391 (p9) cmp.gt.unc p8, p9 = -3, GR_p_Exp // x < 0.125
3392 };;
3393
3394 .pred.rel "mutex",p9,p8
3395 { .mmi
3396 (p9) add GR_p_Offset = 126, r0 // [0.125;0.25] interval
3397 (p8) add GR_p_Offset = 189, r0 // [0.;0.125] interval
3398 nop.i 0
3399 };;
3400
3401 { .mmf
3402 shladd GR_p_Table = GR_p_Offset, 4, GR_p_Table //Make addresses
3403 shladd GR_p_Table2 = GR_p_Offset, 4, GR_p_Table2
3404 fma.s1 FR_r_XS = FR_r_AbsX , f1, FR_r_XNS // xs = |x|-[x]
3405 };;
3406
3407 .pred.rel "mutex",p8,p11
3408 { .mfi
3409 ldfpd FR_p_A5H, FR_p_A5L = [GR_p_Table], 16
3410 (p11) fms.s1 FR_p_XR = f1, f1, FR_p_AbsX // r = 1 - |x|
3411 // for [0.75;1] interval
3412 nop.i 0
3413 }
3414 { .mfi
3415 ldfpd FR_p_A2H, FR_p_A2L = [GR_p_Table2], 16
3416 (p8) fms.s1 FR_p_XR = FR_p_AbsX, f1, f0 // r = |x|
3417 // for [0.;0.125] interval
3418 nop.i 0
3419 };;
3420
3421 { .mfi
3422 ldfpd FR_p_A4H, FR_p_A4L = [GR_p_Table], 16
3423 fma.s1 FR_z_Y1 = FR_z_Y0,FR_z_E2,FR_z_Y0 // y1 = y+y*e2
3424 nop.i 0
3425 }
3426 { .mfi
3427 ldfpd FR_p_A1H, FR_p_A1L = [GR_p_Table2], 16
3428 fma.s1 FR_z_E3 = FR_z_E1,FR_z_E1,FR_z_E0 // e3 = e+e1^2
3429 nop.i 0
3430 };;
3431
3432 .pred.rel "mutex",p9,p10
3433 { .mfi
3434 ldfpd FR_p_A3H, FR_p_A3L = [GR_p_Table], 16
3435 (p9) fms.s1 FR_p_XR = FR_p_AbsX, f1, f0 // r = |x|
3436 // for [0.125;0.25] interval
3437 nop.i 0
3438 }
3439 { .mfi
3440 ldfpd FR_p_A0H, FR_p_A0L = [GR_p_Table2], 16
3441 (p10) fms.s1 FR_p_XR = FR_p_AbsX, f1, FR_p_0p5 // r = |x| - 0.5
3442 // for [0.25;0.75] interval
3443 nop.i 0
3444 };;
3445
3446 { .mmi
3447 ldfe FR_p_A20 = [GR_p_Table], 16
3448 ldfe FR_p_A12 = [GR_p_Table2], 16
3449 nop.i 0
3450 };;
3451
3452 { .mfi
3453 ldfe FR_p_A19 = [GR_p_Table], 16
3454 fma.s1 FR_r_XS2 = FR_r_XS, FR_r_XS, f0 // xs^2
3455 nop.i 0
3456 }
3457 { .mfi
3458 ldfe FR_p_A11 = [GR_p_Table2], 16
3459 nop.f 0
3460 nop.i 0
3461 };;
3462
3463 { .mmi
3464 ldfe FR_p_A18 = [GR_p_Table], 16
3465 ldfe FR_p_A10 = [GR_p_Table2], 16
3466 nop.i 0
3467 };;
3468
3469 .pred.rel "mutex",p12,p13
3470 { .mfi
3471 ldfe FR_p_A17 = [GR_p_Table], 16
3472 fma.s1 FR_z_Y2 = FR_z_Y1,FR_z_E3,FR_z_Y0 // y2 = y+y1*e3
3473 nop.i 0
3474 }
3475 { .mfi
3476 ldfe FR_p_A9 = [GR_p_Table2], 16
3477 fnma.s1 FR_z_R0 = f8,FR_z_Q0,f1 // r = a-b*q
3478 nop.i 0
3479 };;
3480
3481 { .mmi
3482 ldfe FR_p_A16 = [GR_p_Table], 16
3483 ldfe FR_p_A8 = [GR_p_Table2], 16
3484 nop.i 0
3485 };;
3486
3487 { .mmi
3488 ldfe FR_p_A15 = [GR_p_Table], 16
3489 ldfe FR_p_A7 = [GR_p_Table2], 16
3490 nop.i 0
3491 };;
3492
3493 { .mfi
3494 ldfe FR_p_A14 = [GR_p_Table], 16
3495 fma.s1 FR_r_TH = FR_r_A2H, FR_r_XS2, f0 // neg sin
3496 nop.i 0
3497 }
3498 { .mfi
3499 ldfe FR_p_A6 = [GR_p_Table2], 16
3500 fma.s1 FR_r_TL = FR_r_A2L, FR_r_XS2, f0 // neg sin
3501 nop.i 0
3502 };;
3503
3504 { .mfi
3505 ldfe FR_p_A13 = [GR_p_Table], 16
3506 fms.s1 FR_r_XS2L = FR_r_XS, FR_r_XS, FR_r_XS2 // xs^2 delta
3507 nop.i 0
3508 };;
3509
3510 { .mfi
3511 nop.m 0
3512 fma.s1 FR_p_Temp5H = FR_p_A5H, FR_p_XR, f0 // Low poly
3513 nop.i 0
3514 }
3515 { .mfi
3516 nop.m 0
3517 fma.s1 FR_p_XR2 = FR_p_XR, FR_p_XR, f0 // poly tail
3518 nop.i 0
3519 };;
3520
3521 { .mfi
3522 nop.m 0
3523 fabs FR_r_XS = FR_r_XS // Absolute value of xs
3524 nop.i 0
3525 }
3526 { .mfi
3527 nop.m 0
3528 fma.s1 FR_p_Temp2H = FR_p_A2H, FR_p_XR, f0 // High poly
3529 nop.i 0
3530 };;
3531
3532 { .mfi
3533 nop.m 0
3534 fnma.s1 FR_z_E4 = f8,FR_z_Y2,f1 // e4 = 1-b*y2
3535 nop.i 0
3536 }
3537 { .mfi
3538 nop.m 0
3539 fma.s1 FR_z_ZH = FR_z_R0,FR_z_Y2,FR_z_Q0 // 1/x = q+r*y2
3540 nop.i 0
3541 };;
3542
3543 { .mfi
3544 nop.m 0
3545 fms.s1 FR_r_TT = FR_r_A2H, FR_r_XS2, FR_r_TH // neg sin
3546 nop.i 0
3547 }
3548 { .mfi
3549 nop.m 0
3550 fma.s1 FR_r_ResH = FR_r_TH, f1, FR_r_A1H // neg sin
3551 nop.i 0
3552 };;
3553
3554 { .mfi
3555 nop.m 0
3556 fma.s1 FR_r_TL = FR_r_A2H, FR_r_XS2L, FR_r_TL // neg sin
3557 nop.i 0
3558 };;
3559
3560 { .mfi
3561 nop.m 0
3562 fms.s1 FR_p_Temp5L = FR_p_A5H, FR_p_XR, FR_p_Temp5H // Low poly
3563 nop.i 0
3564 }
3565 { .mfi
3566 nop.m 0
3567 fma.s1 FR_p_Poly5H = FR_p_Temp5H, f1, FR_p_A4H // Low poly
3568 nop.i 0
3569 };;
3570
3571 { .mfi
3572 nop.m 0
3573 fms.s1 FR_p_Temp2L = FR_p_A2H, FR_p_XR, FR_p_Temp2H // High poly
3574 nop.i 0
3575 }
3576 { .mfi
3577 nop.m 0
3578 fma.s1 FR_p_Poly2H = FR_p_Temp2H, f1, FR_p_A1H // High poly
3579 nop.i 0
3580 };;
3581
3582 { .mfi
3583 nop.m 0
3584 fma.s1 FR_p_XR3 = FR_p_XR2, FR_p_XR, f0 // r^3
3585 nop.i 0
3586 }
3587 { .mfi
3588 nop.m 0
3589 fms.s1 FR_p_XR2L = FR_p_XR, FR_p_XR, FR_p_XR2 // r^2 delta
3590 nop.i 0
3591 };;
3592
3593 { .mfi
3594 nop.m 0
3595 fma.s1 FR_p_A18 = FR_p_A19, FR_p_XR, FR_p_A18 // poly tail
3596 nop.i 0
3597 }
3598 { .mfi
3599 nop.m 0
3600 fma.s1 FR_p_A14 = FR_p_A15, FR_p_XR, FR_p_A14 // poly tail
3601 nop.i 0
3602 };;
3603
3604 { .mfi
3605 nop.m 0
3606 fma.s1 FR_p_XR4 = FR_p_XR2, FR_p_XR2, f0 // poly tail
3607 nop.i 0
3608 }
3609 { .mfi
3610 nop.m 0
3611 fma.s1 FR_z_Y3 = FR_z_Y2,FR_z_E4,FR_z_Y2 // y3 = y2+y2*e4
3612 nop.i 0
3613 };;
3614
3615 { .mfi
3616 nop.m 0
3617 fma.s1 FR_p_Temp5L = FR_p_A5L, FR_p_XR, FR_p_Temp5L // Low poly
3618 nop.i 0
3619 }
3620 { .mfi
3621 nop.m 0
3622 fms.s1 FR_p_Poly5L = FR_p_A4H, f1, FR_p_Poly5H // Low poly
3623 nop.i 0
3624 };;
3625
3626 { .mfi
3627 nop.m 0
3628 fma.s1 FR_p_Temp4H = FR_p_Poly5H, FR_p_XR, f0 // Low poly
3629 nop.i 0
3630 }
3631 { .mfi
3632 nop.m 0
3633 fma.s1 FR_p_Temp2L = FR_p_A2L, FR_p_XR, FR_p_Temp2L // High poly
3634 nop.i 0
3635 };;
3636
3637 { .mfi
3638 nop.m 0
3639 fms.s1 FR_p_Poly2L = FR_p_A1H, f1, FR_p_Poly2H // High poly
3640 nop.i 0
3641 }
3642 { .mfi
3643 nop.m 0
3644 fma.s1 FR_p_Temp1H = FR_p_Poly2H, FR_p_XR, f0 // High poly
3645 nop.i 0
3646 };;
3647
3648 { .mfi
3649 nop.m 0
3650 fms.s1 FR_p_XR3L = FR_p_XR2, FR_p_XR, FR_p_XR3 // x^3 delta
3651 nop.i 0
3652 }
3653 { .mfi
3654 nop.m 0
3655 fma.s1 FR_p_A16 = FR_p_A17, FR_p_XR, FR_p_A16 //poly tail
3656 nop.i 0
3657 };;
3658
3659 { .mfi
3660 nop.m 0
3661 fms.s1 FR_r_ResL = FR_r_A1H, f1, FR_r_ResH // neg sin
3662 nop.i 0
3663 }
3664 { .mfi
3665 nop.m 0
3666 fma.s1 FR_r_TL = FR_r_TL, f1, FR_r_TT // neg sin
3667 nop.i 0
3668 };;
3669
3670 { .mfi
3671 nop.m 0
3672 fma.s1 FR_p_Temp5L = FR_p_Temp5L, f1, FR_p_A4L // Low poly
3673 nop.i 0
3674 }
3675 { .mfi
3676 nop.m 0
3677 fma.s1 FR_p_Poly5L = FR_p_Poly5L, f1, FR_p_Temp5H //Low poly
3678 nop.i 0
3679 };;
3680
3681 { .mfi
3682 nop.m 0
3683 fms.s1 FR_p_Temp4L = FR_p_Poly5H, FR_p_XR, FR_p_Temp4H//Low poly
3684 nop.i 0
3685 }
3686 { .mfi
3687 nop.m 0
3688 fma.s1 FR_p_Poly4H = FR_p_Temp4H, f1, FR_p_A3H // Low poly
3689 nop.i 0
3690 };;
3691
3692 { .mfi
3693 nop.m 0
3694 fma.s1 FR_p_Temp2L = FR_p_Temp2L, f1, FR_p_A1L // High poly
3695 nop.i 0
3696 }
3697 { .mfi
3698 nop.m 0
3699 fma.s1 FR_p_Poly2L = FR_p_Poly2L, f1, FR_p_Temp2H // High poly
3700 nop.i 0
3701 };;
3702
3703 { .mfi
3704 nop.m 0
3705 fms.s1 FR_p_Temp1L = FR_p_Poly2H,FR_p_XR,FR_p_Temp1H //High poly
3706 nop.i 0
3707 }
3708 { .mfi
3709 nop.m 0
3710 fma.s1 FR_p_Poly1H = FR_p_Temp1H, f1, FR_p_A0H // High poly
3711 nop.i 0
3712 };;
3713
3714 { .mfi
3715 nop.m 0
3716 fma.s1 FR_p_A12 = FR_p_A13, FR_p_XR, FR_p_A12 // poly tail
3717 nop.i 0
3718 }
3719 { .mfi
3720 nop.m 0
3721 fma.s1 FR_p_XR3L = FR_p_XR2L, FR_p_XR, FR_p_XR3L // x^3 low
3722 nop.i 0
3723 };;
3724
3725 { .mfi
3726 nop.m 0
3727 fma.s1 FR_p_Poly5L = FR_p_Poly5L, f1, FR_p_Temp5L //Low poly
3728 nop.i 0
3729 }
3730 { .mfi
3731 nop.m 0
3732 fma.s1 FR_p_A10 = FR_p_A11, FR_p_XR, FR_p_A10 //poly tail
3733 nop.i 0
3734 };;
3735
3736 { .mfi
3737 nop.m 0
3738 fms.s1 FR_p_Poly4L = FR_p_A3H, f1, FR_p_Poly4H /// Low poly
3739 nop.i 0
3740 }
3741 { .mfi
3742 nop.m 0
3743 fma.s1 FR_p_A6 = FR_p_A7, FR_p_XR, FR_p_A6 // poly tail
3744 nop.i 0
3745 };;
3746
3747 { .mfi
3748 nop.m 0
3749 fma.s1 FR_p_A8 = FR_p_A9, FR_p_XR, FR_p_A8 // poly tail
3750 nop.i 0
3751 }
3752 { .mfi
3753 nop.m 0
3754 fma.s1 FR_p_XR6 = FR_p_XR4, FR_p_XR2, f0 // r^6
3755 nop.i 0
3756 };;
3757
3758 { .mfi
3759 nop.m 0
3760 fma.s1 FR_p_Poly2L = FR_p_Poly2L, f1, FR_p_Temp2L // High poly
3761 nop.i 0
3762 }
3763 { .mfi
3764 nop.m 0
3765 fms.s1 FR_p_Poly1L = FR_p_A0H, f1, FR_p_Poly1H // High poly
3766 nop.i 0
3767 };;
3768
3769 { .mfi
3770 nop.m 0
3771 fma.s1 FR_r_ResL = FR_r_ResL, f1, FR_r_TH // neg sin
3772 nop.i 0
3773 }
3774 { .mfi
3775 nop.m 0
3776 fma.s1 FR_r_TT = FR_r_TL, f1, FR_r_A1L // neg sin
3777 nop.i 0
3778 };;
3779
3780 { .mfi
3781 nop.m 0
3782 fma.s1 FR_p_Temp4L = FR_p_Poly5L,FR_p_XR,FR_p_Temp4L //Low poly
3783 nop.i 0
3784 }
3785 { .mfi
3786 nop.m 0
3787 fma.s1 FR_p_A18 = FR_p_A20, FR_p_XR2, FR_p_A18 // poly tail
3788 nop.i 0
3789 };;
3790
3791 { .mfi
3792 nop.m 0
3793 fma.s1 FR_p_Poly4L = FR_p_Poly4L, f1, FR_p_Temp4H // Low poly
3794 nop.i 0
3795 }
3796 { .mfi
3797 nop.m 0
3798 fma.s1 FR_p_A14 = FR_p_A16, FR_p_XR2, FR_p_A14 // poly tail
3799 nop.i 0
3800 };;
3801
3802 { .mfi
3803 nop.m 0
3804 fma.s1 FR_p_A6 = FR_p_A8, FR_p_XR2, FR_p_A6 // poly tail
3805 nop.i 0
3806 }
3807 { .mfi
3808 nop.m 0
3809 fma.s1 FR_p_A10 = FR_p_A12, FR_p_XR2, FR_p_A10 // poly tail
3810 nop.i 0
3811 };;
3812
3813 { .mfi
3814 nop.m 0
3815 fma.s1 FR_p_Temp1L = FR_p_Poly2L,FR_p_XR,FR_p_Temp1L //High poly
3816 nop.i 0
3817 }
3818 { .mfi
3819 nop.m 0
3820 fma.s1 FR_p_Poly1L = FR_p_Poly1L, f1, FR_p_Temp1H // High poly
3821 nop.i 0
3822 };;
3823
3824 { .mfi
3825 nop.m 0
3826 fma.s1 FR_r_ResL = FR_r_ResL, f1, FR_r_TT // neg sin
3827 nop.i 0
3828 }
3829 { .mfi
3830 nop.m 0
3831 fma.s1 FR_r_TH = FR_r_ResH, FR_r_XS2, f0 // neg sin
3832 nop.i 0
3833 };;
3834
3835 { .mfi
3836 nop.m 0
3837 fma.s1 FR_p_Temp4L = FR_p_Temp4L, f1, FR_p_A3L // Low poly
3838 nop.i 0
3839 }
3840 { .mfi
3841 nop.m 0
3842 fma.s1 FR_p_Poly3H = FR_p_Poly4H, FR_p_XR3, f0 // Low poly
3843 nop.i 0
3844 };;
3845
3846 { .mfi
3847 nop.m 0
3848 fma.s1 FR_p_A14 = FR_p_A18, FR_p_XR4, FR_p_A14 // poly tail
3849 nop.i 0
3850 }
3851 { .mfi
3852 nop.m 0
3853 fma.s1 FR_p_XR8 = FR_p_XR4, FR_p_XR4, f0 // r^8
3854 nop.i 0
3855 };;
3856
3857 { .mfi
3858 nop.m 0
3859 fma.s1 FR_r_TL = FR_r_ResH, FR_r_XS2L, f0 // neg sin
3860 nop.i 0
3861 }
3862 { .mfi
3863 nop.m 0
3864 fnma.s1 FR_z_R1 = f8,FR_z_ZH,f1 // r1 = a-b*x
3865 nop.i 0
3866 };;
3867
3868 { .mfi
3869 nop.m 0
3870 fma.s1 FR_p_Temp1L = FR_p_Temp1L, f1, FR_p_A0L // High poly
3871 nop.i 0
3872 }
3873 { .mfi
3874 nop.m 0
3875 fma.s1 FR_p_A6 = FR_p_A10, FR_p_XR4, FR_p_A6 // poly tail
3876 nop.i 0
3877 };;
3878
3879 { .mfi
3880 nop.m 0
3881 fms.s1 FR_r_TT = FR_r_ResH, FR_r_XS2, FR_r_TH // neg sin
3882 nop.i 0
3883 }
3884 { .mfi
3885 nop.m 0
3886 fma.s1 FR_r_Res3H = FR_r_TH, f1, f1 // neg sin
3887 nop.i 0
3888 };;
3889
3890 { .mfi
3891 nop.m 0
3892 fma.s1 FR_p_Poly4L = FR_p_Poly4L, f1, FR_p_Temp4L // Low poly
3893 nop.i 0
3894 }
3895 { .mfi
3896 nop.m 0
3897 fma.s1 FR_p_Poly3L = FR_p_Poly4H, FR_p_XR3L, f0 // Low poly
3898 nop.i 0
3899 };;
3900
3901 { .mfi
3902 nop.m 0
3903 fma.s1 FR_p_Poly0H = FR_p_Poly3H, f1, FR_p_Poly1H // Result
3904 nop.i 0
3905 }
3906 { .mfi
3907 nop.m 0
3908 fma.s1 FR_r_A7 = FR_r_A8, FR_r_XS2, FR_r_A7 // neg sin
3909 nop.i 0
3910 };;
3911
3912 { .mfi
3913 nop.m 0
3914 fma.s1 FR_r_TL = FR_r_ResL, FR_r_XS2, FR_r_TL // neg sin
3915 nop.i 0
3916 }
3917 { .mfi
3918 nop.m 0
3919 fma.s1 FR_r_XS4 = FR_r_XS2, FR_r_XS2, f0 // xs^4
3920 nop.i 0
3921 };;
3922
3923 { .mfi
3924 nop.m 0
3925 fma.s1 FR_p_Poly1L = FR_p_Poly1L, f1, FR_p_Temp1L // High poly
3926 nop.i 0
3927 }
3928 { .mfi
3929 nop.m 0
3930 fma.s1 FR_p_PolyTail = FR_p_A14, FR_p_XR8, FR_p_A6 // poly tail
3931 nop.i 0
3932 };;
3933
3934 { .mfi
3935 nop.m 0
3936 fms.s1 FR_r_Res3L = f1, f1, FR_r_Res3H // neg sin
3937 nop.i 0
3938 }
3939 { .mfi
3940 nop.m 0
3941 fma.s1 FR_r_ResH = FR_r_Res3H, FR_r_XS, f0 // neg sin
3942 nop.i 0
3943 };;
3944
3945 { .mfi
3946 nop.m 0
3947 fms.s1 FR_p_Temp0L = FR_p_Poly4H,FR_p_XR3,FR_p_Poly3H //Low poly
3948 nop.i 0
3949 }
3950 { .mfi
3951 nop.m 0
3952 fma.s1 FR_p_Poly3L = FR_p_Poly4L,FR_p_XR3,FR_p_Poly3L //Low poly
3953 nop.i 0
3954 };;
3955
3956 { .mfi
3957 nop.m 0
3958 fms.s1 FR_p_Poly0L = FR_p_Poly1H, f1, FR_p_Poly0H // Result
3959 nop.i 0
3960 }
3961 { .mfi
3962 nop.m 0
3963 fma.s1 FR_z_ZL = FR_z_R1,FR_z_Y3, f0 // x_lo = r1*y3
3964 nop.i 0
3965 };;
3966
3967 { .mfi
3968 nop.m 0
3969 fma.s1 FR_r_TL = FR_r_TL, f1, FR_r_TT // neg sin
3970 nop.i 0
3971 }
3972 { .mfi
3973 nop.m 0
3974 fma.s1 FR_r_A3 = FR_r_A4, FR_r_XS2, FR_r_A3 /// neg sin
3975 nop.i 0
3976 };;
3977
3978 { .mfi
3979 nop.m 0
3980 fma.s1 FR_p_Poly1L = FR_p_PolyTail,FR_p_XR6,FR_p_Poly1L // High
3981 nop.i 0
3982 }
3983 { .mfi
3984 nop.m 0
3985 fma.s1 FR_r_A5 = FR_r_A6, FR_r_XS2, FR_r_A5 // neg sin
3986 nop.i 0
3987 };;
3988
3989 { .mfi
3990 nop.m 0
3991 fma.s1 FR_r_Res3L = FR_r_Res3L, f1, FR_r_TH // neg sin
3992 nop.i 0
3993 }
3994 { .mfi
3995 nop.m 0
3996 fms.s1 FR_r_ResL = FR_r_Res3H, FR_r_XS, FR_r_ResH // neg sin
3997 nop.i 0
3998 };;
3999
4000 { .mfi
4001 nop.m 0
4002 fma.s1 FR_p_Poly3L = FR_p_Poly3L, f1, FR_p_Temp0L // Low poly
4003 nop.i 0
4004 }
4005 { .mfi
4006 nop.m 0
4007 fma.s1 FR_r_A7 = FR_r_A9, FR_r_XS4, FR_r_A7 // neg sin
4008 nop.i 0
4009 };;
4010
4011 { .mfi
4012 nop.m 0
4013 fma.s1 FR_p_Poly0L = FR_p_Poly0L, f1, FR_p_Poly3H // result
4014 nop.i 0
4015 };;
4016
4017 { .mfi
4018 nop.m 0
4019 (p14) fma.s1 f8 = FR_p_Poly0H, FR_z_ZH, f0 // z*poly
4020 nop.i 0
4021 }
4022 { .mfi
4023 nop.m 0
4024 fma.s1 FR_p_Temp1L = FR_p_Poly0H, FR_z_ZL, f0 // z*poly low
4025 nop.i 0
4026 };;
4027
4028 { .mfi
4029 nop.m 0
4030 fma.s1 FR_r_A3 = FR_r_A5, FR_r_XS4, FR_r_A3 // sin tail
4031 nop.i 0
4032 }
4033 { .mfi
4034 nop.m 0
4035 fma.s1 FR_r_XS7 = FR_r_XS4, FR_r_XS2, f0 // xs^6
4036 nop.i 0
4037 };;
4038
4039 { .mfi
4040 nop.m 0
4041 fma.s1 FR_r_Res3L = FR_r_Res3L, f1, FR_r_TL // sin low
4042 nop.i 0
4043 }
4044 { .mfi
4045 nop.m 0
4046 fma.s1 FR_r_XS8 = FR_r_XS4, FR_r_XS4, f0 // xs^8
4047 nop.i 0
4048 };;
4049
4050 { .mfi
4051 nop.m 0
4052 fma.s1 FR_p_Temp0H = FR_p_Poly3L, f1, FR_p_Poly1L // result
4053 nop.i 0
4054 };;
4055
4056 { .mfi
4057 nop.m 0
4058 (p14) fms.s1 FR_p_Temp1H = FR_p_Poly0H, FR_z_ZH, f8 // hi result
4059 nop.i 0
4060 };;
4061
4062 { .mfi
4063 nop.m 0
4064 fma.s1 FR_r_XS7 = FR_r_XS7, FR_r_XS, f0 // xs^7
4065 nop.i 0
4066 };;
4067
4068 { .mfi
4069 nop.m 0
4070 fma.s1 FR_r_ResL = FR_r_Res3L, FR_r_XS, FR_r_ResL // lo result
4071 nop.i 0
4072 }
4073 { .mfi
4074 nop.m 0
4075 fma.s1 FR_r_Tail = FR_r_A7, FR_r_XS8, FR_r_A3 // tail result
4076 nop.i 0
4077 };;
4078
4079 { .mfi
4080 nop.m 0
4081 fma.s1 FR_p_Poly0L = FR_p_Poly0L, f1, FR_p_Temp0H // lo result
4082 nop.i 0
4083 };;
4084
4085 { .mfi
4086 nop.m 0
4087 fma.s1 FR_r_ResL = FR_r_Tail, FR_r_XS7, FR_r_ResL // lo result
4088 nop.i 0
4089 };;
4090
4091 { .mfi
4092 nop.m 0
4093 (p14) fma.s1 FR_p_Temp1L = FR_p_Poly0L,FR_z_ZH,FR_p_Temp1L //hi result
4094 nop.i 0
4095 };;
4096
4097 { .mfi
4098 nop.m 0
4099 fma.s1 FR_r_TT = FR_r_ResL, f1, f0 // for low result
4100 nop.i 0
4101 };;
4102
4103 .pred.rel "mutex",p12,p13
4104 { .mfi
4105 nop.m 0
4106 (p14) fma.s1 FR_p_Temp1L = FR_p_Temp1L, f1, FR_p_Temp1H // for lo res
4107 nop.i 0
4108 };;
4109
4110 { .mfi
4111 (p10) cmp.eq p13, p12 = r0, r0 // set p13, clear p12
4112 fma.s1 FR_r_Res1H = FR_r_ResH, f1, FR_r_TT // hi res
4113 nop.i 0
4114 };;
4115
4116 { .mfb
4117 (p9) cmp.eq p13, p12 = r0, r0 // set p13, clear p12
4118 (p14) fma.s0 f8 = f8, f1, FR_p_Temp1L // Final result
4119 (p14) br.ret.spnt b0 // Exit for 0 < |X| < 1 path (positive arguments)///////
4120 };;
4121
4122 { .mfi
4123 (p11) cmp.eq p13, p12 = r0, r0 // set p13, clear p12
4124 fms.s1 FR_r_Res1L = FR_r_ResH, f1, FR_r_Res1H // Low sin result
4125 nop.i 0
4126 };;
4127
4128 { .mfi
4129 nop.m 0
4130 fma.s1 FR_r_Res1L = FR_r_Res1L, f1, FR_r_TT // Low sin result
4131 nop.i 0
4132 }
4133 { .mfi
4134 nop.m 0
4135 fma.s1 FR_r_TL = FR_p_Poly0L,FR_r_Res1H,f0 //Low sin result
4136 nop.i 0
4137 };;
4138
4139 { .mfi
4140 nop.m 0
4141 fma.s1 FR_r_TL = FR_p_Poly0H, FR_r_Res1L, FR_r_TL //Low sin
4142 nop.i 0
4143 };;
4144
4145 { .mfi
4146 nop.m 0
4147 fma.s1 FR_r_ResH = FR_p_Poly0H, FR_r_Res1H, FR_r_TL //High sin
4148 nop.i 0
4149 };;
4150
4151 { .mfi
4152 nop.m 0
4153 fms.s1 FR_r_ResL = FR_p_Poly0H,FR_r_Res1H,FR_r_ResH //Low res
4154 nop.i 0
4155 };;
4156
4157 { .mfi
4158 nop.m 0
4159 frcpa.s1 FR_r_Y0,p0 = f1,FR_r_ResH // y = frcpa(b)
4160 nop.i 0
4161 };;
4162
4163 { .mfi
4164 nop.m 0
4165 fneg FR_r_NegOne = f1 // Construct -1.0
4166 nop.i 0
4167 }
4168 { .mfi
4169 nop.m 0
4170 fma.s1 FR_r_ResL = FR_r_ResL, f1, FR_r_TL // low sin
4171 nop.i 0
4172 };;
4173
4174 { .mfi
4175 nop.m 0
4176 fma.s1 FR_r_Q0 = f1,FR_r_Y0,f0 // q = a*y
4177 nop.i 0
4178 }
4179 { .mfi
4180 nop.m 0
4181 fnma.s1 FR_r_E0 = FR_r_Y0,FR_r_ResH,f1 // e = 1-b*y
4182 nop.i 0
4183 };;
4184
4185 { .mfi
4186 nop.m 0
4187 fma.s1 FR_r_E2 = FR_r_E0,FR_r_E0,FR_r_E0 // e2 = e+e^2
4188 nop.i 0
4189 }
4190 { .mfi
4191 nop.m 0
4192 fma.s1 FR_r_E1 = FR_r_E0,FR_r_E0,f0 // e1 = e^2
4193 nop.i 0
4194 };;
4195
4196 { .mfi
4197 nop.m 0
4198 fma.s1 FR_r_Y1 = FR_r_Y0,FR_r_E2,FR_r_Y0 // y1 = y+y*e2
4199 nop.i 0
4200 }
4201 { .mfi
4202 nop.m 0
4203 fma.s1 FR_r_E3 = FR_r_E1,FR_r_E1,FR_r_E0 // e3 = e+e1^2
4204 nop.i 0
4205 };;
4206
4207 { .mfi
4208 nop.m 0
4209 fma.s1 FR_r_Y2 = FR_r_Y1,FR_r_E3,FR_r_Y0 // y2 = y+y1*e3
4210 nop.i 0
4211 }
4212 { .mfi
4213 nop.m 0
4214 fnma.s1 FR_r_R0 = FR_r_ResH,FR_r_Q0,f1 // r = a-b*q
4215 nop.i 0
4216 };;
4217
4218 { .mfi
4219 nop.m 0
4220 fnma.s1 FR_r_E4 = FR_r_ResH,FR_r_Y2,f1 // e4 = 1-b*y2
4221 nop.i 0
4222 }
4223 { .mfi
4224 nop.m 0
4225 fma.s1 FR_r_ZH = FR_r_R0,FR_r_Y2,FR_r_Q0 // x = q+r*y2
4226 nop.i 0
4227 };;
4228
4229 { .mfi
4230 nop.m 0
4231 fma.s1 FR_r_Y3 = FR_r_Y2,FR_r_E4,FR_r_Y2 // y3 = y2+y2*e4
4232 nop.i 0
4233 }
4234 { .mfi
4235 nop.m 0
4236 fnma.s1 FR_r_R1 = FR_r_ResH,FR_r_ZH,f1 // r1 = a-b*x
4237 nop.i 0
4238 };;
4239
4240 { .mfi
4241 nop.m 0
4242 fnma.s1 FR_r_R1 = FR_r_ResL,FR_r_ZH,FR_r_R1 // r1=r1 - b_lo*X
4243 nop.i 0
4244 }
4245 { .mfi
4246 nop.m 0
4247 fma.s1 FR_r_ZHN = FR_r_ZH,FR_r_NegOne, f0 // Negate
4248 nop.i 0
4249 };;
4250
4251 .pred.rel "mutex",p13,p12
4252 { .mfb
4253 nop.m 0
4254 fnma.s0 f8 = FR_r_R1,FR_r_Y3,FR_r_ZHN // Result for neg
4255 br.ret.sptk b0 // Exit for 0 < |X| < 1 path (negative arguments)//////
4256 };;
4257
4258
4259
4260
4261 // SPECIALS (x for natval, nan, +/-inf or +/-0) ///////////////////////////////
4262 //------------------------------------------------------------------------------
4263 .align 32
4264 tgammal_spec:
4265 { .mlx
4266 nop.m 0
4267 movl GR_DenOverflow = 0x2000000000000001
4268 }
4269 { .mfi
4270 nop.m 0
4271 fclass.m p9,p0 = f8,0xB // +/-denormals
4272 nop.i 0
4273 };;
4274 { .mfi
4275 nop.m 0
4276 fclass.m p6,p0 = f8,0x1E1 // Test x for natval, nan, +inf
4277 nop.i 0
4278 };;
4279 { .mfi
4280 nop.m 0
4281 fclass.m p7,p8 = f8,0x7 // +/-0
4282 nop.i 0
4283 }
4284
4285 { .mfi
4286 (p9) cmp.ltu.unc p10,p11 = GR_l_signif_Z, GR_DenOverflow
4287 (p9) fnorm.s0 f8 = f8
4288 nop.i 0
4289 };;
4290
4291 { .mfb
4292 nop.m 0
4293 (p9) fcvt.fx.trunc.s1 FR_n_IXN = FR_l_AbsX // Round by truncate
4294 (p11) br.cond.sptk tgamma_lt_1 // Return to gamma ('good' denormal)////////////
4295 };;
4296
4297 { .mfb
4298 nop.m 0
4299 nop.f 0
4300 (p10) br.cond.spnt tgammal_overflow // "Bad" denormal - overflow! /////////////
4301 };;
4302
4303 { .mfi
4304 nop.m 0
4305 mov FR_X = f8 // for error handler
4306 nop.i 0
4307 }
4308 { .mfb
4309 nop.m 0
4310 (p6) fma.s0 f8 = f8,f1,f8 // res = x + x
4311 (p6) br.ret.spnt b0 // Exit for NAN, INF and NatVals ////////////////////////
4312 };;
4313 .pred.rel "mutex",p7,p8
4314 { .mfi
4315 (p7) mov GR_Parameter_TAG = 256 // negative
4316 (p7) frcpa.s0 f8,p0 = f1,f8 // Raise V flag
4317 nop.i 0
4318 }
4319 { .mfb
4320 nop.m 0
4321 nop.f 0
4322 (p8) br.cond.spnt tgammal_singularity // Branch for +ZERO ////////////////////
4323 };;
4324
4325 { .mfb
4326 nop.m 0
4327 nop.f 0
4328 br.cond.spnt tgammal_libm_err // Branch for -ZERO ///////////////////////
4329 };;
4330
4331
4332
4333
4334 // SINGULARITY (x is negative integer or 0) ////////////////////////////////////
4335 //------------------------------------------------------------------------------
4336 .align 32
4337 tgammal_singularity:
4338 { .mfi
4339 nop.m 0
4340 mov FR_X = f8 // For error handler
4341 mov GR_Parameter_TAG = 256 // negative
4342 }
4343 { .mfb
4344 nop.m 0
4345 frcpa.s0 f8,p0 = f0,f0 // Raise V flag
4346 br.cond.sptk tgammal_libm_err // Call error handler /////////////////////
4347 // with singularity error /////////////////
4348 };;
4349
4350
4351
4352
4353 // OVERFLOW (result is too big and cannot be represented by normal value) //////
4354 // ( X > 1755.54 and for denormals with abs value less than 0x2000000000000001 )
4355 //------------------------------------------------------------------------------
4356 .align 32
4357 tgammal_overflow:
4358 { .mfi
4359 addl r8 = 0x1FFFE, r0 // Exp of INF
4360 fcmp.lt.s1 p15,p14 = f8,f0 // p14 - pos arg, p15 - neg arg
4361 nop.i 0
4362 };;
4363
4364 { .mfi
4365 setf.exp f9 = r8
4366 mov FR_X = f8 // For error handler
4367 mov GR_Parameter_TAG = 255 // overflow
4368 };;
4369
4370 .pred.rel "mutex",p14,p15
4371 { .mfi
4372 nop.m 0
4373 (p14) fma.s0 f8 = f9,f9,f0 // Set I,O and +INF result
4374 nop.i 0
4375 }
4376 { .mfb
4377 nop.m 0
4378 (p15) fnma.s0 f8 = f9,f9,f0 // Set I,O and -INF result
4379 br.cond.sptk tgammal_libm_err // Call error handler /////////////////////
4380 // with overflow error ////////////////////
4381 };;
4382
4383
4384
4385
4386
4387 // UNDERFLOW (x is negative noninteger with big absolute value) ////////////////
4388 //------------------------------------------------------------------------------
4389 .align 32
4390 tgammal_underflow:
4391 { .mfi
4392 nop.m 0
4393 fcvt.fx.trunc.s1 FR_u_IXN = f8 // Convert arg to int repres. in FR
4394 nop.i 0
4395 };;
4396
4397 { .mmi
4398 getf.sig GR_u_XN = FR_u_IXN
4399 mov r11 = 0x00001
4400 nop.i 0
4401 };;
4402
4403 { .mfi
4404 setf.exp f9 = r11
4405 nop.f 0
4406 nop.i 0
4407 };;
4408
4409 { .mfi
4410 nop.m 0
4411 nop.f 0
4412 tbit.z p6,p7 = GR_u_XN,0 // even or odd
4413 };;
4414
4415 .pred.rel "mutex",p6,p7
4416 { .mfi
4417 nop.m 0
4418 (p6) fms.s0 f8 = f9,f9,f9 // for negatives
4419 nop.i 0
4420 }
4421 { .mfb
4422 nop.m 0
4423 (p7) fma.s0 f8 = f9,f9,f9 // for positives
4424 br.ret.sptk b0 // Exit for underflow path //////////////////////////////
4425 };;
4426
4427
4428 GLOBAL_LIBM_END(tgammal)
4429
4430
4431
4432
4433 ////////////////// Tgammal error handler ///////////////////////////////////////
4434 //------------------------------------------------------------------------------
4435 LOCAL_LIBM_ENTRY(__libm_error_region)
4436 tgammal_libm_err:
4437 .prologue
4438 { .mfi
4439 add GR_Parameter_Y=-32,sp // Parameter 2 value
4440 nop.f 0
4441 .save ar.pfs,GR_SAVE_PFS
4442 mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
4443 }
4444 { .mfi
4445 .fframe 64
4446 add sp=-64,sp // Create new stack
4447 nop.f 0
4448 mov GR_SAVE_GP=gp // Save gp
4449 };;
4450 { .mmi
4451 stfe [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack
4452 add GR_Parameter_X = 16,sp // Parameter 1 address
4453 .save b0, GR_SAVE_B0
4454 mov GR_SAVE_B0=b0 // Save b0
4455 };;
4456 .body
4457 { .mib
4458 stfe [GR_Parameter_X] = FR_X // Store Parameter 1 on stack
4459 add GR_Parameter_RESULT = 0,GR_Parameter_Y
4460 nop.b 0 // Parameter 3 address
4461 }
4462 { .mib
4463 stfe [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack
4464 add GR_Parameter_Y = -16,GR_Parameter_Y
4465 br.call.sptk b0=__libm_error_support# // Call error handling function
4466 };;
4467 { .mmi
4468 nop.m 999
4469 nop.m 999
4470 add GR_Parameter_RESULT = 48,sp
4471 };;
4472 { .mmi
4473 ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack
4474 .restore sp
4475 add sp = 64,sp // Restore stack pointer
4476 mov b0 = GR_SAVE_B0 // Restore return address
4477 };;
4478 { .mib
4479 mov gp = GR_SAVE_GP // Restore gp
4480 mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
4481 br.ret.sptk b0 // Return
4482 };;
4483
4484 LOCAL_LIBM_END(__libm_error_region#)
4485
4486 .type __libm_error_support#,@function
4487 .global __libm_error_support#