]> git.ipfire.org Git - thirdparty/glibc.git/blame - ports/sysdeps/ia64/fpu/w_tgammal.S
Fix typos.
[thirdparty/glibc.git] / ports / sysdeps / ia64 / fpu / w_tgammal.S
CommitLineData
d5efd131
MF
1.file "tgammal.s"
2
3
4// Copyright (c) 2002 - 2005, Intel Corporation
5// All rights reserved.
6//
7// Contributed 2002 by the Intel Numerics Group, Intel Corporation
8//
9// Redistribution and use in source and binary forms, with or without
10// modification, are permitted provided that the following conditions are
11// met:
12//
13// * Redistributions of source code must retain the above copyright
14// notice, this list of conditions and the following disclaimer.
15//
16// * Redistributions in binary form must reproduce the above copyright
17// notice, this list of conditions and the following disclaimer in the
18// documentation and/or other materials provided with the distribution.
19//
20// * The name of Intel Corporation may not be used to endorse or promote
21// products derived from this software without specific prior written
22// permission.
23
0347518d
MF
24// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
25// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
d5efd131 26// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
0347518d 27// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
d5efd131 28// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
0347518d
MF
29// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
30// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
31// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
d5efd131 32// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
0347518d
MF
33// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
34// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35//
d5efd131 36// Intel Corporation is the author of this code, and requests that all
0347518d 37// problem reports or change requests be submitted to it directly at
d5efd131
MF
38// http://www.intel.com/software/products/opensource/libraries/num.htm.
39//
40// History
41//==============================================================
42// 01/16/02 Initial version
43// 05/20/02 Cleaned up namespace and sf0 syntax
44// 02/10/03 Reordered header: .section, .global, .proc, .align;
45// used data8 for long double table values
46// 03/17/03 Moved tgammal_libm_err label into .proc region
47// 04/10/03 Changed error codes for overflow and negative integers
48// 03/31/05 Reformatted delimiters between data tables
49//
50// API
51//==============================================================
52// long double tgammal(long double)
53//
54// Resources Used:
55//
56// Floating-Point Registers: f8-f15
57// f32-f127
58//
0347518d 59// General Purpose Registers: r32-r67
d5efd131
MF
60//
61// Predicate Registers: p6-p15
62//
63//*********************************************************************
64//
65// IEEE Special Conditions:
66//
67// tgammal(+inf) = +inf
0347518d
MF
68// tgammal(-inf) = QNaN
69// tgammal(+/-0) = +/-inf
d5efd131
MF
70// tgammal(x<0, x - integer) = QNaN
71// tgammal(SNaN) = QNaN
72// tgammal(QNaN) = QNaN
73//
74//*********************************************************************
75// Overview of operation
76//==============================================================
77//
78// Algorithm description
79// ---------------------
80//
0347518d 81// There are 3 main paths in the implementation
d5efd131
MF
82// (and additional special values branches)
83//
84// 1) |X| >= 13 - Stirling formula computation
85// a) Positive arguments:
0347518d
MF
86// TGAMMAL(X) = exp((X-0.5)*ln(X) - X + C + S(Z)),
87// where C = 0.5*ln(2*Pi) , Z = 1/Z, S(Z) - Bernulli polynomial
d5efd131 88// (up to 'B18' term).
0347518d
MF
89// Some of these calculation done in multiprecision.
90// Ln returns multiprecision result too
d5efd131 91// and exp also accepts and returns pair of values.
0347518d 92//
d5efd131
MF
93// b) Negative arguments
94// TGAMMAL(-X) = PI/(X*TGAMMAL(X)*sin(PI*X)).
95// (X*sin(PI*X))/PI calculated in parallel with TGAMMAL.
96// Here we use polynomial of 9th degree with 2 multiprecision steps.
0347518d 97// Argument range reduction is:
d5efd131
MF
98// N = [x] with round to nearest, r = x - N, -0.5 <= r < 0.5
99// After ((X-0.5)*ln(X) - X + C + S(Z)) completed we just invert
100// its result and compute exp with negative argument (1/exp(x)=exp(-x))
101// Then we multiply exp result to PI/(X*sin(PI*X)).
102//
103// 2) 1 <= |X| < 13 - Polynomial part
104// a) Positive arguments:
105// All values are splitted to such intervals as:
106// #0->[2;3], #1->[3,4], #2->[5,6]...
107// For even intervals we just use polynomial computation with degree 20
108// and first 6 multiprecision computations.
109// Range reduction looks like
110// N = [x] with truncate, r = x - N - 0.5, -0.5 <= r < 0.5
0347518d 111// For odd intervals we use reccurent formula:
d5efd131 112// TGAMMAL(X) = TGAMMA(X-1)*(X-1)
0347518d 113// [1;2] interval is splitted to 3 subranges:
d5efd131
MF
114// [1;1.25], [1.25;1.75], [1.75;2] with the same polynomial forms
115//
116// b) Negative arguments
117// TGAMMAL(-X) = PI/(X*TGAMMAL(X)*sin(PI*X)).
118// (X*sin(PI*X))/PI calculated in parallel with TGAMMAL.
119// After multiplication by TGAMMAL(X) result we calculate reciprocal
120// and get final result.
121//
122// 3) 0 < |X| < 1 - Near 0 part
123// a) Here we use reccurent formula TGAMMAL(X) = TGAMMAL(X+1)/X
0347518d 124// TGAMMAL(X+1) calculated as shown above,
d5efd131
MF
125// 1/X result obtained in parallel. Then we just multiply these values.
126// There is only additional separated subrange: [0;0.125] with specific
127// polynomial constants set.
128//
129// b) Negative arguments
130// TGAMMAL(-X) = PI/(TGAMMAL(X+1)*sin(PI*X)).
131// There is no need to compute 1/X.
132
133
134
135RODATA
136
137.align 16
138LOCAL_OBJECT_START(Constants_Tgammal_log_80_Q)
0347518d 139// log2_hi, log2_lo, Q_6, Q_5, Q_4, Q_3, Q_2, Q_1
d5efd131
MF
140data4 0x00000000,0xB1721800,0x00003FFE,0x00000000
141data4 0x4361C4C6,0x82E30865,0x0000BFE2,0x00000000
142data4 0xA51BE0AF,0x92492453,0x00003FFC,0x00000000
143data4 0xA0CFD29F,0xAAAAAB73,0x0000BFFC,0x00000000
144data4 0xCCCE3872,0xCCCCCCCC,0x00003FFC,0x00000000
145data4 0xFFFFB4FB,0xFFFFFFFF,0x0000BFFC,0x00000000
146data4 0xAAAAAAAB,0xAAAAAAAA,0x00003FFD,0x00000000
0347518d 147data4 0x00000000,0x80000000,0x0000BFFE,0x00000000
d5efd131
MF
148LOCAL_OBJECT_END(Constants_Tgammal_log_80_Q)
149
150.align 64
151LOCAL_OBJECT_START(Constants_Tgammal_log_80_Z_G_H_h1)
0347518d 152// Z1 - 16 bit fixed, G1 and H1 IEEE single, h1 IEEE double
d5efd131 153data4 0x00008000,0x3F800000,0x00000000,0x00000000
0347518d 154data4 0x00000000,0x00000000,0x00000000,0x00000000
d5efd131
MF
155data4 0x00007879,0x3F70F0F0,0x3D785196,0x00000000
156data4 0xEBA0E0D1,0x8B1D330B,0x00003FDA,0x00000000
157data4 0x000071C8,0x3F638E38,0x3DF13843,0x00000000
158data4 0x9EADD553,0xE2AF365E,0x00003FE2,0x00000000
159data4 0x00006BCB,0x3F579430,0x3E2FF9A0,0x00000000
0347518d 160data4 0x752F34A2,0xF585FEC3,0x0000BFE3,0x00000000
d5efd131 161data4 0x00006667,0x3F4CCCC8,0x3E647FD6,0x00000000
0347518d
MF
162data4 0x893B03F3,0xF3546435,0x00003FE2,0x00000000
163data4 0x00006187,0x3F430C30,0x3E8B3AE7,0x00000000
164data4 0x39CDD2AC,0xBABA62E0,0x00003FE4,0x00000000
165data4 0x00005D18,0x3F3A2E88,0x3EA30C68,0x00000000
d5efd131 166data4 0x457978A1,0x8718789F,0x00003FE2,0x00000000
0347518d
MF
167data4 0x0000590C,0x3F321640,0x3EB9CEC8,0x00000000
168data4 0x3185E56A,0x9442DF96,0x0000BFE4,0x00000000
169data4 0x00005556,0x3F2AAAA8,0x3ECF9927,0x00000000
170data4 0x2BBE2CBD,0xCBF9A4BF,0x00003FE4,0x00000000
171data4 0x000051EC,0x3F23D708,0x3EE47FC5,0x00000000
172data4 0x852D5935,0xF3537535,0x00003FE3,0x00000000
173data4 0x00004EC5,0x3F1D89D8,0x3EF8947D,0x00000000
174data4 0x46CDF32F,0xA1F1E699,0x0000BFDF,0x00000000
175data4 0x00004BDB,0x3F17B420,0x3F05F3A1,0x00000000
176data4 0xD8484CE3,0x84A61856,0x00003FE4,0x00000000
d5efd131 177data4 0x00004925,0x3F124920,0x3F0F4303,0x00000000
0347518d
MF
178data4 0xFF28821B,0xC7DD97E0,0x0000BFE2,0x00000000
179data4 0x0000469F,0x3F0D3DC8,0x3F183EBF,0x00000000
180data4 0xEF1FD32F,0xD3C4A887,0x00003FE3,0x00000000
181data4 0x00004445,0x3F088888,0x3F20EC80,0x00000000
182data4 0x464C76DA,0x84672BE6,0x00003FE5,0x00000000
d5efd131 183data4 0x00004211,0x3F042108,0x3F29516A,0x00000000
0347518d 184data4 0x18835FB9,0x9A43A511,0x0000BFE5,0x00000000
d5efd131
MF
185LOCAL_OBJECT_END(Constants_Tgammal_log_80_Z_G_H_h1)
186
187.align 64
188LOCAL_OBJECT_START(Constants_Tgammal_log_80_Z_G_H_h2)
189// Z2 - 16 bit fixed, G2 and H2 IEEE single, h2 IEEE double
0347518d
MF
190data4 0x00008000,0x3F800000,0x00000000,0x00000000
191data4 0x00000000,0x00000000,0x00000000,0x00000000
192data4 0x00007F81,0x3F7F00F8,0x3B7F875D,0x00000000
d5efd131 193data4 0x211398BF,0xAD08B116,0x00003FDB,0x00000000
0347518d
MF
194data4 0x00007F02,0x3F7E03F8,0x3BFF015B,0x00000000
195data4 0xC376958E,0xB106790F,0x00003FDE,0x00000000
196data4 0x00007E85,0x3F7D08E0,0x3C3EE393,0x00000000
197data4 0x79A7679A,0xFD03F242,0x0000BFDA,0x00000000
198data4 0x00007E08,0x3F7C0FC0,0x3C7E0586,0x00000000
199data4 0x05E7AE08,0xF03F81C3,0x0000BFDF,0x00000000
200data4 0x00007D8D,0x3F7B1880,0x3C9E75D2,0x00000000
d5efd131 201data4 0x049EB22F,0xD1B87D3C,0x00003FDE,0x00000000
0347518d
MF
202data4 0x00007D12,0x3F7A2328,0x3CBDC97A,0x00000000
203data4 0x3A9E81E0,0xFABC8B95,0x00003FDF,0x00000000
d5efd131 204data4 0x00007C98,0x3F792FB0,0x3CDCFE47,0x00000000
0347518d
MF
205data4 0x7C4B5443,0xF5F3653F,0x00003FDF,0x00000000
206data4 0x00007C20,0x3F783E08,0x3CFC15D0,0x00000000
207data4 0xF65A1773,0xE78AB204,0x00003FE0,0x00000000
208data4 0x00007BA8,0x3F774E38,0x3D0D874D,0x00000000
209data4 0x7B8EF695,0xDB7CBFFF,0x0000BFE0,0x00000000
210data4 0x00007B31,0x3F766038,0x3D1CF49B,0x00000000
211data4 0xCF773FB3,0xC0241AEA,0x0000BFE0,0x00000000
212data4 0x00007ABB,0x3F757400,0x3D2C531D,0x00000000
213data4 0xC9539FDF,0xFC8F4D48,0x00003FE1,0x00000000
214data4 0x00007A45,0x3F748988,0x3D3BA322,0x00000000
215data4 0x954665C2,0x9CD035FB,0x0000BFE1,0x00000000
216data4 0x000079D1,0x3F73A0D0,0x3D4AE46F,0x00000000
217data4 0xDD367A30,0xEC9017C7,0x00003FE1,0x00000000
218data4 0x0000795D,0x3F72B9D0,0x3D5A1756,0x00000000
219data4 0xCB11189C,0xEE6625D3,0x0000BFE1,0x00000000
220data4 0x000078EB,0x3F71D488,0x3D693B9D,0x00000000
221data4 0xBE11C424,0xA49C8DB5,0x0000BFE0,0x00000000
d5efd131
MF
222LOCAL_OBJECT_END(Constants_Tgammal_log_80_Z_G_H_h2)
223
224.align 64
225LOCAL_OBJECT_START(Constants_Tgammal_log_80_h3_G_H)
0347518d
MF
226// h3 IEEE double extended, H3 and G3 IEEE single
227data4 0x112666B0,0xAAACAAB1,0x00003FD3,0x3F7FFC00
d5efd131 228data4 0x9B7FAD21,0x90051030,0x00003FD8,0x3F7FF400
0347518d
MF
229data4 0xF4D783C4,0xA6B46F46,0x00003FDA,0x3F7FEC00
230data4 0x11C6DDCA,0xDA148D88,0x0000BFD8,0x3F7FE400
d5efd131 231data4 0xCA964D95,0xCE65C1D8,0x0000BFD8,0x3F7FDC00
0347518d
MF
232data4 0x23412D13,0x883838EE,0x0000BFDB,0x3F7FD400
233data4 0x983ED687,0xB7E5CFA1,0x00003FDB,0x3F7FCC08
234data4 0xE3C3930B,0xDBE23B16,0x0000BFD9,0x3F7FC408
235data4 0x48AA4DFC,0x9B92F1FC,0x0000BFDC,0x3F7FBC10
236data4 0xCE9C8F7E,0x9A8CEB15,0x0000BFD9,0x3F7FB410
237data4 0x0DECE74A,0x8C220879,0x00003FDC,0x3F7FAC18
d5efd131 238data4 0x2F053150,0xB25CA912,0x0000BFDA,0x3F7FA420
0347518d
MF
239data4 0xD9A5BE20,0xA5876555,0x00003FDB,0x3F7F9C20
240data4 0x2053F087,0xC919BB6E,0x00003FD9,0x3F7F9428
241data4 0x041E9A77,0xB70BDA79,0x00003FDC,0x3F7F8C30
242data4 0xEA1C9C30,0xF18A5C08,0x00003FDA,0x3F7F8438
243data4 0x796D89E5,0xA3790D84,0x0000BFDD,0x3F7F7C40
244data4 0xA2915A3A,0xE1852369,0x0000BFDD,0x3F7F7448
245data4 0xA39ED868,0xD803858F,0x00003FDC,0x3F7F6C50
246data4 0x9417EBB7,0xB2EEE356,0x0000BFDD,0x3F7F6458
247data4 0x9BB0D07F,0xED5C1F8A,0x0000BFDC,0x3F7F5C68
248data4 0xE87C740A,0xD6D201A0,0x0000BFDD,0x3F7F5470
249data4 0x1CA74025,0xE8DEBF5E,0x00003FDC,0x3F7F4C78
d5efd131 250data4 0x1F34A7EB,0x9A995A97,0x0000BFDC,0x3F7F4488
0347518d
MF
251data4 0x359EED97,0x9CB0F742,0x0000BFDA,0x3F7F3C90
252data4 0xBBC6A1C8,0xD6F833C2,0x0000BFDD,0x3F7F34A0
253data4 0xE71090EC,0xE1F68F2A,0x00003FDC,0x3F7F2CA8
254data4 0xC160A74F,0xD1881CF1,0x0000BFDB,0x3F7F24B8
255data4 0xD78CB5A4,0x9AD05AE2,0x00003FD6,0x3F7F1CC8
256data4 0x9A77DC4B,0xE658CB8E,0x0000BFDD,0x3F7F14D8
257data4 0x6BD6D312,0xBA281296,0x00003FDC,0x3F7F0CE0
258data4 0xF95210D0,0xB478BBEB,0x0000BFDB,0x3F7F04F0
259data4 0x38800100,0x39400480,0x39A00640,0x39E00C41 // H's start here
260data4 0x3A100A21,0x3A300F22,0x3A4FF51C,0x3A6FFC1D
d5efd131 261data4 0x3A87F20B,0x3A97F68B,0x3AA7EB86,0x3AB7E101
0347518d
MF
262data4 0x3AC7E701,0x3AD7DD7B,0x3AE7D474,0x3AF7CBED
263data4 0x3B03E1F3,0x3B0BDE2F,0x3B13DAAA,0x3B1BD766
264data4 0x3B23CC5C,0x3B2BC997,0x3B33C711,0x3B3BBCC6
265data4 0x3B43BAC0,0x3B4BB0F4,0x3B53AF6D,0x3B5BA620
266data4 0x3B639D12,0x3B6B9444,0x3B7393BC,0x3B7B8B6D
d5efd131
MF
267LOCAL_OBJECT_END(Constants_Tgammal_log_80_h3_G_H)
268
0347518d 269.align 64
d5efd131
MF
270LOCAL_OBJECT_START(Constants_Tgammal_stirling)
271//0.5*ln(2*Pi)=9.1893853320467266954096885e-01 + 7.2239360881843238220057778e-17
272data8 0x3FED67F1C864BEB4, 0x3C94D252F2400510
0347518d 273// Bernulli numbers
d5efd131
MF
274data8 0xAAAAAAAAAAAAAAAB, 0x00003FFB //B2 = 8.3333333333333333333333333333e-02
275data8 0xBF66C16C16C16C17 //B4 = -2.7777777777777777777777777778e-03
276data8 0x3F4A01A01A01A01A //B6 = 7.9365079365079365079365079365e-04
277data8 0xBF43813813813814 //B8 = -5.9523809523809523809523809524e-04
278data8 0x3F4B951E2B18FF23 //B10 = 8.4175084175084175084175084175e-04
279data8 0xBF5F6AB0D9993C7D //B12 = -1.9175269175269175269175269175e-03
280data8 0x3F7A41A41A41A41A //B14 = 6.4102564102564102564102564103e-03
281data8 0xBF9E4286CB0F5398 //B16 = -2.9550653594771241830065359477e-02
282data8 0x3FC6FE96381E0680 //B18 = 1.7964437236883057316493849002e-01
283data8 0x3FE0000000000000 // 0.5
284LOCAL_OBJECT_END(Constants_Tgammal_stirling)
285
0347518d 286.align 64
d5efd131 287LOCAL_OBJECT_START(Constants_Tgammal_sin)
0347518d 288// Polynomial coefficients for the sin(Pi*x)/Pi, 0 <= |x| < 0.5
d5efd131
MF
289//A2 = 8.1174242528335360802316245099e-01 + 5.1302254650266899774269946201e-18
290data8 0x3FE9F9CB402BC46C, 0x3C57A8B3819B7CEC
291//A1 = -1.6449340668482264060656916627e+00 + -3.0210280454695477893051351574e-17
292data8 0xBFFA51A6625307D3, 0xBC816A402079D0EF
293data8 0xF3AEF1FFCCE6C813, 0x0000BFE3 //A9 = -7.0921197799923779127089910470e-09
294data8 0x87D54408E6D4BB9D, 0x00003FE9 //A8 = 2.5300880778252693946712766029e-07
295data8 0xEA12033DCE7B8ED9, 0x0000BFED //A7 = -6.9758403885461690048189307819e-06
296data8 0x9BA38C952A59D1A8, 0x00003FF2 //A6 = 1.4842878710882320255092707181e-04
297data8 0x99C0B55178FF0E38, 0x0000BFF6 //A5 = -2.3460810348048124421268761990e-03
298data8 0xD63402E798FEC896, 0x00003FF9 //A4 = 2.6147847817611456327417812320e-02
299data8 0xC354723906D95E92, 0x0000BFFC //A3 = -1.9075182412208257558294507774e-01
300LOCAL_OBJECT_END(Constants_Tgammal_sin)
301
0347518d 302.align 64
d5efd131
MF
303LOCAL_OBJECT_START(Constants_Tgammal_exp_64_Arg)
304data4 0x00000000,0xB17217F4,0x00003FF2,0x00000000 // L_hi = hi part log(2)/2^12
305data4 0xF278ECE6,0xF473DE6A,0x00003FD4,0x00000000 // L_lo = lo part log(2)/2^12
306LOCAL_OBJECT_END(Constants_Tgammal_exp_64_Arg)
307
308LOCAL_OBJECT_START(Constants_Tgammal_exp_64_A)
309data4 0xB1B736A0,0xAAAAAAAB,0x00003FFA,0x00000000 // A3
310data4 0x90CD6327,0xAAAAAAAB,0x00003FFC,0x00000000 // A2
311data4 0xFFFFFFFF,0xFFFFFFFF,0x00003FFD,0x00000000 // A1
312LOCAL_OBJECT_END(Constants_Tgammal_exp_64_A)
313
314LOCAL_OBJECT_START(Constants_Tgammal_exp_64_T1)
0347518d
MF
315data4 0x3F800000,0x3F8164D2,0x3F82CD87,0x3F843A29
316data4 0x3F85AAC3,0x3F871F62,0x3F88980F,0x3F8A14D5
d5efd131
MF
317data4 0x3F8B95C2,0x3F8D1ADF,0x3F8EA43A,0x3F9031DC
318data4 0x3F91C3D3,0x3F935A2B,0x3F94F4F0,0x3F96942D
319data4 0x3F9837F0,0x3F99E046,0x3F9B8D3A,0x3F9D3EDA
320data4 0x3F9EF532,0x3FA0B051,0x3FA27043,0x3FA43516
321data4 0x3FA5FED7,0x3FA7CD94,0x3FA9A15B,0x3FAB7A3A
322data4 0x3FAD583F,0x3FAF3B79,0x3FB123F6,0x3FB311C4
323data4 0x3FB504F3,0x3FB6FD92,0x3FB8FBAF,0x3FBAFF5B
324data4 0x3FBD08A4,0x3FBF179A,0x3FC12C4D,0x3FC346CD
325data4 0x3FC5672A,0x3FC78D75,0x3FC9B9BE,0x3FCBEC15
326data4 0x3FCE248C,0x3FD06334,0x3FD2A81E,0x3FD4F35B
327data4 0x3FD744FD,0x3FD99D16,0x3FDBFBB8,0x3FDE60F5
328data4 0x3FE0CCDF,0x3FE33F89,0x3FE5B907,0x3FE8396A
329data4 0x3FEAC0C7,0x3FED4F30,0x3FEFE4BA,0x3FF28177
330data4 0x3FF5257D,0x3FF7D0DF,0x3FFA83B3,0x3FFD3E0C
331LOCAL_OBJECT_END(Constants_Tgammal_exp_64_T1)
332
333LOCAL_OBJECT_START(Constants_Tgammal_exp_64_T2)
0347518d
MF
334data4 0x3F800000,0x3F80058C,0x3F800B18,0x3F8010A4
335data4 0x3F801630,0x3F801BBD,0x3F80214A,0x3F8026D7
336data4 0x3F802C64,0x3F8031F2,0x3F803780,0x3F803D0E
337data4 0x3F80429C,0x3F80482B,0x3F804DB9,0x3F805349
338data4 0x3F8058D8,0x3F805E67,0x3F8063F7,0x3F806987
339data4 0x3F806F17,0x3F8074A8,0x3F807A39,0x3F807FCA
340data4 0x3F80855B,0x3F808AEC,0x3F80907E,0x3F809610
341data4 0x3F809BA2,0x3F80A135,0x3F80A6C7,0x3F80AC5A
342data4 0x3F80B1ED,0x3F80B781,0x3F80BD14,0x3F80C2A8
343data4 0x3F80C83C,0x3F80CDD1,0x3F80D365,0x3F80D8FA
344data4 0x3F80DE8F,0x3F80E425,0x3F80E9BA,0x3F80EF50
345data4 0x3F80F4E6,0x3F80FA7C,0x3F810013,0x3F8105AA
346data4 0x3F810B41,0x3F8110D8,0x3F81166F,0x3F811C07
347data4 0x3F81219F,0x3F812737,0x3F812CD0,0x3F813269
348data4 0x3F813802,0x3F813D9B,0x3F814334,0x3F8148CE
d5efd131
MF
349data4 0x3F814E68,0x3F815402,0x3F81599C,0x3F815F37
350LOCAL_OBJECT_END(Constants_Tgammal_exp_64_T2)
351
352LOCAL_OBJECT_START(Constants_Tgammal_exp_64_W1)
353data8 0x0000000000000000, 0xBE384454171EC4B4
354data8 0xBE6947414AA72766, 0xBE5D32B6D42518F8
355data8 0x3E68D96D3A319149, 0xBE68F4DA62415F36
356data8 0xBE6DDA2FC9C86A3B, 0x3E6B2E50F49228FE
357data8 0xBE49C0C21188B886, 0x3E64BFC21A4C2F1F
358data8 0xBE6A2FBB2CB98B54, 0x3E5DC5DE9A55D329
359data8 0x3E69649039A7AACE, 0x3E54728B5C66DBA5
360data8 0xBE62B0DBBA1C7D7D, 0x3E576E0409F1AF5F
361data8 0x3E6125001A0DD6A1, 0xBE66A419795FBDEF
362data8 0xBE5CDE8CE1BD41FC, 0xBE621376EA54964F
363data8 0x3E6370BE476E76EE, 0x3E390D1A3427EB92
364data8 0x3E1336DE2BF82BF8, 0xBE5FF1CBD0F7BD9E
365data8 0xBE60A3550CEB09DD, 0xBE5CA37E0980F30D
366data8 0xBE5C541B4C082D25, 0xBE5BBECA3B467D29
367data8 0xBE400D8AB9D946C5, 0xBE5E2A0807ED374A
368data8 0xBE66CB28365C8B0A, 0x3E3AAD5BD3403BCA
369data8 0x3E526055C7EA21E0, 0xBE442C75E72880D6
370data8 0x3E58B2BB85222A43, 0xBE5AAB79522C42BF
371data8 0xBE605CB4469DC2BC, 0xBE589FA7A48C40DC
372data8 0xBE51C2141AA42614, 0xBE48D087C37293F4
373data8 0x3E367A1CA2D673E0, 0xBE51BEBB114F7A38
374data8 0xBE6348E5661A4B48, 0xBDF526431D3B9962
375data8 0x3E3A3B5E35A78A53, 0xBE46C46C1CECD788
376data8 0xBE60B7EC7857D689, 0xBE594D3DD14F1AD7
377data8 0xBE4F9C304C9A8F60, 0xBE52187302DFF9D2
378data8 0xBE5E4C8855E6D68F, 0xBE62140F667F3DC4
379data8 0xBE36961B3BF88747, 0x3E602861C96EC6AA
380data8 0xBE3B5151D57FD718, 0x3E561CD0FC4A627B
381data8 0xBE3A5217CA913FEA, 0x3E40A3CC9A5D193A
382data8 0xBE5AB71310A9C312, 0x3E4FDADBC5F57719
383data8 0x3E361428DBDF59D5, 0x3E5DB5DB61B4180D
384data8 0xBE42AD5F7408D856, 0x3E2A314831B2B707
385LOCAL_OBJECT_END(Constants_Tgammal_exp_64_W1)
386
387LOCAL_OBJECT_START(Constants_Tgammal_exp_64_W2)
388data8 0x0000000000000000, 0xBE641F2537A3D7A2
389data8 0xBE68DD57AD028C40, 0xBE5C77D8F212B1B6
390data8 0x3E57878F1BA5B070, 0xBE55A36A2ECAE6FE
391data8 0xBE620608569DFA3B, 0xBE53B50EA6D300A3
392data8 0x3E5B5EF2223F8F2C, 0xBE56A0D9D6DE0DF4
393data8 0xBE64EEF3EAE28F51, 0xBE5E5AE2367EA80B
394data8 0x3E47CB1A5FCBC02D, 0xBE656BA09BDAFEB7
395data8 0x3E6E70C6805AFEE7, 0xBE6E0509A3415EBA
396data8 0xBE56856B49BFF529, 0x3E66DD3300508651
397data8 0x3E51165FC114BC13, 0x3E53333DC453290F
398data8 0x3E6A072B05539FDA, 0xBE47CD877C0A7696
399data8 0xBE668BF4EB05C6D9, 0xBE67C3E36AE86C93
400data8 0xBE533904D0B3E84B, 0x3E63E8D9556B53CE
401data8 0x3E212C8963A98DC8, 0xBE33138F032A7A22
402data8 0x3E530FA9BC584008, 0xBE6ADF82CCB93C97
403data8 0x3E5F91138370EA39, 0x3E5443A4FB6A05D8
404data8 0x3E63DACD181FEE7A, 0xBE62B29DF0F67DEC
405data8 0x3E65C4833DDE6307, 0x3E5BF030D40A24C1
406data8 0x3E658B8F14E437BE, 0xBE631C29ED98B6C7
407data8 0x3E6335D204CF7C71, 0x3E529EEDE954A79D
408data8 0x3E5D9257F64A2FB8, 0xBE6BED1B854ED06C
409data8 0x3E5096F6D71405CB, 0xBE3D4893ACB9FDF5
410data8 0xBDFEB15801B68349, 0x3E628D35C6A463B9
411data8 0xBE559725ADE45917, 0xBE68C29C042FC476
412data8 0xBE67593B01E511FA, 0xBE4A4313398801ED
413data8 0x3E699571DA7C3300, 0x3E5349BE08062A9E
414data8 0x3E5229C4755BB28E, 0x3E67E42677A1F80D
415data8 0xBE52B33F6B69C352, 0xBE6B3550084DA57F
416data8 0xBE6DB03FD1D09A20, 0xBE60CBC42161B2C1
417data8 0x3E56ED9C78A2B771, 0xBE508E319D0FA795
418data8 0xBE59482AFD1A54E9, 0xBE2A17CEB07FD23E
419data8 0x3E68BF5C17365712, 0x3E3956F9B3785569
420LOCAL_OBJECT_END(Constants_Tgammal_exp_64_W2)
421
422
423
424LOCAL_OBJECT_START(Constants_Tgammal_poly)
425
0347518d 426// Polynomial coefficients for the tgammal(x), 2 <= |x| < 3
d5efd131
MF
427//A5 = 2.8360780594841213109180699803e-02 + 2.2504152891014320704380000000e-19
428data8 0x3F9D0A9BC49353D2, 0x3C109AEA0F23CE2D
429//A4 = 1.0967323400216015538699565468e-01 + 9.9225166000430644587276000000e-18
430data8 0x3FBC138B89492C5B, 0x3C66E138506D5652
431//A3 = 2.5387124684114281691904579930e-01 + 2.2667777637607113205546600000e-17
432data8 0x3FD03F6D2FA4F4F8, 0x3C7A2258DA8CD8B1
433data8 0xC5866457328BC39B, 0x00003FE3 //A20 = 5.7487331964156762795056629138e-09
434data8 0xE93D9F1ACD59C929, 0x0000BFE4 //A19= -1.3576396100397317396956445658e-08
435data8 0xE33389C8F6CBA813, 0x00003FE5 //A18 = 2.6449714924964597501721434271e-08
436data8 0x8FE7B25B9CD26D2A, 0x0000BFE7 //A17= -6.7011017946055513660266853311e-08
437data8 0xB89F4721BFBC15B0, 0x00003FE8 //A16 = 1.7194280320370423615174419192e-07
438data8 0xE49CBDC1874EBABA, 0x0000BFE9 //A15= -4.2582353660153782928729466776e-07
439data8 0x913AF50A336129CA, 0x00003FEB //A14 = 1.0820500665257088283172211622e-06
440data8 0xABCF0F7313B3B332, 0x0000BFEC //A13= -2.5601510627710417669568115706e-06
441//A2 = 6.5455857798133676439533701341e-01 + 1.3292075193155190798867000000e-18
442data8 0x3FE4F224D4B7E01C, 0x3C3885014A2B8319
443//A1 = 9.3473452162608550164435428087e-01 + 3.2785154201417136611642400000e-17
444data8 0x3FEDE9585F1A7093, 0x3C82E63C1B5028BF
445//A0 = 1.3293403881791368004172682049e+00 + 2.2005689328949279282607500000e-16
446data8 0x3FF544FA6D47B38F, 0x3CAFB6AA9829E81F
447data8 0xF3668F799997C76D, 0x00003FED //A12 = 7.2539039479124273660331538367e-06
448data8 0xD6C6BBD54CDEAEB1, 0x0000BFEE //A11= -1.2801665282681088568639378920e-05
449data8 0x809E4763B06F6883, 0x00003FF1 //A10 = 6.1329973609906572700697893187e-05
450data8 0x8443B000F8F9A71A, 0x00003FED //A9 = 3.9417864189995544394564413428e-06
451data8 0xC5C7E6D62A6991D8, 0x00003FF4 //A8 = 7.5447412886334708803357581519e-04
452data8 0xD2AF690725C62D88, 0x00003FF5 //A7 = 1.6074004848394703022110823298e-03
453data8 0xAA44E635D4B7B682, 0x00003FF8 //A6 = 1.0392403425906843901680697839e-02
454//
0347518d 455// Polynomial coefficients for the tgammal(x), 4 <= |x| < 5
d5efd131
MF
456//A5 = 1.1600674810589555185913468449e+00 + 3.0229979112715124660731000000e-17
457data8 0x3FF28FA2EB44D22E, 0x3C816D285234C815
458//A4 = 3.1374268565470946334983182169e+00 + 1.3694868953995008497659600000e-16
459data8 0x400919734073B1E1, 0x3CA3BC83CD7E9565
460//A3 = 7.0834593993741057360580271052e+00 + 3.3899702569039156457249800000e-16
461data8 0x401C5576617B6C1F, 0x3CB86D6431213296
462data8 0xA4A5FB49C094966B, 0x00003FDA //A20 = 9.3591760106637809309720130828e-12
463data8 0xA9260DA0F51D7ED8, 0x00003FDD //A19 = 7.6919898428091669411809372180e-11
464data8 0xA16441DFB14BD6E1, 0x00003FE0 //A18 = 5.8713933014370867331213494535e-10
465data8 0x95F098D9C2234849, 0x00003FE3 //A17 = 4.3638234584169302324461091035e-09
466data8 0x8581817400E5AD2B, 0x00003FE6 //A16 = 3.1084260332429955234755367839e-08
467data8 0xE272940E373EBE15, 0x00003FE8 //A15 = 2.1089573544273993580820317236e-07
468data8 0xB6B3391145D226FB, 0x00003FEB //A14 = 1.3612217421122787182942706259e-06
469data8 0x8B9428C4DF95FCD5, 0x00003FEE //A13 = 8.3195416382628990683949003789e-06
470//A2 = 1.2665135075272345943631080445e+01 + 9.8721896915973874255877000000e-16
471data8 0x4029548C95A76F38, 0x3CD1C8BE715B8E13
472//A1 = 1.6154969393303069580269948347e+01 + 9.6850518810678379641029000000e-16
473data8 0x403027AC12FC1E1E, 0x3CD172711C15501B
474//A0 = 1.1631728396567448058362970187e+01 + 8.7078125362814179268673000000e-16
475data8 0x40274371E7866C65, 0x3CCF5F8A1A5FACA0
476data8 0xC94A903114272C03, 0x00003FF0 //A12 = 4.7991576836334427243159066630e-05
477data8 0x8844262960E04BE6, 0x00003FF3 //A11 = 2.5990716419283017929486175141e-04
478data8 0xAC5418A76767678D, 0x00003FF5 //A10 = 1.3147621245497801180184809726e-03
479data8 0xCA231B6EFE959132, 0x00003FF7 //A9 = 6.1687358811367989146517222415e-03
480data8 0xDA38E39C13819D2A, 0x00003FF9 //A8 = 2.6638454961912040754759086920e-02
481data8 0xD696DF8D8389FE53, 0x00003FFB //A7 = 1.0477995539298934056097943975e-01
482data8 0xBDD5C153048BC435, 0x00003FFD //A6 = 3.7077144754791605130056406006e-01
483//
0347518d 484// Polynomial coefficients for the tgammal(x), 6 <= |x| < 7
d5efd131
MF
485//A5 = 6.7169398121054200601065531373e+01 + 2.9481001527213915901489600000e-15
486data8 0x4050CAD76B377BA0, 0x3CEA8DDB2B2DE93E
487//A4 = 1.6115104376855398982115730178e+02 + 1.3422421925418824418257300000e-14
488data8 0x406424D559BDC687, 0x3D0E397FDB5B33DC
489//A3 = 3.1812194028053562533386866562e+02 + 3.9881709875858650942409600000e-14
490data8 0x4073E1F377A6CF73, 0x3D26738F63FE9C4C
491data8 0xD6E1B5FF90CAABD3, 0x00003FE1 //A20 = 1.5634700199277480081025480635e-09
492data8 0xD451987B925DD37E, 0x00003FE4 //A19 = 1.2358576813211397717382327174e-08
493data8 0xBFC151B67FA58E6B, 0x00003FE7 //A18 = 8.9292951435632759686382657901e-08
494data8 0xA9034C5E1D67572E, 0x00003FEA //A17 = 6.2962205718327848327368724720e-07
495data8 0x8E40F6EAA30A71EC, 0x00003FED //A16 = 4.2394926442967995119170095258e-06
496data8 0xE3C3541B03A1C350, 0x00003FEF //A15 = 2.7151465666109594512258841637e-05
497data8 0xACE2E58436B2DDCE, 0x00003FF2 //A14 = 1.6487723793339152877117376243e-04
498data8 0xF7EAF8D8D1CAA3D1, 0x00003FF4 //A13 = 9.4573158112768812533636022369e-04
499//A2 = 4.8664351544258869353143381886e+02 + 4.7424047995944376868895400000e-14
500data8 0x407E6A4BD6D9463B, 0x3D2AB2868D79E192
501//A1 = 5.1615277644992545447166776285e+02 + 3.0901956935588717379242200000e-14
502data8 0x40802138E2DC003B, 0x3D216570FB601AEA
503//A0 = 2.8788527781504433278314536437e+02 + 2.8213174117085164944959600000e-14
504data8 0x4071FE2A1911F7D6, 0x3D1FC3E4CF4DB5AF
505data8 0xA72B88E48D3D1BAB, 0x00003FF7 //A12 = 5.1016252919939028020562237471e-03
506data8 0xD2EFB1067DB4FFB2, 0x00003FF9 //A11 = 2.5749059441230515023024615917e-02
507data8 0xF788AF9522205C24, 0x00003FFB //A10 = 1.2086617635601742290221382521e-01
508data8 0x861A6CE06CB29EAF, 0x00003FFE //A9 = 5.2384071807018493367136112163e-01
509data8 0x84FBDE0947718B58, 0x00004000 //A8 = 2.0778727617851237754568261869e+00
510data8 0xEEC1371E265A2C3A, 0x00004001 //A7 = 7.4610858525146049022238037342e+00
511data8 0xBF514B9BE68ED59D, 0x00004003 //A6 = 2.3914694993947572859629197920e+01
512//
0347518d 513// Polynomial coefficients for the tgammal(x), 8 <= |x| < 9
d5efd131
MF
514//A5 = 5.8487447114416836484451778233e+03 + 4.7365465221455983144182900000e-13
515data8 0x40B6D8BEA568B6FD, 0x3D60AA4D44C2589B
516//A4 = 1.2796464063087094473303295672e+04 + 1.2373341702514898266244200000e-12
517data8 0x40C8FE3B666B532D, 0x3D75C4752C5B4783
518//A3 = 2.2837606581322281272150576115e+04 + 2.6598064610627891398831000000e-13
519data8 0x40D64D66D23A7764, 0x3D52B77B3A10EA5C
520data8 0xB23418F75B0BE22A, 0x00003FE9 //A20 = 3.3192989594206801808678663868e-07
521data8 0xA984A7BC8B856ED2, 0x00003FEC //A19 = 2.5260177918662350066375115788e-06
522data8 0x921A49729416372C, 0x00003FEF //A18 = 1.7416797068239475136398213598e-05
523data8 0xF5BB9415CC399CA4, 0x00003FF1 //A17 = 1.1717449586392814601938207599e-04
524data8 0xC50B91A40B81F9DF, 0x00003FF4 //A16 = 7.5166775151159345732094429036e-04
525data8 0x96002572326DB203, 0x00003FF7 //A15 = 4.5776541559407384162139204300e-03
526data8 0xD81A1A595E4157BA, 0x00003FF9 //A14 = 2.6379634345126284099420760736e-02
527data8 0x92B700D0CFECADD8, 0x00003FFC //A13 = 1.4327622675407940907282658100e-01
528//A2 = 3.1237895525940199149772524834e+04 + 3.1280450505163186432331700000e-12
529data8 0x40DE8179504C0878, 0x3D8B83BB33FBB766
530//A1 = 2.9192841741344487672904506326e+04 + 7.9300780509779689630767000000e-13
531data8 0x40DC8235DF171691, 0x3D6BE6C780EE54DF
532//A0 = 1.4034407293483411194756627083e+04 + 1.4038139346291543309253700000e-12
533data8 0x40CB693422315F90, 0x3D78B23746113FCE
534data8 0xBAE50807548BC711, 0x00003FFE //A12 = 7.3005724123917935346868107005e-01
535data8 0xDE28B1F57E68CFB6, 0x00004000 //A11 = 3.4712338349724065462763671443e+00
536data8 0xF4DCA5A5FF901118, 0x00004002 //A10 = 1.5303868912154033908205911714e+01
537data8 0xF85AAA1AD5E84E5E, 0x00004004 //A9 = 6.2088539523416399361048051373e+01
538data8 0xE5AA8BB1BF02934D, 0x00004006 //A8 = 2.2966619406617480799195651466e+02
539data8 0xBF6CFEFD67F59845, 0x00004008 //A7 = 7.6570306334640770654588802417e+02
540data8 0x8DB5D2F001635C29, 0x0000400A //A6 = 2.2673639984182571062068713002e+03
541//
0347518d 542// Polynomial coefficients for the tgammal(x), 10 <= |x| < 11
d5efd131
MF
543//A5 = 7.2546009516580589115619659424e+05 + 1.0343348865365065212891728822e-10
544data8 0x412623A830B99290, 0x3DDC6E7C157611C4
545//A4 = 1.4756292870840241666883230209e+06 + 8.1516565365333844166705674775e-11
546data8 0x4136842D497E56AF, 0x3DD66837E4C3F9EE
547//A3 = 2.4356116926500420086085796356e+06 + 3.5508860076560925641351069404e-10
548data8 0x4142950DD8A8C1AF, 0x3DF866C8E3DD0980
549data8 0xB7FD0D1EEAC38EB4, 0x00003FF1 //A20 = 8.7732544640091602721643775932e-05
550data8 0xA9345C64AC750AE9, 0x00003FF4 //A19 = 6.4546407626804942279126469603e-04
551data8 0x8BEABC81BE1E93C9, 0x00003FF7 //A18 = 4.2699261134524096128048819443e-03
552data8 0xE1CD281EDD7315F8, 0x00003FF9 //A17 = 2.7563646660310313164706189622e-02
553data8 0xAD8A5BA6D0FD9758, 0x00003FFC //A16 = 1.6947310643831556048460963841e-01
554data8 0xFCDDA464AD3F182E, 0x00003FFE //A15 = 9.8775699098518676937088606052e-01
555data8 0xAE0DCE2F7B60D1AE, 0x00004001 //A14 = 5.4391852309591064073782104822e+00
556data8 0xE1745D9ABEB8D1A7, 0x00004003 //A13 = 2.8181819161363002758615770457e+01
557//A2 = 3.0619656223573554307222366333e+06 + 1.0819940302945474471259520006e-10
558data8 0x41475C66CFA967E4, 0x3DDDBDDB2A27334B
559//A1 = 2.6099413018962685018777847290e+06 + 3.6851882860056025385268615240e-10
560data8 0x4143E98AA6A48974, 0x3DF9530D42589AB6
561//A0 = 1.1332783889487853739410638809e+06 + 1.9339350553312096248591829758e-10
562data8 0x41314ADE639225C9, 0x3DEA946DD6C2C8D3
563data8 0x88BCFAAE71812A1C, 0x00004006 //A12 = 1.3673820009490115307300592012e+02
564data8 0x9A770F5AB540A326, 0x00004008 //A11 = 6.1786031215382040427126476507e+02
565data8 0xA170C1D2C6B413FC, 0x0000400A //A10 = 2.5830473201524594051391525170e+03
566data8 0x9AE56061CB02EB55, 0x0000400C //A9 = 9.9133441230507404119297200255e+03
567data8 0x872390769650FBE2, 0x0000400E //A8 = 3.4595564309496661629764193479e+04
568data8 0xD3E5E8D6923910C1, 0x0000400F //A7 = 1.0849181904819284819615140521e+05
569data8 0x930D70602F50B754, 0x00004011 //A6 = 3.0116351174131169193070583741e+05
570//
0347518d 571// Polynomial coefficients for the tgammal(x), 12 <= |x| < 13
d5efd131
MF
572//A5 = 1.2249876249976964294910430908e+08 + 6.0051348061679753770848000000e-09
573data8 0x419D34BB29FFC39D, 0x3E39CAB72E01818D
574//A4 = 2.3482765927605420351028442383e+08 + 1.1874729051592862323641700000e-08
575data8 0x41ABFE5F168D56FA, 0x3E4980338AA7B04B
576//A3 = 3.6407329688125067949295043945e+08 + 2.6657200942150363994658700000e-08
577data8 0x41B5B35150E199A5, 0x3E5C9F79C0EB5300
578data8 0xE89AE0F8D726329D, 0x00003FF9 //A20 = 2.8394164465429105626588451540e-02
579data8 0xCF90981F86E38013, 0x00003FFC //A19 = 2.0270002071785908652476845915e-01
580data8 0xA56C658079CA8C4A, 0x00003FFF //A18 = 1.2923704984019263122675412350e+00
581data8 0x80AEF96A67C5615A, 0x00004002 //A17 = 8.0427183300456238315262463506e+00
582data8 0xBE886D7529678931, 0x00004004 //A16 = 4.7633230047847868242503413461e+01
583data8 0x858EDBA4CE2F7508, 0x00004007 //A15 = 2.6711607799594541057655957154e+02
584data8 0xB0B0A3AF388274F0, 0x00004009 //A14 = 1.4135199810126975119809102782e+03
585data8 0xDBA87137988751EF, 0x0000400B //A13 = 7.0290552818218513870879313985e+03
586//A2 = 4.2828433593031734228134155273e+08 + 3.9760422293645854535247300000e-08
587data8 0x41B98719AFEE2947, 0x3E6558A17E0D3007
588//A1 = 3.4008253676084774732589721680e+08 + 1.2558352335001093116071000000e-09
589data8 0x41B4453F68C2C6EB, 0x3E159338C5BC7EC3
590//A0 = 1.3684336546556583046913146973e+08 + 2.6786516700381562934240300000e-08
591data8 0x41A05020CAEE5EA5, 0x3E5CC3058A858579
592data8 0xFF5E3940FB4BA576, 0x0000400D //A12 = 3.2687111823895439312116108631e+04
593data8 0x8A08C124C7F74B6C, 0x00004010 //A11 = 1.4134701786994123329786229006e+05
594data8 0x89D701953540BFFB, 0x00004012 //A10 = 5.6459209892773907605385652281e+05
595data8 0xFC46344B3116C3AD, 0x00004013 //A9 = 2.0666305367147234406757715163e+06
596data8 0xD183EBD7A400151F, 0x00004015 //A8 = 6.8653979211730981618367536737e+06
597data8 0x9C083A40742112F4, 0x00004017 //A7 = 2.0451444503543981795037456447e+07
598data8 0xCD3C475B1A8B6662, 0x00004018 //A6 = 5.3801245423495149598177886823e+07
599LOCAL_OBJECT_END(Constants_Tgammal_poly)
600
601
602LOCAL_OBJECT_START(Constants_Tgammal_poly_splitted)
603
0347518d 604// Polynomial coefficients for the tgammal(x), 1 <= |x| < 1.25
d5efd131
MF
605//A5 = -9.8199506890310417350775651357e-01+ -3.2546247786122976510752200000e-17
606data8 0xBFEF6C80EC38B509, 0xBC82C2FA7A3DE3BD
607//A4 = 9.8172808683439960475425323239e-01 + 4.4847611775298520359811400000e-17
608data8 0x3FEF6A51055096B0, 0x3C89DA56DE95EFE4
609//A3 = -9.0747907608088618225394839101e-01 +-1.0244057366544064435443970000e-16
610data8 0xBFED0A118F324B62, 0xBC9D86C7B9EBCFFF
611data8 0xB8E3FDAA66CC738E, 0x00003FFB //A20 = 9.0278608095877488976217714815e-02
612data8 0xA76067AE1738699C, 0x0000BFFD //A19 =-3.2690738678103132837070881737e-01
613data8 0x9D66B13718408C44, 0x00003FFE //A18 = 6.1484820933424283818320582920e-01
614data8 0xD4AC67BBB4AE5599, 0x0000BFFE //A17 =-8.3075569470082063491389474937e-01
615data8 0xF1426ED1C1488DB3, 0x00003FFE //A16 = 9.4241993542644505594957058785e-01
616data8 0xFC12EB07AA6F4B6B, 0x0000BFFE //A15 =-9.8466366707947121954333549690e-01
617data8 0xFF2B32CFE5B0DDC8, 0x00003FFE //A14 = 9.9675290656677214804168895915e-01
618data8 0xFFD8E7E6FF3662EA, 0x0000BFFE //A13 =-9.9940347089360552383472582319e-01
619//A2 = 9.8905599532797250361682017683e-01 + 5.1760162410376024240867300000e-17
620data8 0x3FEFA658C23B1578, 0x3C8DD673A61F6FE7
621//A1 = -5.7721566490153275452712478000e-01+ -1.0607935612223465065923310000e-16
622data8 0xBFE2788CFC6FB618, 0xBC9E9346622D53B7
623//A0 = 9.9999999999999988897769753748e-01 + 1.1102230245372554544790880000e-16
624data8 0x3FEFFFFFFFFFFFFF, 0x3C9FFFFFFFF51E4E
625data8 0xFFF360DF628F0BC9, 0x00003FFE //A12 = 9.9980740979895815468216470840e-01
626data8 0xFFEF8F9A72B40480, 0x0000BFFE //A11 = -9.9974916001038145045939523470e-01
627data8 0xFFE037B8C7E39952, 0x00003FFE //A10 = 9.9951504002809911822597567307e-01
628data8 0xFFC01E08F348BED2, 0x0000BFFE //A9 = -9.9902522772325406705059517941e-01
629data8 0xFF83DAC83119B52C, 0x00003FFE //A8 = 9.9810569179053383842734164901e-01
630data8 0xFEF9F8AB891ABB24, 0x0000BFFE //A7 = -9.9600176036720260345608796766e-01
631data8 0xFE3F0537573C8235, 0x00003FFE //A6 = 9.9314911461918778676646301341e-01
632//
0347518d 633// Polynomial coefficients for the tgammal(x), 1.25 <= |x| < 1.75
d5efd131
MF
634//A5 = -7.7523052299853054125655660300e-02+ -1.2693512521686721504433600000e-17
635data8 0xBFB3D88CFE50601B, 0xBC6D44ED60EE2170
636//A4 = 1.4464535904462152982041800442e-01 + 2.5426820829345729856648800000e-17
637data8 0x3FC283BD374EB2A9, 0x3C7D50AC436187C3
638//A3 = -1.0729480456477220873257039102e-01+ -6.2429894945456418196551000000e-18
639data8 0xBFBB77AC1CA2EBA5, 0xBC5CCA6BCC422D41
640data8 0xF732D2689F323283, 0x00003FF2 //A20 = 2.3574688251652899567587145422e-04
641data8 0xB6B00E23DE89D13A, 0x0000BFF3 //A19 =-3.4844916488842618776630058875e-04
642data8 0xE98396FE4A1B2799, 0x00003FF3 //A18 =4.4539265198744452020440735977e-04
643data8 0xAF8D235A640DB1A2, 0x0000BFF4 //A17 =-6.6967514303333563295261178346e-04
644data8 0x8513B736C918B261, 0x00003FF5 //A16 = 1.0152970456990865810615917715e-03
645data8 0xC790A1A2C78D8E17, 0x0000BFF5 //A15 =-1.5225598630329403515321688394e-03
646data8 0x959706CFA638CDE2, 0x00003FF6 //A14 = 2.2825614575133879623648932383e-03
647data8 0xE050A6021E129860, 0x0000BFF6 //A13 =-3.4227757733947066666295285936e-03
648//A2 = 4.1481345368830113695679528973e-01 + 3.1252439808354284892632100000e-17
649data8 0x3FDA8C4DBA620D56, 0x3C82040BCB483C76
650//A1 = 3.2338397448885010387886751460e-02 + 3.4437825798552300531443100000e-18
651data8 0x3FA08EA88EE561B1, 0x3C4FC366D6C64806
652//A0 = 8.8622692545275794095971377828e-01 + 7.2689375867553992399219000000e-17
653data8 0x3FEC5BF891B4EF6A, 0x3C94F3877D311C0C
654data8 0xA8275AADC09D16FC, 0x00003FF7 //A12 = 5.1316445128621071486146117136e-03
655data8 0xFBFE2CE9215267A2, 0x0000BFF7 //A11= -7.6902121820788373000579382408e-03
656data8 0xBCC8EEAB67ECD91D, 0x00003FF8 //A10 = 1.1522515369164312742737727262e-02
657data8 0x8D1614BB97E5E8C2, 0x0000BFF9 //A9 = -1.7222443097804730395560633583e-02
658data8 0xD3A963578BE291E3, 0x00003FF9 //A8 = 2.5837606456090186343624210891e-02
659data8 0x9BA7EAE64C42FDF7, 0x0000BFFA //A7 = -3.8001935555045161419575037512e-02
660data8 0xF0115BA1A77607E7, 0x00003FFA //A6 = 5.8610303817173477119764956736e-02
661//
0347518d 662// Polynomial coefficients for the tgammal(x), 1.75 <= |x| < 2.0
d5efd131
MF
663//A5 = 2.6698206874501426502654943818e-04 + 3.4033756836921062797887300000e-20
664data8 0x3F317F3740FE2A68, 0x3BE417093234B06E
665//A4 = 7.4249010753513894345090307070e-02 + 3.9810018444482764697014200000e-18
666data8 0x3FB301FBB0F25A92, 0x3C525BEFFABB622F
667//A3 = -8.1576919247086265851720554565e-02+ -5.2716624487804746360745000000e-19
668data8 0xBFB4E239984650AC, 0xBC2372F1C4F276FF
669data8 0xFEF3AEE71038E9A3, 0x00003FEB //A20 = 1.8995395865421509009969188571e-06
670data8 0xA11CFA2672BF876A, 0x0000BFEB //A19 =-1.2003868221414015771269244270e-06
671data8 0xF8E107215DAE2164, 0x00003FEC //A18 = 3.7085863210303833432006027217e-06
672data8 0xBCDDD3FC011EF7D6, 0x00003FEC //A17 = 2.8143303971756051015245433043e-06
673data8 0x8683C4687FA22E68, 0x00003FEE //A16 = 8.0177018464360416764308252462e-06
674data8 0xFDA09E5D33E32968, 0x00003FEE //A15 = 1.5117372062443781157389064848e-05
675data8 0xFFB00D0CFF4089B4, 0x00003FEF //A14 = 3.0480348961227424242198174995e-05
676data8 0xFEF6C39566785085, 0x00003FF0 //A13 = 6.0788135974125244644334004947e-05
677//A2 = 4.1184033042643969357854416558e-01 + 1.2103396182129232634761000000e-18
678data8 0x3FDA5B978B96BEBF, 0x3C3653AAD0A139E4
679//A1 = -4.2278433509846713445057275749e-01+ -4.9429151528135657430413000000e-18
680data8 0xBFDB0EE6072093CE, 0xBC56CB907027554F
681//A0 = 1.0000000000000000000000000000e+00 + 1.0969171200000000000000000000e-31
682data8 0x3FF0000000000000, 0x3981CC6A5B20B4D5
683data8 0xFF2B7BA9A8D68C37, 0x00003FF1 //A12 = 1.2167446884801403650547161615e-04
684data8 0xFCA53468E3692EF1, 0x00003FF2 //A11 = 2.4094136329542400976250900707e-04
685data8 0x808D698A9C993615, 0x00003FF4 //A10 = 4.9038845704938303659791698883e-04
686data8 0xF10F8E3FB8BB4AFB, 0x00003FF4 //A9 = 9.1957383840999861214472423976e-04
687data8 0x89E224E42F93F005, 0x00003FF6 //A8 = 2.1039333407187324139473634747e-03
688data8 0xBAF374824937A323, 0x00003FF6 //A7 = 2.8526458211545152218493600470e-03
689data8 0xB6BF7564F52140C6, 0x00003FF8 //A6 = 1.1154045718131014476684982178e-02
690//
0347518d 691// Polynomial coefficients for the tgammal(x), 0.0 <= |x| < 0.125
d5efd131
MF
692//A5 = -9.8199506890314514073736518185e-01+ -5.9363811993837985890950900000e-17
693data8 0xBFEF6C80EC38B67A, 0xBC911C46B447C81F
694//A4 = 9.8172808683440015986576554496e-01 + 2.7457414262802803699834200000e-17
695data8 0x3FEF6A51055096B5, 0x3C7FA7FF90ACAD1F
696//A3 = -9.0747907608088618225394839101e-01 + -1.0676255850934306734701780000e-16
697data8 0xBFED0A118F324B62, 0xBC9EC5AFB633438D
698data8 0x9217E83FA207CB80, 0x00003FFD //A20 = 2.8533864762086088781083621561e-01
699data8 0xA8DABFA52FDF03EC, 0x0000BFFE //A19= -6.5958783896337186303285832783e-01
700data8 0xE331ED293AF39F9B, 0x00003FFE //A18 = 8.8748056656454687449654731184e-01
701data8 0xF9163C5DDB52419D, 0x0000BFFE //A17= -9.7299554149078295602977718525e-01
702data8 0xFEC0A1C672CB9265, 0x00003FFE //A16 = 9.9512683005268190987854104489e-01
703data8 0xFFD2D65B8EA7B5F4, 0x0000BFFE //A15= -9.9931087241443958201592847861e-01
704data8 0xFFF93AA39EE53445, 0x00003FFE //A14 = 9.9989668364186884793382816496e-01
705data8 0xFFFB99A9A3F5F480, 0x0000BFFE //A13= -9.9993286506283835663204999212e-01
706//A2 = 9.8905599532797250361682017683e-01 + 5.1778575360788420716540100000e-17
707data8 0x3FEFA658C23B1578, 0x3C8DD92B45408D07
708//A1 = -5.7721566490153275452712478000e-01+ -1.0607938730998824663273110000e-16
709data8 0xBFE2788CFC6FB618, 0xBC9E9346F8FDE55B
710//A0 = 9.9999999999999988897769753748e-01 + 1.1102230246251564036631420000e-16
711data8 0x3FEFFFFFFFFFFFFF, 0x3C9FFFFFFFFFFFFF
712data8 0xFFF7FEBB545812C1, 0x00003FFE //A12 = 9.9987785409425126648628395084e-01
713data8 0xFFF00C02E943A3F2, 0x0000BFFE //A11= -9.9975657530855116454438747397e-01
714data8 0xFFE0420AADC53820, 0x00003FFE //A10 = 9.9951565514290485919027183699e-01
715data8 0xFFC01EB42EF27EEB, 0x0000BFFE //A9 = -9.9902526759155739377365522320e-01
716data8 0xFF83DAD0BF23FF12, 0x00003FFE //A8 = 9.9810569378236378800364235948e-01
717data8 0xFEF9F8ABDBCDB2F3, 0x0000BFFE //A7 = -9.9600176044241699109053158187e-01
718data8 0xFE3F05375988491D, 0x00003FFE //A6 = 9.9314911462127599008937257662e-01
719LOCAL_OBJECT_END(Constants_Tgammal_poly_splitted)
720
721.align 64
722LOCAL_OBJECT_START(Constants_Tgammal_common)
723// Positive overflow value
724data8 0x3FE0000000000000 // 0.5
725data8 0x3FF8000000000000 // 1.5
726data8 0x3FD0000000000000 // 0.25
727data8 0x0000000000000000 // 0
728data8 0xDB718C066B352E21, 0x00004009 // Positive overflow value
729LOCAL_OBJECT_END(Constants_Tgammal_common)
730
731
732
733//=======================================================
734// Lgamma registers
735
736// General Purpose Registers
737GR_l_Log_Table = r33
738GR_l_Log_Table1 = r34
739GR_l_BIAS = r34
740GR_l_Index1 = r35
741GR_l_Index2 = r36
0347518d 742GR_l_signif_Z = r37
d5efd131
MF
743GR_l_X_0 = r38
744GR_l_X_1 = r39
745GR_l_X_2 = r40
746GR_l_Z_1 = r41
747GR_l_Z_2 = r42
748GR_l_N = r43
749GR_l_Index3 = r44
750GR_l_Stirling_Table = r45
751GR_l_N_Unbiased = r46
752
753// Floating Point Registers
754FR_l_logl_X = f8
755
756FR_l_h_3 = f10
757FR_l_poly_hi = f10
758FR_l_W = f11
759FR_l_S = f12
760FR_l_GS_hi = f13
761FR_l_Y_lo = f13
762FR_l_r_cor = f14
763FR_l_G_1 = f15
764FR_l_G = f15
765FR_l_H_1 = f32
766FR_l_H = f32
767FR_l_h = f33
768FR_l_h_1 = f33
769FR_l_N = f33
770FR_l_G_2 = f34
771FR_l_H_2 = f35
772FR_l_h_2 = f36
773FR_l_G_3 = f37
774FR_l_log2_hi = f38
775FR_l_GS_lo = f39
776FR_l_H_3 = f40
777FR_l_float_N = f41
778FR_l_Q_4 = f42
779FR_l_Q_3 = f43
780FR_l_Q_2 = f44
781FR_l_Q_1 = f45
782FR_l_Q_5 = f46
783FR_l_Q_6 = f47
784FR_l_log2_lo = f48
785FR_l_r = f49
786FR_l_poly_lo = f50
787FR_l_poly = f51
788FR_l_rsq = f52
789FR_l_Y_lo_res = f53
790
0347518d
MF
791FR_l_Y0 = f55
792FR_l_Q0 = f56
793FR_l_E0 = f57
794FR_l_E2 = f58
795FR_l_E1 = f59
796FR_l_Y1 = f60
797FR_l_E3 = f61
798FR_l_Y2 = f62
799
800FR_l_Z = f63
801FR_l_Z2 = f64
802FR_l_Z4 = f65
803FR_l_Z8 = f66
804
805FR_l_CH = f67
806FR_l_CL = f68
807
808FR_l_B2 = f69
809FR_l_B4 = f70
810FR_l_B6 = f71
811FR_l_B8 = f72
812FR_l_B10 = f73
813FR_l_B12 = f74
814FR_l_B14 = f75
815FR_l_B16 = f76
816FR_l_B18 = f77
817FR_l_Half = f78
d5efd131
MF
818FR_l_SS = f79
819FR_l_AbsX_m_Half = f80
820FR_l_CXH = f81
821FR_l_CXL = f82
822FR_l_SSCXH = f83
823FR_l_SSCXL = f84
824FR_l_XYH = f85
825FR_l_XYL = f86
826FR_l_Temp = f87
827
828FR_l_logl_YHi = f88
829FR_l_logl_YLo = f89
830
831FR_l_SignedXYH = f123
832
833FR_l_AbsX = f127
834
835
836
837//=======================================================
838// Negative part registers
839
840// General Purpose Registers
841GR_n_sin_Table = r47
842GR_n_XN = r48
843
844// Float point registers
845FR_n_IXNS = f125
846FR_n_IXN = f126
847
848FR_n_XNS = f90
849FR_n_XS = f91
850FR_n_XS2 = f92
851FR_n_XS2L = f93
852FR_n_XS4 = f94
853FR_n_XS7 = f95
854FR_n_XS8 = f96
855FR_n_TT = f97
856FR_n_TH = f98
857FR_n_TL = f99
858
0347518d
MF
859FR_n_A2H = f100
860FR_n_A2L = f101
861FR_n_A1H = f102
862FR_n_A1L = f103
863FR_n_A9 = f104
864FR_n_A8 = f105
865FR_n_A7 = f106
866FR_n_A6 = f107
867FR_n_A5 = f108
868FR_n_A4 = f109
869FR_n_A3 = f110
d5efd131
MF
870
871FR_n_PolyH = f111
872FR_n_PolyL = f112
873
874FR_n_Poly1H = f113
875FR_n_SinxH = f113 // the same as FR_n_Poly1H
876FR_n_Poly1L = f114
877FR_n_SinxL = f114 // the same as FR_n_Poly1L
878
879FR_n_Tail = f115
880FR_n_NegOne = f116
881
882FR_n_Y0 = f117
883
0347518d
MF
884FR_n_Q0 = f118
885FR_n_E0 = f119
886
887FR_n_E2 = f120
888FR_n_E1 = f121
889
890FR_n_Y1 = f55
891FR_n_E3 = f56
892
893FR_n_Y2 = f57
894FR_n_R0 = f58
d5efd131
MF
895
896FR_n_E4 = f59
897FR_n_RcpResH = f60
898
899FR_n_Y3 = f61
900FR_n_R1 = f62
901FR_n_Temp = f63
902
903FR_n_RcpResL = f64
904
905FR_n_ResH = f65
906FR_n_ResL = f66
907
908
909
910
911//=======================================================
912// Exp registers
913
914// General Purpose Registers
915GR_e_ad_Arg = r33
916GR_e_ad_A = r34
917GR_e_signexp_x = r35
918GR_e_exp_x = r35
919GR_e_exp_mask = r36
920GR_e_ad_W1 = r37
921GR_e_ad_W2 = r38
922GR_e_M2 = r39
923GR_e_M1 = r40
924GR_e_K = r41
925GR_e_exp_2_mk = r42
926GR_e_exp_2_k = r43
927GR_e_ad_T1 = r44
928GR_e_ad_T2 = r45
929GR_e_N_fix = r46
930GR_e_one = r47
931GR_e_exp_bias = r48
932GR_e_sig_inv_ln2 = r49
933GR_e_rshf_2to51 = r50
934GR_e_exp_2tom51 = r51
935GR_e_rshf = r52
936
937// Floating Point Registers
938FR_e_RSHF_2TO51 = f10
939FR_e_INV_LN2_2TO63 = f11
940FR_e_W_2TO51_RSH = f12
941FR_e_2TOM51 = f13
942FR_e_RSHF = f14
943FR_e_Y_hi = f15
944FR_e_Y_lo = f32
945FR_e_scale = f33
946FR_e_float_N = f34
947FR_e_N_signif = f35
948FR_e_L_hi = f36
949FR_e_L_lo = f37
950FR_e_r = f38
951FR_e_W1 = f39
952FR_e_T1 = f40
953FR_e_W2 = f41
954FR_e_T2 = f42
955FR_e_W1_p1 = f43
956FR_e_rsq = f44
957FR_e_A2 = f45
958FR_e_r4 = f46
959FR_e_A3 = f47
960FR_e_poly = f48
961FR_e_T = f49
962FR_e_W = f50
963FR_e_Wp1 = f51
964FR_e_r6 = f52
965FR_e_2_mk = f53
966FR_e_A1 = f54
967FR_e_T_scale = f55
968FR_e_result_lo = f56
969FR_e_W_T_scale = f57
970FR_e_Wp1_T_scale = f58
971
972FR_e_expl_Input_X = f123
973FR_e_expl_Input_Y = f124
974FR_e_expl_Output_X = f123
975FR_e_expl_Output_Y = f124
976
977
978FR_e_expl_Input_AbsX = f122
979
980
981
982//=======================================================
983// Common registers
984
985// General Purpose Registers
986GR_c_Table = r53
987GR_c_NegUnderflow = r54
988GR_c_NegSingularity = r55
989GR_c_X = r56
990GR_c_SignBit = r57
991GR_c_13 = r58
992
993
994// Floating Point Registers
995FR_c_PosOverflow = f123
996FR_c_XN = f124
997
998
999//=======================================================
1000// Polynomial part registers
1001
1002// General Purpose Registers
1003GR_p_Table = r59
1004GR_p_XN = r33
1005GR_p_Table2 = r34
1006GR_p_Int = r35
1007GR_p_Offset = r36
1008GR_p_Offset2 = r38
1009GR_p_X_Sgnd = GR_l_signif_Z // = r37
1010GR_p_Exp = r61
1011GR_p_Bias = r62
1012GR_p_0p75 = r63
1013
1014// Floating Point Registers
1015FR_p_AbsX = FR_l_AbsX // = f127
1016FR_p_IXN = FR_n_IXN // = f126
1017FR_p_XN = f32
1018FR_p_0p5 = f33
1019FR_p_1p5 = f34
1020FR_p_AbsXM1 = f35
1021FR_p_2 = f36
1022
0347518d
MF
1023FR_p_A20 = f37
1024FR_p_A19 = f38
1025FR_p_A18 = f39
1026FR_p_A17 = f40
1027FR_p_A16 = f41
1028FR_p_A15 = f42
1029FR_p_A14 = f43
1030FR_p_A13 = f44
1031FR_p_A12 = f45
1032FR_p_A11 = f46
1033FR_p_A10 = f47
1034FR_p_A9 = f48
1035FR_p_A8 = f49
1036FR_p_A7 = f50
1037FR_p_A6 = f51
1038FR_p_A5H = f52
1039FR_p_A5L = f53
1040FR_p_A4H = f54
1041FR_p_A4L = f55
1042FR_p_A3H = f56
1043FR_p_A3L = f57
1044FR_p_A2H = f58
1045FR_p_A2L = f59
1046FR_p_A1H = f60
1047FR_p_A1L = f61
1048FR_p_A0H = f62
1049FR_p_A0L = f63
d5efd131
MF
1050
1051FR_p_XR = f64
0347518d
MF
1052FR_p_XR2 = f65
1053FR_p_XR2L = f52
1054
1055FR_p_XR3 = f58
1056FR_p_XR3L = f38
1057
1058FR_p_XR4 = f42
1059FR_p_XR6 = f40
1060FR_p_XR8 = f37
1061
1062FR_p_Poly5H = f66
1063FR_p_Poly5L = f67
1064FR_p_Poly4H = f53
1065FR_p_Poly4L = f44
1066FR_p_Poly3H = f41
1067FR_p_Poly3L = f47
1068FR_p_Poly2H = f68
1069FR_p_Poly2L = f54
1070FR_p_Poly1H = f55
1071FR_p_Poly1L = f46
1072FR_p_Poly0H = f39
1073FR_p_Poly0L = f43
1074
1075FR_p_Temp5H = f69
1076FR_p_Temp5L = f70
1077FR_p_Temp4H = f71
1078FR_p_Temp4L = f60
1079FR_p_Temp2H = f72
1080FR_p_Temp2L = f73
1081FR_p_Temp1H = f59
1082FR_p_Temp1L = f61
1083FR_p_Temp0H = f49
1084FR_p_Temp0L = f48
1085FR_p_PolyTail = f45
1086FR_p_OddPoly0H = f56
1087FR_p_OddPoly0L = f51
d5efd131
MF
1088
1089FR_p_0p25 = f73
1090
1091
1092//=======================================================
1093// Negative polynomial part registers
1094// General Purpose Registers
0347518d
MF
1095GR_r_sin_Table = r47
1096GR_r_sin_Table2 = r60
d5efd131
MF
1097
1098// Floating Point Registers
0347518d
MF
1099FR_r_IXNS = FR_n_IXNS
1100FR_r_IXN = FR_n_IXN
d5efd131
MF
1101
1102FR_r_AbsX = FR_l_AbsX
1103
0347518d
MF
1104FR_r_A9 = f74
1105FR_r_A8 = f75
1106FR_r_A7 = f76
1107FR_r_A6 = f77
1108FR_r_A5 = f78
1109FR_r_A4 = f79
1110FR_r_A3 = f80
1111FR_r_A2H = f81
1112FR_r_A2L = f82
1113FR_r_A1H = f83
1114FR_r_A1L = f84
1115
1116FR_r_XNS = f85
1117FR_r_XS = f86
1118FR_r_XS2 = f87
1119FR_r_XS2L = f88
1120FR_r_XS4 = f89
1121FR_r_XS7 = f90
1122FR_r_XS8 = f91
d5efd131
MF
1123
1124FR_r_Tail = f92
1125
0347518d
MF
1126FR_r_TT = f93
1127FR_r_TH = f94
1128FR_r_TL = f95
d5efd131
MF
1129
1130FR_r_ResH = f96
1131FR_r_ResL = f97
1132
0347518d
MF
1133FR_r_Res3H = f98
1134FR_r_Res3L = f99
1135
1136FR_r_Res1H = f100
1137FR_r_Res1L = f101
1138
1139
1140
1141FR_r_Y0 = f102
1142FR_r_Q0 = f103
1143FR_r_E0 = f104
1144FR_r_E2 = f105
1145FR_r_E1 = f106
1146FR_r_Y1 = f107
1147FR_r_E3 = f108
1148FR_r_Y2 = f109
1149FR_r_R0 = f110
1150FR_r_E4 = f111
1151FR_r_ZH = f112
1152FR_r_Y3 = f113
1153FR_r_R1 = f114
d5efd131
MF
1154FR_r_ZHN = f115
1155FR_r_ZL = f115
1156FR_r_NegOne = f116
1157
0347518d
MF
1158FR_z_Y0 = f102
1159FR_z_Q0 = f103
1160FR_z_E0 = f104
1161FR_z_E2 = f105
1162FR_z_E1 = f106
1163FR_z_Y1 = f107
1164FR_z_E3 = f108
1165FR_z_Y2 = f109
1166FR_z_R0 = f110
1167FR_z_E4 = f111
1168FR_z_ZH = f112
1169FR_z_Y3 = f113
1170FR_z_R1 = f114
1171FR_z_ZL = f115
d5efd131
MF
1172
1173
1174// General Purpose Registers
1175GR_SAVE_PFS = r32
1176GR_DenOverflow = r33
1177GR_u_XN = r34
1178
0347518d
MF
1179GR_SAVE_B0 = r35
1180GR_SAVE_GP = r36
1181GR_SAVE_SP = r37
d5efd131
MF
1182
1183// Floating Point Registers
1184FR_u_IXN = f34
1185
1186
1187// ERROR HANDLER REGISTERS
1188GR_Parameter_X = r64
1189GR_Parameter_Y = r65
1190GR_Parameter_RESULT = r66
1191GR_Parameter_TAG = r67
1192
1193FR_RESULT = f8
0347518d 1194FR_X = f32
d5efd131
MF
1195FR_Y = f1
1196
1197
1198.section .text
1199GLOBAL_LIBM_ENTRY(tgammal)
1200{ .mfi
1201 alloc r32 = ar.pfs,0,32,4,0
1202 fabs FR_l_AbsX = f8 // Get absolute value of X
0347518d 1203 addl GR_n_sin_Table = @ltoff(Constants_Tgammal_sin), gp
d5efd131 1204}
0347518d 1205{ .mfi
d5efd131
MF
1206 addl GR_l_Log_Table=@ltoff(Constants_Tgammal_log_80_Z_G_H_h1#),gp
1207 nop.f 0
1208 addl GR_l_Stirling_Table = @ltoff(Constants_Tgammal_stirling), gp
1209};;
1210
1211{ .mfi
1212 getf.sig GR_l_signif_Z = f8 // Significand of X
1213 fcvt.fx.s1 FR_n_IXNS = f8 // Convert to fixed point
1214 addl GR_c_Table = @ltoff(Constants_Tgammal_common), gp
1215}
1216{ .mfi
1217 ld8 GR_l_Log_Table = [GR_l_Log_Table]
1218 nop.f 0
1219 addl GR_p_Table = @ltoff(Constants_Tgammal_poly), gp
1220};;
1221
1222{ .mfi
0347518d 1223 ld8 GR_n_sin_Table = [GR_n_sin_Table]
d5efd131
MF
1224 fclass.m p6,p0 = f8,0x1EF // Check x for NaN, 0, INF, denorm
1225 // NatVal.
1226 addl GR_c_NegSingularity = 0x1003E, r0
1227}
1228{ .mlx
1229 ld8 GR_l_Stirling_Table = [GR_l_Stirling_Table]
1230 movl GR_c_13 = 0x402A000000000000 // 13.0
1231};;
1232
1233{ .mfi
1234 getf.d GR_c_X = f8 // Double prec. X to general register
1235 frcpa.s1 FR_z_Y0,p0 = f1,f8 // y = frcpa(x) (for negatives)
1236 extr.u GR_l_Index1 = GR_l_signif_Z, 59, 4 // = High 4 bits of Z
1237}
1238{ .mlx
1239 ld8 GR_c_Table = [GR_c_Table]
1240 movl GR_c_SignBit = 0x8000000000000000 // High bit (sign)
1241};;
1242
1243{ .mfi
0347518d 1244 ld8 GR_p_Table = [GR_p_Table]
d5efd131 1245 fcmp.lt.s1 p15, p14 = f8,f0 // p14 - positive arg, p15 - negative
0347518d 1246 shl GR_l_Index1 = GR_l_Index1,5 // Adjust Index1 ptr (x32)
d5efd131
MF
1247}
1248{ .mfb
1249 adds GR_c_NegUnderflow = 1765, r0
1250 nop.f 0
1251(p6) br.cond.spnt tgammal_spec // Spec. values processing branch ////////////
1252 // (0s, INFs, NANs, NatVals, denormals) //////
1253};;
1254
1255{ .mfi
1256 ldfpd FR_l_CH,FR_l_CL= [GR_l_Stirling_Table], 16 // Load CH, CL
1257 fcvt.fx.trunc.s1 FR_n_IXN = FR_l_AbsX // Abs arg to int by trunc
1258 extr.u GR_l_X_0 = GR_l_signif_Z, 49, 15 // High 15 bit of Z
1259}
1260{ .mfi
1261 add GR_l_Index1 = GR_l_Index1,GR_l_Log_Table // Add offset
1262 fma.s1 FR_p_2 = f1, f1, f1 // 2.0
1263 andcm GR_c_X = GR_c_X, GR_c_SignBit // Remove sign
1264};;
1265
0347518d 1266{ .mfi
d5efd131
MF
1267 addl GR_l_Log_Table = @ltoff(Constants_Tgammal_log_80_Z_G_H_h2#), gp
1268 fcmp.lt.s1 p10, p0 = FR_l_AbsX, f1 // If |X|<1 then p10 = 1
1269 nop.i 0
1270}
1271{ .mlx
1272 ld2 GR_l_Z_1 = [GR_l_Index1],4 // load Z_1 from Index1
1273 movl GR_l_BIAS = 0x000000000000FFFF // Bias for exponent
1274};;
1275
0347518d 1276{ .mfi
d5efd131
MF
1277 ld8 GR_l_Log_Table = [GR_l_Log_Table]
1278 frcpa.s1 FR_l_Y0, p0 = f1, FR_l_AbsX // y = frcpa(x)
1279 nop.i 0
1280}
1281{ .mfi
0347518d 1282 ldfs FR_l_G_1 = [GR_l_Index1],4 // Load G_1
d5efd131
MF
1283 fsub.s1 FR_l_W = FR_l_AbsX, f1 // W = |X|-1
1284 nop.i 0
1285};;
1286
0347518d 1287{ .mfi
d5efd131
MF
1288 getf.exp GR_l_N_Unbiased= FR_l_AbsX // exponent of |X|
1289 fmerge.se FR_l_S = f1, FR_l_AbsX // S = merging of X and 1.0
0347518d 1290 cmp.gtu p11, p0 = GR_c_13, GR_c_X // If 1 <= |X| < 13
d5efd131
MF
1291 // then p11 = 1
1292}
1293{ .mfb
1294 ldfs FR_l_H_1 = [GR_l_Index1],8 // Load H_1
1295 fcvt.xf FR_n_XNS = FR_n_IXNS // Convert to FP repr. of int X
1296(p10) br.cond.spnt tgamma_lt_1 // Branch to |X| < 1 path ///////////////////
1297};;
1298
0347518d
MF
1299{ .mfi
1300 ldfpd FR_n_A2H, FR_n_A2L = [GR_n_sin_Table], 16
d5efd131
MF
1301 nop.f 0
1302 pmpyshr2.u GR_l_X_1 = GR_l_X_0,GR_l_Z_1,15 // Adjust Index2 (x32)
1303}
0347518d
MF
1304{ .mfb
1305 ldfe FR_l_B2 = [GR_l_Stirling_Table], 16
d5efd131
MF
1306 nop.f 0
1307(p11) br.cond.spnt tgamma_lt_13 // Branch to 1 <= |X| < 13 path ///////////////
1308};;
1309
0347518d
MF
1310{ .mfi
1311 ldfe FR_l_h_1 = [GR_l_Index1],0
d5efd131
MF
1312 nop.f 0
1313 sub GR_l_N = GR_l_N_Unbiased, GR_l_BIAS // N - BIAS
1314}
0347518d 1315{ .mib
d5efd131
MF
1316 ldfpd FR_l_B4,FR_l_B6= [GR_l_Stirling_Table], 16 // Load C
1317(p15) cmp.geu.unc p8,p0 = GR_l_N_Unbiased, GR_c_NegSingularity
1318(p8) br.cond.spnt tgammal_singularity // Singularity for arg < to -2^63 //////
1319};;
1320
0347518d
MF
1321{ .mmi
1322(p15) ldfpd FR_n_A1H, FR_n_A1L = [GR_n_sin_Table], 16
d5efd131 1323 ldfpd FR_l_B8, FR_l_B10 = [GR_l_Stirling_Table], 16
0347518d 1324 add GR_c_Table = 0x20, GR_c_Table
d5efd131
MF
1325};;
1326
1327{ .mfi
0347518d
MF
1328(p15) ldfe FR_n_A9 = [GR_n_sin_Table], 16
1329 fma.s1 FR_l_Q0 = f1,FR_l_Y0,f0 // Q0 = Y0
d5efd131
MF
1330 nop.i 0
1331}
0347518d
MF
1332{ .mfi
1333 ldfpd FR_l_B12, FR_l_B14 = [GR_l_Stirling_Table], 16
1334 fnma.s1 FR_l_E0 = FR_l_Y0,FR_l_AbsX,f1 // e = 1-b*y
d5efd131
MF
1335 nop.i 0
1336};;
1337
0347518d
MF
1338{ .mfi
1339(p15) ldfe FR_n_A8 = [GR_n_sin_Table], 16
d5efd131 1340 fcvt.xf FR_c_XN = FR_n_IXN // Convert to FP repr. of int X
0347518d 1341 extr.u GR_l_Index2 = GR_l_X_1, 6, 4 // Extract Index2
d5efd131 1342}
0347518d 1343{ .mfi
d5efd131
MF
1344 ldfpd FR_l_B16, FR_l_B18 = [GR_l_Stirling_Table], 16
1345 nop.f 0
1346 nop.i 0
1347};;
1348
0347518d
MF
1349{ .mfi
1350(p15) ldfe FR_n_A7 = [GR_n_sin_Table], 16
d5efd131
MF
1351 fms.s1 FR_l_CXH = FR_l_CH, f1, FR_l_AbsX // CXH = CH+|X|
1352 shl GR_l_Index2 = GR_l_Index2,5
1353}
0347518d 1354{ .mfi
d5efd131
MF
1355 ldfd FR_l_Half = [GR_l_Stirling_Table] // Load 0.5
1356 nop.f 0
1357 nop.i 0
1358};;
1359
0347518d 1360{ .mfi
d5efd131
MF
1361 add GR_l_Index2 = GR_l_Index2, GR_l_Log_Table // Add offset
1362 nop.f 0
1363 nop.i 0
1364}
0347518d
MF
1365{ .mfi
1366(p15) ldfe FR_n_A6 = [GR_n_sin_Table], 16
d5efd131
MF
1367(p15) fma.s1 FR_n_XS = FR_l_AbsX , f1, FR_n_XNS // xs = x - int(x)
1368 nop.i 0
1369};;
1370
0347518d
MF
1371{ .mmi
1372 ld2 GR_l_Z_2 = [GR_l_Index2],4
d5efd131
MF
1373 addl GR_l_Log_Table = @ltoff(Constants_Tgammal_log_80_h3_G_H#),gp
1374 nop.i 0
1375};;
1376
0347518d 1377{ .mfi
d5efd131
MF
1378 ld8 GR_l_Log_Table = [GR_l_Log_Table]
1379 fma.s1 FR_l_E2 = FR_l_E0,FR_l_E0,FR_l_E0 // e2 = e+e^2
1380 nop.i 0
1381}
0347518d
MF
1382{ .mfi
1383 ldfs FR_l_G_2 = [GR_l_Index2],4
d5efd131
MF
1384 fma.s1 FR_l_E1 = FR_l_E0,FR_l_E0,f0 // e1 = e^2
1385 nop.i 0
1386};;
1387
0347518d
MF
1388{ .mmi
1389 ldfs FR_l_H_2 = [GR_l_Index2],8
1390(p15) ldfe FR_n_A5 = [GR_n_sin_Table], 16
d5efd131
MF
1391 nop.i 0
1392};;
1393
0347518d 1394{ .mfi
d5efd131
MF
1395 setf.sig FR_l_float_N = GR_l_N // float_N = Make N a fp number
1396 nop.f 0
0347518d 1397 pmpyshr2.u GR_l_X_2 = GR_l_X_1,GR_l_Z_2,15 // X_2 = X_1 * Z_2
d5efd131 1398}
0347518d
MF
1399{ .mfi
1400 ldfe FR_l_h_2 = [GR_l_Index2],0
d5efd131
MF
1401 fma.s1 FR_l_CXL = FR_l_AbsX, f1, FR_l_CXH // CXL = |X|+CXH
1402 add GR_l_Log_Table1= 0x200, GR_l_Log_Table
1403};;
1404
0347518d
MF
1405{ .mfi
1406(p15) ldfe FR_n_A4 = [GR_n_sin_Table], 16
d5efd131
MF
1407(p15) fcmp.eq.unc.s1 p9,p0 = FR_l_AbsX, FR_c_XN //if argument is integer
1408 // and negative
1409 nop.i 0
1410}
0347518d 1411{ .mfi
d5efd131
MF
1412 ldfe FR_c_PosOverflow = [GR_c_Table],16 //Load pos overflow value
1413(p15) fma.s1 FR_n_XS2 = FR_n_XS, FR_n_XS, f0 // xs^2 = xs*xs
1414 nop.i 0
1415};;
1416
0347518d
MF
1417{ .mfi
1418(p15) ldfe FR_n_A3 = [GR_n_sin_Table], 16
1419 nop.f 0
d5efd131
MF
1420 nop.i 0
1421};;
1422
0347518d 1423{ .mfi
d5efd131
MF
1424(p15) getf.sig GR_n_XN = FR_n_IXN // int(x) to general reg
1425 fma.s1 FR_l_Y1 = FR_l_Y0,FR_l_E2,FR_l_Y0 // y1 = y+y*e2
0347518d 1426 nop.i 0
d5efd131 1427}
0347518d
MF
1428{ .mfb
1429 nop.m 0
d5efd131
MF
1430 fma.s1 FR_l_E3 = FR_l_E1,FR_l_E1,FR_l_E0 // e3 = e+e1^2
1431(p9) br.cond.spnt tgammal_singularity // Singularity for integer /////////////
1432 // and negative arguments //////////////
1433};;
1434
0347518d 1435{ .mfi
d5efd131
MF
1436 nop.m 0
1437 fms.s1 FR_l_AbsX_m_Half = FR_l_AbsX, f1, FR_l_Half // |x|-0.5
1438 extr.u GR_l_Index2 = GR_l_X_2, 1, 5 // Get Index3
1439};;
1440
0347518d
MF
1441{ .mfi
1442 shladd GR_l_Log_Table1= GR_l_Index2, 2, GR_l_Log_Table1
d5efd131
MF
1443 nop.f 0
1444 shladd GR_l_Index3 = GR_l_Index2,4, GR_l_Log_Table // Index3
1445}
1446{ .mfb
1447(p15) cmp.gtu.unc p11, p0 = GR_n_XN, GR_c_NegUnderflow // X < -1765
1448 fms.s1 FR_l_CXL = FR_l_CH, f1, FR_l_CXL // CXL = CH - CXL
1449(p11) br.cond.spnt tgammal_underflow // Singularity for negative argument //////
1450 // at underflow domain (X < -1765) //////
1451};;
1452
0347518d
MF
1453{ .mfi
1454 addl GR_l_Log_Table = @ltoff(Constants_Tgammal_log_80_Q#), gp
d5efd131 1455(p15) fma.s1 FR_n_TT = FR_n_A2L, FR_n_XS2, f0 // T=A2L*x^2
0347518d 1456 tbit.nz.unc p13, p12 = GR_n_XN, 0x0 // whether [X] odd or even
d5efd131
MF
1457}
1458{ .mfi
1459 nop.m 0
1460(p15) fms.s1 FR_n_XS2L = FR_n_XS, FR_n_XS, FR_n_XS2 // xs^2 Low part
1461 nop.i 0
1462};;
1463
0347518d
MF
1464{ .mfi
1465 ld8 GR_l_Log_Table = [GR_l_Log_Table]
d5efd131 1466(p15) fma.s1 FR_n_A7 = FR_n_A8, FR_n_XS2, FR_n_A7 // poly tail
0347518d 1467 nop.i 0
d5efd131 1468}
0347518d
MF
1469{ .mfi
1470 ldfe FR_l_h_3 = [GR_l_Index3],12
d5efd131 1471(p15) fma.s1 FR_n_XS4 = FR_n_XS2, FR_n_XS2, f0 // xs^4 = xs^2*xs^2
0347518d 1472 nop.i 0
d5efd131
MF
1473};;
1474
0347518d
MF
1475{ .mfi
1476 ldfs FR_l_H_3 = [GR_l_Log_Table1], 0
d5efd131 1477 fma.s1 FR_l_Y2 = FR_l_Y1, FR_l_E3, FR_l_Y0 // y2 = y+y1*e3
0347518d 1478 nop.i 0
d5efd131 1479}
0347518d
MF
1480{ .mfi
1481 ldfs FR_l_G_3 = [GR_l_Index3], 0
d5efd131 1482 fnma.s1 FR_l_Z = FR_l_AbsX,FR_l_Q0,f1 // r = a-b*q
0347518d 1483 nop.i 0
d5efd131
MF
1484};;
1485
0347518d 1486{ .mfi
d5efd131
MF
1487 nop.m 0
1488 fmpy.s1 FR_l_G = FR_l_G_1, FR_l_G_2 // G = G1 * G_2
1489 nop.i 0
1490}
0347518d 1491{ .mfi
d5efd131
MF
1492 nop.m 0
1493 fadd.s1 FR_l_H = FR_l_H_1, FR_l_H_2 // H = H_1 + H_2
1494 nop.i 0
1495};;
1496
0347518d 1497{ .mfi
d5efd131 1498 ldfe FR_l_log2_hi = [GR_l_Log_Table],16 // load log2_hi part
0347518d 1499 fadd.s1 FR_l_h = FR_l_h_1, FR_l_h_2 // h = h_1 + h_2
d5efd131
MF
1500 nop.i 0
1501}
1502{ .mfi
0347518d 1503 nop.m 0
d5efd131 1504 fcvt.xf FR_l_float_N = FR_l_float_N // int(N)
0347518d 1505 nop.i 0
d5efd131
MF
1506};;
1507
0347518d 1508{ .mfi
d5efd131
MF
1509 ldfe FR_l_log2_lo = [GR_l_Log_Table],16 // Load log2_lo part
1510 fma.s1 FR_l_CXL = FR_l_CXL, f1, FR_l_CL
0347518d 1511 nop.i 0
d5efd131
MF
1512}
1513{ .mfi
0347518d 1514 nop.m 0
d5efd131 1515(p15) fma.s1 FR_n_TT = FR_n_A2H, FR_n_XS2L, FR_n_TT // T=A2H*x2L+T
0347518d 1516 nop.i 0
d5efd131
MF
1517};;
1518
0347518d
MF
1519{ .mfi
1520 ldfe FR_l_Q_6 = [GR_l_Log_Table],16
d5efd131 1521(p15) fma.s1 FR_n_A3 = FR_n_A4, FR_n_XS2, FR_n_A3 // poly tail
0347518d 1522 nop.i 0
d5efd131
MF
1523}
1524{ .mfi
0347518d 1525 nop.m 0
d5efd131 1526(p15) fma.s1 FR_n_A5 = FR_n_A6, FR_n_XS2, FR_n_A5 // poly tail
0347518d 1527 nop.i 0
d5efd131
MF
1528};;
1529
0347518d
MF
1530{ .mfi
1531 ldfe FR_l_Q_5 = [GR_l_Log_Table],16
d5efd131 1532(p15) fabs FR_n_XS = FR_n_XS // abs(xs)
0347518d 1533 nop.i 0
d5efd131
MF
1534}
1535{ .mfi
0347518d 1536 nop.m 0
d5efd131 1537 fma.s1 FR_l_Z = FR_l_Z,FR_l_Y2,FR_l_Q0 // x_hi = q+r*y2
0347518d 1538 nop.i 0
d5efd131
MF
1539};;
1540
0347518d
MF
1541{ .mfi
1542 ldfe FR_l_Q_4 = [GR_l_Log_Table],16
d5efd131 1543(p15) fma.s1 FR_n_A7 = FR_n_A9, FR_n_XS4, FR_n_A7 // poly tail
0347518d 1544 nop.i 0
d5efd131
MF
1545}
1546{ .mfi
0347518d 1547 nop.m 0
d5efd131 1548(p15) fma.s1 FR_n_XS7 = FR_n_XS4, FR_n_XS2, f0 // = x^4*x^2
0347518d 1549 nop.i 0
d5efd131
MF
1550};;
1551
0347518d
MF
1552{ .mfi
1553 ldfe FR_l_Q_3 = [GR_l_Log_Table],16
d5efd131 1554 fneg FR_n_NegOne = f1 // -1.0
0347518d 1555 nop.i 0
d5efd131
MF
1556}
1557{ .mfi
0347518d 1558 nop.m 0
d5efd131 1559(p15) fma.s1 FR_n_XS8 = FR_n_XS4, FR_n_XS4, f0 // xs^8 = xs^4*xs^4
0347518d 1560 nop.i 0
d5efd131
MF
1561};;
1562
0347518d
MF
1563{ .mfi
1564 ldfe FR_l_Q_2 = [GR_l_Log_Table],16
1565 fadd.s1 FR_l_h = FR_l_h, FR_l_h_3 // h = h_1 + h_2 + h_3
1566 nop.i 0
d5efd131
MF
1567}
1568{ .mfi
0347518d 1569 nop.m 0
d5efd131 1570(p15) fma.s1 FR_n_TH = FR_n_A2H, FR_n_XS2, FR_n_TT // A2H*xs2+T
0347518d 1571 nop.i 0
d5efd131
MF
1572};;
1573
0347518d
MF
1574{ .mfi
1575 ldfe FR_l_Q_1 = [GR_l_Log_Table],16
1576 fmpy.s1 FR_l_G = FR_l_G, FR_l_G_3 // G = G_1 * G_2 * G_3
1577 nop.i 0
d5efd131
MF
1578}
1579{ .mfi
1580 nop.m 0
0347518d
MF
1581 fadd.s1 FR_l_H = FR_l_H, FR_l_H_3 // H = H_1 + H_2 + H_3
1582 nop.i 0
d5efd131
MF
1583};;
1584
0347518d 1585{ .mfi
d5efd131
MF
1586 nop.m 0
1587 fma.s1 FR_l_Z2 = FR_l_Z, FR_l_Z, f0 // Z^2
1588 nop.i 0
1589}
1590{ .mfi
1591 nop.m 0
1592(p15) fma.s1 FR_n_A3 = FR_n_A5, FR_n_XS4, FR_n_A3 // poly tail
1593 nop.i 0
1594};;
1595
0347518d 1596{ .mfi
d5efd131
MF
1597 nop.m 0
1598(p14) fcmp.gt.unc.s1 p7,p0 = FR_l_AbsX, FR_c_PosOverflow //X > 1755.5483
1599 // (overflow domain, result cannot be represented by normal value)
1600 nop.i 0
1601}
0347518d 1602{ .mfi
d5efd131
MF
1603 nop.m 0
1604(p15) fma.s1 FR_n_XS7 = FR_n_XS7, FR_n_XS, f0 // x^7 construction
1605 nop.i 0
1606};;
1607
0347518d 1608{ .mfi
d5efd131
MF
1609 nop.m 0
1610(p15) fms.s1 FR_n_TL = FR_n_A2H, FR_n_XS2, FR_n_TH // A2H*xs2+TH
1611 nop.i 0
1612}
1613{ .mfi
1614 nop.m 0
1615(p15) fma.s1 FR_n_PolyH = FR_n_TH, f1, FR_n_A1H // PolyH=TH+A1H
0347518d 1616 nop.i 0
d5efd131
MF
1617};;
1618
0347518d 1619{ .mfi
d5efd131
MF
1620 nop.m 0
1621 fmpy.s1 FR_l_GS_hi = FR_l_G, FR_l_S // GS_hi = G*S
1622 nop.i 0
1623}
1624{ .mfb
1625 nop.m 0
1626 fms.s1 FR_l_r = FR_l_G, FR_l_S, f1 // r = G*S -1
1627(p7) br.cond.spnt tgammal_overflow // Overflow path for arg > 1755.5483 //////
1628};;
1629
0347518d 1630{ .mfi
d5efd131 1631 nop.m 0
6f65e668 1632 fma.s1 FR_l_B14 = FR_l_B16, FR_l_Z2, FR_l_B14// Bernoulli tail
d5efd131
MF
1633 nop.i 0
1634}
0347518d 1635{ .mfi
d5efd131
MF
1636 nop.m 0
1637 fma.s1 FR_l_Z4 = FR_l_Z2, FR_l_Z2, f0 // Z^4 = Z^2*Z^2
1638 nop.i 0
1639};;
1640
0347518d 1641{ .mfi
d5efd131 1642 nop.m 0
6f65e668 1643 fma.s1 FR_l_B2 = FR_l_B4, FR_l_Z2, FR_l_B2 // Bernoulli tail
d5efd131
MF
1644 nop.i 0
1645}
1646{ .mfi
1647 nop.m 0
6f65e668 1648 fma.s1 FR_l_B6 = FR_l_B8, FR_l_Z2, FR_l_B6 // Bernoulli tail
d5efd131
MF
1649 nop.i 0
1650};;
1651
0347518d 1652{ .mfi
d5efd131 1653 nop.m 0
6f65e668 1654 fma.s1 FR_l_B10 = FR_l_B12, FR_l_Z2, FR_l_B10// Bernoulli tail
d5efd131
MF
1655 nop.i 0
1656}
1657{ .mfi
1658 nop.m 0
1659(p15) fma.s1 FR_n_Tail = FR_n_A7, FR_n_XS8, FR_n_A3 // poly tail
1660 nop.i 0
1661};;
1662
0347518d 1663{ .mfi
d5efd131
MF
1664 nop.m 0
1665(p15) fma.s1 FR_n_TL = FR_n_TL, f1, FR_n_TT // TL = TL+T
1666 nop.i 0
1667}
1668{ .mfi
1669 nop.m 0
1670(p15) fms.s1 FR_n_PolyL = FR_n_A1H, f1, FR_n_PolyH // polyH+A1H
1671 nop.i 0
1672};;
1673
0347518d 1674{ .mfi
d5efd131
MF
1675 nop.m 0
1676 fma.s1 FR_l_poly_lo = FR_l_r, FR_l_Q_6, FR_l_Q_5 // Q_5+r*Q_6
1677 nop.i 0
1678}
1679{ .mfi
1680 nop.m 0
1681 fsub.s1 FR_l_r_cor = FR_l_GS_hi, f1 // r_cor = GS_hi -1
0347518d 1682 nop.i 0
d5efd131
MF
1683};;
1684
0347518d 1685{ .mfi
d5efd131
MF
1686 nop.m 0
1687 fms.s1 FR_l_GS_lo = FR_l_G, FR_l_S, FR_l_GS_hi // G*S-GS_hi
1688 nop.i 0
1689}
1690{ .mfi
1691 nop.m 0
1692 fma.s1 FR_l_poly = FR_l_r, FR_l_Q_2, FR_l_Q_1 //poly=r*Q2+Q1
0347518d 1693 nop.i 0
d5efd131
MF
1694};;
1695
0347518d 1696{ .mfi
d5efd131
MF
1697 nop.m 0
1698 fmpy.s1 FR_l_rsq = FR_l_r, FR_l_r // rsq = r * r
1699 nop.i 0
1700}
1701{ .mfi
1702 nop.m 0
1703 fma.s1 FR_l_G = FR_l_float_N, FR_l_log2_hi, FR_l_H // Tbl =
1704 // float_N*log2_hi + H
0347518d 1705 nop.i 0
d5efd131
MF
1706};;
1707
0347518d
MF
1708{ .mfi
1709 nop.m 0
d5efd131
MF
1710 fma.s1 FR_l_Y_lo = FR_l_float_N, FR_l_log2_lo, FR_l_h // Y_lo=
1711 // float_N*log2_lo + h
0347518d 1712 nop.i 0
d5efd131
MF
1713}
1714{ .mfi
0347518d 1715 nop.m 0
d5efd131 1716 fma.s1 FR_l_B14 = FR_l_B18, FR_l_Z4, FR_l_B14 //bernulli tail
0347518d 1717 nop.i 0
d5efd131
MF
1718};;
1719
0347518d
MF
1720{ .mfi
1721 nop.m 0
d5efd131 1722 fma.s1 FR_l_B2 = FR_l_B6, FR_l_Z4, FR_l_B2 //bernulli tail
0347518d 1723 nop.i 0
d5efd131
MF
1724}
1725{ .mfi
0347518d 1726 nop.m 0
d5efd131 1727 fma.s1 FR_l_Z8 = FR_l_Z4, FR_l_Z4, f0 //bernulli tail
0347518d 1728 nop.i 0
d5efd131
MF
1729};;
1730
0347518d 1731{ .mfi
d5efd131
MF
1732 nop.m 0
1733 fma.s1 FR_l_poly_lo = FR_l_r, FR_l_poly_lo, FR_l_Q_4 // poly_lo =
1734 // Q_4 + r * poly_lo
1735 nop.i 0
1736}
1737{ .mfi
1738 nop.m 0
1739 fsub.s1 FR_l_r_cor = FR_l_r_cor, FR_l_r // r_cor = r_cor - r
0347518d 1740 nop.i 0
d5efd131
MF
1741};;
1742
0347518d 1743{ .mfi
d5efd131
MF
1744 nop.m 0
1745(p15) fma.s1 FR_n_PolyL = FR_n_PolyL, f1, FR_n_TH // polyL+TH
1746 nop.i 0
1747}
1748{ .mfi
1749 nop.m 0
1750(p15) fma.s1 FR_n_TT = FR_n_TL, f1, FR_n_A1L // TL+A1L
1751 nop.i 0
1752};;
1753
0347518d 1754{ .mfi
d5efd131 1755 nop.m 0
0347518d
MF
1756 fadd.s1 FR_l_logl_YHi = FR_l_G, FR_l_r // Y_hi = Tbl + r
1757 nop.i 0
d5efd131
MF
1758};;
1759
0347518d 1760{ .mfi
d5efd131
MF
1761 nop.m 0
1762 fma.s1 FR_l_B10 = FR_l_B14, FR_l_Z4, FR_l_B10 //bernulli tail
0347518d 1763 nop.i 0
d5efd131
MF
1764};;
1765
0347518d 1766{ .mfi
d5efd131
MF
1767 nop.m 0
1768 fma.s1 FR_l_poly_lo = FR_l_r, FR_l_poly_lo, FR_l_Q_3 // poly_lo =
1769 // Q_3 + r * poly_lo
1770 nop.i 0
1771}
1772{ .mfi
1773 nop.m 0
1774 fadd.s1 FR_l_r_cor = FR_l_r_cor, FR_l_GS_lo // r_cor=r_cor+GS_lo
0347518d 1775 nop.i 0
d5efd131
MF
1776};;
1777
0347518d 1778{ .mfi
d5efd131
MF
1779 nop.m 0
1780(p15) fma.s1 FR_n_PolyL = FR_n_PolyL, f1, FR_n_TT // polyL+TT
1781 nop.i 0
1782};;
1783
0347518d 1784{ .mfi
d5efd131 1785 nop.m 0
0347518d 1786 fsub.s1 FR_l_Y_lo_res = FR_l_G, FR_l_logl_YHi // Y_lo = Tbl - Y_hi
d5efd131
MF
1787 nop.i 0
1788}
1789{ .mfi
1790 nop.m 0
1791 fma.s1 FR_l_XYH = FR_l_logl_YHi, FR_l_AbsX_m_Half, f0 // XYH=
1792 // YHi*|x-0.5|
1793 nop.i 0
1794};;
1795
0347518d 1796{ .mfi
d5efd131 1797 nop.m 0
6f65e668 1798 fma.s1 FR_l_SS = FR_l_B10, FR_l_Z8, FR_l_B2 // Bernoulli tail
d5efd131
MF
1799 nop.i 0
1800};;
1801
0347518d 1802{ .mfi
d5efd131
MF
1803 nop.m 0
1804 fadd.s1 FR_l_r_cor = FR_l_r_cor, FR_l_Y_lo // r_cor = r_cor+Y_lo
1805 nop.i 0
1806}
1807{ .mfi
1808 nop.m 0
1809 fma.s1 FR_l_poly = FR_l_rsq, FR_l_poly_lo, FR_l_poly //poly=
1810 // r^2*polyLo+poly
0347518d 1811 nop.i 0
d5efd131
MF
1812};;
1813
0347518d 1814{ .mfi
d5efd131
MF
1815 nop.m 0
1816(p15) fma.s1 FR_n_TT = FR_n_PolyL, FR_n_XS2, f0 // T=polyL*xs^2
0347518d 1817 nop.i 0
d5efd131
MF
1818};;
1819
0347518d 1820{ .mfi
d5efd131
MF
1821 nop.m 0
1822 fadd.s1 FR_l_Y_lo = FR_l_Y_lo_res, FR_l_r // Y_lo = Y_lo + r
1823 nop.i 0
1824}
1825{ .mfi
1826 nop.m 0
1827 fms.s1 FR_l_XYL = FR_l_logl_YHi, FR_l_AbsX_m_Half, FR_l_XYH
1828 // XYL = YHi*|x-0.5|-XYH
1829 nop.i 0
1830};;
1831
0347518d 1832{ .mfi
d5efd131
MF
1833 nop.m 0
1834 fma.s1 FR_l_SSCXH = FR_l_SS, FR_l_Z, FR_l_CXH // SS*Z+CXH
1835 nop.i 0
1836}
1837{ .mfi
1838 mov GR_e_exp_2tom51= 0xffff-51 // 2^-51
1839(p15) fma.s1 FR_l_SignedXYH = FR_l_XYH, FR_n_NegOne, f0 // XYH = -XYH
1840 // for negatives
1841 nop.i 0
1842};;
1843
0347518d 1844{ .mlx
d5efd131
MF
1845 nop.m 0
1846 movl GR_e_rshf_2to51 = 0x4718000000000000 // 1.10000 2^(63+51)
1847}
1848{ .mlx
1849 nop.m 0
1850 movl GR_e_sig_inv_ln2 = 0xb8aa3b295c17f0bc //significand of 1/ln2
1851};;
1852
1853{ .mfi
1854 nop.m 0
0347518d 1855 fma.s1 FR_l_poly = FR_l_rsq, FR_l_poly, FR_l_r_cor // poly =
d5efd131 1856 // rsq * poly + r_cor
0347518d 1857 nop.i 0
d5efd131
MF
1858};;
1859
0347518d 1860{ .mfi
d5efd131 1861 addl GR_e_ad_Arg = @ltoff(Constants_Tgammal_exp_64_Arg#),gp
0347518d 1862(p15) fma.s1 FR_n_TT = FR_n_PolyH, FR_n_XS2L, FR_n_TT
d5efd131
MF
1863 mov GR_e_exp_mask = 0x1FFFF // Form exponent mask
1864}
1865{ .mlx
1866 nop.m 0
1867 movl GR_e_rshf = 0x43e8000000000000 // 1.10000 2^63 rshift
1868};;
1869
1870
0347518d 1871{ .mmi
d5efd131
MF
1872 setf.sig FR_e_INV_LN2_2TO63 = GR_e_sig_inv_ln2 // form 1/ln2 * 2^63
1873 setf.d FR_e_RSHF_2TO51 = GR_e_rshf_2to51 // 1.1000 * 2^(63+51)
1874 nop.i 0
1875};;
1876
0347518d 1877{ .mfi
d5efd131
MF
1878 nop.m 0
1879 fms.s1 FR_l_SSCXL = FR_l_CXH, f1, FR_l_SSCXH // CXH+SS*CXH
1880 nop.i 0
1881}
1882{ .mfi
1883 nop.m 0
0347518d 1884 fma.s1 FR_e_expl_Input_AbsX = FR_l_XYH, f1, FR_l_SSCXH // HI EXP
d5efd131
MF
1885 nop.i 0
1886};;
1887
1888.pred.rel "mutex",p14,p15
0347518d 1889{ .mfi
d5efd131
MF
1890 nop.m 0
1891(p14) fma.s1 FR_e_expl_Input_X = FR_l_XYH, f1, FR_l_SSCXH // HI EXP
1892 mov GR_e_exp_bias = 0x0FFFF // Set exponent bias
1893}
1894{ .mfi
1895 ld8 GR_e_ad_Arg = [GR_e_ad_Arg] // Point to Arg table
1896(p15) fms.s1 FR_e_expl_Input_X = FR_l_SignedXYH, f1, FR_l_SSCXH // HI EXP
1897 nop.i 0
1898};;
1899
0347518d 1900{ .mfi
d5efd131
MF
1901 nop.m 0
1902 fadd.s1 FR_l_logl_YLo = FR_l_Y_lo, FR_l_poly // YLo = YLo+poly
1903 nop.i 0
1904};;
1905
0347518d 1906{ .mfi
d5efd131
MF
1907 setf.exp FR_e_2TOM51 = GR_e_exp_2tom51 //2^-51 for scaling float_N
1908(p15) fma.s1 FR_n_TH = FR_n_PolyH, FR_n_XS2, FR_n_TT // TH=
1909 // polyH*xs^2+T
1910 nop.i 0
1911}
1912{ .mib
1913 setf.d FR_e_RSHF = GR_e_rshf // Right shift const 1.1000*2^63
1914 nop.i 0
1915 nop.b 0
1916};;
1917
0347518d 1918{ .mfi
d5efd131
MF
1919 add GR_e_ad_A = 0x20, GR_e_ad_Arg // Point to A table
1920 nop.f 0
1921 add GR_e_ad_T1 = 0x50, GR_e_ad_Arg // Point to T1 table
1922}
1923{ .mfi
1924 add GR_e_ad_T2 = 0x150, GR_e_ad_Arg // Point to T2 table
1925 nop.f 0
1926 nop.i 0
1927};;
1928
0347518d 1929{ .mfi
d5efd131
MF
1930 nop.m 0
1931 fma.s1 FR_l_SSCXL = FR_l_SS, FR_l_Z, FR_l_SSCXL
1932 nop.i 0
1933}
1934{ .mfi
1935 nop.m 0
1936 fms.s1 FR_e_expl_Input_Y = FR_l_XYH, f1, FR_e_expl_Input_AbsX
1937 nop.i 0
1938};;
1939
0347518d 1940{ .mfi
d5efd131
MF
1941 ldfe FR_e_L_hi = [GR_e_ad_Arg],16 // Get L_hi
1942 nop.f 0
1943 nop.i 0
1944};;
1945
0347518d 1946{ .mfi
d5efd131
MF
1947 nop.m 0
1948 fma.s1 FR_l_XYL = FR_l_logl_YLo, FR_l_AbsX_m_Half, FR_l_XYL
1949 // XYL = YLo*|x-0.5|+XYL
1950 nop.i 0
1951};;
1952
0347518d 1953{ .mfi
d5efd131 1954 ldfe FR_e_L_lo = [GR_e_ad_Arg],16 // Get L_lo
0347518d 1955(p15) fms.s1 FR_n_TL = FR_n_PolyH, FR_n_XS2, FR_n_TH // TL =
d5efd131
MF
1956 // = polyH*xs^2-TH
1957 add GR_e_ad_W1 = 0x100, GR_e_ad_T2 // Point to W1 table
1958}
1959{ .mfi
1960 nop.m 0
1961(p15) fma.s1 FR_n_Poly1H = FR_n_TH, f1, f1 // poly1H = TH+1
1962 add GR_e_ad_W2 = 0x300, GR_e_ad_T2 // Point to W2 table
1963};;
1964
0347518d 1965{ .mmi
d5efd131 1966 getf.exp GR_e_signexp_x = FR_e_expl_Input_X // Extract sign and exp
0347518d 1967 ldfe FR_e_A3 = [GR_e_ad_A],16 // Get A3
d5efd131
MF
1968 nop.i 0
1969};;
1970
0347518d 1971{ .mfi
d5efd131
MF
1972 nop.m 0
1973 fma.s1 FR_l_SSCXL = FR_l_SSCXL, f1, FR_l_CXL
1974 nop.i 0
1975}
1976{ .mfi
1977 nop.m 0
1978 fma.s1 FR_e_expl_Input_Y = FR_e_expl_Input_Y, f1, FR_l_SSCXH
1979 nop.i 0
1980};;
1981
0347518d 1982{ .mfi
d5efd131
MF
1983 nop.m 0
1984 fma.s1 FR_e_N_signif=FR_e_expl_Input_X,FR_e_INV_LN2_2TO63,FR_e_RSHF_2TO51
1985 and GR_e_exp_x = GR_e_signexp_x, GR_e_exp_mask
1986};;
1987
0347518d 1988{ .mmi
d5efd131
MF
1989 sub GR_e_exp_x = GR_e_exp_x, GR_e_exp_bias // Get exponent
1990 ldfe FR_e_A2 = [GR_e_ad_A],16 // Get A2 for main path
1991 nop.i 0
1992};;
1993
0347518d 1994{ .mfi
d5efd131
MF
1995 nop.m 0
1996(p15) fma.s1 FR_n_PolyH = FR_n_Poly1H, FR_n_XS, f0//sin(Pi*x) poly
1997 nop.i 0
1998}
1999{ .mfi
2000 nop.m 0
2001(p15) fms.s1 FR_n_Poly1L = f1, f1, FR_n_Poly1H//sin(Pi*x) poly
2002 nop.i 0
2003};;
2004
0347518d 2005{ .mfi
d5efd131
MF
2006 nop.m 0
2007(p15) fma.s1 FR_n_TL = FR_n_TL, f1, FR_n_TT//sin(Pi*x) poly
2008 nop.i 0
2009};;
2010
0347518d 2011{ .mfi
d5efd131
MF
2012 nop.m 0
2013 fma.s1 FR_l_Temp = FR_l_XYL, f1, FR_l_SSCXL // XYL+SS*CXL
2014 nop.i 0
2015}
2016{ .mfi
2017 nop.m 0
2018(p15) fma.s1 FR_e_expl_Input_Y = FR_e_expl_Input_Y, FR_n_NegOne, f0
2019 // Negate lo part of exp argument for negative input values
2020 nop.i 0
2021};;
2022
0347518d 2023{ .mfi
d5efd131
MF
2024 ldfe FR_e_A1 = [GR_e_ad_A],16 // Get A1
2025 nop.f 0
2026 nop.i 0
2027}
2028{ .mfi
2029 nop.m 0
0347518d 2030 fms.s1 FR_e_float_N = FR_e_N_signif, FR_e_2TOM51, FR_e_RSHF
d5efd131
MF
2031 // Get float N = signd*2^51-RSHIFTER
2032 nop.i 0
2033};;
2034
0347518d 2035{ .mfi
d5efd131
MF
2036 nop.m 0
2037(p15) fma.s1 FR_n_Poly1L = FR_n_Poly1L, f1, FR_n_TH //sin(Pi*x) poly
2038 nop.i 0
2039}
2040{ .mfi
2041 nop.m 0
2042(p15) fms.s1 FR_n_PolyL = FR_n_Poly1H, FR_n_XS, FR_n_PolyH//sin(Pi*x)
2043 nop.i 0
2044};;
2045
0347518d 2046{ .mfi
d5efd131
MF
2047 getf.sig GR_e_N_fix = FR_e_N_signif // Get N from significand
2048 nop.f 0
2049 nop.i 0
2050};;
2051
2052.pred.rel "mutex",p14,p15
0347518d 2053{ .mfi
d5efd131 2054 nop.m 0
0347518d 2055(p14) fma.s1 FR_e_expl_Input_Y = FR_e_expl_Input_Y, f1, FR_l_Temp
d5efd131
MF
2056 nop.i 0
2057}
0347518d 2058{ .mfi
d5efd131 2059 nop.m 0
0347518d 2060(p15) fms.s1 FR_e_expl_Input_Y = FR_e_expl_Input_Y, f1, FR_l_Temp
d5efd131
MF
2061 // arguments for exp computation
2062 nop.i 0
2063};;
2064
0347518d 2065{ .mfi
d5efd131
MF
2066 nop.m 0
2067 fnma.s1 FR_e_r = FR_e_L_hi, FR_e_float_N, FR_e_expl_Input_X
2068 // r = -L_hi * float_N + x
2069 extr.u GR_e_M1 = GR_e_N_fix, 6, 6 // Extract index M_1
2070};;
2071
0347518d 2072{ .mfi
d5efd131
MF
2073 nop.m 0
2074(p15) fma.s1 FR_n_Poly1L = FR_n_Poly1L, f1, FR_n_TL //sin(Pi*x) poly
2075 nop.i 0
2076};;
2077
2078
0347518d 2079{ .mmf
d5efd131
MF
2080 nop.m 0
2081 nop.m 0
0347518d 2082 fma.s1 FR_e_r = FR_e_r, f1, FR_e_expl_Input_Y
d5efd131
MF
2083 // r = r + FR_e_expl_Input_Y
2084};;
2085
0347518d 2086{ .mmi
d5efd131
MF
2087 shladd GR_e_ad_W1 = GR_e_M1,3,GR_e_ad_W1 // Point to W1
2088 shladd GR_e_ad_T1 = GR_e_M1,2,GR_e_ad_T1 // Point to T1
2089 extr.u GR_e_M2 = GR_e_N_fix, 0, 6 // Extract index M_2
2090};;
2091
2092
0347518d 2093{ .mfi
d5efd131
MF
2094 ldfs FR_e_T1 = [GR_e_ad_T1],0 // Get T1
2095 nop.f 0
2096 extr GR_e_K = GR_e_N_fix, 12, 32 //Extract limit range K
2097}
2098{ .mfi
2099 shladd GR_e_ad_T2 = GR_e_M2,2,GR_e_ad_T2 // Point to T2
2100(p15) fma.s1 FR_n_PolyL = FR_n_Poly1L, FR_n_XS, FR_n_PolyL
2101 //sin(Pi*x) poly
2102 shladd GR_e_ad_W2 = GR_e_M2,3,GR_e_ad_W2 // Point to W2
2103};;
2104
0347518d 2105{ .mfi
d5efd131
MF
2106 ldfs FR_e_T2 = [GR_e_ad_T2],0 // Get T2
2107 nop.f 0
2108 add GR_e_exp_2_k = GR_e_exp_bias, GR_e_K // exp of 2^k
2109}
2110{ .mfi
2111 ldfd FR_e_W1 = [GR_e_ad_W1],0 // Get W1
2112 nop.f 0
2113 sub GR_e_exp_2_mk = GR_e_exp_bias, GR_e_K // exp of 2^-k
2114};;
2115
0347518d 2116{ .mmi
d5efd131
MF
2117 ldfd FR_e_W2 = [GR_e_ad_W2],0 // Get W2
2118 nop.m 0
2119 nop.i 0
2120};;
2121
0347518d 2122{ .mmf
d5efd131
MF
2123 setf.exp FR_e_scale = GR_e_exp_2_k // Set scale = 2^k
2124 setf.exp FR_e_2_mk = GR_e_exp_2_mk // Form 2^-k
0347518d 2125 fnma.s1 FR_e_r = FR_e_L_lo, FR_e_float_N, FR_e_r
d5efd131
MF
2126 // r = -L_lo * float_N + r
2127};;
2128
0347518d 2129{ .mfi
d5efd131 2130 nop.m 0
0347518d 2131(p15) fma.s1 FR_n_PolyL = FR_n_Tail, FR_n_XS7, FR_n_PolyL
d5efd131
MF
2132 //sin(Pi*x) poly
2133 nop.i 0
2134};;
2135
0347518d 2136{ .mfi
d5efd131
MF
2137 nop.m 0
2138 fma.s1 FR_e_poly = FR_e_r, FR_e_A3, FR_e_A2 // poly=r*A3+A2
2139 nop.i 0
2140}
2141{ .mfi
2142 nop.m 0
2143 fmpy.s1 FR_e_rsq = FR_e_r, FR_e_r // rsq = r * r
2144 nop.i 0
2145};;
2146
0347518d 2147{ .mfi
d5efd131
MF
2148 nop.m 0
2149 fmpy.s1 FR_e_T = FR_e_T1, FR_e_T2 // T = T1 * T2
2150 nop.i 0
2151}
2152{ .mfi
2153 nop.m 0
2154 fadd.s1 FR_e_W1_p1 = FR_e_W1, f1 // W1_p1 = W1 + 1.0
2155 nop.i 0
2156};;
2157
0347518d 2158{ .mfi
d5efd131
MF
2159 nop.m 0
2160(p15) fma.s1 FR_n_TT = FR_n_PolyL, FR_l_AbsX, f0 //sin(Pi*x) poly
2161 nop.i 0
2162};;
2163
0347518d 2164{ .mfi
d5efd131 2165 nop.m 0
0347518d 2166 fma.s1 FR_e_poly = FR_e_r, FR_e_poly, FR_e_A1
d5efd131
MF
2167 // poly = r * poly + A1
2168 nop.i 0
2169};;
2170
0347518d 2171{ .mfi
d5efd131
MF
2172 nop.m 0
2173 fma.s1 FR_e_T_scale = FR_e_T, FR_e_scale, f0 // T_scale=T*scale
2174 nop.i 0
2175}
2176{ .mfi
2177 nop.m 0
0347518d 2178 fma.s1 FR_e_W = FR_e_W2, FR_e_W1_p1, FR_e_W1
d5efd131
MF
2179 // W = W2 * (W1+1.0) + W1
2180 nop.i 0
2181};;
2182
0347518d 2183{ .mfi
d5efd131 2184 nop.m 0
0347518d 2185(p15) fma.s1 FR_n_SinxH = FR_n_PolyH, FR_l_AbsX, FR_n_TT
d5efd131
MF
2186 // sin(Pi*x) poly
2187 nop.i 0
2188};;
2189
0347518d 2190{ .mfi
d5efd131
MF
2191 nop.m 0
2192 mov FR_e_Y_hi = FR_e_T // Assume Y_hi = T
2193 nop.i 0
2194};;
2195
0347518d 2196{ .mfi
d5efd131 2197 nop.m 0
0347518d 2198 fma.s1 FR_e_poly = FR_e_rsq, FR_e_poly, FR_e_r
d5efd131
MF
2199 // poly = rsq * poly + r
2200 nop.i 0
2201};;
2202
0347518d 2203{ .mfi
d5efd131 2204 nop.m 0
0347518d 2205 fma.s1 FR_e_Wp1_T_scale = FR_e_W, FR_e_T_scale, FR_e_T_scale
d5efd131
MF
2206 // (W+1)*T*scale
2207 nop.i 0
2208}
2209{ .mfi
2210 nop.m 0
2211 fma.s1 FR_e_W_T_scale = FR_e_W, FR_e_T_scale, f0 // W*T*scale
2212 nop.i 0
2213};;
2214
0347518d 2215{ .mfi
d5efd131
MF
2216 nop.m 0
2217(p15) fms.s1 FR_n_SinxL = FR_n_PolyH, FR_l_AbsX, FR_n_SinxH
2218 // Low part of sin
2219 nop.i 0
2220};;
2221
0347518d 2222{ .mfi
d5efd131
MF
2223 nop.m 0
2224(p15) frcpa.s1 FR_n_Y0, p0 = f1, FR_n_SinxH // y = frcpa(b)
2225 nop.i 0
2226};;
2227
0347518d 2228{ .mfi
d5efd131
MF
2229 nop.m 0
2230 fma.s1 FR_e_result_lo = FR_e_Wp1_T_scale, FR_e_poly, FR_e_W_T_scale
2231 // Low part of exp result
2232 nop.i 0
2233};;
2234
0347518d 2235{ .mfi
d5efd131
MF
2236 nop.m 0
2237(p15) fma.s1 FR_n_SinxL = FR_n_SinxL, f1, FR_n_TT // sin low result
2238 nop.i 0
2239};;
2240
0347518d 2241{ .mfi
d5efd131
MF
2242 nop.m 0
2243(p15) fma.s1 FR_n_Q0 = f1,FR_n_Y0,f0 // q = y
2244 nop.i 0
2245}
0347518d 2246{ .mfi
d5efd131 2247 nop.m 0
0347518d 2248(p15) fnma.s1 FR_n_E0 = FR_n_Y0, FR_n_SinxH, f1 // e = 1-b*y
d5efd131
MF
2249 nop.i 0
2250};;
2251
2252
0347518d 2253{ .mfb
d5efd131
MF
2254 nop.m 0
2255(p14) fma.s0 f8 = FR_e_Y_hi, FR_e_scale, FR_e_result_lo
2256(p14) br.ret.spnt b0 // Exit for positive Stirling path //////////////////////
2257};;
2258
0347518d 2259{ .mfi
d5efd131
MF
2260 nop.m 0
2261 fma.s1 FR_e_expl_Output_X = FR_e_Y_hi, FR_e_scale, f0 // exp result
2262 nop.i 0
2263}
2264{ .mfi
2265 nop.m 0
2266 fma.s1 FR_e_expl_Output_Y = FR_e_result_lo, f1, f0// exp lo result
2267 nop.i 0
2268};;
2269
0347518d 2270{ .mfi
d5efd131
MF
2271 nop.m 0
2272 fma.s1 FR_n_E2 = FR_n_E0,FR_n_E0,FR_n_E0 // e2 = e+e^2
2273 nop.i 0
2274}
0347518d 2275{ .mfi
d5efd131
MF
2276 nop.m 0
2277 fma.s1 FR_n_E1 = FR_n_E0,FR_n_E0,f0 // e1 = e^2
2278 nop.i 0
2279};;
2280
0347518d 2281{ .mfi
d5efd131
MF
2282 nop.m 0
2283 fma.s1 FR_n_Y1 = FR_n_Y0,FR_n_E2,FR_n_Y0 // y1 = y+y*e2
2284 nop.i 0
2285}
0347518d 2286{ .mfi
d5efd131
MF
2287 nop.m 0
2288 fma.s1 FR_n_E3 = FR_n_E1,FR_n_E1,FR_n_E0 // e3 = e+e1^2
2289 nop.i 0
2290};;
2291
0347518d 2292{ .mfi
d5efd131
MF
2293 nop.m 0
2294 fma.s1 FR_n_Y2 = FR_n_Y1,FR_n_E3,FR_n_Y0 // y2 = y+y1*e3
2295 nop.i 0
2296}
0347518d 2297{ .mfi
d5efd131
MF
2298 nop.m 0
2299 fnma.s1 FR_n_R0 = FR_n_SinxH,FR_n_Q0,f1 // r = a-b*q
2300 nop.i 0
2301};;
2302
0347518d 2303{ .mfi
d5efd131
MF
2304 nop.m 0
2305 fnma.s1 FR_n_E4 = FR_n_SinxH,FR_n_Y2,f1 // e4 = 1-b*y2
2306 nop.i 0
2307}
0347518d 2308{ .mfi
d5efd131
MF
2309 nop.m 0
2310 fma.s1 FR_n_RcpResH = FR_n_R0,FR_n_Y2,FR_n_Q0 // x = q+r*y2
2311 nop.i 0
2312};;
2313
0347518d 2314{ .mfi
d5efd131
MF
2315 nop.m 0
2316 fma.s1 FR_n_Y3 = FR_n_Y2,FR_n_E4,FR_n_Y2 // y3 = y2+y2*e4
2317 nop.i 0
2318}
0347518d 2319{ .mfi
d5efd131
MF
2320 nop.m 0
2321 fnma.s1 FR_n_R1 = FR_n_SinxH,FR_n_RcpResH,f1 // r1 = a-b*x
2322 nop.i 0
2323};;
2324
0347518d 2325{ .mfi
d5efd131 2326 nop.m 0
0347518d 2327 fnma.s1 FR_n_R1 = FR_n_SinxL,FR_n_RcpResH,FR_n_R1
d5efd131
MF
2328 // r1 = r1 - b_lo*X
2329 nop.i 0
2330};;
2331
0347518d 2332{ .mfi
d5efd131
MF
2333 nop.m 0
2334 fma.s1 FR_n_RcpResL = FR_n_R1,FR_n_Y3,f0 // x_lo = r1*y3
2335 nop.i 0
2336}
0347518d 2337{ .mfi
d5efd131
MF
2338 nop.m 0
2339 fma.s1 FR_n_Temp = FR_n_RcpResH, FR_e_expl_Output_Y, f0
2340 // Multiplying exp and sin result
2341 nop.i 0
2342};;
2343
0347518d 2344{ .mfi
d5efd131
MF
2345 nop.m 0
2346 fma.s1 FR_n_Temp = FR_n_RcpResL, FR_e_expl_Output_X, FR_n_Temp
2347 // Multiplying exp and sin result
2348 nop.i 0
2349};;
2350
0347518d 2351{ .mfi
d5efd131
MF
2352 nop.m 0
2353 fma.s1 FR_n_ResH = FR_n_RcpResH, FR_e_expl_Output_X, FR_n_Temp
2354 // Multiplying exp and sin result
2355 nop.i 0
2356};;
2357
0347518d 2358{ .mfi
d5efd131
MF
2359 nop.m 0
2360 fms.s1 FR_n_ResL = FR_n_RcpResH, FR_e_expl_Output_X, FR_n_ResH
2361 // Multiplying exp and sin result
2362 nop.i 0
2363}
0347518d 2364{ .mfi
d5efd131
MF
2365 nop.m 0
2366(p12) fma.s1 FR_n_ResH = FR_n_ResH, FR_n_NegOne, f0 // Negate
2367 nop.i 0
2368};;
2369
0347518d 2370{ .mfi
d5efd131
MF
2371 nop.m 0
2372 fma.s1 FR_n_ResL = FR_n_ResL, f1, FR_n_Temp
2373 // Multiplying exp and sin result - low result obtained
2374 nop.i 0
2375};;
2376
2377.pred.rel "mutex",p12,p13
0347518d 2378{ .mfi
d5efd131
MF
2379 nop.m 0
2380(p13) fma.s0 f8 = FR_n_ResH, f1, FR_n_ResL // For odd
2381 nop.i 0
2382}
0347518d 2383{ .mfb
d5efd131
MF
2384 nop.m 0
2385(p12) fms.s0 f8 = FR_n_ResH, f1, FR_n_ResL // For even
2386 br.ret.sptk b0 // Exit for negative Stirling path //////////////////////
2387};;
2388
2389
2390//////////// 1 <= |X| < 13 path ////////////////////////////////////////////////
2391//------------------------------------------------------------------------------
2392.align 64
2393tgamma_lt_13:
0347518d 2394{ .mfi
d5efd131
MF
2395 getf.sig GR_p_XN = FR_p_IXN // Get significand
2396 fcvt.xf FR_p_XN = FR_p_IXN // xn = [x]
2397 add GR_r_sin_Table2= 0x40, GR_r_sin_Table // Shifted table addr.
0347518d 2398}
d5efd131
MF
2399{ .mfi
2400 ldfpd FR_p_0p5, FR_p_1p5 = [GR_c_Table], 16 // 0.5 & 1.5
2401 fms.s1 FR_p_AbsXM1 = FR_p_AbsX, f1, f1 // X-1
2402 add GR_p_Table2 = 0xB0, GR_p_Table
0347518d 2403};;
d5efd131 2404
0347518d 2405{ .mfi
d5efd131
MF
2406 add GR_r_sin_Table = -16, GR_r_sin_Table // For compensation
2407 fcvt.xf FR_r_XNS = FR_r_IXNS // Convert int repr to float
2408 shr.u GR_p_X_Sgnd = GR_p_X_Sgnd, 59 // Get only 5 bit of signd
0347518d 2409};;
d5efd131 2410
0347518d 2411{ .mfi
d5efd131
MF
2412 ldfpd FR_r_A2H,FR_r_A2L = [GR_r_sin_Table], 16 // Load A2
2413 nop.f 0
0347518d
MF
2414 add GR_p_Int = -2, GR_p_XN // int = int - 2
2415}
d5efd131 2416{ .mfi
0347518d 2417 ldfe FR_r_A6 = [GR_r_sin_Table2], 16
d5efd131
MF
2418 nop.f 0
2419 cmp.gtu p11, p12 = 0x2, GR_p_XN // p11: x < 2 (splitted intervals),
2420 // p12: x > 2 (base intervals)
0347518d 2421};;
d5efd131 2422
0347518d
MF
2423{ .mfi
2424 ldfpd FR_r_A1H, FR_r_A1L = [GR_r_sin_Table], 16
d5efd131
MF
2425 nop.f 0
2426 shr GR_p_Int = GR_p_Int, 1 // int/2
0347518d 2427}
d5efd131 2428{ .mfi
0347518d 2429 ldfe FR_r_A5 = [GR_r_sin_Table2], 16
d5efd131
MF
2430 nop.f 0
2431(p11) cmp.gtu.unc p10, p11 = 0x1C, GR_p_X_Sgnd // sgnd(x) < 0.75
0347518d 2432};;
d5efd131 2433
0347518d
MF
2434{ .mfi
2435 ldfe FR_r_A9 = [GR_r_sin_Table], 16
d5efd131
MF
2436 nop.f 0
2437 shl GR_p_Offset = GR_p_Int, 4 // offset = int*16
0347518d 2438}
d5efd131 2439{ .mfi
0347518d 2440 ldfe FR_r_A4 = [GR_r_sin_Table2], 16
d5efd131
MF
2441 nop.f 0
2442(p10) cmp.gtu.unc p9, p10 = 0x14, GR_p_X_Sgnd // sgnd(x) < 0.25
0347518d 2443};;
d5efd131
MF
2444
2445
0347518d
MF
2446{ .mfi
2447 ldfe FR_r_A8 = [GR_r_sin_Table], 16
d5efd131
MF
2448 nop.f 0
2449(p12) tbit.nz.unc p13, p12 = GR_p_XN, 0x0 // p13: reccurent computations
2450 // X is at [3;4], [5;6], [7;8]... interval
0347518d 2451}
d5efd131 2452{ .mfi
0347518d 2453 ldfe FR_r_A3 = [GR_r_sin_Table2], 16
d5efd131
MF
2454 nop.f 0
2455 shladd GR_p_Offset = GR_p_Int, 2, GR_p_Offset // +int*4
0347518d 2456};;
d5efd131
MF
2457
2458.pred.rel "mutex",p9,p11
0347518d
MF
2459{ .mfi
2460 add GR_p_Offset = GR_p_Int, GR_p_Offset
d5efd131
MF
2461 // +int, so offset = int*21
2462(p9) fms.s1 FR_p_XR = FR_p_AbsX, f1, f1 // r = x-1
0347518d
MF
2463 nop.i 0
2464}
d5efd131 2465{ .mfi
0347518d
MF
2466 ldfe FR_r_A7 = [GR_r_sin_Table], 16
2467(p11) fms.s1 FR_p_XR = FR_p_2, f1, FR_p_AbsX
d5efd131 2468 // r = 2-x for 1.75 < x < 2
0347518d
MF
2469 nop.i 0
2470};;
d5efd131
MF
2471
2472.pred.rel "mutex",p9,p10
2473.pred.rel "mutex",p10,p11
2474.pred.rel "mutex",p9,p11
0347518d 2475{ .mfi
d5efd131 2476(p9) add GR_p_Offset = 126, r0 // 1.0 < x < 1.25 table
0347518d 2477(p15) fcmp.eq.unc.s1 p7,p0 = FR_p_AbsX, FR_p_XN
d5efd131 2478 // If arg is integer and negative - singularity branch
0347518d 2479 nop.i 0
d5efd131 2480}
0347518d 2481{ .mfi
d5efd131
MF
2482(p10) add GR_p_Offset = 147, r0 // 1.25 < x < 1.75 table
2483 nop.f 0
2484(p11) add GR_p_Offset = 168, r0 // 1.75 < x < 2.0 table
0347518d 2485};;
d5efd131 2486
0347518d
MF
2487{ .mmf
2488 shladd GR_p_Table = GR_p_Offset, 4, GR_p_Table
d5efd131
MF
2489 shladd GR_p_Table2 = GR_p_Offset, 4, GR_p_Table2
2490 fma.s1 FR_r_XS = FR_r_AbsX , f1, FR_r_XNS // xs = x - [x]
0347518d 2491};;
d5efd131 2492
0347518d
MF
2493{ .mmb
2494 ldfpd FR_p_A5H, FR_p_A5L = [GR_p_Table], 16
2495 ldfpd FR_p_A2H, FR_p_A2L = [GR_p_Table2], 16
d5efd131
MF
2496(p7) br.cond.spnt tgammal_singularity // Singularity for integer /////////////
2497 // and negative argument ///////////////
2498};;
2499
0347518d
MF
2500{ .mfi
2501 ldfpd FR_p_A4H, FR_p_A4L = [GR_p_Table], 16
d5efd131 2502 fma.s1 FR_p_XN = FR_p_XN, f1, FR_p_0p5 // xn = xn+0.5
0347518d 2503 nop.i 0
d5efd131 2504}
0347518d
MF
2505{ .mfi
2506 ldfpd FR_p_A1H, FR_p_A1L = [GR_p_Table2], 16
d5efd131 2507(p10) fms.s1 FR_p_XR = FR_p_AbsX, f1, FR_p_1p5 // r = x - 1.5
0347518d 2508 nop.i 0
d5efd131
MF
2509};;
2510
0347518d
MF
2511{ .mmi
2512 ldfpd FR_p_A3H, FR_p_A3L = [GR_p_Table], 16
2513 ldfpd FR_p_A0H, FR_p_A0L = [GR_p_Table2], 16
d5efd131
MF
2514 nop.i 0
2515};;
2516
0347518d
MF
2517{ .mmi
2518 ldfe FR_p_A20 = [GR_p_Table], 16
2519 ldfe FR_p_A12 = [GR_p_Table2], 16
d5efd131
MF
2520 nop.i 0
2521};;
2522
0347518d
MF
2523{ .mmf
2524 ldfe FR_p_A19 = [GR_p_Table], 16
2525 ldfe FR_p_A11 = [GR_p_Table2], 16
d5efd131
MF
2526 fma.s1 FR_r_XS2 = FR_r_XS, FR_r_XS, f0 // xs2 = xs*xs
2527};;
2528
0347518d
MF
2529{ .mmi
2530 ldfe FR_p_A18 = [GR_p_Table], 16
2531 ldfe FR_p_A10 = [GR_p_Table2], 16
d5efd131
MF
2532 nop.i 0
2533};;
2534
2535.pred.rel "mutex",p12,p13
0347518d
MF
2536{ .mfi
2537 ldfe FR_p_A17 = [GR_p_Table], 16
d5efd131 2538(p12) fms.s1 FR_p_XR = FR_p_AbsX, f1, FR_p_XN // r = x - xn
0347518d 2539 nop.i 0
d5efd131 2540}
0347518d 2541{ .mfi
d5efd131
MF
2542 ldfe FR_p_A9 = [GR_p_Table2], 16
2543(p13) fms.s1 FR_p_XR = FR_p_AbsX, f1, FR_p_XN
0347518d 2544 nop.i 0
d5efd131
MF
2545};;
2546
0347518d
MF
2547{ .mmi
2548 ldfe FR_p_A16 = [GR_p_Table], 16
2549 ldfe FR_p_A8 = [GR_p_Table2], 16
d5efd131
MF
2550(p9) cmp.eq p12, p0 = r0, r0 // clear p12
2551};;
2552
0347518d
MF
2553{ .mmi
2554 ldfe FR_p_A15 = [GR_p_Table], 16
2555 ldfe FR_p_A7 = [GR_p_Table2], 16
d5efd131
MF
2556(p10) cmp.eq p12, p0 = r0, r0 // clear p12
2557};;
2558
0347518d
MF
2559{ .mfi
2560 ldfe FR_p_A14 = [GR_p_Table], 16
d5efd131
MF
2561 fma.s1 FR_r_TH = FR_r_A2H, FR_r_XS2, f0 // sin for neg
2562(p11) cmp.eq p12, p0 = r0, r0 // clear p12
2563}
0347518d 2564{ .mfi
d5efd131
MF
2565 ldfe FR_p_A6 = [GR_p_Table2], 16
2566 fma.s1 FR_r_TL = FR_r_A2L, FR_r_XS2, f0 // sin for neg
0347518d 2567 nop.i 0
d5efd131
MF
2568};;
2569
0347518d 2570{ .mfi
d5efd131
MF
2571 ldfe FR_p_A13 = [GR_p_Table], 16
2572 fms.s1 FR_r_XS2L = FR_r_XS, FR_r_XS, FR_r_XS2 // x2Lo part
0347518d 2573 nop.i 0
d5efd131
MF
2574};;
2575
0347518d
MF
2576{ .mfi
2577 nop.m 0
d5efd131
MF
2578 fma.s1 FR_p_Temp5H = FR_p_A5H, FR_p_XR, f0 // A5H*r
2579 // 'Low poly'
0347518d 2580 nop.i 0
d5efd131 2581}
0347518d
MF
2582{ .mfi
2583 nop.m 0
d5efd131 2584 fma.s1 FR_p_XR2 = FR_p_XR, FR_p_XR, f0 // r^2 = r*r
0347518d 2585 nop.i 0
d5efd131
MF
2586};;
2587
0347518d
MF
2588{ .mfi
2589 nop.m 0
d5efd131 2590 fabs FR_r_XS = FR_r_XS // abs(xs)
0347518d 2591 nop.i 0
d5efd131 2592}
0347518d
MF
2593{ .mfi
2594 nop.m 0
2595 fma.s1 FR_p_Temp2H = FR_p_A2H, FR_p_XR, f0 // A2H*r
d5efd131 2596 // 'High poly'
0347518d 2597 nop.i 0
d5efd131
MF
2598};;
2599
0347518d
MF
2600{ .mfi
2601 nop.m 0
d5efd131 2602 fms.s1 FR_r_TT = FR_r_A2H, FR_r_XS2, FR_r_TH // sin for neg
0347518d 2603 nop.i 0
d5efd131 2604}
0347518d
MF
2605{ .mfi
2606 nop.m 0
d5efd131 2607 fma.s1 FR_r_ResH = FR_r_TH, f1, FR_r_A1H // sin for neg
0347518d 2608 nop.i 0
d5efd131
MF
2609};;
2610
0347518d
MF
2611{ .mfi
2612 nop.m 0
d5efd131 2613 fma.s1 FR_r_TL = FR_r_A2H, FR_r_XS2L, FR_r_TL // sin for neg
0347518d 2614 nop.i 0
d5efd131
MF
2615};;
2616
0347518d
MF
2617{ .mfi
2618 nop.m 0
d5efd131
MF
2619 fms.s1 FR_p_Temp5L = FR_p_A5H,FR_p_XR,FR_p_Temp5H //A5H*r delta
2620 // 'Low poly'
0347518d 2621 nop.i 0
d5efd131 2622}
0347518d
MF
2623{ .mfi
2624 nop.m 0
d5efd131
MF
2625 fma.s1 FR_p_Poly5H = FR_p_Temp5H, f1, FR_p_A4H // A5H*r+A4H
2626 // 'Low poly'
0347518d 2627 nop.i 0
d5efd131
MF
2628};;
2629
0347518d
MF
2630{ .mfi
2631 nop.m 0
d5efd131
MF
2632 fms.s1 FR_p_Temp2L = FR_p_A2H, FR_p_XR, FR_p_Temp2H//A2H*r delta
2633 //'High poly'
0347518d 2634 nop.i 0
d5efd131 2635}
0347518d
MF
2636{ .mfi
2637 nop.m 0
d5efd131
MF
2638 fma.s1 FR_p_Poly2H = FR_p_Temp2H, f1, FR_p_A1H // A2H*r+A1H
2639 //'High poly'
0347518d 2640 nop.i 0
d5efd131
MF
2641};;
2642
0347518d
MF
2643{ .mfi
2644 nop.m 0
d5efd131 2645 fma.s1 FR_p_XR3 = FR_p_XR2, FR_p_XR, f0 // r^3 = r^2*r
0347518d 2646 nop.i 0
d5efd131 2647}
0347518d
MF
2648{ .mfi
2649 nop.m 0
d5efd131 2650 fms.s1 FR_p_XR2L = FR_p_XR, FR_p_XR, FR_p_XR2 // r^2 delta
0347518d 2651 nop.i 0
d5efd131
MF
2652};;
2653
0347518d
MF
2654{ .mfi
2655 nop.m 0
d5efd131 2656 fma.s1 FR_p_A18 = FR_p_A19, FR_p_XR, FR_p_A18 // Poly tail
0347518d 2657 nop.i 0
d5efd131 2658}
0347518d
MF
2659{ .mfi
2660 nop.m 0
d5efd131 2661 fma.s1 FR_p_A14 = FR_p_A15, FR_p_XR, FR_p_A14 // Poly tail
0347518d 2662 nop.i 0
d5efd131
MF
2663};;
2664
0347518d
MF
2665{ .mfi
2666 nop.m 0
d5efd131 2667 fma.s1 FR_p_XR4 = FR_p_XR2, FR_p_XR2, f0 // r^4 = r^2*r^2
0347518d 2668 nop.i 0
d5efd131
MF
2669};;
2670
0347518d
MF
2671{ .mfi
2672 nop.m 0
d5efd131
MF
2673 fma.s1 FR_p_Temp5L = FR_p_A5L, FR_p_XR, FR_p_Temp5L// Low part
2674 // of A5*r+A4
0347518d 2675 nop.i 0
d5efd131 2676}
0347518d
MF
2677{ .mfi
2678 nop.m 0
d5efd131
MF
2679 fms.s1 FR_p_Poly5L = FR_p_A4H, f1, FR_p_Poly5H // Low part
2680 // of A5*r+A4
0347518d 2681 nop.i 0
d5efd131
MF
2682};;
2683
0347518d
MF
2684{ .mfi
2685 nop.m 0
d5efd131 2686 fma.s1 FR_p_Temp4H = FR_p_Poly5H, FR_p_XR, f0 // (A5H*r+A4H)*r
0347518d 2687 nop.i 0
d5efd131 2688}
0347518d
MF
2689{ .mfi
2690 nop.m 0
d5efd131 2691 fma.s1 FR_p_Temp2L = FR_p_A2L, FR_p_XR, FR_p_Temp2L // A2*r low
0347518d 2692 nop.i 0
d5efd131
MF
2693};;
2694
0347518d
MF
2695{ .mfi
2696 nop.m 0
d5efd131 2697 fms.s1 FR_p_Poly2L = FR_p_A1H, f1, FR_p_Poly2H // High poly
0347518d 2698 nop.i 0
d5efd131 2699}
0347518d
MF
2700{ .mfi
2701 nop.m 0
d5efd131 2702 fma.s1 FR_p_Temp1H = FR_p_Poly2H, FR_p_XR, f0 // High poly
0347518d 2703 nop.i 0
d5efd131
MF
2704};;
2705
0347518d
MF
2706{ .mfi
2707 nop.m 0
d5efd131 2708 fms.s1 FR_p_XR3L = FR_p_XR2, FR_p_XR, FR_p_XR3 // x^3 delta
0347518d 2709 nop.i 0
d5efd131 2710}
0347518d
MF
2711{ .mfi
2712 nop.m 0
d5efd131 2713 fma.s1 FR_p_A16 = FR_p_A17, FR_p_XR, FR_p_A16 // Poly tail
0347518d 2714 nop.i 0
d5efd131
MF
2715};;
2716
0347518d
MF
2717{ .mfi
2718 nop.m 0
d5efd131 2719 fms.s1 FR_r_ResL = FR_r_A1H, f1, FR_r_ResH // sin for neg
0347518d 2720 nop.i 0
d5efd131 2721}
0347518d
MF
2722{ .mfi
2723 nop.m 0
d5efd131 2724 fma.s1 FR_r_TL = FR_r_TL, f1, FR_r_TT // sin for neg
0347518d 2725 nop.i 0
d5efd131
MF
2726};;
2727
0347518d
MF
2728{ .mfi
2729 nop.m 0
d5efd131 2730 fma.s1 FR_p_Temp5L = FR_p_Temp5L, f1, FR_p_A4L // Low poly
0347518d 2731 nop.i 0
d5efd131 2732}
0347518d
MF
2733{ .mfi
2734 nop.m 0
d5efd131 2735 fma.s1 FR_p_Poly5L = FR_p_Poly5L, f1, FR_p_Temp5H // Low poly
0347518d 2736 nop.i 0
d5efd131
MF
2737};;
2738
0347518d
MF
2739{ .mfi
2740 nop.m 0
d5efd131 2741 fms.s1 FR_p_Temp4L = FR_p_Poly5H,FR_p_XR,FR_p_Temp4H //Low poly
0347518d 2742 nop.i 0
d5efd131 2743}
0347518d
MF
2744{ .mfi
2745 nop.m 0
d5efd131 2746 fma.s1 FR_p_Poly4H = FR_p_Temp4H, f1, FR_p_A3H // Low poly
0347518d 2747 nop.i 0
d5efd131
MF
2748};;
2749
0347518d
MF
2750{ .mfi
2751 nop.m 0
d5efd131 2752 fma.s1 FR_p_Temp2L = FR_p_Temp2L, f1, FR_p_A1L // High poly
0347518d 2753 nop.i 0
d5efd131 2754}
0347518d
MF
2755{ .mfi
2756 nop.m 0
d5efd131 2757 fma.s1 FR_p_Poly2L = FR_p_Poly2L, f1, FR_p_Temp2H // High poly
0347518d 2758 nop.i 0
d5efd131
MF
2759};;
2760
0347518d
MF
2761{ .mfi
2762 nop.m 0
d5efd131 2763 fms.s1 FR_p_Temp1L = FR_p_Poly2H,FR_p_XR,FR_p_Temp1H //High poly
0347518d 2764 nop.i 0
d5efd131 2765}
0347518d
MF
2766{ .mfi
2767 nop.m 0
d5efd131 2768 fma.s1 FR_p_Poly1H = FR_p_Temp1H, f1, FR_p_A0H // High poly
0347518d 2769 nop.i 0
d5efd131
MF
2770};;
2771
0347518d
MF
2772{ .mfi
2773 nop.m 0
d5efd131 2774 fma.s1 FR_p_A12 = FR_p_A13, FR_p_XR, FR_p_A12 // Poly tail
0347518d 2775 nop.i 0
d5efd131 2776}
0347518d
MF
2777{ .mfi
2778 nop.m 0
d5efd131 2779 fma.s1 FR_p_XR3L = FR_p_XR2L, FR_p_XR, FR_p_XR3L // x^3 low
0347518d 2780 nop.i 0
d5efd131
MF
2781};;
2782
0347518d
MF
2783{ .mfi
2784 nop.m 0
d5efd131 2785 fma.s1 FR_p_Poly5L = FR_p_Poly5L, f1, FR_p_Temp5L // Low poly
0347518d 2786 nop.i 0
d5efd131 2787}
0347518d
MF
2788{ .mfi
2789 nop.m 0
d5efd131 2790 fma.s1 FR_p_A10 = FR_p_A11, FR_p_XR, FR_p_A10 // Poly tail
0347518d 2791 nop.i 0
d5efd131
MF
2792};;
2793
0347518d
MF
2794{ .mfi
2795 nop.m 0
d5efd131 2796 fms.s1 FR_p_Poly4L = FR_p_A3H, f1, FR_p_Poly4H // Low poly
0347518d 2797 nop.i 0
d5efd131 2798}
0347518d
MF
2799{ .mfi
2800 nop.m 0
d5efd131 2801 fma.s1 FR_p_A6 = FR_p_A7, FR_p_XR, FR_p_A6 // Poly tail
0347518d 2802 nop.i 0
d5efd131
MF
2803};;
2804
0347518d
MF
2805{ .mfi
2806 nop.m 0
d5efd131 2807 fma.s1 FR_p_A8 = FR_p_A9, FR_p_XR, FR_p_A8 // Poly tail
0347518d 2808 nop.i 0
d5efd131 2809}
0347518d
MF
2810{ .mfi
2811 nop.m 0
d5efd131 2812 fma.s1 FR_p_XR6 = FR_p_XR4, FR_p_XR2, f0 // Poly tail
0347518d 2813 nop.i 0
d5efd131
MF
2814};;
2815
0347518d
MF
2816{ .mfi
2817 nop.m 0
d5efd131 2818 fma.s1 FR_p_Poly2L = FR_p_Poly2L, f1, FR_p_Temp2L // High poly
0347518d 2819 nop.i 0
d5efd131 2820}
0347518d
MF
2821{ .mfi
2822 nop.m 0
d5efd131 2823 fms.s1 FR_p_Poly1L = FR_p_A0H, f1, FR_p_Poly1H // High poly
0347518d 2824 nop.i 0
d5efd131
MF
2825};;
2826
0347518d
MF
2827{ .mfi
2828 nop.m 0
d5efd131 2829 fma.s1 FR_r_ResL = FR_r_ResL, f1, FR_r_TH // sin for neg
0347518d 2830 nop.i 0
d5efd131 2831}
0347518d
MF
2832{ .mfi
2833 nop.m 0
d5efd131 2834 fma.s1 FR_r_TT = FR_r_TL, f1, FR_r_A1L // sin for neg
0347518d 2835 nop.i 0
d5efd131
MF
2836};;
2837
0347518d
MF
2838{ .mfi
2839 nop.m 0
d5efd131 2840 fma.s1 FR_p_Temp4L = FR_p_Poly5L,FR_p_XR,FR_p_Temp4L // Low poly
0347518d 2841 nop.i 0
d5efd131 2842}
0347518d
MF
2843{ .mfi
2844 nop.m 0
d5efd131 2845 fma.s1 FR_p_A18 = FR_p_A20, FR_p_XR2, FR_p_A18 // Poly tail
0347518d 2846 nop.i 0
d5efd131
MF
2847};;
2848
0347518d
MF
2849{ .mfi
2850 nop.m 0
d5efd131 2851 fma.s1 FR_p_Poly4L = FR_p_Poly4L, f1, FR_p_Temp4H // Low poly
0347518d 2852 nop.i 0
d5efd131 2853}
0347518d
MF
2854{ .mfi
2855 nop.m 0
d5efd131 2856 fma.s1 FR_p_A14 = FR_p_A16, FR_p_XR2, FR_p_A14 // Poly tail
0347518d 2857 nop.i 0
d5efd131
MF
2858};;
2859
0347518d
MF
2860{ .mfi
2861 nop.m 0
d5efd131 2862 fma.s1 FR_p_A6 = FR_p_A8, FR_p_XR2, FR_p_A6 // Poly tail
0347518d 2863 nop.i 0
d5efd131 2864}
0347518d
MF
2865{ .mfi
2866 nop.m 0
d5efd131 2867 fma.s1 FR_p_A10 = FR_p_A12, FR_p_XR2, FR_p_A10 // Poly tail
0347518d 2868 nop.i 0
d5efd131
MF
2869};;
2870
0347518d
MF
2871{ .mfi
2872 nop.m 0
d5efd131 2873 fma.s1 FR_p_Temp1L = FR_p_Poly2L,FR_p_XR,FR_p_Temp1L //High poly
0347518d 2874 nop.i 0
d5efd131 2875}
0347518d
MF
2876{ .mfi
2877 nop.m 0
d5efd131 2878 fma.s1 FR_p_Poly1L = FR_p_Poly1L, f1, FR_p_Temp1H // High poly
0347518d 2879 nop.i 0
d5efd131
MF
2880};;
2881
0347518d
MF
2882{ .mfi
2883 nop.m 0
d5efd131 2884 fma.s1 FR_r_ResL = FR_r_ResL, f1, FR_r_TT // sin for neg
0347518d 2885 nop.i 0
d5efd131 2886}
0347518d
MF
2887{ .mfi
2888 nop.m 0
d5efd131 2889 fma.s1 FR_r_TH = FR_r_ResH, FR_r_XS2, f0 // sin for neg
0347518d 2890 nop.i 0
d5efd131
MF
2891};;
2892
0347518d
MF
2893{ .mfi
2894 nop.m 0
d5efd131 2895 fma.s1 FR_p_Temp4L = FR_p_Temp4L, f1, FR_p_A3L // Low poly
0347518d 2896 nop.i 0
d5efd131 2897}
0347518d
MF
2898{ .mfi
2899 nop.m 0
d5efd131 2900 fma.s1 FR_p_Poly3H = FR_p_Poly4H, FR_p_XR3, f0 // Low poly
0347518d 2901 nop.i 0
d5efd131
MF
2902};;
2903
0347518d
MF
2904{ .mfi
2905 nop.m 0
d5efd131 2906 fma.s1 FR_p_A14 = FR_p_A18, FR_p_XR4, FR_p_A14 // Poly tail
0347518d 2907 nop.i 0
d5efd131 2908}
0347518d
MF
2909{ .mfi
2910 nop.m 0
d5efd131 2911 fma.s1 FR_p_XR8 = FR_p_XR4, FR_p_XR4, f0 // Poly tail
0347518d 2912 nop.i 0
d5efd131
MF
2913};;
2914
0347518d
MF
2915{ .mfi
2916 nop.m 0
d5efd131 2917 fma.s1 FR_r_TL = FR_r_ResH, FR_r_XS2L, f0 // sin for neg
0347518d 2918 nop.i 0
d5efd131
MF
2919};;
2920
0347518d
MF
2921{ .mfi
2922 nop.m 0
d5efd131 2923 fma.s1 FR_p_Temp1L = FR_p_Temp1L, f1, FR_p_A0L // High poly
0347518d 2924 nop.i 0
d5efd131 2925}
0347518d
MF
2926{ .mfi
2927 nop.m 0
d5efd131 2928 fma.s1 FR_p_A6 = FR_p_A10, FR_p_XR4, FR_p_A6 // Poly tail
0347518d 2929 nop.i 0
d5efd131
MF
2930};;
2931
0347518d
MF
2932{ .mfi
2933 nop.m 0
d5efd131 2934 fms.s1 FR_r_TT = FR_r_ResH, FR_r_XS2, FR_r_TH // sin for neg
0347518d 2935 nop.i 0
d5efd131 2936}
0347518d
MF
2937{ .mfi
2938 nop.m 0
d5efd131 2939 fma.s1 FR_r_Res3H = FR_r_TH, f1, f1 // sin for neg
0347518d 2940 nop.i 0
d5efd131
MF
2941};;
2942
0347518d
MF
2943{ .mfi
2944 nop.m 0
d5efd131 2945 fma.s1 FR_p_Poly4L = FR_p_Poly4L, f1, FR_p_Temp4L // Low poly
0347518d 2946 nop.i 0
d5efd131 2947}
0347518d
MF
2948{ .mfi
2949 nop.m 0
d5efd131 2950 fma.s1 FR_p_Poly3L = FR_p_Poly4H, FR_p_XR3L, f0 // Low poly
0347518d 2951 nop.i 0
d5efd131
MF
2952};;
2953
0347518d
MF
2954{ .mfi
2955 nop.m 0
d5efd131 2956 fma.s1 FR_p_Poly0H = FR_p_Poly3H,f1,FR_p_Poly1H //Low & High add
0347518d 2957 nop.i 0
d5efd131 2958}
0347518d
MF
2959{ .mfi
2960 nop.m 0
d5efd131 2961 fma.s1 FR_r_A7 = FR_r_A8, FR_r_XS2, FR_r_A7 // sin for neg
0347518d 2962 nop.i 0
d5efd131
MF
2963};;
2964
0347518d
MF
2965{ .mfi
2966 nop.m 0
d5efd131 2967 fma.s1 FR_r_TL = FR_r_ResL, FR_r_XS2, FR_r_TL // sin for neg
0347518d 2968 nop.i 0
d5efd131 2969}
0347518d
MF
2970{ .mfi
2971 nop.m 0
d5efd131 2972 fma.s1 FR_r_XS4 = FR_r_XS2, FR_r_XS2, f0 // sin for neg
0347518d 2973 nop.i 0
d5efd131
MF
2974};;
2975
0347518d
MF
2976{ .mfi
2977 nop.m 0
d5efd131 2978 fma.s1 FR_p_Poly1L = FR_p_Poly1L, f1, FR_p_Temp1L // High poly
0347518d 2979 nop.i 0
d5efd131 2980}
0347518d
MF
2981{ .mfi
2982 nop.m 0
d5efd131 2983 fma.s1 FR_p_PolyTail = FR_p_A14, FR_p_XR8, FR_p_A6 // Poly tail
0347518d 2984 nop.i 0
d5efd131
MF
2985};;
2986
0347518d
MF
2987{ .mfi
2988 nop.m 0
d5efd131 2989 fms.s1 FR_r_Res3L = f1, f1, FR_r_Res3H // sin for neg
0347518d 2990 nop.i 0
d5efd131 2991}
0347518d
MF
2992{ .mfi
2993 nop.m 0
d5efd131 2994 fma.s1 FR_r_ResH = FR_r_Res3H, FR_r_XS, f0 // sin for neg
0347518d 2995 nop.i 0
d5efd131
MF
2996};;
2997
0347518d
MF
2998{ .mfi
2999 nop.m 0
d5efd131 3000 fms.s1 FR_p_Temp0L = FR_p_Poly4H,FR_p_XR3,FR_p_Poly3H //Low poly
0347518d 3001 nop.i 0
d5efd131 3002}
0347518d
MF
3003{ .mfi
3004 nop.m 0
d5efd131 3005 fma.s1 FR_p_Poly3L = FR_p_Poly4L,FR_p_XR3,FR_p_Poly3L //Low poly
0347518d 3006 nop.i 0
d5efd131
MF
3007};;
3008
3009{ .mfi
0347518d 3010 nop.m 0
d5efd131 3011 fms.s1 FR_p_Poly0L = FR_p_Poly1H,f1,FR_p_Poly0H //Low & High add
0347518d 3012 nop.i 0
d5efd131 3013}
0347518d
MF
3014{ .mfi
3015 nop.m 0
3016(p13) fma.s1 FR_p_OddPoly0H = FR_p_Poly0H, FR_p_AbsXM1, f0
d5efd131 3017 // Reccurent computations - multiplying by X-1
0347518d 3018 nop.i 0
d5efd131
MF
3019};;
3020
0347518d
MF
3021{ .mfi
3022 nop.m 0
d5efd131 3023 fma.s1 FR_r_TL = FR_r_TL, f1, FR_r_TT // sin for neg
0347518d 3024 nop.i 0
d5efd131 3025}
0347518d
MF
3026{ .mfi
3027 nop.m 0
d5efd131 3028 fma.s1 FR_r_A3 = FR_r_A4, FR_r_XS2, FR_r_A3 // sin for neg
0347518d 3029 nop.i 0
d5efd131
MF
3030};;
3031
0347518d
MF
3032{ .mfi
3033 nop.m 0
d5efd131 3034 fma.s1 FR_p_Poly1L = FR_p_PolyTail,FR_p_XR6,FR_p_Poly1L//High
0347518d 3035 nop.i 0
d5efd131 3036}
0347518d
MF
3037{ .mfi
3038 nop.m 0
d5efd131 3039 fma.s1 FR_r_A5 = FR_r_A6, FR_r_XS2, FR_r_A5 // sin for neg
0347518d 3040 nop.i 0
d5efd131
MF
3041};;
3042
0347518d
MF
3043{ .mfi
3044 nop.m 0
d5efd131 3045 fma.s1 FR_r_Res3L = FR_r_Res3L, f1, FR_r_TH // sin for neg
0347518d 3046 nop.i 0
d5efd131 3047}
0347518d
MF
3048{ .mfi
3049 nop.m 0
d5efd131 3050 fms.s1 FR_r_ResL = FR_r_Res3H, FR_r_XS, FR_r_ResH//sin for neg
0347518d 3051 nop.i 0
d5efd131
MF
3052};;
3053
3054{ .mfi
0347518d 3055 nop.m 0
d5efd131 3056 fma.s1 FR_p_Poly3L = FR_p_Poly3L, f1, FR_p_Temp0L // Low poly
0347518d 3057 nop.i 0
d5efd131 3058}
0347518d
MF
3059{ .mfi
3060 nop.m 0
d5efd131 3061 fma.s1 FR_r_A7 = FR_r_A9, FR_r_XS4, FR_r_A7 // sin for neg
0347518d 3062 nop.i 0
d5efd131
MF
3063};;
3064
3065{ .mfi
0347518d 3066 nop.m 0
d5efd131 3067 fma.s1 FR_p_Poly0L = FR_p_Poly0L,f1,FR_p_Poly3H //Low & High add
0347518d 3068 nop.i 0
d5efd131 3069}
0347518d
MF
3070{ .mfi
3071 nop.m 0
d5efd131
MF
3072(p13) fms.s1 FR_p_OddPoly0L = FR_p_Poly0H, FR_p_AbsXM1, FR_p_OddPoly0H
3073 // Reccurent computations - multiplying by X-1 (low part)
0347518d 3074 nop.i 0
d5efd131
MF
3075};;
3076
0347518d
MF
3077{ .mfi
3078 nop.m 0
d5efd131 3079 fma.s1 FR_r_A3 = FR_r_A5, FR_r_XS4, FR_r_A3 // sin for neg
0347518d 3080 nop.i 0
d5efd131 3081}
0347518d
MF
3082{ .mfi
3083 nop.m 0
d5efd131 3084 fma.s1 FR_r_XS7 = FR_r_XS4, FR_r_XS2, f0 // xs^6
0347518d 3085 nop.i 0
d5efd131
MF
3086};;
3087
0347518d
MF
3088{ .mfi
3089 nop.m 0
d5efd131 3090 fma.s1 FR_r_Res3L = FR_r_Res3L, f1, FR_r_TL // sin for neg
0347518d 3091 nop.i 0
d5efd131 3092}
0347518d
MF
3093{ .mfi
3094 nop.m 0
d5efd131 3095 fma.s1 FR_r_XS8 = FR_r_XS4, FR_r_XS4, f0 // sin for neg
0347518d 3096 nop.i 0
d5efd131
MF
3097};;
3098
0347518d
MF
3099{ .mfi
3100 nop.m 0
d5efd131 3101 fma.s1 FR_p_Temp0H = FR_p_Poly3L,f1,FR_p_Poly1L //Low & High add
0347518d 3102 nop.i 0
d5efd131
MF
3103};;
3104
0347518d
MF
3105{ .mfi
3106 nop.m 0
d5efd131 3107 fma.s1 FR_r_XS7 = FR_r_XS7, FR_r_XS, f0 // xs^7
0347518d 3108 nop.i 0
d5efd131
MF
3109};;
3110
0347518d
MF
3111{ .mfi
3112 nop.m 0
d5efd131 3113 fma.s1 FR_r_ResL = FR_r_Res3L, FR_r_XS, FR_r_ResL//sin for neg
0347518d 3114 nop.i 0
d5efd131 3115}
0347518d
MF
3116{ .mfi
3117 nop.m 0
d5efd131 3118 fma.s1 FR_r_Tail = FR_r_A7, FR_r_XS8, FR_r_A3 // sin tail res
0347518d 3119 nop.i 0
d5efd131
MF
3120};;
3121
0347518d
MF
3122{ .mfi
3123 nop.m 0
d5efd131 3124 fma.s1 FR_p_Poly0L = FR_p_Poly0L,f1,FR_p_Temp0H //Low & High add
0347518d 3125 nop.i 0
d5efd131
MF
3126};;
3127
3128
0347518d
MF
3129{ .mfi
3130 nop.m 0
d5efd131 3131 fma.s1 FR_r_ResL = FR_r_Tail,FR_r_XS7,FR_r_ResL //sin for neg
0347518d 3132 nop.i 0
d5efd131
MF
3133};;
3134
0347518d
MF
3135{ .mfi
3136 nop.m 0
d5efd131
MF
3137(p13) fma.s1 FR_p_OddPoly0L = FR_p_Poly0L, FR_p_AbsXM1, FR_p_OddPoly0L
3138 // Reccurent computations - multiplying by X-1 (low part)
0347518d 3139 nop.i 0
d5efd131
MF
3140};;
3141
0347518d
MF
3142{ .mfi
3143 nop.m 0
d5efd131 3144 fma.s1 FR_r_TT = FR_r_ResL, FR_r_AbsX, f0 // X*sin
0347518d 3145 nop.i 0
d5efd131
MF
3146};;
3147
3148.pred.rel "mutex",p12,p13
0347518d
MF
3149{ .mfi
3150 nop.m 0
d5efd131 3151(p12) fma.s0 f8 = FR_p_Poly0H, f1, FR_p_Poly0L // Even
0347518d 3152 nop.i 0
d5efd131 3153}
0347518d
MF
3154{ .mfb
3155 nop.m 0
d5efd131
MF
3156(p13) fma.s0 f8 = FR_p_OddPoly0H, f1, FR_p_OddPoly0L // Odd
3157(p14) br.ret.spnt b0 // Exit for 1 <= |X| < 13 path (positive arguments)/////
3158};;
3159
0347518d
MF
3160{ .mfi
3161 nop.m 0
3162(p13) fma.s1 FR_p_Poly0H = FR_p_OddPoly0H, f1, f0
d5efd131 3163 // Reccurent computations
0347518d 3164 nop.i 0
d5efd131 3165}
0347518d
MF
3166{ .mfi
3167 nop.m 0
3168(p13) fma.s1 FR_p_Poly0L = FR_p_OddPoly0L, f1, f0
d5efd131 3169 // Reccurent computations
0347518d 3170 nop.i 0
d5efd131
MF
3171};;
3172
0347518d
MF
3173{ .mfi
3174 nop.m 0
d5efd131
MF
3175 fma.s1 FR_r_Res1H = FR_r_ResH, FR_r_AbsX, FR_r_TT // X*sin
3176(p11) cmp.eq p13, p12 = r0, r0
3177};;
3178
0347518d
MF
3179{ .mfi
3180 nop.m 0
d5efd131
MF
3181 fms.s1 FR_r_Res1L = FR_r_ResH,FR_r_AbsX,FR_r_Res1H// X*sin
3182(p9) cmp.eq p13, p12 = r0, r0
3183};;
3184
0347518d
MF
3185{ .mfi
3186 nop.m 0
d5efd131
MF
3187 fma.s1 FR_r_Res1L = FR_r_Res1L, f1, FR_r_TT // sin for neg
3188(p10) cmp.eq p13, p12 = r0, r0
3189}
0347518d
MF
3190{ .mfi
3191 nop.m 0
d5efd131 3192 fma.s1 FR_r_TL = FR_p_Poly0L, FR_r_Res1H, f0 // mult by sin
0347518d 3193 nop.i 0
d5efd131
MF
3194};;
3195
0347518d
MF
3196{ .mfi
3197 nop.m 0
d5efd131 3198 fma.s1 FR_r_TL = FR_p_Poly0H,FR_r_Res1L,FR_r_TL//mult by sin
0347518d 3199 nop.i 0
d5efd131
MF
3200};;
3201
0347518d
MF
3202{ .mfi
3203 nop.m 0
d5efd131 3204 fma.s1 FR_r_ResH = FR_p_Poly0H,FR_r_Res1H,FR_r_TL//mult by sin
0347518d 3205 nop.i 0
d5efd131
MF
3206};;
3207
0347518d
MF
3208{ .mfi
3209 nop.m 0
d5efd131 3210 fms.s1 FR_r_ResL = FR_p_Poly0H,FR_r_Res1H,FR_r_ResH//sin mult
0347518d 3211 nop.i 0
d5efd131
MF
3212};;
3213
0347518d
MF
3214{ .mfi
3215 nop.m 0
d5efd131 3216 frcpa.s1 FR_r_Y0,p0 = f1,FR_r_ResH // y = frcpa(b)
0347518d 3217 nop.i 0
d5efd131
MF
3218};;
3219
0347518d
MF
3220{ .mfi
3221 nop.m 0
d5efd131 3222 fneg FR_r_NegOne = f1 // Form -1.0
0347518d 3223 nop.i 0
d5efd131 3224}
0347518d
MF
3225{ .mfi
3226 nop.m 0
d5efd131 3227 fma.s1 FR_r_ResL = FR_r_ResL, f1, FR_r_TL //Low result of mult
0347518d 3228 nop.i 0
d5efd131
MF
3229};;
3230
0347518d
MF
3231{ .mfi
3232 nop.m 0
d5efd131 3233 fma.s1 FR_r_Q0 = f1,FR_r_Y0,f0 // q = a*y
0347518d 3234 nop.i 0
d5efd131 3235}
0347518d
MF
3236{ .mfi
3237 nop.m 0
3238 fnma.s1 FR_r_E0 = FR_r_Y0,FR_r_ResH,f1 // e = 1-b*y
3239 nop.i 0
d5efd131
MF
3240};;
3241
0347518d
MF
3242{ .mfi
3243 nop.m 0
d5efd131 3244 fma.s1 FR_r_E2 = FR_r_E0,FR_r_E0,FR_r_E0 // e2 = e+e^2
0347518d 3245 nop.i 0
d5efd131 3246}
0347518d
MF
3247{ .mfi
3248 nop.m 0
d5efd131 3249 fma.s1 FR_r_E1 = FR_r_E0,FR_r_E0,f0 // e1 = e^2
0347518d 3250 nop.i 0
d5efd131
MF
3251};;
3252
0347518d
MF
3253{ .mfi
3254 nop.m 0
d5efd131 3255 fma.s1 FR_r_Y1 = FR_r_Y0,FR_r_E2,FR_r_Y0 // y1 = y+y*e2
0347518d 3256 nop.i 0
d5efd131 3257}
0347518d
MF
3258{ .mfi
3259 nop.m 0
d5efd131 3260 fma.s1 FR_r_E3 = FR_r_E1,FR_r_E1,FR_r_E0 // e3 = e+e1^2
0347518d 3261 nop.i 0
d5efd131
MF
3262};;
3263
0347518d
MF
3264{ .mfi
3265 nop.m 0
d5efd131 3266 fma.s1 FR_r_Y2 = FR_r_Y1,FR_r_E3,FR_r_Y0 // y2 = y+y1*e3
0347518d 3267 nop.i 0
d5efd131 3268}
0347518d
MF
3269{ .mfi
3270 nop.m 0
d5efd131 3271 fnma.s1 FR_r_R0 = FR_r_ResH,FR_r_Q0,f1 // r = a-b*q
0347518d 3272 nop.i 0
d5efd131
MF
3273};;
3274
0347518d
MF
3275{ .mfi
3276 nop.m 0
d5efd131 3277 fnma.s1 FR_r_E4 = FR_r_ResH,FR_r_Y2,f1 // e4 = 1-b*y2
0347518d 3278 nop.i 0
d5efd131 3279}
0347518d
MF
3280{ .mfi
3281 nop.m 0
d5efd131 3282 fma.s1 FR_r_ZH = FR_r_R0,FR_r_Y2,FR_r_Q0 // x = q+r*y2
0347518d 3283 nop.i 0
d5efd131
MF
3284};;
3285
0347518d
MF
3286{ .mfi
3287 nop.m 0
d5efd131 3288 fma.s1 FR_r_Y3 = FR_r_Y2,FR_r_E4,FR_r_Y2 // y3 = y2+y2*e4
0347518d 3289 nop.i 0
d5efd131 3290}
0347518d
MF
3291{ .mfi
3292 nop.m 0
d5efd131 3293 fnma.s1 FR_r_R1 = FR_r_ResH,FR_r_ZH,f1 // r1 = a-b*x
0347518d 3294 nop.i 0
d5efd131
MF
3295};;
3296
0347518d
MF
3297{ .mfi
3298 nop.m 0
d5efd131 3299 fnma.s1 FR_r_R1 = FR_r_ResL,FR_r_ZH,FR_r_R1 // r1=r1-b_lo*X
0347518d 3300 nop.i 0
d5efd131 3301}
0347518d
MF
3302{ .mfi
3303 nop.m 0
d5efd131 3304(p12) fma.s1 FR_r_ZHN = FR_r_ZH,FR_r_NegOne, f0 // Negate for evens
0347518d 3305 nop.i 0
d5efd131
MF
3306};;
3307
3308.pred.rel "mutex",p13,p12
0347518d
MF
3309{ .mfi
3310 nop.m 0
d5efd131 3311(p13) fma.s0 f8 = FR_r_R1,FR_r_Y3,FR_r_ZH // Final result
0347518d 3312 nop.i 0
d5efd131 3313}
0347518d
MF
3314{ .mfb
3315 nop.m 0
d5efd131
MF
3316(p12) fnma.s0 f8 = FR_r_R1,FR_r_Y3,FR_r_ZHN // Final result
3317 br.ret.sptk b0 // Exit for 1 <= |X| < 13 path (negative arguments)//////
3318};;
3319
3320
3321//////////// |X| < 1 path /////////////////////////////////////////////////////
3322//------------------------------------------------------------------------------
3323.align 64
3324tgamma_lt_1:
0347518d 3325{ .mfi
d5efd131
MF
3326 getf.exp GR_p_Exp = FR_p_AbsX // exp of abs X
3327 fma.s1 FR_z_Q0 = f1,FR_z_Y0,f0 // q = a*y
3328 add GR_r_sin_Table2= 0x50, GR_r_sin_Table
0347518d 3329}
d5efd131 3330{ .mfi
0347518d
MF
3331 ldfpd FR_p_0p5, FR_p_1p5 = [GR_c_Table], 16
3332 fnma.s1 FR_z_E0 = FR_z_Y0,f8,f1 // e = 1-b*y
d5efd131 3333 add GR_p_Table2 = 0xB0, GR_p_Table
0347518d 3334};;
d5efd131 3335
0347518d 3336{ .mfi
d5efd131
MF
3337 ldfd FR_p_0p25 = [GR_c_Table]
3338 fcvt.xf FR_r_XNS = FR_r_IXNS // Convert int repr to float
0347518d 3339 shr.u GR_p_X_Sgnd = GR_p_X_Sgnd, 60
d5efd131
MF
3340 // Obtain only 4 bits of significand
3341}
0347518d 3342{ .mfi
d5efd131
MF
3343 nop.m 0
3344 nop.f 0
3345 add GR_p_Bias = 0xffff, r0 // Set bias
0347518d 3346};;
d5efd131 3347
0347518d
MF
3348{ .mfi
3349 ldfpd FR_r_A2H, FR_r_A2L = [GR_r_sin_Table], 16
d5efd131
MF
3350 nop.f 0
3351 shl GR_p_XN = GR_p_Exp, 4
3352 // Shift exp to 4 bits left to set place for significand
0347518d 3353}
d5efd131
MF
3354{ .mlx
3355 ldfe FR_r_A6 = [GR_r_sin_Table2], 16
3356 movl GR_p_0p75 = 0xfffec // 0.75
0347518d 3357};;
d5efd131 3358
0347518d
MF
3359{ .mfi
3360 ldfpd FR_r_A1H, FR_r_A1L = [GR_r_sin_Table], 16
d5efd131 3361 nop.f 0
0347518d 3362 or GR_p_XN = GR_p_XN, GR_p_X_Sgnd
d5efd131 3363 // Combine exp with 4 high bits of significand
0347518d 3364}
d5efd131 3365{ .mfi
0347518d 3366 ldfe FR_r_A5 = [GR_r_sin_Table2], 16
d5efd131
MF
3367 nop.f 0
3368 sub GR_p_Exp = GR_p_Exp, GR_p_Bias // Unbiased exp
0347518d 3369};;
d5efd131 3370
0347518d
MF
3371{ .mmi
3372 ldfe FR_r_A9 = [GR_r_sin_Table], 16
3373 ldfe FR_r_A4 = [GR_r_sin_Table2], 16
d5efd131 3374 cmp.gtu.unc p10, p11 = GR_p_0p75, GR_p_XN // sgnd(x) < 0.75
0347518d 3375};;
d5efd131 3376
0347518d
MF
3377{ .mfi
3378 ldfe FR_r_A8 = [GR_r_sin_Table], 16
d5efd131
MF
3379 fma.s1 FR_z_E2 = FR_z_E0,FR_z_E0,FR_z_E0 // e2 = e+e^2
3380(p10) cmp.gt.unc p9, p10 = -2, GR_p_Exp // x < 0.25
0347518d 3381}
d5efd131 3382{ .mfi
0347518d 3383 ldfe FR_r_A3 = [GR_r_sin_Table2], 16
d5efd131
MF
3384 fma.s1 FR_z_E1 = FR_z_E0,FR_z_E0,f0 // e1 = e^2
3385(p11) add GR_p_Offset = 168, r0 // [0.75;1] interval
0347518d 3386};;
d5efd131 3387
0347518d 3388{ .mmi
d5efd131 3389(p10) add GR_p_Offset = 147, r0 // [0.25;0.75] interval
0347518d 3390 ldfe FR_r_A7 = [GR_r_sin_Table], 16
d5efd131 3391(p9) cmp.gt.unc p8, p9 = -3, GR_p_Exp // x < 0.125
0347518d 3392};;
d5efd131
MF
3393
3394.pred.rel "mutex",p9,p8
0347518d 3395{ .mmi
d5efd131
MF
3396(p9) add GR_p_Offset = 126, r0 // [0.125;0.25] interval
3397(p8) add GR_p_Offset = 189, r0 // [0.;0.125] interval
0347518d
MF
3398 nop.i 0
3399};;
d5efd131 3400
0347518d 3401{ .mmf
d5efd131
MF
3402 shladd GR_p_Table = GR_p_Offset, 4, GR_p_Table //Make addresses
3403 shladd GR_p_Table2 = GR_p_Offset, 4, GR_p_Table2
3404 fma.s1 FR_r_XS = FR_r_AbsX , f1, FR_r_XNS // xs = |x|-[x]
0347518d 3405};;
d5efd131
MF
3406
3407.pred.rel "mutex",p8,p11
0347518d
MF
3408{ .mfi
3409 ldfpd FR_p_A5H, FR_p_A5L = [GR_p_Table], 16
d5efd131
MF
3410(p11) fms.s1 FR_p_XR = f1, f1, FR_p_AbsX // r = 1 - |x|
3411 // for [0.75;1] interval
0347518d 3412 nop.i 0
d5efd131 3413}
0347518d
MF
3414{ .mfi
3415 ldfpd FR_p_A2H, FR_p_A2L = [GR_p_Table2], 16
d5efd131
MF
3416(p8) fms.s1 FR_p_XR = FR_p_AbsX, f1, f0 // r = |x|
3417 // for [0.;0.125] interval
0347518d 3418 nop.i 0
d5efd131
MF
3419};;
3420
0347518d
MF
3421{ .mfi
3422 ldfpd FR_p_A4H, FR_p_A4L = [GR_p_Table], 16
d5efd131 3423 fma.s1 FR_z_Y1 = FR_z_Y0,FR_z_E2,FR_z_Y0 // y1 = y+y*e2
0347518d 3424 nop.i 0
d5efd131 3425}
0347518d
MF
3426{ .mfi
3427 ldfpd FR_p_A1H, FR_p_A1L = [GR_p_Table2], 16
d5efd131 3428 fma.s1 FR_z_E3 = FR_z_E1,FR_z_E1,FR_z_E0 // e3 = e+e1^2
0347518d 3429 nop.i 0
d5efd131
MF
3430};;
3431
3432.pred.rel "mutex",p9,p10
0347518d
MF
3433{ .mfi
3434 ldfpd FR_p_A3H, FR_p_A3L = [GR_p_Table], 16
d5efd131
MF
3435(p9) fms.s1 FR_p_XR = FR_p_AbsX, f1, f0 // r = |x|
3436 // for [0.125;0.25] interval
0347518d 3437 nop.i 0
d5efd131 3438}
0347518d
MF
3439{ .mfi
3440 ldfpd FR_p_A0H, FR_p_A0L = [GR_p_Table2], 16
d5efd131
MF
3441(p10) fms.s1 FR_p_XR = FR_p_AbsX, f1, FR_p_0p5 // r = |x| - 0.5
3442 // for [0.25;0.75] interval
0347518d 3443 nop.i 0
d5efd131
MF
3444};;
3445
0347518d
MF
3446{ .mmi
3447 ldfe FR_p_A20 = [GR_p_Table], 16
3448 ldfe FR_p_A12 = [GR_p_Table2], 16
d5efd131
MF
3449 nop.i 0
3450};;
3451
0347518d
MF
3452{ .mfi
3453 ldfe FR_p_A19 = [GR_p_Table], 16
d5efd131 3454 fma.s1 FR_r_XS2 = FR_r_XS, FR_r_XS, f0 // xs^2
0347518d 3455 nop.i 0
d5efd131 3456}
0347518d
MF
3457{ .mfi
3458 ldfe FR_p_A11 = [GR_p_Table2], 16
d5efd131 3459 nop.f 0
0347518d 3460 nop.i 0
d5efd131
MF
3461};;
3462
0347518d
MF
3463{ .mmi
3464 ldfe FR_p_A18 = [GR_p_Table], 16
3465 ldfe FR_p_A10 = [GR_p_Table2], 16
d5efd131
MF
3466 nop.i 0
3467};;
3468
3469.pred.rel "mutex",p12,p13
0347518d
MF
3470{ .mfi
3471 ldfe FR_p_A17 = [GR_p_Table], 16
d5efd131 3472 fma.s1 FR_z_Y2 = FR_z_Y1,FR_z_E3,FR_z_Y0 // y2 = y+y1*e3
0347518d 3473 nop.i 0
d5efd131 3474}
0347518d
MF
3475{ .mfi
3476 ldfe FR_p_A9 = [GR_p_Table2], 16
d5efd131 3477 fnma.s1 FR_z_R0 = f8,FR_z_Q0,f1 // r = a-b*q
0347518d 3478 nop.i 0
d5efd131
MF
3479};;
3480
0347518d
MF
3481{ .mmi
3482 ldfe FR_p_A16 = [GR_p_Table], 16
3483 ldfe FR_p_A8 = [GR_p_Table2], 16
3484 nop.i 0
d5efd131
MF
3485};;
3486
0347518d
MF
3487{ .mmi
3488 ldfe FR_p_A15 = [GR_p_Table], 16
3489 ldfe FR_p_A7 = [GR_p_Table2], 16
d5efd131
MF
3490 nop.i 0
3491};;
3492
0347518d
MF
3493{ .mfi
3494 ldfe FR_p_A14 = [GR_p_Table], 16
d5efd131 3495 fma.s1 FR_r_TH = FR_r_A2H, FR_r_XS2, f0 // neg sin
0347518d 3496 nop.i 0
d5efd131 3497}
0347518d
MF
3498{ .mfi
3499 ldfe FR_p_A6 = [GR_p_Table2], 16
d5efd131 3500 fma.s1 FR_r_TL = FR_r_A2L, FR_r_XS2, f0 // neg sin
0347518d 3501 nop.i 0
d5efd131
MF
3502};;
3503
0347518d
MF
3504{ .mfi
3505 ldfe FR_p_A13 = [GR_p_Table], 16
d5efd131 3506 fms.s1 FR_r_XS2L = FR_r_XS, FR_r_XS, FR_r_XS2 // xs^2 delta
0347518d 3507 nop.i 0
d5efd131
MF
3508};;
3509
0347518d
MF
3510{ .mfi
3511 nop.m 0
d5efd131 3512 fma.s1 FR_p_Temp5H = FR_p_A5H, FR_p_XR, f0 // Low poly
0347518d 3513 nop.i 0
d5efd131 3514}
0347518d
MF
3515{ .mfi
3516 nop.m 0
d5efd131 3517 fma.s1 FR_p_XR2 = FR_p_XR, FR_p_XR, f0 // poly tail
0347518d 3518 nop.i 0
d5efd131
MF
3519};;
3520
0347518d
MF
3521{ .mfi
3522 nop.m 0
d5efd131 3523 fabs FR_r_XS = FR_r_XS // Absolute value of xs
0347518d 3524 nop.i 0
d5efd131 3525}
0347518d
MF
3526{ .mfi
3527 nop.m 0
d5efd131 3528 fma.s1 FR_p_Temp2H = FR_p_A2H, FR_p_XR, f0 // High poly
0347518d 3529 nop.i 0
d5efd131
MF
3530};;
3531
0347518d
MF
3532{ .mfi
3533 nop.m 0
d5efd131 3534 fnma.s1 FR_z_E4 = f8,FR_z_Y2,f1 // e4 = 1-b*y2
0347518d 3535 nop.i 0
d5efd131 3536}
0347518d
MF
3537{ .mfi
3538 nop.m 0
d5efd131 3539 fma.s1 FR_z_ZH = FR_z_R0,FR_z_Y2,FR_z_Q0 // 1/x = q+r*y2
0347518d 3540 nop.i 0
d5efd131
MF
3541};;
3542
0347518d
MF
3543{ .mfi
3544 nop.m 0
d5efd131 3545 fms.s1 FR_r_TT = FR_r_A2H, FR_r_XS2, FR_r_TH // neg sin
0347518d 3546 nop.i 0
d5efd131 3547}
0347518d
MF
3548{ .mfi
3549 nop.m 0
d5efd131 3550 fma.s1 FR_r_ResH = FR_r_TH, f1, FR_r_A1H // neg sin
0347518d 3551 nop.i 0
d5efd131
MF
3552};;
3553
0347518d
MF
3554{ .mfi
3555 nop.m 0
d5efd131 3556 fma.s1 FR_r_TL = FR_r_A2H, FR_r_XS2L, FR_r_TL // neg sin
0347518d 3557 nop.i 0
d5efd131
MF
3558};;
3559
0347518d
MF
3560{ .mfi
3561 nop.m 0
d5efd131 3562 fms.s1 FR_p_Temp5L = FR_p_A5H, FR_p_XR, FR_p_Temp5H // Low poly
0347518d 3563 nop.i 0
d5efd131 3564}
0347518d
MF
3565{ .mfi
3566 nop.m 0
d5efd131 3567 fma.s1 FR_p_Poly5H = FR_p_Temp5H, f1, FR_p_A4H // Low poly
0347518d 3568 nop.i 0
d5efd131
MF
3569};;
3570
0347518d
MF
3571{ .mfi
3572 nop.m 0
d5efd131 3573 fms.s1 FR_p_Temp2L = FR_p_A2H, FR_p_XR, FR_p_Temp2H // High poly
0347518d 3574 nop.i 0
d5efd131 3575}
0347518d
MF
3576{ .mfi
3577 nop.m 0
d5efd131 3578 fma.s1 FR_p_Poly2H = FR_p_Temp2H, f1, FR_p_A1H // High poly
0347518d 3579 nop.i 0
d5efd131
MF
3580};;
3581
0347518d
MF
3582{ .mfi
3583 nop.m 0
d5efd131 3584 fma.s1 FR_p_XR3 = FR_p_XR2, FR_p_XR, f0 // r^3
0347518d 3585 nop.i 0
d5efd131 3586}
0347518d
MF
3587{ .mfi
3588 nop.m 0
d5efd131 3589 fms.s1 FR_p_XR2L = FR_p_XR, FR_p_XR, FR_p_XR2 // r^2 delta
0347518d 3590 nop.i 0
d5efd131
MF
3591};;
3592
0347518d
MF
3593{ .mfi
3594 nop.m 0
d5efd131 3595 fma.s1 FR_p_A18 = FR_p_A19, FR_p_XR, FR_p_A18 // poly tail
0347518d 3596 nop.i 0
d5efd131 3597}
0347518d
MF
3598{ .mfi
3599 nop.m 0
d5efd131 3600 fma.s1 FR_p_A14 = FR_p_A15, FR_p_XR, FR_p_A14 // poly tail
0347518d 3601 nop.i 0
d5efd131
MF
3602};;
3603
0347518d
MF
3604{ .mfi
3605 nop.m 0
d5efd131 3606 fma.s1 FR_p_XR4 = FR_p_XR2, FR_p_XR2, f0 // poly tail
0347518d 3607 nop.i 0
d5efd131 3608}
0347518d
MF
3609{ .mfi
3610 nop.m 0
d5efd131 3611 fma.s1 FR_z_Y3 = FR_z_Y2,FR_z_E4,FR_z_Y2 // y3 = y2+y2*e4
0347518d 3612 nop.i 0
d5efd131
MF
3613};;
3614
0347518d
MF
3615{ .mfi
3616 nop.m 0
d5efd131 3617 fma.s1 FR_p_Temp5L = FR_p_A5L, FR_p_XR, FR_p_Temp5L // Low poly
0347518d 3618 nop.i 0
d5efd131 3619}
0347518d
MF
3620{ .mfi
3621 nop.m 0
d5efd131 3622 fms.s1 FR_p_Poly5L = FR_p_A4H, f1, FR_p_Poly5H // Low poly
0347518d 3623 nop.i 0
d5efd131
MF
3624};;
3625
0347518d
MF
3626{ .mfi
3627 nop.m 0
d5efd131 3628 fma.s1 FR_p_Temp4H = FR_p_Poly5H, FR_p_XR, f0 // Low poly
0347518d 3629 nop.i 0
d5efd131 3630}
0347518d
MF
3631{ .mfi
3632 nop.m 0
d5efd131 3633 fma.s1 FR_p_Temp2L = FR_p_A2L, FR_p_XR, FR_p_Temp2L // High poly
0347518d 3634 nop.i 0
d5efd131
MF
3635};;
3636
0347518d
MF
3637{ .mfi
3638 nop.m 0
d5efd131 3639 fms.s1 FR_p_Poly2L = FR_p_A1H, f1, FR_p_Poly2H // High poly
0347518d 3640 nop.i 0
d5efd131 3641}
0347518d
MF
3642{ .mfi
3643 nop.m 0
d5efd131 3644 fma.s1 FR_p_Temp1H = FR_p_Poly2H, FR_p_XR, f0 // High poly
0347518d 3645 nop.i 0
d5efd131
MF
3646};;
3647
0347518d
MF
3648{ .mfi
3649 nop.m 0
d5efd131 3650 fms.s1 FR_p_XR3L = FR_p_XR2, FR_p_XR, FR_p_XR3 // x^3 delta
0347518d 3651 nop.i 0
d5efd131 3652}
0347518d
MF
3653{ .mfi
3654 nop.m 0
d5efd131 3655 fma.s1 FR_p_A16 = FR_p_A17, FR_p_XR, FR_p_A16 //poly tail
0347518d 3656 nop.i 0
d5efd131
MF
3657};;
3658
0347518d
MF
3659{ .mfi
3660 nop.m 0
d5efd131 3661 fms.s1 FR_r_ResL = FR_r_A1H, f1, FR_r_ResH // neg sin
0347518d 3662 nop.i 0
d5efd131 3663}
0347518d
MF
3664{ .mfi
3665 nop.m 0
d5efd131 3666 fma.s1 FR_r_TL = FR_r_TL, f1, FR_r_TT // neg sin
0347518d 3667 nop.i 0
d5efd131
MF
3668};;
3669
0347518d
MF
3670{ .mfi
3671 nop.m 0
d5efd131 3672 fma.s1 FR_p_Temp5L = FR_p_Temp5L, f1, FR_p_A4L // Low poly
0347518d 3673 nop.i 0
d5efd131 3674}
0347518d
MF
3675{ .mfi
3676 nop.m 0
d5efd131 3677 fma.s1 FR_p_Poly5L = FR_p_Poly5L, f1, FR_p_Temp5H //Low poly
0347518d 3678 nop.i 0
d5efd131
MF
3679};;
3680
0347518d
MF
3681{ .mfi
3682 nop.m 0
d5efd131 3683 fms.s1 FR_p_Temp4L = FR_p_Poly5H, FR_p_XR, FR_p_Temp4H//Low poly
0347518d 3684 nop.i 0
d5efd131 3685}
0347518d
MF
3686{ .mfi
3687 nop.m 0
d5efd131 3688 fma.s1 FR_p_Poly4H = FR_p_Temp4H, f1, FR_p_A3H // Low poly
0347518d 3689 nop.i 0
d5efd131
MF
3690};;
3691
0347518d
MF
3692{ .mfi
3693 nop.m 0
d5efd131 3694 fma.s1 FR_p_Temp2L = FR_p_Temp2L, f1, FR_p_A1L // High poly
0347518d 3695 nop.i 0
d5efd131 3696}
0347518d
MF
3697{ .mfi
3698 nop.m 0
d5efd131 3699 fma.s1 FR_p_Poly2L = FR_p_Poly2L, f1, FR_p_Temp2H // High poly
0347518d 3700 nop.i 0
d5efd131
MF
3701};;
3702
0347518d
MF
3703{ .mfi
3704 nop.m 0
d5efd131 3705 fms.s1 FR_p_Temp1L = FR_p_Poly2H,FR_p_XR,FR_p_Temp1H //High poly
0347518d 3706 nop.i 0
d5efd131 3707}
0347518d
MF
3708{ .mfi
3709 nop.m 0
d5efd131 3710 fma.s1 FR_p_Poly1H = FR_p_Temp1H, f1, FR_p_A0H // High poly
0347518d 3711 nop.i 0
d5efd131
MF
3712};;
3713
0347518d
MF
3714{ .mfi
3715 nop.m 0
d5efd131 3716 fma.s1 FR_p_A12 = FR_p_A13, FR_p_XR, FR_p_A12 // poly tail
0347518d 3717 nop.i 0
d5efd131 3718}
0347518d
MF
3719{ .mfi
3720 nop.m 0
d5efd131 3721 fma.s1 FR_p_XR3L = FR_p_XR2L, FR_p_XR, FR_p_XR3L // x^3 low
0347518d 3722 nop.i 0
d5efd131
MF
3723};;
3724
0347518d
MF
3725{ .mfi
3726 nop.m 0
d5efd131 3727 fma.s1 FR_p_Poly5L = FR_p_Poly5L, f1, FR_p_Temp5L //Low poly
0347518d 3728 nop.i 0
d5efd131 3729}
0347518d
MF
3730{ .mfi
3731 nop.m 0
d5efd131 3732 fma.s1 FR_p_A10 = FR_p_A11, FR_p_XR, FR_p_A10 //poly tail
0347518d 3733 nop.i 0
d5efd131
MF
3734};;
3735
0347518d
MF
3736{ .mfi
3737 nop.m 0
d5efd131 3738 fms.s1 FR_p_Poly4L = FR_p_A3H, f1, FR_p_Poly4H /// Low poly
0347518d 3739 nop.i 0
d5efd131 3740}
0347518d
MF
3741{ .mfi
3742 nop.m 0
d5efd131 3743 fma.s1 FR_p_A6 = FR_p_A7, FR_p_XR, FR_p_A6 // poly tail
0347518d 3744 nop.i 0
d5efd131
MF
3745};;
3746
0347518d
MF
3747{ .mfi
3748 nop.m 0
d5efd131 3749 fma.s1 FR_p_A8 = FR_p_A9, FR_p_XR, FR_p_A8 // poly tail
0347518d 3750 nop.i 0
d5efd131 3751}
0347518d
MF
3752{ .mfi
3753 nop.m 0
d5efd131 3754 fma.s1 FR_p_XR6 = FR_p_XR4, FR_p_XR2, f0 // r^6
0347518d 3755 nop.i 0
d5efd131
MF
3756};;
3757
0347518d
MF
3758{ .mfi
3759 nop.m 0
d5efd131 3760 fma.s1 FR_p_Poly2L = FR_p_Poly2L, f1, FR_p_Temp2L // High poly
0347518d 3761 nop.i 0
d5efd131 3762}
0347518d
MF
3763{ .mfi
3764 nop.m 0
d5efd131 3765 fms.s1 FR_p_Poly1L = FR_p_A0H, f1, FR_p_Poly1H // High poly
0347518d 3766 nop.i 0
d5efd131
MF
3767};;
3768
0347518d
MF
3769{ .mfi
3770 nop.m 0
d5efd131 3771 fma.s1 FR_r_ResL = FR_r_ResL, f1, FR_r_TH // neg sin
0347518d 3772 nop.i 0
d5efd131 3773}
0347518d
MF
3774{ .mfi
3775 nop.m 0
d5efd131 3776 fma.s1 FR_r_TT = FR_r_TL, f1, FR_r_A1L // neg sin
0347518d 3777 nop.i 0
d5efd131
MF
3778};;
3779
0347518d
MF
3780{ .mfi
3781 nop.m 0
d5efd131 3782 fma.s1 FR_p_Temp4L = FR_p_Poly5L,FR_p_XR,FR_p_Temp4L //Low poly
0347518d 3783 nop.i 0
d5efd131 3784}
0347518d
MF
3785{ .mfi
3786 nop.m 0
d5efd131 3787 fma.s1 FR_p_A18 = FR_p_A20, FR_p_XR2, FR_p_A18 // poly tail
0347518d 3788 nop.i 0
d5efd131
MF
3789};;
3790
0347518d
MF
3791{ .mfi
3792 nop.m 0
d5efd131 3793 fma.s1 FR_p_Poly4L = FR_p_Poly4L, f1, FR_p_Temp4H // Low poly
0347518d 3794 nop.i 0
d5efd131 3795}
0347518d
MF
3796{ .mfi
3797 nop.m 0
d5efd131 3798 fma.s1 FR_p_A14 = FR_p_A16, FR_p_XR2, FR_p_A14 // poly tail
0347518d 3799 nop.i 0
d5efd131
MF
3800};;
3801
0347518d
MF
3802{ .mfi
3803 nop.m 0
d5efd131 3804 fma.s1 FR_p_A6 = FR_p_A8, FR_p_XR2, FR_p_A6 // poly tail
0347518d 3805 nop.i 0
d5efd131 3806}
0347518d
MF
3807{ .mfi
3808 nop.m 0
d5efd131 3809 fma.s1 FR_p_A10 = FR_p_A12, FR_p_XR2, FR_p_A10 // poly tail
0347518d 3810 nop.i 0
d5efd131
MF
3811};;
3812
0347518d
MF
3813{ .mfi
3814 nop.m 0
d5efd131 3815 fma.s1 FR_p_Temp1L = FR_p_Poly2L,FR_p_XR,FR_p_Temp1L //High poly
0347518d 3816 nop.i 0
d5efd131 3817}
0347518d
MF
3818{ .mfi
3819 nop.m 0
d5efd131 3820 fma.s1 FR_p_Poly1L = FR_p_Poly1L, f1, FR_p_Temp1H // High poly
0347518d 3821 nop.i 0
d5efd131
MF
3822};;
3823
0347518d
MF
3824{ .mfi
3825 nop.m 0
d5efd131 3826 fma.s1 FR_r_ResL = FR_r_ResL, f1, FR_r_TT // neg sin
0347518d 3827 nop.i 0
d5efd131 3828}
0347518d
MF
3829{ .mfi
3830 nop.m 0
d5efd131 3831 fma.s1 FR_r_TH = FR_r_ResH, FR_r_XS2, f0 // neg sin
0347518d 3832 nop.i 0
d5efd131
MF
3833};;
3834
0347518d
MF
3835{ .mfi
3836 nop.m 0
d5efd131 3837 fma.s1 FR_p_Temp4L = FR_p_Temp4L, f1, FR_p_A3L // Low poly
0347518d 3838 nop.i 0
d5efd131 3839}
0347518d
MF
3840{ .mfi
3841 nop.m 0
d5efd131 3842 fma.s1 FR_p_Poly3H = FR_p_Poly4H, FR_p_XR3, f0 // Low poly
0347518d 3843 nop.i 0
d5efd131
MF
3844};;
3845
0347518d
MF
3846{ .mfi
3847 nop.m 0
d5efd131 3848 fma.s1 FR_p_A14 = FR_p_A18, FR_p_XR4, FR_p_A14 // poly tail
0347518d 3849 nop.i 0
d5efd131 3850}
0347518d
MF
3851{ .mfi
3852 nop.m 0
d5efd131 3853 fma.s1 FR_p_XR8 = FR_p_XR4, FR_p_XR4, f0 // r^8
0347518d 3854 nop.i 0
d5efd131
MF
3855};;
3856
0347518d
MF
3857{ .mfi
3858 nop.m 0
d5efd131 3859 fma.s1 FR_r_TL = FR_r_ResH, FR_r_XS2L, f0 // neg sin
0347518d 3860 nop.i 0
d5efd131 3861}
0347518d
MF
3862{ .mfi
3863 nop.m 0
d5efd131 3864 fnma.s1 FR_z_R1 = f8,FR_z_ZH,f1 // r1 = a-b*x
0347518d 3865 nop.i 0
d5efd131
MF
3866};;
3867
0347518d
MF
3868{ .mfi
3869 nop.m 0
d5efd131 3870 fma.s1 FR_p_Temp1L = FR_p_Temp1L, f1, FR_p_A0L // High poly
0347518d 3871 nop.i 0
d5efd131 3872}
0347518d
MF
3873{ .mfi
3874 nop.m 0
d5efd131 3875 fma.s1 FR_p_A6 = FR_p_A10, FR_p_XR4, FR_p_A6 // poly tail
0347518d 3876 nop.i 0
d5efd131
MF
3877};;
3878
0347518d
MF
3879{ .mfi
3880 nop.m 0
d5efd131 3881 fms.s1 FR_r_TT = FR_r_ResH, FR_r_XS2, FR_r_TH // neg sin
0347518d 3882 nop.i 0
d5efd131 3883}
0347518d
MF
3884{ .mfi
3885 nop.m 0
d5efd131 3886 fma.s1 FR_r_Res3H = FR_r_TH, f1, f1 // neg sin
0347518d 3887 nop.i 0
d5efd131
MF
3888};;
3889
0347518d
MF
3890{ .mfi
3891 nop.m 0
d5efd131 3892 fma.s1 FR_p_Poly4L = FR_p_Poly4L, f1, FR_p_Temp4L // Low poly
0347518d 3893 nop.i 0
d5efd131 3894}
0347518d
MF
3895{ .mfi
3896 nop.m 0
d5efd131 3897 fma.s1 FR_p_Poly3L = FR_p_Poly4H, FR_p_XR3L, f0 // Low poly
0347518d 3898 nop.i 0
d5efd131
MF
3899};;
3900
0347518d
MF
3901{ .mfi
3902 nop.m 0
d5efd131 3903 fma.s1 FR_p_Poly0H = FR_p_Poly3H, f1, FR_p_Poly1H // Result
0347518d 3904 nop.i 0
d5efd131 3905}
0347518d
MF
3906{ .mfi
3907 nop.m 0
d5efd131 3908 fma.s1 FR_r_A7 = FR_r_A8, FR_r_XS2, FR_r_A7 // neg sin
0347518d 3909 nop.i 0
d5efd131
MF
3910};;
3911
0347518d
MF
3912{ .mfi
3913 nop.m 0
d5efd131 3914 fma.s1 FR_r_TL = FR_r_ResL, FR_r_XS2, FR_r_TL // neg sin
0347518d 3915 nop.i 0
d5efd131 3916}
0347518d
MF
3917{ .mfi
3918 nop.m 0
d5efd131 3919 fma.s1 FR_r_XS4 = FR_r_XS2, FR_r_XS2, f0 // xs^4
0347518d 3920 nop.i 0
d5efd131
MF
3921};;
3922
0347518d
MF
3923{ .mfi
3924 nop.m 0
d5efd131 3925 fma.s1 FR_p_Poly1L = FR_p_Poly1L, f1, FR_p_Temp1L // High poly
0347518d 3926 nop.i 0
d5efd131 3927}
0347518d
MF
3928{ .mfi
3929 nop.m 0
d5efd131 3930 fma.s1 FR_p_PolyTail = FR_p_A14, FR_p_XR8, FR_p_A6 // poly tail
0347518d 3931 nop.i 0
d5efd131
MF
3932};;
3933
0347518d
MF
3934{ .mfi
3935 nop.m 0
d5efd131 3936 fms.s1 FR_r_Res3L = f1, f1, FR_r_Res3H // neg sin
0347518d 3937 nop.i 0
d5efd131 3938}
0347518d
MF
3939{ .mfi
3940 nop.m 0
d5efd131 3941 fma.s1 FR_r_ResH = FR_r_Res3H, FR_r_XS, f0 // neg sin
0347518d 3942 nop.i 0
d5efd131
MF
3943};;
3944
0347518d
MF
3945{ .mfi
3946 nop.m 0
d5efd131 3947 fms.s1 FR_p_Temp0L = FR_p_Poly4H,FR_p_XR3,FR_p_Poly3H //Low poly
0347518d 3948 nop.i 0
d5efd131 3949}
0347518d
MF
3950{ .mfi
3951 nop.m 0
d5efd131 3952 fma.s1 FR_p_Poly3L = FR_p_Poly4L,FR_p_XR3,FR_p_Poly3L //Low poly
0347518d 3953 nop.i 0
d5efd131
MF
3954};;
3955
0347518d
MF
3956{ .mfi
3957 nop.m 0
d5efd131 3958 fms.s1 FR_p_Poly0L = FR_p_Poly1H, f1, FR_p_Poly0H // Result
0347518d 3959 nop.i 0
d5efd131 3960}
0347518d
MF
3961{ .mfi
3962 nop.m 0
d5efd131 3963 fma.s1 FR_z_ZL = FR_z_R1,FR_z_Y3, f0 // x_lo = r1*y3
0347518d 3964 nop.i 0
d5efd131
MF
3965};;
3966
0347518d
MF
3967{ .mfi
3968 nop.m 0
d5efd131 3969 fma.s1 FR_r_TL = FR_r_TL, f1, FR_r_TT // neg sin
0347518d 3970 nop.i 0
d5efd131 3971}
0347518d
MF
3972{ .mfi
3973 nop.m 0
d5efd131 3974 fma.s1 FR_r_A3 = FR_r_A4, FR_r_XS2, FR_r_A3 /// neg sin
0347518d 3975 nop.i 0
d5efd131
MF
3976};;
3977
0347518d
MF
3978{ .mfi
3979 nop.m 0
d5efd131 3980 fma.s1 FR_p_Poly1L = FR_p_PolyTail,FR_p_XR6,FR_p_Poly1L // High
0347518d 3981 nop.i 0
d5efd131 3982}
0347518d
MF
3983{ .mfi
3984 nop.m 0
d5efd131 3985 fma.s1 FR_r_A5 = FR_r_A6, FR_r_XS2, FR_r_A5 // neg sin
0347518d 3986 nop.i 0
d5efd131
MF
3987};;
3988
0347518d
MF
3989{ .mfi
3990 nop.m 0
d5efd131 3991 fma.s1 FR_r_Res3L = FR_r_Res3L, f1, FR_r_TH // neg sin
0347518d 3992 nop.i 0
d5efd131 3993}
0347518d
MF
3994{ .mfi
3995 nop.m 0
d5efd131 3996 fms.s1 FR_r_ResL = FR_r_Res3H, FR_r_XS, FR_r_ResH // neg sin
0347518d 3997 nop.i 0
d5efd131
MF
3998};;
3999
0347518d
MF
4000{ .mfi
4001 nop.m 0
d5efd131 4002 fma.s1 FR_p_Poly3L = FR_p_Poly3L, f1, FR_p_Temp0L // Low poly
0347518d 4003 nop.i 0
d5efd131 4004}
0347518d
MF
4005{ .mfi
4006 nop.m 0
d5efd131 4007 fma.s1 FR_r_A7 = FR_r_A9, FR_r_XS4, FR_r_A7 // neg sin
0347518d 4008 nop.i 0
d5efd131
MF
4009};;
4010
0347518d
MF
4011{ .mfi
4012 nop.m 0
d5efd131 4013 fma.s1 FR_p_Poly0L = FR_p_Poly0L, f1, FR_p_Poly3H // result
0347518d 4014 nop.i 0
d5efd131
MF
4015};;
4016
0347518d
MF
4017{ .mfi
4018 nop.m 0
d5efd131 4019(p14) fma.s1 f8 = FR_p_Poly0H, FR_z_ZH, f0 // z*poly
0347518d 4020 nop.i 0
d5efd131 4021}
0347518d
MF
4022{ .mfi
4023 nop.m 0
d5efd131 4024 fma.s1 FR_p_Temp1L = FR_p_Poly0H, FR_z_ZL, f0 // z*poly low
0347518d 4025 nop.i 0
d5efd131
MF
4026};;
4027
0347518d
MF
4028{ .mfi
4029 nop.m 0
d5efd131 4030 fma.s1 FR_r_A3 = FR_r_A5, FR_r_XS4, FR_r_A3 // sin tail
0347518d 4031 nop.i 0
d5efd131 4032}
0347518d
MF
4033{ .mfi
4034 nop.m 0
d5efd131 4035 fma.s1 FR_r_XS7 = FR_r_XS4, FR_r_XS2, f0 // xs^6
0347518d 4036 nop.i 0
d5efd131
MF
4037};;
4038
0347518d
MF
4039{ .mfi
4040 nop.m 0
d5efd131 4041 fma.s1 FR_r_Res3L = FR_r_Res3L, f1, FR_r_TL // sin low
0347518d 4042 nop.i 0
d5efd131 4043}
0347518d
MF
4044{ .mfi
4045 nop.m 0
d5efd131 4046 fma.s1 FR_r_XS8 = FR_r_XS4, FR_r_XS4, f0 // xs^8
0347518d 4047 nop.i 0
d5efd131
MF
4048};;
4049
0347518d
MF
4050{ .mfi
4051 nop.m 0
d5efd131 4052 fma.s1 FR_p_Temp0H = FR_p_Poly3L, f1, FR_p_Poly1L // result
0347518d 4053 nop.i 0
d5efd131
MF
4054};;
4055
0347518d
MF
4056{ .mfi
4057 nop.m 0
d5efd131 4058(p14) fms.s1 FR_p_Temp1H = FR_p_Poly0H, FR_z_ZH, f8 // hi result
0347518d 4059 nop.i 0
d5efd131
MF
4060};;
4061
0347518d
MF
4062{ .mfi
4063 nop.m 0
d5efd131 4064 fma.s1 FR_r_XS7 = FR_r_XS7, FR_r_XS, f0 // xs^7
0347518d 4065 nop.i 0
d5efd131
MF
4066};;
4067
0347518d
MF
4068{ .mfi
4069 nop.m 0
d5efd131 4070 fma.s1 FR_r_ResL = FR_r_Res3L, FR_r_XS, FR_r_ResL // lo result
0347518d 4071 nop.i 0
d5efd131 4072}
0347518d
MF
4073{ .mfi
4074 nop.m 0
d5efd131 4075 fma.s1 FR_r_Tail = FR_r_A7, FR_r_XS8, FR_r_A3 // tail result
0347518d 4076 nop.i 0
d5efd131
MF
4077};;
4078
0347518d
MF
4079{ .mfi
4080 nop.m 0
d5efd131 4081 fma.s1 FR_p_Poly0L = FR_p_Poly0L, f1, FR_p_Temp0H // lo result
0347518d 4082 nop.i 0
d5efd131
MF
4083};;
4084
0347518d
MF
4085{ .mfi
4086 nop.m 0
d5efd131 4087 fma.s1 FR_r_ResL = FR_r_Tail, FR_r_XS7, FR_r_ResL // lo result
0347518d 4088 nop.i 0
d5efd131
MF
4089};;
4090
0347518d
MF
4091{ .mfi
4092 nop.m 0
d5efd131 4093(p14) fma.s1 FR_p_Temp1L = FR_p_Poly0L,FR_z_ZH,FR_p_Temp1L //hi result
0347518d 4094 nop.i 0
d5efd131
MF
4095};;
4096
0347518d
MF
4097{ .mfi
4098 nop.m 0
d5efd131 4099 fma.s1 FR_r_TT = FR_r_ResL, f1, f0 // for low result
0347518d 4100 nop.i 0
d5efd131
MF
4101};;
4102
4103.pred.rel "mutex",p12,p13
0347518d
MF
4104{ .mfi
4105 nop.m 0
d5efd131 4106(p14) fma.s1 FR_p_Temp1L = FR_p_Temp1L, f1, FR_p_Temp1H // for lo res
0347518d 4107 nop.i 0
d5efd131
MF
4108};;
4109
0347518d 4110{ .mfi
d5efd131
MF
4111(p10) cmp.eq p13, p12 = r0, r0 // set p13, clear p12
4112 fma.s1 FR_r_Res1H = FR_r_ResH, f1, FR_r_TT // hi res
0347518d 4113 nop.i 0
d5efd131
MF
4114};;
4115
0347518d 4116{ .mfb
d5efd131
MF
4117(p9) cmp.eq p13, p12 = r0, r0 // set p13, clear p12
4118(p14) fma.s0 f8 = f8, f1, FR_p_Temp1L // Final result
4119(p14) br.ret.spnt b0 // Exit for 0 < |X| < 1 path (positive arguments)///////
4120};;
4121
0347518d 4122{ .mfi
d5efd131
MF
4123(p11) cmp.eq p13, p12 = r0, r0 // set p13, clear p12
4124 fms.s1 FR_r_Res1L = FR_r_ResH, f1, FR_r_Res1H // Low sin result
0347518d 4125 nop.i 0
d5efd131
MF
4126};;
4127
0347518d
MF
4128{ .mfi
4129 nop.m 0
d5efd131 4130 fma.s1 FR_r_Res1L = FR_r_Res1L, f1, FR_r_TT // Low sin result
0347518d 4131 nop.i 0
d5efd131 4132}
0347518d
MF
4133{ .mfi
4134 nop.m 0
d5efd131 4135 fma.s1 FR_r_TL = FR_p_Poly0L,FR_r_Res1H,f0 //Low sin result
0347518d 4136 nop.i 0
d5efd131
MF
4137};;
4138
0347518d
MF
4139{ .mfi
4140 nop.m 0
d5efd131 4141 fma.s1 FR_r_TL = FR_p_Poly0H, FR_r_Res1L, FR_r_TL //Low sin
0347518d 4142 nop.i 0
d5efd131
MF
4143};;
4144
0347518d
MF
4145{ .mfi
4146 nop.m 0
d5efd131 4147 fma.s1 FR_r_ResH = FR_p_Poly0H, FR_r_Res1H, FR_r_TL //High sin
0347518d 4148 nop.i 0
d5efd131
MF
4149};;
4150
0347518d
MF
4151{ .mfi
4152 nop.m 0
d5efd131 4153 fms.s1 FR_r_ResL = FR_p_Poly0H,FR_r_Res1H,FR_r_ResH //Low res
0347518d 4154 nop.i 0
d5efd131
MF
4155};;
4156
0347518d
MF
4157{ .mfi
4158 nop.m 0
d5efd131 4159 frcpa.s1 FR_r_Y0,p0 = f1,FR_r_ResH // y = frcpa(b)
0347518d 4160 nop.i 0
d5efd131
MF
4161};;
4162
0347518d
MF
4163{ .mfi
4164 nop.m 0
d5efd131 4165 fneg FR_r_NegOne = f1 // Construct -1.0
0347518d 4166 nop.i 0
d5efd131 4167}
0347518d
MF
4168{ .mfi
4169 nop.m 0
d5efd131 4170 fma.s1 FR_r_ResL = FR_r_ResL, f1, FR_r_TL // low sin
0347518d 4171 nop.i 0
d5efd131
MF
4172};;
4173
0347518d
MF
4174{ .mfi
4175 nop.m 0
d5efd131 4176 fma.s1 FR_r_Q0 = f1,FR_r_Y0,f0 // q = a*y
0347518d 4177 nop.i 0
d5efd131 4178}
0347518d
MF
4179{ .mfi
4180 nop.m 0
4181 fnma.s1 FR_r_E0 = FR_r_Y0,FR_r_ResH,f1 // e = 1-b*y
4182 nop.i 0
d5efd131
MF
4183};;
4184
0347518d
MF
4185{ .mfi
4186 nop.m 0
d5efd131 4187 fma.s1 FR_r_E2 = FR_r_E0,FR_r_E0,FR_r_E0 // e2 = e+e^2
0347518d 4188 nop.i 0
d5efd131 4189}
0347518d
MF
4190{ .mfi
4191 nop.m 0
d5efd131 4192 fma.s1 FR_r_E1 = FR_r_E0,FR_r_E0,f0 // e1 = e^2
0347518d 4193 nop.i 0
d5efd131
MF
4194};;
4195
0347518d
MF
4196{ .mfi
4197 nop.m 0
d5efd131 4198 fma.s1 FR_r_Y1 = FR_r_Y0,FR_r_E2,FR_r_Y0 // y1 = y+y*e2
0347518d 4199 nop.i 0
d5efd131 4200}
0347518d
MF
4201{ .mfi
4202 nop.m 0
d5efd131 4203 fma.s1 FR_r_E3 = FR_r_E1,FR_r_E1,FR_r_E0 // e3 = e+e1^2
0347518d 4204 nop.i 0
d5efd131
MF
4205};;
4206
0347518d
MF
4207{ .mfi
4208 nop.m 0
d5efd131 4209 fma.s1 FR_r_Y2 = FR_r_Y1,FR_r_E3,FR_r_Y0 // y2 = y+y1*e3
0347518d 4210 nop.i 0
d5efd131 4211}
0347518d
MF
4212{ .mfi
4213 nop.m 0
d5efd131 4214 fnma.s1 FR_r_R0 = FR_r_ResH,FR_r_Q0,f1 // r = a-b*q
0347518d 4215 nop.i 0
d5efd131
MF
4216};;
4217
0347518d
MF
4218{ .mfi
4219 nop.m 0
d5efd131 4220 fnma.s1 FR_r_E4 = FR_r_ResH,FR_r_Y2,f1 // e4 = 1-b*y2
0347518d 4221 nop.i 0
d5efd131 4222}
0347518d
MF
4223{ .mfi
4224 nop.m 0
d5efd131 4225 fma.s1 FR_r_ZH = FR_r_R0,FR_r_Y2,FR_r_Q0 // x = q+r*y2
0347518d 4226 nop.i 0
d5efd131
MF
4227};;
4228
0347518d
MF
4229{ .mfi
4230 nop.m 0
d5efd131 4231 fma.s1 FR_r_Y3 = FR_r_Y2,FR_r_E4,FR_r_Y2 // y3 = y2+y2*e4
0347518d 4232 nop.i 0
d5efd131 4233}
0347518d
MF
4234{ .mfi
4235 nop.m 0
d5efd131 4236 fnma.s1 FR_r_R1 = FR_r_ResH,FR_r_ZH,f1 // r1 = a-b*x
0347518d 4237 nop.i 0
d5efd131
MF
4238};;
4239
0347518d
MF
4240{ .mfi
4241 nop.m 0
d5efd131 4242 fnma.s1 FR_r_R1 = FR_r_ResL,FR_r_ZH,FR_r_R1 // r1=r1 - b_lo*X
0347518d 4243 nop.i 0
d5efd131 4244}
0347518d
MF
4245{ .mfi
4246 nop.m 0
d5efd131 4247 fma.s1 FR_r_ZHN = FR_r_ZH,FR_r_NegOne, f0 // Negate
0347518d 4248 nop.i 0
d5efd131
MF
4249};;
4250
4251.pred.rel "mutex",p13,p12
0347518d
MF
4252{ .mfb
4253 nop.m 0
d5efd131
MF
4254 fnma.s0 f8 = FR_r_R1,FR_r_Y3,FR_r_ZHN // Result for neg
4255 br.ret.sptk b0 // Exit for 0 < |X| < 1 path (negative arguments)//////
4256};;
4257
4258
4259
4260
4261// SPECIALS (x for natval, nan, +/-inf or +/-0) ///////////////////////////////
4262//------------------------------------------------------------------------------
4263.align 32
4264tgammal_spec:
4265{ .mlx
0347518d 4266 nop.m 0
d5efd131
MF
4267 movl GR_DenOverflow = 0x2000000000000001
4268}
4269{ .mfi
4270 nop.m 0
4271 fclass.m p9,p0 = f8,0xB // +/-denormals
4272 nop.i 0
4273};;
4274{ .mfi
4275 nop.m 0
4276 fclass.m p6,p0 = f8,0x1E1 // Test x for natval, nan, +inf
4277 nop.i 0
4278};;
4279{ .mfi
4280 nop.m 0
4281 fclass.m p7,p8 = f8,0x7 // +/-0
4282 nop.i 0
4283}
4284
4285{ .mfi
0347518d 4286(p9) cmp.ltu.unc p10,p11 = GR_l_signif_Z, GR_DenOverflow
d5efd131 4287(p9) fnorm.s0 f8 = f8
0347518d 4288 nop.i 0
d5efd131
MF
4289};;
4290
4291{ .mfb
0347518d 4292 nop.m 0
d5efd131
MF
4293(p9) fcvt.fx.trunc.s1 FR_n_IXN = FR_l_AbsX // Round by truncate
4294(p11) br.cond.sptk tgamma_lt_1 // Return to gamma ('good' denormal)////////////
4295};;
4296
4297{ .mfb
0347518d
MF
4298 nop.m 0
4299 nop.f 0
d5efd131
MF
4300(p10) br.cond.spnt tgammal_overflow // "Bad" denormal - overflow! /////////////
4301};;
4302
4303{ .mfi
4304 nop.m 0
4305 mov FR_X = f8 // for error handler
4306 nop.i 0
4307}
4308{ .mfb
4309 nop.m 0
4310(p6) fma.s0 f8 = f8,f1,f8 // res = x + x
4311(p6) br.ret.spnt b0 // Exit for NAN, INF and NatVals ////////////////////////
4312};;
4313.pred.rel "mutex",p7,p8
4314{ .mfi
4315(p7) mov GR_Parameter_TAG = 256 // negative
4316(p7) frcpa.s0 f8,p0 = f1,f8 // Raise V flag
0347518d 4317 nop.i 0
d5efd131
MF
4318}
4319{ .mfb
4320 nop.m 0
0347518d 4321 nop.f 0
d5efd131
MF
4322(p8) br.cond.spnt tgammal_singularity // Branch for +ZERO ////////////////////
4323};;
4324
4325{ .mfb
0347518d
MF
4326 nop.m 0
4327 nop.f 0
d5efd131
MF
4328 br.cond.spnt tgammal_libm_err // Branch for -ZERO ///////////////////////
4329};;
4330
4331
4332
4333
4334// SINGULARITY (x is negative integer or 0) ////////////////////////////////////
4335//------------------------------------------------------------------------------
4336.align 32
4337tgammal_singularity:
4338{ .mfi
4339 nop.m 0
4340 mov FR_X = f8 // For error handler
4341 mov GR_Parameter_TAG = 256 // negative
4342}
4343{ .mfb
4344 nop.m 0
4345 frcpa.s0 f8,p0 = f0,f0 // Raise V flag
4346 br.cond.sptk tgammal_libm_err // Call error handler /////////////////////
4347 // with singularity error /////////////////
4348};;
4349
4350
4351
4352
4353// OVERFLOW (result is too big and cannot be represented by normal value) //////
4354// ( X > 1755.54 and for denormals with abs value less than 0x2000000000000001 )
4355//------------------------------------------------------------------------------
4356.align 32
4357tgammal_overflow:
4358{ .mfi
4359 addl r8 = 0x1FFFE, r0 // Exp of INF
4360 fcmp.lt.s1 p15,p14 = f8,f0 // p14 - pos arg, p15 - neg arg
0347518d 4361 nop.i 0
d5efd131
MF
4362};;
4363
4364{ .mfi
4365 setf.exp f9 = r8
4366 mov FR_X = f8 // For error handler
4367 mov GR_Parameter_TAG = 255 // overflow
4368};;
4369
4370.pred.rel "mutex",p14,p15
4371{ .mfi
0347518d 4372 nop.m 0
d5efd131 4373(p14) fma.s0 f8 = f9,f9,f0 // Set I,O and +INF result
0347518d 4374 nop.i 0
d5efd131
MF
4375}
4376{ .mfb
0347518d 4377 nop.m 0
d5efd131
MF
4378(p15) fnma.s0 f8 = f9,f9,f0 // Set I,O and -INF result
4379 br.cond.sptk tgammal_libm_err // Call error handler /////////////////////
4380 // with overflow error ////////////////////
4381};;
4382
4383
4384
4385
4386
4387// UNDERFLOW (x is negative noninteger with big absolute value) ////////////////
4388//------------------------------------------------------------------------------
4389.align 32
4390tgammal_underflow:
4391{ .mfi
0347518d 4392 nop.m 0
d5efd131 4393 fcvt.fx.trunc.s1 FR_u_IXN = f8 // Convert arg to int repres. in FR
0347518d 4394 nop.i 0
d5efd131
MF
4395};;
4396
4397{ .mmi
4398 getf.sig GR_u_XN = FR_u_IXN
4399 mov r11 = 0x00001
4400 nop.i 0
4401};;
4402
4403{ .mfi
4404 setf.exp f9 = r11
4405 nop.f 0
4406 nop.i 0
4407};;
4408
4409{ .mfi
4410 nop.m 0
4411 nop.f 0
4412 tbit.z p6,p7 = GR_u_XN,0 // even or odd
4413};;
4414
4415.pred.rel "mutex",p6,p7
4416{ .mfi
4417 nop.m 0
4418(p6) fms.s0 f8 = f9,f9,f9 // for negatives
4419 nop.i 0
4420}
4421{ .mfb
4422 nop.m 0
4423(p7) fma.s0 f8 = f9,f9,f9 // for positives
4424 br.ret.sptk b0 // Exit for underflow path //////////////////////////////
4425};;
4426
4427
4428GLOBAL_LIBM_END(tgammal)
4429
4430
4431
4432
4433////////////////// Tgammal error handler ///////////////////////////////////////
4434//------------------------------------------------------------------------------
4435LOCAL_LIBM_ENTRY(__libm_error_region)
4436tgammal_libm_err:
4437.prologue
4438{ .mfi
4439 add GR_Parameter_Y=-32,sp // Parameter 2 value
4440 nop.f 0
4441.save ar.pfs,GR_SAVE_PFS
4442 mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
4443}
4444{ .mfi
4445.fframe 64
4446 add sp=-64,sp // Create new stack
4447 nop.f 0
4448 mov GR_SAVE_GP=gp // Save gp
4449};;
4450{ .mmi
4451 stfe [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack
4452 add GR_Parameter_X = 16,sp // Parameter 1 address
4453.save b0, GR_SAVE_B0
4454 mov GR_SAVE_B0=b0 // Save b0
4455};;
4456.body
4457{ .mib
4458 stfe [GR_Parameter_X] = FR_X // Store Parameter 1 on stack
4459 add GR_Parameter_RESULT = 0,GR_Parameter_Y
4460 nop.b 0 // Parameter 3 address
4461}
4462{ .mib
4463 stfe [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack
4464 add GR_Parameter_Y = -16,GR_Parameter_Y
4465 br.call.sptk b0=__libm_error_support# // Call error handling function
4466};;
4467{ .mmi
4468 nop.m 999
4469 nop.m 999
4470 add GR_Parameter_RESULT = 48,sp
4471};;
4472{ .mmi
4473 ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack
4474.restore sp
4475 add sp = 64,sp // Restore stack pointer
4476 mov b0 = GR_SAVE_B0 // Restore return address
4477};;
4478{ .mib
4479 mov gp = GR_SAVE_GP // Restore gp
4480 mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
4481 br.ret.sptk b0 // Return
4482};;
4483
4484LOCAL_LIBM_END(__libm_error_region#)
4485
4486.type __libm_error_support#,@function
4487.global __libm_error_support#