]>
Commit | Line | Data |
---|---|---|
d5efd131 MF |
1 | .file "acoshl.s" |
2 | ||
3 | ||
4 | // Copyright (c) 2000 - 2005, Intel Corporation | |
5 | // All rights reserved. | |
6 | // | |
7 | // Contributed 2000 by the Intel Numerics Group, Intel Corporation | |
8 | // | |
9 | // Redistribution and use in source and binary forms, with or without | |
10 | // modification, are permitted provided that the following conditions are | |
11 | // met: | |
12 | // | |
13 | // * Redistributions of source code must retain the above copyright | |
14 | // notice, this list of conditions and the following disclaimer. | |
15 | // | |
16 | // * Redistributions in binary form must reproduce the above copyright | |
17 | // notice, this list of conditions and the following disclaimer in the | |
18 | // documentation and/or other materials provided with the distribution. | |
19 | // | |
20 | // * The name of Intel Corporation may not be used to endorse or promote | |
21 | // products derived from this software without specific prior written | |
22 | // permission. | |
23 | ||
0347518d MF |
24 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
25 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
d5efd131 | 26 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
0347518d | 27 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS |
d5efd131 | 28 | // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
0347518d MF |
29 | // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
30 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | |
31 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY | |
d5efd131 | 32 | // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING |
0347518d MF |
33 | // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
34 | // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
35 | // | |
d5efd131 | 36 | // Intel Corporation is the author of this code, and requests that all |
0347518d | 37 | // problem reports or change requests be submitted to it directly at |
d5efd131 MF |
38 | // http://www.intel.com/software/products/opensource/libraries/num.htm. |
39 | // | |
40 | //********************************************************************* | |
41 | // | |
0347518d | 42 | // History: |
d5efd131 MF |
43 | // 10/01/01 Initial version |
44 | // 10/10/01 Performance inproved | |
45 | // 12/11/01 Changed huges_logp to not be global | |
46 | // 01/02/02 Corrected .restore syntax | |
47 | // 05/20/02 Cleaned up namespace and sf0 syntax | |
48 | // 08/14/02 Changed mli templates to mlx | |
49 | // 02/06/03 Reorganized data tables | |
50 | // 03/31/05 Reformatted delimiters between data tables | |
51 | // | |
52 | //********************************************************************* | |
53 | // | |
54 | // API | |
55 | //============================================================== | |
56 | // long double acoshl(long double); | |
57 | // | |
58 | // Overview of operation | |
59 | //============================================================== | |
0347518d | 60 | // |
d5efd131 MF |
61 | // There are 6 paths: |
62 | // 1. x = 1 | |
63 | // Return acoshl(x) = 0; | |
64 | // | |
65 | // 2. x < 1 | |
66 | // Return acoshl(x) = Nan (Domain error, error handler call with tag 135); | |
67 | // | |
68 | // 3. x = [S,Q]Nan or +INF | |
69 | // Return acoshl(x) = x + x; | |
0347518d | 70 | // |
d5efd131 | 71 | // 4. 'Near 1': 1 < x < 1+1/8 |
0347518d | 72 | // Return acoshl(x) = sqrtl(2*y)*(1-P(y)/Q(y)), |
d5efd131 MF |
73 | // where y = 1, P(y)/Q(y) - rational approximation |
74 | // | |
75 | // 5. 'Huges': x > 0.5*2^64 | |
76 | // Return acoshl(x) = (logl(2*x-1)); | |
0347518d | 77 | // |
d5efd131 MF |
78 | // 6. 'Main path': 1+1/8 < x < 0.5*2^64 |
79 | // b_hi + b_lo = x + sqrt(x^2 - 1); | |
80 | // acoshl(x) = logl_special(b_hi, b_lo); | |
0347518d MF |
81 | // |
82 | // Algorithm description | |
d5efd131 MF |
83 | //============================================================== |
84 | // | |
85 | // I. Near 1 path algorithm | |
86 | // ************************************************************** | |
0347518d | 87 | // The formula is acoshl(x) = sqrtl(2*y)*(1-P(y)/Q(y)), |
d5efd131 MF |
88 | // where y = 1, P(y)/Q(y) - rational approximation |
89 | // | |
90 | // 1) y = x - 1, y2 = 2 * y | |
91 | // | |
92 | // 2) Compute in parallel sqrtl(2*y) and P(y)/Q(y) | |
93 | // a) sqrtl computation method described below (main path algorithm, item 2)) | |
0347518d | 94 | // As result we obtain (gg+gl) - multiprecision result |
d5efd131 MF |
95 | // as pair of double extended values |
96 | // b) P(y) and Q(y) calculated without any extra precision manipulations | |
97 | // c) P/Q division: | |
98 | // y = frcpa(Q) initial approximation of 1/Q | |
99 | // z = P*y initial approximation of P/Q | |
0347518d | 100 | // |
d5efd131 MF |
101 | // e = 1 - b*y |
102 | // e2 = e + e^2 | |
103 | // e1 = e^2 | |
104 | // y1 = y + y*e2 = y + y*(e+e^2) | |
105 | // | |
106 | // e3 = e + e1^2 | |
107 | // y2 = y + y1*e3 = y + y*(e+e^2+..+e^6) | |
108 | // | |
109 | // r = P - Q*z | |
110 | // e = 1 - Q*y2 | |
111 | // xx = z + r*y2 high part of a/b | |
112 | // | |
113 | // y3 = y2 + y2*e4 | |
114 | // r1 = P - Q*xx | |
115 | // xl = r1*y3 low part of a/b | |
116 | // | |
117 | // 3) res = sqrt(2*y) - sqrt(2*y)*(P(y)/Q(y)) = | |
118 | // = (gg+gl) - (gg + gl)*(xx+xl); | |
119 | // | |
120 | // a) hh = gg*xx; hl = gg*xl; lh = gl*xx; ll = gl*xl; | |
121 | // b) res = ((((gl + ll) + lh) + hl) + hh) + gg; | |
122 | // (exactly in this order) | |
123 | // | |
0347518d | 124 | // II. Main path algorithm |
d5efd131 MF |
125 | // ( thanks to Peter Markstein for the idea of sqrt(x^2+1) computation! ) |
126 | // ********************************************************************** | |
127 | // | |
128 | // There are 3 parts of x+sqrt(x^2-1) computation: | |
129 | // | |
130 | // 1) m2 = (m2_hi+m2_lo) = x^2-1 obtaining | |
131 | // ------------------------------------ | |
132 | // m2_hi = x2_hi - 1, where x2_hi = x * x; | |
0347518d MF |
133 | // m2_lo = x2_lo + p1_lo, where |
134 | // x2_lo = FMS(x*x-x2_hi), | |
d5efd131 MF |
135 | // p1_lo = (1 + m2_hi) - x2_hi; |
136 | // | |
137 | // 2) g = (g_hi+g_lo) = sqrt(m2) = sqrt(m2_hi+m2_lo) | |
138 | // ---------------------------------------------- | |
139 | // r = invsqrt(m2_hi) (8-bit reciprocal square root approximation); | |
140 | // g = m2_hi * r (first 8 bit-approximation of sqrt); | |
0347518d | 141 | // |
d5efd131 MF |
142 | // h = 0.5 * r; |
143 | // e = 0.5 - g * h; | |
144 | // g = g * e + g (second 16 bit-approximation of sqrt); | |
0347518d | 145 | // |
d5efd131 MF |
146 | // h = h * e + h; |
147 | // e = 0.5 - g * h; | |
148 | // g = g * e + g (third 32 bit-approximation of sqrt); | |
149 | // | |
150 | // h = h * e + h; | |
151 | // e = 0.5 - g * h; | |
152 | // g_hi = g * e + g (fourth 64 bit-approximation of sqrt); | |
0347518d | 153 | // |
d5efd131 MF |
154 | // Remainder computation: |
155 | // h = h * e + h; | |
156 | // d = (m2_hi - g_hi * g_hi) + m2_lo; | |
157 | // g_lo = d * h; | |
158 | // | |
159 | // 3) b = (b_hi + b_lo) = x + g, where g = (g_hi + g_lo) = sqrt(x^2-1) | |
160 | // ------------------------------------------------------------------- | |
161 | // b_hi = (g_hi + x) + gl; | |
162 | // b_lo = (x - b_hi) + g_hi + gl; | |
0347518d | 163 | // |
d5efd131 MF |
164 | // Now we pass b presented as sum b_hi + b_lo to special version |
165 | // of logl function which accept a pair of arguments as | |
0347518d MF |
166 | // mutiprecision value. |
167 | // | |
d5efd131 MF |
168 | // Special log algorithm overview |
169 | // ================================ | |
170 | // Here we use a table lookup method. The basic idea is that in | |
0347518d | 171 | // order to compute logl(Arg) for an argument Arg in [1,2), |
d5efd131 MF |
172 | // we construct a value G such that G*Arg is close to 1 and that |
173 | // logl(1/G) is obtainable easily from a table of values calculated | |
174 | // beforehand. Thus | |
175 | // | |
176 | // logl(Arg) = logl(1/G) + logl((G*Arg - 1)) | |
177 | // | |
178 | // Because |G*Arg - 1| is small, the second term on the right hand | |
179 | // side can be approximated by a short polynomial. We elaborate | |
180 | // this method in four steps. | |
181 | // | |
182 | // Step 0: Initialization | |
183 | // | |
184 | // We need to calculate logl( X+1 ). Obtain N, S_hi such that | |
185 | // | |
186 | // X = 2^N * ( S_hi + S_lo ) exactly | |
187 | // | |
188 | // where S_hi in [1,2) and S_lo is a correction to S_hi in the sense | |
189 | // that |S_lo| <= ulp(S_hi). | |
190 | // | |
191 | // For the special version of logl: S_lo = b_lo | |
192 | // !-----------------------------------------------! | |
193 | // | |
194 | // Step 1: Argument Reduction | |
195 | // | |
196 | // Based on S_hi, obtain G_1, G_2, G_3 from a table and calculate | |
197 | // | |
198 | // G := G_1 * G_2 * G_3 | |
199 | // r := (G * S_hi - 1) + G * S_lo | |
200 | // | |
0347518d | 201 | // These G_j's have the property that the product is exactly |
d5efd131 MF |
202 | // representable and that |r| < 2^(-12) as a result. |
203 | // | |
204 | // Step 2: Approximation | |
205 | // | |
206 | // logl(1 + r) is approximated by a short polynomial poly(r). | |
207 | // | |
208 | // Step 3: Reconstruction | |
209 | // | |
210 | // Finally, logl( X ) = logl( X+1 ) is given by | |
211 | // | |
212 | // logl( X ) = logl( 2^N * (S_hi + S_lo) ) | |
213 | // ~=~ N*logl(2) + logl(1/G) + logl(1 + r) | |
214 | // ~=~ N*logl(2) + logl(1/G) + poly(r). | |
215 | // | |
216 | // For detailed description see logl or log1pl function, regular path. | |
217 | // | |
218 | // Registers used | |
219 | //============================================================== | |
0347518d | 220 | // Floating Point registers used: |
d5efd131 MF |
221 | // f8, input |
222 | // f32 -> f95 (64 registers) | |
223 | ||
0347518d | 224 | // General registers used: |
d5efd131 MF |
225 | // r32 -> r67 (36 registers) |
226 | ||
227 | // Predicate registers used: | |
228 | // p7 -> p11 | |
229 | // p7 for 'NaNs, Inf' path | |
230 | // p8 for 'near 1' path | |
231 | // p9 for 'huges' path | |
0347518d | 232 | // p10 for x = 1 |
d5efd131 MF |
233 | // p11 for x < 1 |
234 | // | |
235 | //********************************************************************* | |
236 | // IEEE Special Conditions: | |
237 | // | |
238 | // acoshl(+inf) = +inf | |
0347518d MF |
239 | // acoshl(-inf) = QNaN |
240 | // acoshl(1) = 0 | |
d5efd131 MF |
241 | // acoshl(x<1) = QNaN |
242 | // acoshl(SNaN) = QNaN | |
243 | // acoshl(QNaN) = QNaN | |
244 | // | |
245 | ||
246 | // Data tables | |
247 | //============================================================== | |
0347518d | 248 | |
d5efd131 MF |
249 | RODATA |
250 | .align 64 | |
251 | ||
6f65e668 | 252 | // Near 1 path rational approximation coefficients |
d5efd131 | 253 | LOCAL_OBJECT_START(Poly_P) |
0347518d MF |
254 | data8 0xB0978143F695D40F, 0x3FF1 // .84205539791447100108478906277453574946e-4 |
255 | data8 0xB9800D841A8CAD29, 0x3FF6 // .28305085180397409672905983082168721069e-2 | |
256 | data8 0xC889F455758C1725, 0x3FF9 // .24479844297887530847660233111267222945e-1 | |
257 | data8 0x9BE1DFF006F45F12, 0x3FFB // .76114415657565879842941751209926938306e-1 | |
258 | data8 0x9E34AF4D372861E0, 0x3FFB // .77248925727776366270605984806795850504e-1 | |
259 | data8 0xF3DC502AEE14C4AE, 0x3FA6 // .3077953476682583606615438814166025592e-26 | |
d5efd131 MF |
260 | LOCAL_OBJECT_END(Poly_P) |
261 | ||
262 | // | |
263 | LOCAL_OBJECT_START(Poly_Q) | |
0347518d MF |
264 | data8 0xF76E3FD3C7680357, 0x3FF1 // .11798413344703621030038719253730708525e-3 |
265 | data8 0xD107D2E7273263AE, 0x3FF7 // .63791065024872525660782716786703188820e-2 | |
266 | data8 0xB609BE5CDE206AEF, 0x3FFB // .88885771950814004376363335821980079985e-1 | |
267 | data8 0xF7DEACAC28067C8A, 0x3FFD // .48412074662702495416825113623936037072302 | |
268 | data8 0x8F9BE5890CEC7E38, 0x3FFF // 1.1219450873557867470217771071068369729526 | |
269 | data8 0xED4F06F3D2BC92D1, 0x3FFE // .92698710873331639524734537734804056798748 | |
d5efd131 MF |
270 | LOCAL_OBJECT_END(Poly_Q) |
271 | ||
0347518d | 272 | // Q coeffs |
d5efd131 | 273 | LOCAL_OBJECT_START(Constants_Q) |
0347518d | 274 | data4 0x00000000,0xB1721800,0x00003FFE,0x00000000 |
d5efd131 MF |
275 | data4 0x4361C4C6,0x82E30865,0x0000BFE2,0x00000000 |
276 | data4 0x328833CB,0xCCCCCAF2,0x00003FFC,0x00000000 | |
277 | data4 0xA9D4BAFB,0x80000077,0x0000BFFD,0x00000000 | |
278 | data4 0xAAABE3D2,0xAAAAAAAA,0x00003FFD,0x00000000 | |
0347518d | 279 | data4 0xFFFFDAB7,0xFFFFFFFF,0x0000BFFD,0x00000000 |
d5efd131 MF |
280 | LOCAL_OBJECT_END(Constants_Q) |
281 | ||
282 | // Z1 - 16 bit fixed | |
283 | LOCAL_OBJECT_START(Constants_Z_1) | |
284 | data4 0x00008000 | |
285 | data4 0x00007879 | |
286 | data4 0x000071C8 | |
287 | data4 0x00006BCB | |
288 | data4 0x00006667 | |
289 | data4 0x00006187 | |
290 | data4 0x00005D18 | |
291 | data4 0x0000590C | |
292 | data4 0x00005556 | |
293 | data4 0x000051EC | |
294 | data4 0x00004EC5 | |
295 | data4 0x00004BDB | |
296 | data4 0x00004925 | |
297 | data4 0x0000469F | |
298 | data4 0x00004445 | |
299 | data4 0x00004211 | |
300 | LOCAL_OBJECT_END(Constants_Z_1) | |
301 | ||
302 | // G1 and H1 - IEEE single and h1 - IEEE double | |
303 | LOCAL_OBJECT_START(Constants_G_H_h1) | |
304 | data4 0x3F800000,0x00000000 | |
305 | data8 0x0000000000000000 | |
306 | data4 0x3F70F0F0,0x3D785196 | |
307 | data8 0x3DA163A6617D741C | |
308 | data4 0x3F638E38,0x3DF13843 | |
309 | data8 0x3E2C55E6CBD3D5BB | |
310 | data4 0x3F579430,0x3E2FF9A0 | |
311 | data8 0xBE3EB0BFD86EA5E7 | |
312 | data4 0x3F4CCCC8,0x3E647FD6 | |
313 | data8 0x3E2E6A8C86B12760 | |
314 | data4 0x3F430C30,0x3E8B3AE7 | |
315 | data8 0x3E47574C5C0739BA | |
316 | data4 0x3F3A2E88,0x3EA30C68 | |
317 | data8 0x3E20E30F13E8AF2F | |
318 | data4 0x3F321640,0x3EB9CEC8 | |
319 | data8 0xBE42885BF2C630BD | |
320 | data4 0x3F2AAAA8,0x3ECF9927 | |
321 | data8 0x3E497F3497E577C6 | |
322 | data4 0x3F23D708,0x3EE47FC5 | |
323 | data8 0x3E3E6A6EA6B0A5AB | |
324 | data4 0x3F1D89D8,0x3EF8947D | |
325 | data8 0xBDF43E3CD328D9BE | |
326 | data4 0x3F17B420,0x3F05F3A1 | |
327 | data8 0x3E4094C30ADB090A | |
328 | data4 0x3F124920,0x3F0F4303 | |
329 | data8 0xBE28FBB2FC1FE510 | |
330 | data4 0x3F0D3DC8,0x3F183EBF | |
331 | data8 0x3E3A789510FDE3FA | |
332 | data4 0x3F088888,0x3F20EC80 | |
333 | data8 0x3E508CE57CC8C98F | |
334 | data4 0x3F042108,0x3F29516A | |
335 | data8 0xBE534874A223106C | |
336 | LOCAL_OBJECT_END(Constants_G_H_h1) | |
337 | ||
338 | // Z2 - 16 bit fixed | |
339 | LOCAL_OBJECT_START(Constants_Z_2) | |
340 | data4 0x00008000 | |
341 | data4 0x00007F81 | |
342 | data4 0x00007F02 | |
343 | data4 0x00007E85 | |
344 | data4 0x00007E08 | |
345 | data4 0x00007D8D | |
346 | data4 0x00007D12 | |
347 | data4 0x00007C98 | |
348 | data4 0x00007C20 | |
349 | data4 0x00007BA8 | |
350 | data4 0x00007B31 | |
351 | data4 0x00007ABB | |
352 | data4 0x00007A45 | |
353 | data4 0x000079D1 | |
354 | data4 0x0000795D | |
355 | data4 0x000078EB | |
356 | LOCAL_OBJECT_END(Constants_Z_2) | |
357 | ||
358 | // G2 and H2 - IEEE single and h2 - IEEE double | |
359 | LOCAL_OBJECT_START(Constants_G_H_h2) | |
360 | data4 0x3F800000,0x00000000 | |
361 | data8 0x0000000000000000 | |
362 | data4 0x3F7F00F8,0x3B7F875D | |
363 | data8 0x3DB5A11622C42273 | |
364 | data4 0x3F7E03F8,0x3BFF015B | |
365 | data8 0x3DE620CF21F86ED3 | |
366 | data4 0x3F7D08E0,0x3C3EE393 | |
367 | data8 0xBDAFA07E484F34ED | |
368 | data4 0x3F7C0FC0,0x3C7E0586 | |
369 | data8 0xBDFE07F03860BCF6 | |
370 | data4 0x3F7B1880,0x3C9E75D2 | |
371 | data8 0x3DEA370FA78093D6 | |
372 | data4 0x3F7A2328,0x3CBDC97A | |
373 | data8 0x3DFF579172A753D0 | |
374 | data4 0x3F792FB0,0x3CDCFE47 | |
375 | data8 0x3DFEBE6CA7EF896B | |
376 | data4 0x3F783E08,0x3CFC15D0 | |
377 | data8 0x3E0CF156409ECB43 | |
378 | data4 0x3F774E38,0x3D0D874D | |
379 | data8 0xBE0B6F97FFEF71DF | |
380 | data4 0x3F766038,0x3D1CF49B | |
381 | data8 0xBE0804835D59EEE8 | |
382 | data4 0x3F757400,0x3D2C531D | |
383 | data8 0x3E1F91E9A9192A74 | |
384 | data4 0x3F748988,0x3D3BA322 | |
385 | data8 0xBE139A06BF72A8CD | |
386 | data4 0x3F73A0D0,0x3D4AE46F | |
387 | data8 0x3E1D9202F8FBA6CF | |
388 | data4 0x3F72B9D0,0x3D5A1756 | |
389 | data8 0xBE1DCCC4BA796223 | |
390 | data4 0x3F71D488,0x3D693B9D | |
391 | data8 0xBE049391B6B7C239 | |
392 | LOCAL_OBJECT_END(Constants_G_H_h2) | |
393 | ||
0347518d | 394 | // G3 and H3 - IEEE single and h3 - IEEE double |
d5efd131 MF |
395 | LOCAL_OBJECT_START(Constants_G_H_h3) |
396 | data4 0x3F7FFC00,0x38800100 | |
397 | data8 0x3D355595562224CD | |
398 | data4 0x3F7FF400,0x39400480 | |
399 | data8 0x3D8200A206136FF6 | |
400 | data4 0x3F7FEC00,0x39A00640 | |
401 | data8 0x3DA4D68DE8DE9AF0 | |
402 | data4 0x3F7FE400,0x39E00C41 | |
403 | data8 0xBD8B4291B10238DC | |
404 | data4 0x3F7FDC00,0x3A100A21 | |
405 | data8 0xBD89CCB83B1952CA | |
406 | data4 0x3F7FD400,0x3A300F22 | |
407 | data8 0xBDB107071DC46826 | |
408 | data4 0x3F7FCC08,0x3A4FF51C | |
409 | data8 0x3DB6FCB9F43307DB | |
410 | data4 0x3F7FC408,0x3A6FFC1D | |
411 | data8 0xBD9B7C4762DC7872 | |
412 | data4 0x3F7FBC10,0x3A87F20B | |
413 | data8 0xBDC3725E3F89154A | |
414 | data4 0x3F7FB410,0x3A97F68B | |
415 | data8 0xBD93519D62B9D392 | |
416 | data4 0x3F7FAC18,0x3AA7EB86 | |
417 | data8 0x3DC184410F21BD9D | |
418 | data4 0x3F7FA420,0x3AB7E101 | |
419 | data8 0xBDA64B952245E0A6 | |
420 | data4 0x3F7F9C20,0x3AC7E701 | |
421 | data8 0x3DB4B0ECAABB34B8 | |
422 | data4 0x3F7F9428,0x3AD7DD7B | |
423 | data8 0x3D9923376DC40A7E | |
424 | data4 0x3F7F8C30,0x3AE7D474 | |
425 | data8 0x3DC6E17B4F2083D3 | |
426 | data4 0x3F7F8438,0x3AF7CBED | |
427 | data8 0x3DAE314B811D4394 | |
428 | data4 0x3F7F7C40,0x3B03E1F3 | |
429 | data8 0xBDD46F21B08F2DB1 | |
430 | data4 0x3F7F7448,0x3B0BDE2F | |
431 | data8 0xBDDC30A46D34522B | |
432 | data4 0x3F7F6C50,0x3B13DAAA | |
433 | data8 0x3DCB0070B1F473DB | |
434 | data4 0x3F7F6458,0x3B1BD766 | |
435 | data8 0xBDD65DDC6AD282FD | |
436 | data4 0x3F7F5C68,0x3B23CC5C | |
437 | data8 0xBDCDAB83F153761A | |
438 | data4 0x3F7F5470,0x3B2BC997 | |
439 | data8 0xBDDADA40341D0F8F | |
440 | data4 0x3F7F4C78,0x3B33C711 | |
441 | data8 0x3DCD1BD7EBC394E8 | |
442 | data4 0x3F7F4488,0x3B3BBCC6 | |
443 | data8 0xBDC3532B52E3E695 | |
444 | data4 0x3F7F3C90,0x3B43BAC0 | |
445 | data8 0xBDA3961EE846B3DE | |
446 | data4 0x3F7F34A0,0x3B4BB0F4 | |
447 | data8 0xBDDADF06785778D4 | |
448 | data4 0x3F7F2CA8,0x3B53AF6D | |
449 | data8 0x3DCC3ED1E55CE212 | |
450 | data4 0x3F7F24B8,0x3B5BA620 | |
451 | data8 0xBDBA31039E382C15 | |
452 | data4 0x3F7F1CC8,0x3B639D12 | |
453 | data8 0x3D635A0B5C5AF197 | |
454 | data4 0x3F7F14D8,0x3B6B9444 | |
455 | data8 0xBDDCCB1971D34EFC | |
456 | data4 0x3F7F0CE0,0x3B7393BC | |
457 | data8 0x3DC7450252CD7ADA | |
458 | data4 0x3F7F04F0,0x3B7B8B6D | |
459 | data8 0xBDB68F177D7F2A42 | |
460 | LOCAL_OBJECT_END(Constants_G_H_h3) | |
461 | ||
462 | // Assembly macros | |
463 | //============================================================== | |
464 | ||
465 | // Floating Point Registers | |
466 | ||
467 | FR_Arg = f8 | |
468 | FR_Res = f8 | |
469 | ||
470 | ||
471 | FR_PP0 = f32 | |
472 | FR_PP1 = f33 | |
473 | FR_PP2 = f34 | |
474 | FR_PP3 = f35 | |
475 | FR_PP4 = f36 | |
476 | FR_PP5 = f37 | |
477 | FR_QQ0 = f38 | |
478 | FR_QQ1 = f39 | |
479 | FR_QQ2 = f40 | |
480 | FR_QQ3 = f41 | |
481 | FR_QQ4 = f42 | |
482 | FR_QQ5 = f43 | |
483 | ||
0347518d MF |
484 | FR_Q1 = f44 |
485 | FR_Q2 = f45 | |
486 | FR_Q3 = f46 | |
487 | FR_Q4 = f47 | |
d5efd131 MF |
488 | |
489 | FR_Half = f48 | |
490 | FR_Two = f49 | |
491 | ||
0347518d MF |
492 | FR_log2_hi = f50 |
493 | FR_log2_lo = f51 | |
d5efd131 MF |
494 | |
495 | ||
496 | FR_X2 = f52 | |
497 | FR_M2 = f53 | |
498 | FR_M2L = f54 | |
499 | FR_Rcp = f55 | |
500 | FR_GG = f56 | |
501 | FR_HH = f57 | |
502 | FR_EE = f58 | |
503 | FR_DD = f59 | |
504 | FR_GL = f60 | |
505 | FR_Tmp = f61 | |
506 | ||
507 | ||
508 | FR_XM1 = f62 | |
509 | FR_2XM1 = f63 | |
510 | FR_XM12 = f64 | |
511 | ||
512 | ||
513 | ||
514 | // Special logl registers | |
0347518d MF |
515 | FR_XLog_Hi = f65 |
516 | FR_XLog_Lo = f66 | |
d5efd131 | 517 | |
0347518d | 518 | FR_Y_hi = f67 |
d5efd131 MF |
519 | FR_Y_lo = f68 |
520 | ||
0347518d MF |
521 | FR_S_hi = f69 |
522 | FR_S_lo = f70 | |
d5efd131 MF |
523 | |
524 | FR_poly_lo = f71 | |
525 | FR_poly_hi = f72 | |
526 | ||
527 | FR_G = f73 | |
528 | FR_H = f74 | |
529 | FR_h = f75 | |
530 | ||
531 | FR_G2 = f76 | |
532 | FR_H2 = f77 | |
0347518d | 533 | FR_h2 = f78 |
d5efd131 | 534 | |
0347518d MF |
535 | FR_r = f79 |
536 | FR_rsq = f80 | |
537 | FR_rcub = f81 | |
d5efd131 | 538 | |
0347518d | 539 | FR_float_N = f82 |
d5efd131 | 540 | |
0347518d MF |
541 | FR_G3 = f83 |
542 | FR_H3 = f84 | |
543 | FR_h3 = f85 | |
d5efd131 | 544 | |
0347518d | 545 | FR_2_to_minus_N = f86 |
d5efd131 MF |
546 | |
547 | ||
548 | // Near 1 registers | |
549 | FR_PP = f65 | |
550 | FR_QQ = f66 | |
551 | ||
552 | ||
553 | FR_PV6 = f69 | |
554 | FR_PV4 = f70 | |
555 | FR_PV3 = f71 | |
556 | FR_PV2 = f72 | |
557 | ||
558 | FR_QV6 = f73 | |
559 | FR_QV4 = f74 | |
560 | FR_QV3 = f75 | |
561 | FR_QV2 = f76 | |
562 | ||
563 | FR_Y0 = f77 | |
0347518d | 564 | FR_Q0 = f78 |
d5efd131 MF |
565 | FR_E0 = f79 |
566 | FR_E2 = f80 | |
567 | FR_E1 = f81 | |
568 | FR_Y1 = f82 | |
569 | FR_E3 = f83 | |
570 | FR_Y2 = f84 | |
571 | FR_R0 = f85 | |
572 | FR_E4 = f86 | |
573 | FR_Y3 = f87 | |
574 | FR_R1 = f88 | |
575 | FR_X_Hi = f89 | |
576 | FR_X_lo = f90 | |
577 | ||
578 | FR_HH = f91 | |
579 | FR_LL = f92 | |
580 | FR_HL = f93 | |
581 | FR_LH = f94 | |
582 | ||
583 | ||
584 | ||
585 | // Error handler registers | |
586 | FR_Arg_X = f95 | |
587 | FR_Arg_Y = f0 | |
588 | ||
589 | ||
590 | // General Purpose Registers | |
591 | ||
592 | // General prolog registers | |
593 | GR_PFS = r32 | |
594 | GR_OneP125 = r33 | |
595 | GR_TwoP63 = r34 | |
596 | GR_Arg = r35 | |
597 | GR_Half = r36 | |
598 | ||
599 | // Near 1 path registers | |
600 | GR_Poly_P = r37 | |
601 | GR_Poly_Q = r38 | |
602 | ||
603 | // Special logl registers | |
0347518d MF |
604 | GR_Index1 = r39 |
605 | GR_Index2 = r40 | |
606 | GR_signif = r41 | |
607 | GR_X_0 = r42 | |
608 | GR_X_1 = r43 | |
609 | GR_X_2 = r44 | |
d5efd131 | 610 | GR_minus_N = r45 |
0347518d MF |
611 | GR_Z_1 = r46 |
612 | GR_Z_2 = r47 | |
613 | GR_N = r48 | |
614 | GR_Bias = r49 | |
615 | GR_M = r50 | |
616 | GR_Index3 = r51 | |
617 | GR_exp_2tom80 = r52 | |
618 | GR_exp_mask = r53 | |
619 | GR_exp_2tom7 = r54 | |
620 | GR_ad_ln10 = r55 | |
d5efd131 MF |
621 | GR_ad_tbl_1 = r56 |
622 | GR_ad_tbl_2 = r57 | |
623 | GR_ad_tbl_3 = r58 | |
624 | GR_ad_q = r59 | |
625 | GR_ad_z_1 = r60 | |
626 | GR_ad_z_2 = r61 | |
627 | GR_ad_z_3 = r62 | |
628 | ||
629 | // | |
630 | // Added for unwind support | |
631 | // | |
632 | GR_SAVE_PFS = r32 | |
633 | GR_SAVE_B0 = r33 | |
634 | GR_SAVE_GP = r34 | |
635 | ||
636 | GR_Parameter_X = r64 | |
637 | GR_Parameter_Y = r65 | |
638 | GR_Parameter_RESULT = r66 | |
639 | GR_Parameter_TAG = r67 | |
640 | ||
641 | ||
642 | ||
643 | .section .text | |
644 | GLOBAL_LIBM_ENTRY(acoshl) | |
645 | ||
646 | { .mfi | |
647 | alloc GR_PFS = ar.pfs,0,32,4,0 // Local frame allocation | |
648 | fcmp.lt.s1 p11, p0 = FR_Arg, f1 // if arg is less than 1 | |
649 | mov GR_Half = 0xfffe // 0.5's exp | |
650 | } | |
651 | { .mfi | |
652 | addl GR_Poly_Q = @ltoff(Poly_Q), gp // Address of Q-coeff table | |
653 | fma.s1 FR_X2 = FR_Arg, FR_Arg, f0 // Obtain x^2 | |
654 | addl GR_Poly_P = @ltoff(Poly_P), gp // Address of P-coeff table | |
0347518d | 655 | };; |
d5efd131 | 656 | |
0347518d | 657 | { .mfi |
6f65e668 | 658 | getf.d GR_Arg = FR_Arg // get argument as double (int64) |
d5efd131 MF |
659 | fma.s0 FR_Two = f1, f1, f1 // construct 2.0 |
660 | addl GR_ad_z_1 = @ltoff(Constants_Z_1#),gp // logl tables | |
661 | } | |
0347518d MF |
662 | { .mlx |
663 | nop.m 0 | |
d5efd131 | 664 | movl GR_TwoP63 = 0x43E8000000000000 // 0.5*2^63 (huge arguments) |
0347518d | 665 | };; |
d5efd131 | 666 | |
0347518d | 667 | { .mfi |
d5efd131 MF |
668 | ld8 GR_Poly_P = [GR_Poly_P] // get actual P-coeff table address |
669 | fcmp.eq.s1 p10, p0 = FR_Arg, f1 // if arg == 1 (return 0) | |
670 | nop.i 0 | |
671 | } | |
0347518d | 672 | { .mlx |
d5efd131 MF |
673 | ld8 GR_Poly_Q = [GR_Poly_Q] // get actual Q-coeff table address |
674 | movl GR_OneP125 = 0x3FF2000000000000 // 1.125 (near 1 path bound) | |
675 | };; | |
676 | ||
0347518d | 677 | { .mfi |
d5efd131 MF |
678 | ld8 GR_ad_z_1 = [GR_ad_z_1] // Get pointer to Constants_Z_1 |
679 | fclass.m p7,p0 = FR_Arg, 0xe3 // if arg NaN inf | |
680 | cmp.le p9, p0 = GR_TwoP63, GR_Arg // if arg > 0.5*2^63 ('huges') | |
681 | } | |
682 | { .mfb | |
683 | cmp.ge p8, p0 = GR_OneP125, GR_Arg // if arg<1.125 -near 1 path | |
684 | fms.s1 FR_XM1 = FR_Arg, f1, f1 // X0 = X-1 (for near 1 path) | |
685 | (p11) br.cond.spnt acoshl_lt_pone // error branch (less than 1) | |
0347518d | 686 | };; |
d5efd131 | 687 | |
0347518d | 688 | { .mmi |
d5efd131 MF |
689 | setf.exp FR_Half = GR_Half // construct 0.5 |
690 | (p9) setf.s FR_XLog_Lo = r0 // Low of logl arg=0 (Huges path) | |
691 | mov GR_exp_mask = 0x1FFFF // Create exponent mask | |
0347518d | 692 | };; |
d5efd131 | 693 | |
0347518d | 694 | { .mmf |
d5efd131 MF |
695 | (p8) ldfe FR_PP5 = [GR_Poly_P],16 // Load P5 |
696 | (p8) ldfe FR_QQ5 = [GR_Poly_Q],16 // Load Q5 | |
697 | fms.s1 FR_M2 = FR_X2, f1, f1 // m2 = x^2 - 1 | |
698 | };; | |
699 | ||
0347518d | 700 | { .mfi |
d5efd131 | 701 | (p8) ldfe FR_QQ4 = [GR_Poly_Q],16 // Load Q4 |
0347518d | 702 | fms.s1 FR_M2L = FR_Arg, FR_Arg, FR_X2 // low part of |
d5efd131 MF |
703 | // m2 = fma(X*X - m2) |
704 | add GR_ad_tbl_1 = 0x040, GR_ad_z_1 // Point to Constants_G_H_h1 | |
705 | } | |
706 | { .mfb | |
0347518d | 707 | (p8) ldfe FR_PP4 = [GR_Poly_P],16 // Load P4 |
d5efd131 MF |
708 | (p7) fma.s0 FR_Res = FR_Arg,f1,FR_Arg // r = a + a (Nan, Inf) |
709 | (p7) br.ret.spnt b0 // return (Nan, Inf) | |
0347518d | 710 | };; |
d5efd131 MF |
711 | |
712 | { .mfi | |
713 | (p8) ldfe FR_PP3 = [GR_Poly_P],16 // Load P3 | |
714 | nop.f 0 | |
715 | add GR_ad_q = -0x60, GR_ad_z_1 // Point to Constants_P | |
716 | } | |
717 | { .mfb | |
718 | (p8) ldfe FR_QQ3 = [GR_Poly_Q],16 // Load Q3 | |
719 | (p9) fms.s1 FR_XLog_Hi = FR_Two, FR_Arg, f1 // Hi of log arg = 2*X-1 | |
720 | (p9) br.cond.spnt huges_logl // special version of log | |
721 | } | |
0347518d | 722 | ;; |
d5efd131 | 723 | |
0347518d | 724 | { .mfi |
d5efd131 MF |
725 | (p8) ldfe FR_PP2 = [GR_Poly_P],16 // Load P2 |
726 | (p8) fma.s1 FR_2XM1 = FR_Two, FR_XM1, f0 // 2X0 = 2 * X0 | |
727 | add GR_ad_z_2 = 0x140, GR_ad_z_1 // Point to Constants_Z_2 | |
728 | } | |
729 | { .mfb | |
730 | (p8) ldfe FR_QQ2 = [GR_Poly_Q],16 // Load Q2 | |
731 | (p10) fma.s0 FR_Res = f0,f1,f0 // r = 0 (arg = 1) | |
0347518d MF |
732 | (p10) br.ret.spnt b0 // return (arg = 1) |
733 | };; | |
d5efd131 | 734 | |
0347518d | 735 | { .mmi |
d5efd131 MF |
736 | (p8) ldfe FR_PP1 = [GR_Poly_P],16 // Load P1 |
737 | (p8) ldfe FR_QQ1 = [GR_Poly_Q],16 // Load Q1 | |
738 | add GR_ad_tbl_2 = 0x180, GR_ad_z_1 // Point to Constants_G_H_h2 | |
739 | } | |
740 | ;; | |
741 | ||
0347518d MF |
742 | { .mfi |
743 | (p8) ldfe FR_PP0 = [GR_Poly_P] // Load P0 | |
d5efd131 MF |
744 | fma.s1 FR_Tmp = f1, f1, FR_M2 // Tmp = 1 + m2 |
745 | add GR_ad_tbl_3 = 0x280, GR_ad_z_1 // Point to Constants_G_H_h3 | |
746 | } | |
747 | { .mfb | |
748 | (p8) ldfe FR_QQ0 = [GR_Poly_Q] | |
749 | nop.f 0 | |
750 | (p8) br.cond.spnt near_1 // near 1 path | |
0347518d MF |
751 | };; |
752 | { .mfi | |
d5efd131 MF |
753 | ldfe FR_log2_hi = [GR_ad_q],16 // Load log2_hi |
754 | nop.f 0 | |
755 | mov GR_Bias = 0x0FFFF // Create exponent bias | |
756 | };; | |
0347518d | 757 | { .mfi |
d5efd131 MF |
758 | nop.m 0 |
759 | frsqrta.s1 FR_Rcp, p0 = FR_M2 // Rcp = 1/m2 reciprocal appr. | |
760 | nop.i 0 | |
0347518d | 761 | };; |
d5efd131 MF |
762 | |
763 | { .mfi | |
764 | ldfe FR_log2_lo = [GR_ad_q],16 // Load log2_lo | |
765 | fms.s1 FR_Tmp = FR_X2, f1, FR_Tmp // Tmp = x^2 - Tmp | |
766 | nop.i 0 | |
767 | };; | |
768 | ||
769 | { .mfi | |
770 | ldfe FR_Q4 = [GR_ad_q],16 // Load Q4 | |
771 | fma.s1 FR_GG = FR_Rcp, FR_M2, f0 // g = Rcp * m2 | |
772 | // 8 bit Newton Raphson iteration | |
773 | nop.i 0 | |
774 | } | |
775 | { .mfi | |
0347518d | 776 | nop.m 0 |
d5efd131 MF |
777 | fma.s1 FR_HH = FR_Half, FR_Rcp, f0 // h = 0.5 * Rcp |
778 | nop.i 0 | |
779 | };; | |
780 | { .mfi | |
781 | ldfe FR_Q3 = [GR_ad_q],16 // Load Q3 | |
782 | fnma.s1 FR_EE = FR_GG, FR_HH, FR_Half // e = 0.5 - g * h | |
783 | nop.i 0 | |
784 | } | |
785 | { .mfi | |
0347518d | 786 | nop.m 0 |
d5efd131 MF |
787 | fma.s1 FR_M2L = FR_Tmp, f1, FR_M2L // low part of m2 = Tmp+m2l |
788 | nop.i 0 | |
789 | };; | |
790 | ||
791 | { .mfi | |
792 | ldfe FR_Q2 = [GR_ad_q],16 // Load Q2 | |
0347518d | 793 | fma.s1 FR_GG = FR_GG, FR_EE, FR_GG // g = g * e + g |
d5efd131 MF |
794 | // 16 bit Newton Raphson iteration |
795 | nop.i 0 | |
796 | } | |
797 | { .mfi | |
798 | nop.m 0 | |
799 | fma.s1 FR_HH = FR_HH, FR_EE, FR_HH // h = h * e + h | |
800 | nop.i 0 | |
801 | };; | |
802 | ||
803 | { .mfi | |
804 | ldfe FR_Q1 = [GR_ad_q] // Load Q1 | |
805 | fnma.s1 FR_EE = FR_GG, FR_HH, FR_Half // e = 0.5 - g * h | |
806 | nop.i 0 | |
807 | };; | |
808 | { .mfi | |
809 | nop.m 0 | |
0347518d | 810 | fma.s1 FR_GG = FR_GG, FR_EE, FR_GG // g = g * e + g |
d5efd131 MF |
811 | // 32 bit Newton Raphson iteration |
812 | nop.i 0 | |
813 | } | |
814 | { .mfi | |
815 | nop.m 0 | |
816 | fma.s1 FR_HH = FR_HH, FR_EE, FR_HH // h = h * e + h | |
817 | nop.i 0 | |
818 | };; | |
819 | ||
820 | { .mfi | |
821 | nop.m 0 | |
822 | fnma.s1 FR_EE = FR_GG, FR_HH, FR_Half // e = 0.5 - g * h | |
823 | nop.i 0 | |
824 | };; | |
825 | ||
826 | { .mfi | |
827 | nop.m 0 | |
0347518d | 828 | fma.s1 FR_GG = FR_GG, FR_EE, FR_GG // g = g * e + g |
d5efd131 MF |
829 | // 64 bit Newton Raphson iteration |
830 | nop.i 0 | |
831 | } | |
832 | { .mfi | |
833 | nop.m 0 | |
834 | fma.s1 FR_HH = FR_HH, FR_EE, FR_HH // h = h * e + h | |
835 | nop.i 0 | |
836 | };; | |
837 | ||
838 | { .mfi | |
839 | nop.m 0 | |
840 | fnma.s1 FR_DD = FR_GG, FR_GG, FR_M2 // Remainder d = g * g - p2 | |
841 | nop.i 0 | |
842 | } | |
843 | { .mfi | |
844 | nop.m 0 | |
845 | fma.s1 FR_XLog_Hi = FR_Arg, f1, FR_GG // bh = z + gh | |
846 | nop.i 0 | |
847 | };; | |
848 | ||
849 | { .mfi | |
850 | nop.m 0 | |
851 | fma.s1 FR_DD = FR_DD, f1, FR_M2L // add p2l: d = d + p2l | |
852 | nop.i 0 | |
853 | };; | |
854 | ||
855 | { .mfi | |
856 | getf.sig GR_signif = FR_XLog_Hi // Get significand of x+1 | |
857 | nop.f 0 | |
858 | mov GR_exp_2tom7 = 0x0fff8 // Exponent of 2^-7 | |
859 | };; | |
860 | ||
861 | { .mfi | |
862 | nop.m 0 | |
863 | fma.s1 FR_GL = FR_DD, FR_HH, f0 // gl = d * h | |
864 | extr.u GR_Index1 = GR_signif, 59, 4 // Get high 4 bits of signif | |
865 | } | |
866 | { .mfi | |
867 | nop.m 0 | |
868 | fma.s1 FR_XLog_Hi = FR_DD, FR_HH, FR_XLog_Hi // bh = bh + gl | |
869 | nop.i 0 | |
870 | };; | |
871 | ||
872 | ||
873 | ||
874 | { .mmi | |
875 | shladd GR_ad_z_1 = GR_Index1, 2, GR_ad_z_1 // Point to Z_1 | |
876 | shladd GR_ad_tbl_1 = GR_Index1, 4, GR_ad_tbl_1 // Point to G_1 | |
877 | extr.u GR_X_0 = GR_signif, 49, 15 // Get high 15 bits of signif. | |
878 | };; | |
879 | ||
880 | { .mmi | |
881 | ld4 GR_Z_1 = [GR_ad_z_1] // Load Z_1 | |
882 | nop.m 0 | |
883 | nop.i 0 | |
884 | };; | |
885 | ||
886 | { .mmi | |
887 | ldfps FR_G, FR_H = [GR_ad_tbl_1],8 // Load G_1, H_1 | |
888 | nop.m 0 | |
889 | nop.i 0 | |
890 | };; | |
891 | ||
892 | { .mfi | |
893 | nop.m 0 | |
894 | fms.s1 FR_XLog_Lo = FR_Arg, f1, FR_XLog_Hi // bl = x - bh | |
895 | pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15 // Get bits 30-15 of X_0 * Z_1 | |
896 | };; | |
897 | ||
898 | // WE CANNOT USE GR_X_1 IN NEXT 3 CYCLES BECAUSE OF POSSIBLE 10 CLOCKS STALL! | |
899 | // "DEAD" ZONE! | |
900 | ||
901 | { .mfi | |
902 | nop.m 0 | |
903 | nop.f 0 | |
904 | nop.i 0 | |
905 | };; | |
906 | ||
907 | { .mfi | |
908 | nop.m 0 | |
909 | fmerge.se FR_S_hi = f1,FR_XLog_Hi // Form |x+1| | |
910 | nop.i 0 | |
911 | };; | |
912 | ||
913 | ||
914 | { .mmi | |
915 | getf.exp GR_N = FR_XLog_Hi // Get N = exponent of x+1 | |
916 | ldfd FR_h = [GR_ad_tbl_1] // Load h_1 | |
917 | nop.i 0 | |
918 | };; | |
919 | ||
920 | { .mfi | |
921 | nop.m 0 | |
922 | nop.f 0 | |
0347518d | 923 | extr.u GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1 |
d5efd131 MF |
924 | };; |
925 | ||
926 | { .mfi | |
927 | shladd GR_ad_tbl_2 = GR_Index2, 4, GR_ad_tbl_2 // Point to G_2 | |
928 | fma.s1 FR_XLog_Lo = FR_XLog_Lo, f1, FR_GG // bl = bl + gg | |
929 | mov GR_exp_2tom80 = 0x0ffaf // Exponent of 2^-80 | |
930 | } | |
931 | { .mfi | |
932 | shladd GR_ad_z_2 = GR_Index2, 2, GR_ad_z_2 // Point to Z_2 | |
933 | nop.f 0 | |
934 | sub GR_N = GR_N, GR_Bias // sub bias from exp | |
935 | };; | |
936 | ||
937 | { .mmi | |
938 | ldfps FR_G2, FR_H2 = [GR_ad_tbl_2],8 // Load G_2, H_2 | |
939 | ld4 GR_Z_2 = [GR_ad_z_2] // Load Z_2 | |
940 | sub GR_minus_N = GR_Bias, GR_N // Form exponent of 2^(-N) | |
941 | };; | |
942 | ||
943 | { .mmi | |
944 | ldfd FR_h2 = [GR_ad_tbl_2] // Load h_2 | |
945 | nop.m 0 | |
946 | nop.i 0 | |
947 | };; | |
948 | ||
949 | { .mmi | |
950 | setf.sig FR_float_N = GR_N // Put integer N into rightmost sign | |
951 | setf.exp FR_2_to_minus_N = GR_minus_N // Form 2^(-N) | |
952 | pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15 // Get bits 30-15 of X_1 * Z_2 | |
953 | };; | |
954 | ||
0347518d | 955 | // WE CANNOT USE GR_X_2 IN NEXT 3 CYCLES ("DEAD" ZONE!) |
d5efd131 MF |
956 | // BECAUSE OF POSSIBLE 10 CLOCKS STALL! |
957 | // (Just nops added - nothing to do here) | |
958 | ||
959 | { .mfi | |
960 | nop.m 0 | |
961 | fma.s1 FR_XLog_Lo = FR_XLog_Lo, f1, FR_GL // bl = bl + gl | |
962 | nop.i 0 | |
963 | };; | |
964 | { .mfi | |
965 | nop.m 0 | |
966 | nop.f 0 | |
967 | nop.i 0 | |
968 | };; | |
969 | { .mfi | |
970 | nop.m 0 | |
971 | nop.f 0 | |
972 | nop.i 0 | |
973 | };; | |
974 | ||
975 | { .mfi | |
976 | nop.m 0 | |
977 | nop.f 0 | |
978 | extr.u GR_Index3 = GR_X_2, 1, 5 // Extract bits 1-5 of X_2 | |
979 | };; | |
980 | ||
981 | { .mfi | |
982 | shladd GR_ad_tbl_3 = GR_Index3, 4, GR_ad_tbl_3 // Point to G_3 | |
983 | nop.f 0 | |
984 | nop.i 0 | |
985 | };; | |
986 | ||
987 | { .mfi | |
988 | ldfps FR_G3, FR_H3 = [GR_ad_tbl_3],8 // Load G_3, H_3 | |
989 | nop.f 0 | |
990 | nop.i 0 | |
991 | };; | |
992 | ||
993 | { .mfi | |
994 | ldfd FR_h3 = [GR_ad_tbl_3] // Load h_3 | |
995 | fcvt.xf FR_float_N = FR_float_N | |
996 | nop.i 0 | |
997 | };; | |
998 | ||
999 | { .mfi | |
1000 | nop.m 0 | |
1001 | fmpy.s1 FR_G = FR_G, FR_G2 // G = G_1 * G_2 | |
1002 | nop.i 0 | |
1003 | } | |
1004 | { .mfi | |
1005 | nop.m 0 | |
1006 | fadd.s1 FR_H = FR_H, FR_H2 // H = H_1 + H_2 | |
1007 | nop.i 0 | |
1008 | };; | |
1009 | ||
1010 | { .mfi | |
1011 | nop.m 0 | |
1012 | fadd.s1 FR_h = FR_h, FR_h2 // h = h_1 + h_2 | |
1013 | nop.i 0 | |
1014 | } | |
1015 | { .mfi | |
1016 | nop.m 0 | |
1017 | fma.s1 FR_S_lo = FR_XLog_Lo, FR_2_to_minus_N, f0 //S_lo=S_lo*2^(-N) | |
1018 | nop.i 0 | |
1019 | };; | |
1020 | ||
1021 | { .mfi | |
1022 | nop.m 0 | |
1023 | fmpy.s1 FR_G = FR_G, FR_G3 // G = (G_1 * G_2) * G_3 | |
1024 | nop.i 0 | |
1025 | } | |
1026 | { .mfi | |
1027 | nop.m 0 | |
1028 | fadd.s1 FR_H = FR_H, FR_H3 // H = (H_1 + H_2) + H_3 | |
1029 | nop.i 0 | |
1030 | };; | |
1031 | ||
1032 | { .mfi | |
1033 | nop.m 0 | |
1034 | fadd.s1 FR_h = FR_h, FR_h3 // h = (h_1 + h_2) + h_3 | |
1035 | nop.i 0 | |
1036 | };; | |
1037 | ||
1038 | { .mfi | |
1039 | nop.m 0 | |
1040 | fms.s1 FR_r = FR_G, FR_S_hi, f1 // r = G * S_hi - 1 | |
1041 | nop.i 0 | |
1042 | } | |
1043 | { .mfi | |
1044 | nop.m 0 | |
1045 | fma.s1 FR_Y_hi = FR_float_N, FR_log2_hi, FR_H // Y_hi=N*log2_hi+H | |
1046 | nop.i 0 | |
1047 | };; | |
1048 | ||
1049 | { .mfi | |
1050 | nop.m 0 | |
1051 | fma.s1 FR_h = FR_float_N, FR_log2_lo, FR_h // h=N*log2_lo+h | |
1052 | nop.i 0 | |
1053 | } | |
1054 | { .mfi | |
1055 | nop.m 0 | |
1056 | fma.s1 FR_r = FR_G, FR_S_lo, FR_r // r=G*S_lo+(G*S_hi-1) | |
1057 | nop.i 0 | |
1058 | };; | |
1059 | ||
1060 | { .mfi | |
1061 | nop.m 0 | |
1062 | fma.s1 FR_poly_lo = FR_r, FR_Q4, FR_Q3 // poly_lo = r * Q4 + Q3 | |
1063 | nop.i 0 | |
1064 | } | |
1065 | { .mfi | |
1066 | nop.m 0 | |
1067 | fmpy.s1 FR_rsq = FR_r, FR_r // rsq = r * r | |
1068 | nop.i 0 | |
1069 | };; | |
1070 | ||
1071 | { .mfi | |
1072 | nop.m 0 | |
1073 | fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_Q2 // poly_lo=poly_lo*r+Q2 | |
1074 | nop.i 0 | |
1075 | } | |
1076 | { .mfi | |
1077 | nop.m 0 | |
1078 | fma.s1 FR_rcub = FR_rsq, FR_r, f0 // rcub = r^3 | |
1079 | nop.i 0 | |
1080 | };; | |
1081 | ||
1082 | { .mfi | |
1083 | nop.m 0 | |
1084 | fma.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r // poly_hi = Q1*rsq + r | |
1085 | nop.i 0 | |
1086 | };; | |
1087 | ||
1088 | { .mfi | |
1089 | nop.m 0 | |
1090 | fma.s1 FR_poly_lo = FR_poly_lo, FR_rcub, FR_h//poly_lo=poly_lo*r^3+h | |
1091 | nop.i 0 | |
1092 | };; | |
1093 | ||
1094 | { .mfi | |
1095 | nop.m 0 | |
0347518d | 1096 | fadd.s0 FR_Y_lo = FR_poly_hi, FR_poly_lo |
d5efd131 MF |
1097 | // Y_lo=poly_hi+poly_lo |
1098 | nop.i 0 | |
1099 | };; | |
1100 | ||
1101 | { .mfb | |
1102 | nop.m 0 | |
1103 | fadd.s0 FR_Res = FR_Y_lo,FR_Y_hi // Result=Y_lo+Y_hi | |
1104 | br.ret.sptk b0 // Common exit for 2^-7 < x < inf | |
1105 | };; | |
1106 | ||
1107 | ||
1108 | huges_logl: | |
1109 | { .mmi | |
1110 | getf.sig GR_signif = FR_XLog_Hi // Get significand of x+1 | |
1111 | mov GR_exp_2tom7 = 0x0fff8 // Exponent of 2^-7 | |
1112 | nop.i 0 | |
1113 | };; | |
1114 | ||
1115 | { .mfi | |
1116 | add GR_ad_tbl_1 = 0x040, GR_ad_z_1 // Point to Constants_G_H_h1 | |
1117 | nop.f 0 | |
1118 | add GR_ad_q = -0x60, GR_ad_z_1 // Point to Constants_P | |
1119 | } | |
1120 | { .mfi | |
1121 | add GR_ad_z_2 = 0x140, GR_ad_z_1 // Point to Constants_Z_2 | |
1122 | nop.f 0 | |
1123 | add GR_ad_tbl_2 = 0x180, GR_ad_z_1 // Point to Constants_G_H_h2 | |
1124 | };; | |
1125 | ||
1126 | { .mfi | |
1127 | add GR_ad_tbl_3 = 0x280, GR_ad_z_1 // Point to Constants_G_H_h3 | |
1128 | nop.f 0 | |
1129 | extr.u GR_Index1 = GR_signif, 59, 4 // Get high 4 bits of signif | |
1130 | };; | |
1131 | ||
1132 | { .mfi | |
1133 | shladd GR_ad_z_1 = GR_Index1, 2, GR_ad_z_1 // Point to Z_1 | |
1134 | nop.f 0 | |
1135 | extr.u GR_X_0 = GR_signif, 49, 15 // Get high 15 bits of signif. | |
1136 | };; | |
1137 | ||
1138 | { .mfi | |
1139 | ld4 GR_Z_1 = [GR_ad_z_1] // Load Z_1 | |
1140 | nop.f 0 | |
1141 | mov GR_exp_mask = 0x1FFFF // Create exponent mask | |
1142 | } | |
1143 | { .mfi | |
1144 | shladd GR_ad_tbl_1 = GR_Index1, 4, GR_ad_tbl_1 // Point to G_1 | |
1145 | nop.f 0 | |
1146 | mov GR_Bias = 0x0FFFF // Create exponent bias | |
1147 | };; | |
1148 | ||
1149 | { .mfi | |
1150 | ldfps FR_G, FR_H = [GR_ad_tbl_1],8 // Load G_1, H_1 | |
1151 | fmerge.se FR_S_hi = f1,FR_XLog_Hi // Form |x| | |
1152 | nop.i 0 | |
1153 | };; | |
1154 | ||
1155 | { .mmi | |
1156 | getf.exp GR_N = FR_XLog_Hi // Get N = exponent of x+1 | |
1157 | ldfd FR_h = [GR_ad_tbl_1] // Load h_1 | |
1158 | nop.i 0 | |
1159 | };; | |
1160 | ||
1161 | { .mfi | |
1162 | ldfe FR_log2_hi = [GR_ad_q],16 // Load log2_hi | |
1163 | nop.f 0 | |
1164 | pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15 // Get bits 30-15 of X_0 * Z_1 | |
1165 | };; | |
1166 | ||
1167 | { .mmi | |
1168 | ldfe FR_log2_lo = [GR_ad_q],16 // Load log2_lo | |
0347518d | 1169 | sub GR_N = GR_N, GR_Bias |
d5efd131 MF |
1170 | mov GR_exp_2tom80 = 0x0ffaf // Exponent of 2^-80 |
1171 | };; | |
1172 | ||
1173 | { .mfi | |
1174 | ldfe FR_Q4 = [GR_ad_q],16 // Load Q4 | |
1175 | nop.f 0 | |
1176 | sub GR_minus_N = GR_Bias, GR_N // Form exponent of 2^(-N) | |
1177 | };; | |
1178 | ||
1179 | { .mmf | |
1180 | ldfe FR_Q3 = [GR_ad_q],16 // Load Q3 | |
1181 | setf.sig FR_float_N = GR_N // Put integer N into rightmost sign | |
1182 | nop.f 0 | |
1183 | };; | |
1184 | ||
1185 | { .mmi | |
1186 | ldfe FR_Q2 = [GR_ad_q],16 // Load Q2 | |
1187 | nop.m 0 | |
0347518d | 1188 | extr.u GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1 |
d5efd131 MF |
1189 | };; |
1190 | ||
1191 | { .mmi | |
1192 | ldfe FR_Q1 = [GR_ad_q] // Load Q1 | |
1193 | shladd GR_ad_z_2 = GR_Index2, 2, GR_ad_z_2 // Point to Z_2 | |
1194 | nop.i 0 | |
1195 | };; | |
1196 | ||
1197 | { .mmi | |
1198 | ld4 GR_Z_2 = [GR_ad_z_2] // Load Z_2 | |
1199 | shladd GR_ad_tbl_2 = GR_Index2, 4, GR_ad_tbl_2 // Point to G_2 | |
1200 | nop.i 0 | |
1201 | };; | |
1202 | ||
1203 | { .mmi | |
1204 | ldfps FR_G2, FR_H2 = [GR_ad_tbl_2],8 // Load G_2, H_2 | |
1205 | nop.m 0 | |
1206 | nop.i 0 | |
1207 | };; | |
1208 | ||
1209 | { .mmf | |
1210 | ldfd FR_h2 = [GR_ad_tbl_2] // Load h_2 | |
1211 | setf.exp FR_2_to_minus_N = GR_minus_N // Form 2^(-N) | |
1212 | nop.f 0 | |
1213 | };; | |
1214 | ||
1215 | { .mfi | |
1216 | nop.m 0 | |
1217 | nop.f 0 | |
1218 | pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15 // Get bits 30-15 of X_1*Z_2 | |
1219 | };; | |
1220 | ||
0347518d | 1221 | // WE CANNOT USE GR_X_2 IN NEXT 3 CYCLES ("DEAD" ZONE!) |
d5efd131 MF |
1222 | // BECAUSE OF POSSIBLE 10 CLOCKS STALL! |
1223 | // (Just nops added - nothing to do here) | |
1224 | ||
1225 | { .mfi | |
1226 | nop.m 0 | |
1227 | nop.f 0 | |
1228 | nop.i 0 | |
1229 | };; | |
1230 | ||
1231 | { .mfi | |
1232 | nop.m 0 | |
1233 | nop.f 0 | |
1234 | nop.i 0 | |
1235 | };; | |
1236 | ||
1237 | { .mfi | |
1238 | nop.m 0 | |
1239 | nop.f 0 | |
1240 | nop.i 0 | |
1241 | };; | |
1242 | ||
1243 | { .mfi | |
1244 | nop.m 0 | |
1245 | nop.f 0 | |
1246 | extr.u GR_Index3 = GR_X_2, 1, 5 // Extract bits 1-5 of X_2 | |
1247 | };; | |
1248 | ||
1249 | { .mfi | |
1250 | shladd GR_ad_tbl_3 = GR_Index3, 4, GR_ad_tbl_3 // Point to G_3 | |
1251 | fcvt.xf FR_float_N = FR_float_N | |
1252 | nop.i 0 | |
1253 | };; | |
1254 | ||
1255 | { .mfi | |
1256 | ldfps FR_G3, FR_H3 = [GR_ad_tbl_3],8 // Load G_3, H_3 | |
1257 | nop.f 0 | |
1258 | nop.i 0 | |
1259 | };; | |
1260 | ||
1261 | { .mfi | |
1262 | ldfd FR_h3 = [GR_ad_tbl_3] // Load h_3 | |
1263 | fmpy.s1 FR_G = FR_G, FR_G2 // G = G_1 * G_2 | |
1264 | nop.i 0 | |
1265 | } | |
1266 | { .mfi | |
1267 | nop.m 0 | |
1268 | fadd.s1 FR_H = FR_H, FR_H2 // H = H_1 + H_2 | |
1269 | nop.i 0 | |
1270 | };; | |
1271 | ||
1272 | { .mmf | |
1273 | nop.m 0 | |
1274 | nop.m 0 | |
1275 | fadd.s1 FR_h = FR_h, FR_h2 // h = h_1 + h_2 | |
1276 | };; | |
1277 | ||
1278 | { .mfi | |
1279 | nop.m 0 | |
1280 | fmpy.s1 FR_G = FR_G, FR_G3 // G = (G_1 * G_2)*G_3 | |
1281 | nop.i 0 | |
1282 | } | |
1283 | { .mfi | |
1284 | nop.m 0 | |
1285 | fadd.s1 FR_H = FR_H, FR_H3 // H = (H_1 + H_2)+H_3 | |
1286 | nop.i 0 | |
1287 | };; | |
1288 | ||
1289 | { .mfi | |
1290 | nop.m 0 | |
1291 | fadd.s1 FR_h = FR_h, FR_h3 // h = (h_1 + h_2) + h_3 | |
1292 | nop.i 0 | |
1293 | };; | |
1294 | ||
1295 | { .mfi | |
1296 | nop.m 0 | |
1297 | fms.s1 FR_r = FR_G, FR_S_hi, f1 // r = G * S_hi - 1 | |
1298 | nop.i 0 | |
1299 | } | |
1300 | { .mfi | |
1301 | nop.m 0 | |
1302 | fma.s1 FR_Y_hi = FR_float_N, FR_log2_hi, FR_H // Y_hi=N*log2_hi+H | |
1303 | nop.i 0 | |
1304 | };; | |
1305 | ||
1306 | { .mfi | |
1307 | nop.m 0 | |
1308 | fma.s1 FR_h = FR_float_N, FR_log2_lo, FR_h // h = N*log2_lo+h | |
1309 | nop.i 0 | |
1310 | };; | |
1311 | ||
1312 | { .mfi | |
1313 | nop.m 0 | |
1314 | fma.s1 FR_poly_lo = FR_r, FR_Q4, FR_Q3 // poly_lo = r * Q4 + Q3 | |
1315 | nop.i 0 | |
1316 | } | |
1317 | { .mfi | |
1318 | nop.m 0 | |
1319 | fmpy.s1 FR_rsq = FR_r, FR_r // rsq = r * r | |
1320 | nop.i 0 | |
1321 | };; | |
1322 | ||
1323 | { .mfi | |
1324 | nop.m 0 | |
1325 | fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_Q2 // poly_lo=poly_lo*r+Q2 | |
1326 | nop.i 0 | |
1327 | } | |
1328 | { .mfi | |
1329 | nop.m 0 | |
1330 | fma.s1 FR_rcub = FR_rsq, FR_r, f0 // rcub = r^3 | |
1331 | nop.i 0 | |
1332 | };; | |
1333 | ||
1334 | { .mfi | |
1335 | nop.m 0 | |
1336 | fma.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r // poly_hi = Q1*rsq + r | |
1337 | nop.i 0 | |
1338 | };; | |
1339 | ||
1340 | { .mfi | |
1341 | nop.m 0 | |
1342 | fma.s1 FR_poly_lo = FR_poly_lo, FR_rcub, FR_h//poly_lo=poly_lo*r^3+h | |
1343 | nop.i 0 | |
1344 | };; | |
1345 | { .mfi | |
1346 | nop.m 0 | |
0347518d | 1347 | fadd.s0 FR_Y_lo = FR_poly_hi, FR_poly_lo // Y_lo=poly_hi+poly_lo |
d5efd131 MF |
1348 | nop.i 0 |
1349 | };; | |
1350 | { .mfb | |
1351 | nop.m 0 | |
1352 | fadd.s0 FR_Res = FR_Y_lo,FR_Y_hi // Result=Y_lo+Y_hi | |
1353 | br.ret.sptk b0 // Common exit | |
1354 | };; | |
1355 | ||
1356 | ||
1357 | // NEAR ONE INTERVAL | |
1358 | near_1: | |
0347518d MF |
1359 | { .mfi |
1360 | nop.m 0 | |
d5efd131 | 1361 | frsqrta.s1 FR_Rcp, p0 = FR_2XM1 // Rcp = 1/x reciprocal appr. &SQRT& |
0347518d | 1362 | nop.i 0 |
d5efd131 MF |
1363 | };; |
1364 | ||
0347518d MF |
1365 | { .mfi |
1366 | nop.m 0 | |
d5efd131 | 1367 | fma.s1 FR_PV6 = FR_PP5, FR_XM1, FR_PP4 // pv6 = P5*xm1+P4 $POLY$ |
0347518d | 1368 | nop.i 0 |
d5efd131 MF |
1369 | } |
1370 | { .mfi | |
0347518d | 1371 | nop.m 0 |
d5efd131 | 1372 | fma.s1 FR_QV6 = FR_QQ5, FR_XM1, FR_QQ4 // qv6 = Q5*xm1+Q4 $POLY$ |
0347518d | 1373 | nop.i 0 |
d5efd131 MF |
1374 | };; |
1375 | ||
0347518d MF |
1376 | { .mfi |
1377 | nop.m 0 | |
d5efd131 | 1378 | fma.s1 FR_PV4 = FR_PP3, FR_XM1, FR_PP2 // pv4 = P3*xm1+P2 $POLY$ |
0347518d | 1379 | nop.i 0 |
d5efd131 MF |
1380 | } |
1381 | { .mfi | |
0347518d | 1382 | nop.m 0 |
d5efd131 | 1383 | fma.s1 FR_QV4 = FR_QQ3, FR_XM1, FR_QQ2 // qv4 = Q3*xm1+Q2 $POLY$ |
0347518d | 1384 | nop.i 0 |
d5efd131 MF |
1385 | };; |
1386 | ||
0347518d MF |
1387 | { .mfi |
1388 | nop.m 0 | |
d5efd131 | 1389 | fma.s1 FR_XM12 = FR_XM1, FR_XM1, f0 // xm1^2 = xm1 * xm1 $POLY$ |
0347518d | 1390 | nop.i 0 |
d5efd131 MF |
1391 | };; |
1392 | ||
0347518d MF |
1393 | { .mfi |
1394 | nop.m 0 | |
d5efd131 | 1395 | fma.s1 FR_PV2 = FR_PP1, FR_XM1, FR_PP0 // pv2 = P1*xm1+P0 $POLY$ |
0347518d | 1396 | nop.i 0 |
d5efd131 MF |
1397 | } |
1398 | { .mfi | |
0347518d | 1399 | nop.m 0 |
d5efd131 | 1400 | fma.s1 FR_QV2 = FR_QQ1, FR_XM1, FR_QQ0 // qv2 = Q1*xm1+Q0 $POLY$ |
0347518d | 1401 | nop.i 0 |
d5efd131 MF |
1402 | };; |
1403 | ||
0347518d MF |
1404 | { .mfi |
1405 | nop.m 0 | |
1406 | fma.s1 FR_GG = FR_Rcp, FR_2XM1, f0 // g = Rcp * x &SQRT& | |
1407 | nop.i 0 | |
d5efd131 MF |
1408 | } |
1409 | { .mfi | |
0347518d | 1410 | nop.m 0 |
d5efd131 | 1411 | fma.s1 FR_HH = FR_Half, FR_Rcp, f0 // h = 0.5 * Rcp &SQRT& |
0347518d | 1412 | nop.i 0 |
d5efd131 MF |
1413 | };; |
1414 | ||
1415 | ||
0347518d MF |
1416 | { .mfi |
1417 | nop.m 0 | |
d5efd131 | 1418 | fma.s1 FR_PV3 = FR_XM12, FR_PV6, FR_PV4//pv3=pv6*xm1^2+pv4 $POLY$ |
0347518d | 1419 | nop.i 0 |
d5efd131 MF |
1420 | } |
1421 | { .mfi | |
0347518d | 1422 | nop.m 0 |
d5efd131 | 1423 | fma.s1 FR_QV3 = FR_XM12, FR_QV6, FR_QV4//qv3=qv6*xm1^2+qv4 $POLY$ |
0347518d | 1424 | nop.i 0 |
d5efd131 MF |
1425 | };; |
1426 | ||
1427 | ||
0347518d MF |
1428 | { .mfi |
1429 | nop.m 0 | |
d5efd131 | 1430 | fnma.s1 FR_EE = FR_GG, FR_HH, FR_Half // e = 0.5 - g * h &SQRT& |
0347518d | 1431 | nop.i 0 |
d5efd131 MF |
1432 | };; |
1433 | ||
0347518d MF |
1434 | { .mfi |
1435 | nop.m 0 | |
d5efd131 | 1436 | fma.s1 FR_PP = FR_XM12, FR_PV3, FR_PV2 //pp=pv3*xm1^2+pv2 $POLY$ |
0347518d | 1437 | nop.i 0 |
d5efd131 MF |
1438 | } |
1439 | { .mfi | |
0347518d | 1440 | nop.m 0 |
d5efd131 | 1441 | fma.s1 FR_QQ = FR_XM12, FR_QV3, FR_QV2 //qq=qv3*xm1^2+qv2 $POLY$ |
0347518d | 1442 | nop.i 0 |
d5efd131 MF |
1443 | };; |
1444 | ||
1445 | { .mfi | |
0347518d | 1446 | nop.m 0 |
d5efd131 | 1447 | fma.s1 FR_GG = FR_GG, FR_EE, FR_GG // g = g * e + g &SQRT& |
0347518d | 1448 | nop.i 0 |
d5efd131 MF |
1449 | } |
1450 | { .mfi | |
0347518d | 1451 | nop.m 0 |
d5efd131 | 1452 | fma.s1 FR_HH = FR_HH, FR_EE, FR_HH // h = h * e + h &SQRT& |
0347518d | 1453 | nop.i 0 |
d5efd131 MF |
1454 | };; |
1455 | ||
1456 | { .mfi | |
0347518d | 1457 | nop.m 0 |
d5efd131 | 1458 | frcpa.s1 FR_Y0,p0 = f1,FR_QQ // y = frcpa(b) #DIV# |
0347518d | 1459 | nop.i 0 |
d5efd131 MF |
1460 | } |
1461 | { .mfi | |
0347518d | 1462 | nop.m 0 |
d5efd131 | 1463 | fnma.s1 FR_EE = FR_GG, FR_HH, FR_Half // e = 0.5 - g*h &SQRT& |
0347518d | 1464 | nop.i 0 |
d5efd131 MF |
1465 | };; |
1466 | ||
1467 | { .mfi | |
0347518d | 1468 | nop.m 0 |
d5efd131 | 1469 | fma.s1 FR_Q0 = FR_PP,FR_Y0,f0 // q = a*y #DIV# |
0347518d | 1470 | nop.i 0 |
d5efd131 MF |
1471 | } |
1472 | { .mfi | |
0347518d | 1473 | nop.m 0 |
d5efd131 | 1474 | fnma.s1 FR_E0 = FR_Y0,FR_QQ,f1 // e = 1 - b*y #DIV# |
0347518d | 1475 | nop.i 0 |
d5efd131 MF |
1476 | };; |
1477 | ||
1478 | { .mfi | |
0347518d MF |
1479 | nop.m 0 |
1480 | fma.s1 FR_GG = FR_GG, FR_EE, FR_GG // g = g * e + g &SQRT& | |
1481 | nop.i 0 | |
d5efd131 MF |
1482 | } |
1483 | { .mfi | |
0347518d | 1484 | nop.m 0 |
d5efd131 | 1485 | fma.s1 FR_HH = FR_HH, FR_EE, FR_HH // h = h * e + h &SQRT& |
0347518d | 1486 | nop.i 0 |
d5efd131 MF |
1487 | };; |
1488 | ||
1489 | { .mfi | |
0347518d | 1490 | nop.m 0 |
d5efd131 | 1491 | fma.s1 FR_E2 = FR_E0,FR_E0,FR_E0 // e2 = e+e^2 #DIV# |
0347518d | 1492 | nop.i 0 |
d5efd131 MF |
1493 | } |
1494 | { .mfi | |
0347518d | 1495 | nop.m 0 |
d5efd131 | 1496 | fma.s1 FR_E1 = FR_E0,FR_E0,f0 // e1 = e^2 #DIV# |
0347518d | 1497 | nop.i 0 |
d5efd131 MF |
1498 | };; |
1499 | ||
1500 | { .mfi | |
0347518d | 1501 | nop.m 0 |
d5efd131 | 1502 | fnma.s1 FR_EE = FR_GG, FR_HH, FR_Half // e = 0.5 - g * h &SQRT& |
0347518d | 1503 | nop.i 0 |
d5efd131 MF |
1504 | } |
1505 | { .mfi | |
0347518d | 1506 | nop.m 0 |
d5efd131 | 1507 | fnma.s1 FR_DD = FR_GG, FR_GG, FR_2XM1 // d = x - g * g &SQRT& |
0347518d | 1508 | nop.i 0 |
d5efd131 MF |
1509 | };; |
1510 | ||
1511 | { .mfi | |
0347518d | 1512 | nop.m 0 |
d5efd131 | 1513 | fma.s1 FR_Y1 = FR_Y0,FR_E2,FR_Y0 // y1 = y+y*e2 #DIV# |
0347518d | 1514 | nop.i 0 |
d5efd131 MF |
1515 | } |
1516 | { .mfi | |
0347518d | 1517 | nop.m 0 |
d5efd131 | 1518 | fma.s1 FR_E3 = FR_E1,FR_E1,FR_E0 // e3 = e+e1^2 #DIV# |
0347518d | 1519 | nop.i 0 |
d5efd131 MF |
1520 | };; |
1521 | ||
1522 | { .mfi | |
0347518d | 1523 | nop.m 0 |
d5efd131 | 1524 | fma.s1 FR_GG = FR_DD, FR_HH, FR_GG // g = d * h + g &SQRT& |
0347518d | 1525 | nop.i 0 |
d5efd131 MF |
1526 | } |
1527 | { .mfi | |
0347518d | 1528 | nop.m 0 |
d5efd131 | 1529 | fma.s1 FR_HH = FR_HH, FR_EE, FR_HH // h = h * e + h &SQRT& |
0347518d | 1530 | nop.i 0 |
d5efd131 MF |
1531 | };; |
1532 | ||
1533 | { .mfi | |
0347518d | 1534 | nop.m 0 |
d5efd131 | 1535 | fma.s1 FR_Y2 = FR_Y1,FR_E3,FR_Y0 // y2 = y+y1*e3 #DIV# |
0347518d | 1536 | nop.i 0 |
d5efd131 MF |
1537 | } |
1538 | { .mfi | |
0347518d | 1539 | nop.m 0 |
d5efd131 | 1540 | fnma.s1 FR_R0 = FR_QQ,FR_Q0,FR_PP // r = a-b*q #DIV# |
0347518d | 1541 | nop.i 0 |
d5efd131 MF |
1542 | };; |
1543 | ||
1544 | { .mfi | |
0347518d MF |
1545 | nop.m 0 |
1546 | fnma.s1 FR_DD = FR_GG, FR_GG, FR_2XM1 // d = x - g * g &SQRT& | |
1547 | nop.i 0 | |
d5efd131 MF |
1548 | };; |
1549 | ||
1550 | { .mfi | |
0347518d | 1551 | nop.m 0 |
d5efd131 | 1552 | fnma.s1 FR_E4 = FR_QQ,FR_Y2,f1 // e4 = 1-b*y2 #DIV# |
0347518d | 1553 | nop.i 0 |
d5efd131 MF |
1554 | } |
1555 | { .mfi | |
0347518d | 1556 | nop.m 0 |
d5efd131 | 1557 | fma.s1 FR_X_Hi = FR_R0,FR_Y2,FR_Q0 // x = q+r*y2 #DIV# |
0347518d | 1558 | nop.i 0 |
d5efd131 MF |
1559 | };; |
1560 | ||
1561 | { .mfi | |
0347518d | 1562 | nop.m 0 |
d5efd131 | 1563 | fma.s1 FR_GL = FR_DD, FR_HH, f0 // gl = d * h &SQRT& |
0347518d | 1564 | nop.i 0 |
d5efd131 MF |
1565 | };; |
1566 | ||
1567 | { .mfi | |
0347518d | 1568 | nop.m 0 |
d5efd131 | 1569 | fma.s1 FR_Y3 = FR_Y2,FR_E4,FR_Y2 // y3 = y2+y2*e4 #DIV# |
0347518d | 1570 | nop.i 0 |
d5efd131 MF |
1571 | } |
1572 | { .mfi | |
0347518d | 1573 | nop.m 0 |
d5efd131 | 1574 | fnma.s1 FR_R1 = FR_QQ,FR_X_Hi,FR_PP // r1 = a-b*x #DIV# |
0347518d | 1575 | nop.i 0 |
d5efd131 MF |
1576 | };; |
1577 | ||
1578 | { .mfi | |
0347518d | 1579 | nop.m 0 |
d5efd131 | 1580 | fma.s1 FR_HH = FR_GG, FR_X_Hi, f0 // hh = gg * x_hi |
0347518d | 1581 | nop.i 0 |
d5efd131 MF |
1582 | } |
1583 | { .mfi | |
0347518d | 1584 | nop.m 0 |
d5efd131 | 1585 | fma.s1 FR_LH = FR_GL, FR_X_Hi, f0 // lh = gl * x_hi |
0347518d | 1586 | nop.i 0 |
d5efd131 MF |
1587 | };; |
1588 | ||
1589 | { .mfi | |
0347518d | 1590 | nop.m 0 |
d5efd131 | 1591 | fma.s1 FR_X_lo = FR_R1,FR_Y3,f0 // x_lo = r1*y3 #DIV# |
0347518d | 1592 | nop.i 0 |
d5efd131 MF |
1593 | };; |
1594 | ||
1595 | { .mfi | |
0347518d | 1596 | nop.m 0 |
d5efd131 | 1597 | fma.s1 FR_LL = FR_GL, FR_X_lo, f0 // ll = gl*x_lo |
0347518d | 1598 | nop.i 0 |
d5efd131 MF |
1599 | } |
1600 | { .mfi | |
0347518d | 1601 | nop.m 0 |
d5efd131 | 1602 | fma.s1 FR_HL = FR_GG, FR_X_lo, f0 // hl = gg * x_lo |
0347518d | 1603 | nop.i 0 |
d5efd131 MF |
1604 | };; |
1605 | ||
1606 | { .mfi | |
0347518d | 1607 | nop.m 0 |
d5efd131 | 1608 | fms.s1 FR_Res = FR_GL, f1, FR_LL // res = gl + ll |
0347518d | 1609 | nop.i 0 |
d5efd131 MF |
1610 | };; |
1611 | ||
1612 | { .mfi | |
0347518d | 1613 | nop.m 0 |
d5efd131 | 1614 | fms.s1 FR_Res = FR_Res, f1, FR_LH // res = res + lh |
0347518d | 1615 | nop.i 0 |
d5efd131 MF |
1616 | };; |
1617 | ||
1618 | { .mfi | |
0347518d | 1619 | nop.m 0 |
d5efd131 | 1620 | fms.s1 FR_Res = FR_Res, f1, FR_HL // res = res + hl |
0347518d | 1621 | nop.i 0 |
d5efd131 MF |
1622 | };; |
1623 | ||
1624 | { .mfi | |
0347518d | 1625 | nop.m 0 |
d5efd131 | 1626 | fms.s1 FR_Res = FR_Res, f1, FR_HH // res = res + hh |
0347518d | 1627 | nop.i 0 |
d5efd131 MF |
1628 | };; |
1629 | ||
1630 | { .mfb | |
0347518d | 1631 | nop.m 0 |
d5efd131 MF |
1632 | fma.s0 FR_Res = FR_Res, f1, FR_GG // result = res + gg |
1633 | br.ret.sptk b0 // Exit for near 1 path | |
1634 | };; | |
1635 | // NEAR ONE INTERVAL END | |
1636 | ||
1637 | ||
1638 | ||
1639 | ||
1640 | acoshl_lt_pone: | |
1641 | { .mfi | |
0347518d | 1642 | nop.m 0 |
d5efd131 | 1643 | fmerge.s FR_Arg_X = FR_Arg, FR_Arg |
0347518d | 1644 | nop.i 0 |
d5efd131 MF |
1645 | };; |
1646 | { .mfb | |
1647 | mov GR_Parameter_TAG = 135 | |
1648 | frcpa.s0 FR_Res,p0 = f0,f0 // get QNaN,and raise invalid | |
1649 | br.cond.sptk __libm_error_region // exit if x < 1.0 | |
1650 | };; | |
1651 | ||
1652 | GLOBAL_LIBM_END(acoshl) | |
1653 | ||
1654 | ||
1655 | ||
1656 | LOCAL_LIBM_ENTRY(__libm_error_region) | |
1657 | .prologue | |
1658 | { .mfi | |
1659 | add GR_Parameter_Y = -32,sp // Parameter 2 value | |
1660 | nop.f 0 | |
1661 | .save ar.pfs,GR_SAVE_PFS | |
1662 | mov GR_SAVE_PFS = ar.pfs // Save ar.pfs | |
1663 | } | |
1664 | { .mfi | |
1665 | .fframe 64 | |
1666 | add sp = -64,sp // Create new stack | |
1667 | nop.f 0 | |
1668 | mov GR_SAVE_GP = gp // Save gp | |
1669 | };; | |
1670 | ||
1671 | { .mmi | |
1672 | stfe [GR_Parameter_Y] = FR_Arg_Y,16 // Parameter 2 to stack | |
1673 | add GR_Parameter_X = 16,sp // Parameter 1 address | |
1674 | .save b0,GR_SAVE_B0 | |
1675 | mov GR_SAVE_B0 = b0 // Save b0 | |
1676 | };; | |
1677 | ||
1678 | .body | |
1679 | { .mib | |
1680 | stfe [GR_Parameter_X] = FR_Arg_X // Parameter 1 to stack | |
1681 | add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address | |
0347518d | 1682 | nop.b 0 |
d5efd131 MF |
1683 | } |
1684 | { .mib | |
1685 | stfe [GR_Parameter_Y] = FR_Res // Parameter 3 to stack | |
1686 | add GR_Parameter_Y = -16,GR_Parameter_Y | |
1687 | br.call.sptk b0 = __libm_error_support# // Error handling function | |
1688 | };; | |
1689 | ||
1690 | { .mmi | |
1691 | nop.m 0 | |
1692 | nop.m 0 | |
1693 | add GR_Parameter_RESULT = 48,sp | |
1694 | };; | |
1695 | ||
1696 | { .mmi | |
1697 | ldfe f8 = [GR_Parameter_RESULT] // Get return res | |
1698 | .restore sp | |
1699 | add sp = 64,sp // Restore stack pointer | |
1700 | mov b0 = GR_SAVE_B0 // Restore return address | |
1701 | };; | |
1702 | ||
1703 | { .mib | |
1704 | mov gp = GR_SAVE_GP // Restore gp | |
1705 | mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs | |
1706 | br.ret.sptk b0 // Return | |
1707 | };; | |
1708 | ||
1709 | LOCAL_LIBM_END(__libm_error_region#) | |
1710 | ||
1711 | .type __libm_error_support#,@function | |
1712 | .global __libm_error_support# | |
1713 | ||
1714 | ||
1715 | ||
1716 |