]>
Commit | Line | Data |
---|---|---|
0ecb606c JJ |
1 | .file "logl.s" |
2 | ||
3 | ||
4 | // Copyright (c) 2000 - 2003, Intel Corporation | |
5 | // All rights reserved. | |
6 | // | |
7 | // Contributed 2000 by the Intel Numerics Group, Intel Corporation | |
8 | // | |
9 | // Redistribution and use in source and binary forms, with or without | |
10 | // modification, are permitted provided that the following conditions are | |
11 | // met: | |
12 | // | |
13 | // * Redistributions of source code must retain the above copyright | |
14 | // notice, this list of conditions and the following disclaimer. | |
15 | // | |
16 | // * Redistributions in binary form must reproduce the above copyright | |
17 | // notice, this list of conditions and the following disclaimer in the | |
18 | // documentation and/or other materials provided with the distribution. | |
19 | // | |
20 | // * The name of Intel Corporation may not be used to endorse or promote | |
21 | // products derived from this software without specific prior written | |
22 | // permission. | |
23 | ||
24 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
25 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
26 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
27 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS | |
28 | // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | |
29 | // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | |
30 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | |
31 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY | |
32 | // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING | |
33 | // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | |
34 | // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
35 | // | |
36 | // Intel Corporation is the author of this code, and requests that all | |
37 | // problem reports or change requests be submitted to it directly at | |
38 | // http://www.intel.com/software/products/opensource/libraries/num.htm. | |
39 | // | |
40 | //********************************************************************* | |
41 | // | |
42 | // History: | |
43 | // 05/21/01 Extracted logl and log10l from log1pl.s file, and optimized | |
44 | // all paths. | |
45 | // 06/20/01 Fixed error tag for x=-inf. | |
46 | // 05/20/02 Cleaned up namespace and sf0 syntax | |
47 | // 02/10/03 Reordered header: .section, .global, .proc, .align; | |
48 | // used data8 for long double table values | |
49 | // | |
50 | //********************************************************************* | |
51 | // | |
52 | //********************************************************************* | |
53 | // | |
54 | // Function: Combined logl(x) and log10l(x) where | |
55 | // logl(x) = ln(x), for double-extended precision x values | |
56 | // log10l(x) = log (x), for double-extended precision x values | |
57 | // 10 | |
58 | // | |
59 | //********************************************************************* | |
60 | // | |
61 | // Resources Used: | |
62 | // | |
63 | // Floating-Point Registers: f8 (Input and Return Value) | |
64 | // f34-f76 | |
65 | // | |
66 | // General Purpose Registers: | |
67 | // r32-r56 | |
68 | // r53-r56 (Used to pass arguments to error handling routine) | |
69 | // | |
70 | // Predicate Registers: p6-p14 | |
71 | // | |
72 | //********************************************************************* | |
73 | // | |
74 | // IEEE Special Conditions: | |
75 | // | |
76 | // Denormal fault raised on denormal inputs | |
77 | // Overflow exceptions cannot occur | |
78 | // Underflow exceptions raised when appropriate for log1p | |
79 | // (Error Handling Routine called for underflow) | |
80 | // Inexact raised when appropriate by algorithm | |
81 | // | |
82 | // logl(inf) = inf | |
83 | // logl(-inf) = QNaN | |
84 | // logl(+/-0) = -inf | |
85 | // logl(SNaN) = QNaN | |
86 | // logl(QNaN) = QNaN | |
87 | // logl(EM_special Values) = QNaN | |
88 | // log10l(inf) = inf | |
89 | // log10l(-inf) = QNaN | |
90 | // log10l(+/-0) = -inf | |
91 | // log10l(SNaN) = QNaN | |
92 | // log10l(QNaN) = QNaN | |
93 | // log10l(EM_special Values) = QNaN | |
94 | // | |
95 | //********************************************************************* | |
96 | // | |
97 | // Overview | |
98 | // | |
99 | // The method consists of two cases. | |
100 | // | |
101 | // If |X-1| < 2^(-7) use case log_near1; | |
102 | // else use case log_regular; | |
103 | // | |
104 | // Case log_near1: | |
105 | // | |
106 | // logl( 1 + X ) can be approximated by a simple polynomial | |
107 | // in W = X-1. This polynomial resembles the truncated Taylor | |
108 | // series W - W^/2 + W^3/3 - ... | |
109 | // | |
110 | // Case log_regular: | |
111 | // | |
112 | // Here we use a table lookup method. The basic idea is that in | |
113 | // order to compute logl(Arg) for an argument Arg in [1,2), we | |
114 | // construct a value G such that G*Arg is close to 1 and that | |
115 | // logl(1/G) is obtainable easily from a table of values calculated | |
116 | // beforehand. Thus | |
117 | // | |
118 | // logl(Arg) = logl(1/G) + logl(G*Arg) | |
119 | // = logl(1/G) + logl(1 + (G*Arg - 1)) | |
120 | // | |
121 | // Because |G*Arg - 1| is small, the second term on the right hand | |
122 | // side can be approximated by a short polynomial. We elaborate | |
123 | // this method in four steps. | |
124 | // | |
125 | // Step 0: Initialization | |
126 | // | |
127 | // We need to calculate logl( X ). Obtain N, S_hi such that | |
128 | // | |
129 | // X = 2^N * S_hi exactly | |
130 | // | |
131 | // where S_hi in [1,2) | |
132 | // | |
133 | // Step 1: Argument Reduction | |
134 | // | |
135 | // Based on S_hi, obtain G_1, G_2, G_3 from a table and calculate | |
136 | // | |
137 | // G := G_1 * G_2 * G_3 | |
138 | // r := (G * S_hi - 1) | |
139 | // | |
140 | // These G_j's have the property that the product is exactly | |
141 | // representable and that |r| < 2^(-12) as a result. | |
142 | // | |
143 | // Step 2: Approximation | |
144 | // | |
145 | // | |
146 | // logl(1 + r) is approximated by a short polynomial poly(r). | |
147 | // | |
148 | // Step 3: Reconstruction | |
149 | // | |
150 | // | |
151 | // Finally, logl( X ) is given by | |
152 | // | |
153 | // logl( X ) = logl( 2^N * S_hi ) | |
154 | // ~=~ N*logl(2) + logl(1/G) + logl(1 + r) | |
155 | // ~=~ N*logl(2) + logl(1/G) + poly(r). | |
156 | // | |
157 | // **** Algorithm **** | |
158 | // | |
159 | // Case log_near1: | |
160 | // | |
161 | // Here we compute a simple polynomial. To exploit parallelism, we split | |
162 | // the polynomial into two portions. | |
163 | // | |
164 | // W := X - 1 | |
165 | // Wsq := W * W | |
166 | // W4 := Wsq*Wsq | |
167 | // W6 := W4*Wsq | |
168 | // Y_hi := W + Wsq*(P_1 + W*(P_2 + W*(P_3 + W*P_4)) | |
169 | // Y_lo := W6*(P_5 + W*(P_6 + W*(P_7 + W*P_8))) | |
170 | // | |
171 | // Case log_regular: | |
172 | // | |
173 | // We present the algorithm in four steps. | |
174 | // | |
175 | // Step 0. Initialization | |
176 | // ---------------------- | |
177 | // | |
178 | // Z := X | |
179 | // N := unbaised exponent of Z | |
180 | // S_hi := 2^(-N) * Z | |
181 | // | |
182 | // Step 1. Argument Reduction | |
183 | // -------------------------- | |
184 | // | |
185 | // Let | |
186 | // | |
187 | // Z = 2^N * S_hi = 2^N * 1.d_1 d_2 d_3 ... d_63 | |
188 | // | |
189 | // We obtain G_1, G_2, G_3 by the following steps. | |
190 | // | |
191 | // | |
192 | // Define X_0 := 1.d_1 d_2 ... d_14. This is extracted | |
193 | // from S_hi. | |
194 | // | |
195 | // Define A_1 := 1.d_1 d_2 d_3 d_4. This is X_0 truncated | |
196 | // to lsb = 2^(-4). | |
197 | // | |
198 | // Define index_1 := [ d_1 d_2 d_3 d_4 ]. | |
199 | // | |
200 | // Fetch Z_1 := (1/A_1) rounded UP in fixed point with | |
201 | // fixed point lsb = 2^(-15). | |
202 | // Z_1 looks like z_0.z_1 z_2 ... z_15 | |
203 | // Note that the fetching is done using index_1. | |
204 | // A_1 is actually not needed in the implementation | |
205 | // and is used here only to explain how is the value | |
206 | // Z_1 defined. | |
207 | // | |
208 | // Fetch G_1 := (1/A_1) truncated to 21 sig. bits. | |
209 | // floating pt. Again, fetching is done using index_1. A_1 | |
210 | // explains how G_1 is defined. | |
211 | // | |
212 | // Calculate X_1 := X_0 * Z_1 truncated to lsb = 2^(-14) | |
213 | // = 1.0 0 0 0 d_5 ... d_14 | |
214 | // This is accomplised by integer multiplication. | |
215 | // It is proved that X_1 indeed always begin | |
216 | // with 1.0000 in fixed point. | |
217 | // | |
218 | // | |
219 | // Define A_2 := 1.0 0 0 0 d_5 d_6 d_7 d_8. This is X_1 | |
220 | // truncated to lsb = 2^(-8). Similar to A_1, | |
221 | // A_2 is not needed in actual implementation. It | |
222 | // helps explain how some of the values are defined. | |
223 | // | |
224 | // Define index_2 := [ d_5 d_6 d_7 d_8 ]. | |
225 | // | |
226 | // Fetch Z_2 := (1/A_2) rounded UP in fixed point with | |
227 | // fixed point lsb = 2^(-15). Fetch done using index_2. | |
228 | // Z_2 looks like z_0.z_1 z_2 ... z_15 | |
229 | // | |
230 | // Fetch G_2 := (1/A_2) truncated to 21 sig. bits. | |
231 | // floating pt. | |
232 | // | |
233 | // Calculate X_2 := X_1 * Z_2 truncated to lsb = 2^(-14) | |
234 | // = 1.0 0 0 0 0 0 0 0 d_9 d_10 ... d_14 | |
235 | // This is accomplised by integer multiplication. | |
236 | // It is proved that X_2 indeed always begin | |
237 | // with 1.00000000 in fixed point. | |
238 | // | |
239 | // | |
240 | // Define A_3 := 1.0 0 0 0 0 0 0 0 d_9 d_10 d_11 d_12 d_13 1. | |
241 | // This is 2^(-14) + X_2 truncated to lsb = 2^(-13). | |
242 | // | |
243 | // Define index_3 := [ d_9 d_10 d_11 d_12 d_13 ]. | |
244 | // | |
245 | // Fetch G_3 := (1/A_3) truncated to 21 sig. bits. | |
246 | // floating pt. Fetch is done using index_3. | |
247 | // | |
248 | // Compute G := G_1 * G_2 * G_3. | |
249 | // | |
250 | // This is done exactly since each of G_j only has 21 sig. bits. | |
251 | // | |
252 | // Compute | |
253 | // | |
254 | // r := (G*S_hi - 1) | |
255 | // | |
256 | // | |
257 | // Step 2. Approximation | |
258 | // --------------------- | |
259 | // | |
260 | // This step computes an approximation to logl( 1 + r ) where r is the | |
261 | // reduced argument just obtained. It is proved that |r| <= 1.9*2^(-13); | |
262 | // thus logl(1+r) can be approximated by a short polynomial: | |
263 | // | |
264 | // logl(1+r) ~=~ poly = r + Q1 r^2 + ... + Q4 r^5 | |
265 | // | |
266 | // | |
267 | // Step 3. Reconstruction | |
268 | // ---------------------- | |
269 | // | |
270 | // This step computes the desired result of logl(X): | |
271 | // | |
272 | // logl(X) = logl( 2^N * S_hi ) | |
273 | // = N*logl(2) + logl( S_hi ) | |
274 | // = N*logl(2) + logl(1/G) + | |
275 | // logl(1 + G*S_hi - 1 ) | |
276 | // | |
277 | // logl(2), logl(1/G_j) are stored as pairs of (single,double) numbers: | |
278 | // log2_hi, log2_lo, log1byGj_hi, log1byGj_lo. The high parts are | |
279 | // single-precision numbers and the low parts are double precision | |
280 | // numbers. These have the property that | |
281 | // | |
282 | // N*log2_hi + SUM ( log1byGj_hi ) | |
283 | // | |
284 | // is computable exactly in double-extended precision (64 sig. bits). | |
285 | // Finally | |
286 | // | |
287 | // Y_hi := N*log2_hi + SUM ( log1byGj_hi ) | |
288 | // Y_lo := poly_hi + [ poly_lo + | |
289 | // ( SUM ( log1byGj_lo ) + N*log2_lo ) ] | |
290 | // | |
291 | ||
292 | RODATA | |
293 | .align 64 | |
294 | ||
295 | // ************* DO NOT CHANGE THE ORDER OF THESE TABLES ************* | |
296 | ||
297 | // P_8, P_7, P_6, P_5, P_4, P_3, P_2, and P_1 | |
298 | ||
299 | LOCAL_OBJECT_START(Constants_P) | |
300 | data8 0xE3936754EFD62B15,0x00003FFB | |
301 | data8 0x8003B271A5E56381,0x0000BFFC | |
302 | data8 0x9249248C73282DB0,0x00003FFC | |
303 | data8 0xAAAAAA9F47305052,0x0000BFFC | |
304 | data8 0xCCCCCCCCCCD17FC9,0x00003FFC | |
305 | data8 0x8000000000067ED5,0x0000BFFD | |
306 | data8 0xAAAAAAAAAAAAAAAA,0x00003FFD | |
307 | data8 0xFFFFFFFFFFFFFFFE,0x0000BFFD | |
308 | LOCAL_OBJECT_END(Constants_P) | |
309 | ||
310 | // log2_hi, log2_lo, Q_4, Q_3, Q_2, and Q_1 | |
311 | ||
312 | LOCAL_OBJECT_START(Constants_Q) | |
313 | data8 0xB172180000000000,0x00003FFE | |
314 | data8 0x82E308654361C4C6,0x0000BFE2 | |
315 | data8 0xCCCCCAF2328833CB,0x00003FFC | |
316 | data8 0x80000077A9D4BAFB,0x0000BFFD | |
317 | data8 0xAAAAAAAAAAABE3D2,0x00003FFD | |
318 | data8 0xFFFFFFFFFFFFDAB7,0x0000BFFD | |
319 | LOCAL_OBJECT_END(Constants_Q) | |
320 | ||
321 | // 1/ln10_hi, 1/ln10_lo | |
322 | ||
323 | LOCAL_OBJECT_START(Constants_1_by_LN10) | |
324 | data8 0xDE5BD8A937287195,0x00003FFD | |
325 | data8 0xD56EAABEACCF70C8,0x00003FBB | |
326 | LOCAL_OBJECT_END(Constants_1_by_LN10) | |
327 | ||
328 | ||
329 | // Z1 - 16 bit fixed | |
330 | ||
331 | LOCAL_OBJECT_START(Constants_Z_1) | |
332 | data4 0x00008000 | |
333 | data4 0x00007879 | |
334 | data4 0x000071C8 | |
335 | data4 0x00006BCB | |
336 | data4 0x00006667 | |
337 | data4 0x00006187 | |
338 | data4 0x00005D18 | |
339 | data4 0x0000590C | |
340 | data4 0x00005556 | |
341 | data4 0x000051EC | |
342 | data4 0x00004EC5 | |
343 | data4 0x00004BDB | |
344 | data4 0x00004925 | |
345 | data4 0x0000469F | |
346 | data4 0x00004445 | |
347 | data4 0x00004211 | |
348 | LOCAL_OBJECT_END(Constants_Z_1) | |
349 | ||
350 | // G1 and H1 - IEEE single and h1 - IEEE double | |
351 | ||
352 | LOCAL_OBJECT_START(Constants_G_H_h1) | |
353 | data4 0x3F800000,0x00000000 | |
354 | data8 0x0000000000000000 | |
355 | data4 0x3F70F0F0,0x3D785196 | |
356 | data8 0x3DA163A6617D741C | |
357 | data4 0x3F638E38,0x3DF13843 | |
358 | data8 0x3E2C55E6CBD3D5BB | |
359 | data4 0x3F579430,0x3E2FF9A0 | |
360 | data8 0xBE3EB0BFD86EA5E7 | |
361 | data4 0x3F4CCCC8,0x3E647FD6 | |
362 | data8 0x3E2E6A8C86B12760 | |
363 | data4 0x3F430C30,0x3E8B3AE7 | |
364 | data8 0x3E47574C5C0739BA | |
365 | data4 0x3F3A2E88,0x3EA30C68 | |
366 | data8 0x3E20E30F13E8AF2F | |
367 | data4 0x3F321640,0x3EB9CEC8 | |
368 | data8 0xBE42885BF2C630BD | |
369 | data4 0x3F2AAAA8,0x3ECF9927 | |
370 | data8 0x3E497F3497E577C6 | |
371 | data4 0x3F23D708,0x3EE47FC5 | |
372 | data8 0x3E3E6A6EA6B0A5AB | |
373 | data4 0x3F1D89D8,0x3EF8947D | |
374 | data8 0xBDF43E3CD328D9BE | |
375 | data4 0x3F17B420,0x3F05F3A1 | |
376 | data8 0x3E4094C30ADB090A | |
377 | data4 0x3F124920,0x3F0F4303 | |
378 | data8 0xBE28FBB2FC1FE510 | |
379 | data4 0x3F0D3DC8,0x3F183EBF | |
380 | data8 0x3E3A789510FDE3FA | |
381 | data4 0x3F088888,0x3F20EC80 | |
382 | data8 0x3E508CE57CC8C98F | |
383 | data4 0x3F042108,0x3F29516A | |
384 | data8 0xBE534874A223106C | |
385 | LOCAL_OBJECT_END(Constants_G_H_h1) | |
386 | ||
387 | // Z2 - 16 bit fixed | |
388 | ||
389 | LOCAL_OBJECT_START(Constants_Z_2) | |
390 | data4 0x00008000 | |
391 | data4 0x00007F81 | |
392 | data4 0x00007F02 | |
393 | data4 0x00007E85 | |
394 | data4 0x00007E08 | |
395 | data4 0x00007D8D | |
396 | data4 0x00007D12 | |
397 | data4 0x00007C98 | |
398 | data4 0x00007C20 | |
399 | data4 0x00007BA8 | |
400 | data4 0x00007B31 | |
401 | data4 0x00007ABB | |
402 | data4 0x00007A45 | |
403 | data4 0x000079D1 | |
404 | data4 0x0000795D | |
405 | data4 0x000078EB | |
406 | LOCAL_OBJECT_END(Constants_Z_2) | |
407 | ||
408 | // G2 and H2 - IEEE single and h2 - IEEE double | |
409 | ||
410 | LOCAL_OBJECT_START(Constants_G_H_h2) | |
411 | data4 0x3F800000,0x00000000 | |
412 | data8 0x0000000000000000 | |
413 | data4 0x3F7F00F8,0x3B7F875D | |
414 | data8 0x3DB5A11622C42273 | |
415 | data4 0x3F7E03F8,0x3BFF015B | |
416 | data8 0x3DE620CF21F86ED3 | |
417 | data4 0x3F7D08E0,0x3C3EE393 | |
418 | data8 0xBDAFA07E484F34ED | |
419 | data4 0x3F7C0FC0,0x3C7E0586 | |
420 | data8 0xBDFE07F03860BCF6 | |
421 | data4 0x3F7B1880,0x3C9E75D2 | |
422 | data8 0x3DEA370FA78093D6 | |
423 | data4 0x3F7A2328,0x3CBDC97A | |
424 | data8 0x3DFF579172A753D0 | |
425 | data4 0x3F792FB0,0x3CDCFE47 | |
426 | data8 0x3DFEBE6CA7EF896B | |
427 | data4 0x3F783E08,0x3CFC15D0 | |
428 | data8 0x3E0CF156409ECB43 | |
429 | data4 0x3F774E38,0x3D0D874D | |
430 | data8 0xBE0B6F97FFEF71DF | |
431 | data4 0x3F766038,0x3D1CF49B | |
432 | data8 0xBE0804835D59EEE8 | |
433 | data4 0x3F757400,0x3D2C531D | |
434 | data8 0x3E1F91E9A9192A74 | |
435 | data4 0x3F748988,0x3D3BA322 | |
436 | data8 0xBE139A06BF72A8CD | |
437 | data4 0x3F73A0D0,0x3D4AE46F | |
438 | data8 0x3E1D9202F8FBA6CF | |
439 | data4 0x3F72B9D0,0x3D5A1756 | |
440 | data8 0xBE1DCCC4BA796223 | |
441 | data4 0x3F71D488,0x3D693B9D | |
442 | data8 0xBE049391B6B7C239 | |
443 | LOCAL_OBJECT_END(Constants_G_H_h2) | |
444 | ||
445 | // G3 and H3 - IEEE single and h3 - IEEE double | |
446 | ||
447 | LOCAL_OBJECT_START(Constants_G_H_h3) | |
448 | data4 0x3F7FFC00,0x38800100 | |
449 | data8 0x3D355595562224CD | |
450 | data4 0x3F7FF400,0x39400480 | |
451 | data8 0x3D8200A206136FF6 | |
452 | data4 0x3F7FEC00,0x39A00640 | |
453 | data8 0x3DA4D68DE8DE9AF0 | |
454 | data4 0x3F7FE400,0x39E00C41 | |
455 | data8 0xBD8B4291B10238DC | |
456 | data4 0x3F7FDC00,0x3A100A21 | |
457 | data8 0xBD89CCB83B1952CA | |
458 | data4 0x3F7FD400,0x3A300F22 | |
459 | data8 0xBDB107071DC46826 | |
460 | data4 0x3F7FCC08,0x3A4FF51C | |
461 | data8 0x3DB6FCB9F43307DB | |
462 | data4 0x3F7FC408,0x3A6FFC1D | |
463 | data8 0xBD9B7C4762DC7872 | |
464 | data4 0x3F7FBC10,0x3A87F20B | |
465 | data8 0xBDC3725E3F89154A | |
466 | data4 0x3F7FB410,0x3A97F68B | |
467 | data8 0xBD93519D62B9D392 | |
468 | data4 0x3F7FAC18,0x3AA7EB86 | |
469 | data8 0x3DC184410F21BD9D | |
470 | data4 0x3F7FA420,0x3AB7E101 | |
471 | data8 0xBDA64B952245E0A6 | |
472 | data4 0x3F7F9C20,0x3AC7E701 | |
473 | data8 0x3DB4B0ECAABB34B8 | |
474 | data4 0x3F7F9428,0x3AD7DD7B | |
475 | data8 0x3D9923376DC40A7E | |
476 | data4 0x3F7F8C30,0x3AE7D474 | |
477 | data8 0x3DC6E17B4F2083D3 | |
478 | data4 0x3F7F8438,0x3AF7CBED | |
479 | data8 0x3DAE314B811D4394 | |
480 | data4 0x3F7F7C40,0x3B03E1F3 | |
481 | data8 0xBDD46F21B08F2DB1 | |
482 | data4 0x3F7F7448,0x3B0BDE2F | |
483 | data8 0xBDDC30A46D34522B | |
484 | data4 0x3F7F6C50,0x3B13DAAA | |
485 | data8 0x3DCB0070B1F473DB | |
486 | data4 0x3F7F6458,0x3B1BD766 | |
487 | data8 0xBDD65DDC6AD282FD | |
488 | data4 0x3F7F5C68,0x3B23CC5C | |
489 | data8 0xBDCDAB83F153761A | |
490 | data4 0x3F7F5470,0x3B2BC997 | |
491 | data8 0xBDDADA40341D0F8F | |
492 | data4 0x3F7F4C78,0x3B33C711 | |
493 | data8 0x3DCD1BD7EBC394E8 | |
494 | data4 0x3F7F4488,0x3B3BBCC6 | |
495 | data8 0xBDC3532B52E3E695 | |
496 | data4 0x3F7F3C90,0x3B43BAC0 | |
497 | data8 0xBDA3961EE846B3DE | |
498 | data4 0x3F7F34A0,0x3B4BB0F4 | |
499 | data8 0xBDDADF06785778D4 | |
500 | data4 0x3F7F2CA8,0x3B53AF6D | |
501 | data8 0x3DCC3ED1E55CE212 | |
502 | data4 0x3F7F24B8,0x3B5BA620 | |
503 | data8 0xBDBA31039E382C15 | |
504 | data4 0x3F7F1CC8,0x3B639D12 | |
505 | data8 0x3D635A0B5C5AF197 | |
506 | data4 0x3F7F14D8,0x3B6B9444 | |
507 | data8 0xBDDCCB1971D34EFC | |
508 | data4 0x3F7F0CE0,0x3B7393BC | |
509 | data8 0x3DC7450252CD7ADA | |
510 | data4 0x3F7F04F0,0x3B7B8B6D | |
511 | data8 0xBDB68F177D7F2A42 | |
512 | LOCAL_OBJECT_END(Constants_G_H_h3) | |
513 | ||
514 | ||
515 | // Floating Point Registers | |
516 | ||
517 | FR_Input_X = f8 | |
518 | ||
519 | FR_Y_hi = f34 | |
520 | FR_Y_lo = f35 | |
521 | ||
522 | FR_Scale = f36 | |
523 | FR_X_Prime = f37 | |
524 | FR_S_hi = f38 | |
525 | FR_W = f39 | |
526 | FR_G = f40 | |
527 | ||
528 | FR_H = f41 | |
529 | FR_wsq = f42 | |
530 | FR_w4 = f43 | |
531 | FR_h = f44 | |
532 | FR_w6 = f45 | |
533 | ||
534 | FR_G2 = f46 | |
535 | FR_H2 = f47 | |
536 | FR_poly_lo = f48 | |
537 | FR_P8 = f49 | |
538 | FR_poly_hi = f50 | |
539 | ||
540 | FR_P7 = f51 | |
541 | FR_h2 = f52 | |
542 | FR_rsq = f53 | |
543 | FR_P6 = f54 | |
544 | FR_r = f55 | |
545 | ||
546 | FR_log2_hi = f56 | |
547 | FR_log2_lo = f57 | |
548 | FR_p87 = f58 | |
549 | FR_p876 = f58 | |
550 | FR_p8765 = f58 | |
551 | FR_float_N = f59 | |
552 | FR_Q4 = f60 | |
553 | ||
554 | FR_p43 = f61 | |
555 | FR_p432 = f61 | |
556 | FR_p4321 = f61 | |
557 | FR_P4 = f62 | |
558 | FR_G3 = f63 | |
559 | FR_H3 = f64 | |
560 | FR_h3 = f65 | |
561 | ||
562 | FR_Q3 = f66 | |
563 | FR_P3 = f67 | |
564 | FR_Q2 = f68 | |
565 | FR_P2 = f69 | |
566 | FR_1LN10_hi = f70 | |
567 | ||
568 | FR_Q1 = f71 | |
569 | FR_P1 = f72 | |
570 | FR_1LN10_lo = f73 | |
571 | FR_P5 = f74 | |
572 | FR_rcub = f75 | |
573 | ||
574 | FR_Output_X_tmp = f76 | |
575 | ||
576 | FR_X = f8 | |
577 | FR_Y = f0 | |
578 | FR_RESULT = f76 | |
579 | ||
580 | ||
581 | // General Purpose Registers | |
582 | ||
583 | GR_ad_p = r33 | |
584 | GR_Index1 = r34 | |
585 | GR_Index2 = r35 | |
586 | GR_signif = r36 | |
587 | GR_X_0 = r37 | |
588 | GR_X_1 = r38 | |
589 | GR_X_2 = r39 | |
590 | GR_Z_1 = r40 | |
591 | GR_Z_2 = r41 | |
592 | GR_N = r42 | |
593 | GR_Bias = r43 | |
594 | GR_M = r44 | |
595 | GR_Index3 = r45 | |
596 | GR_ad_p2 = r46 | |
597 | GR_exp_mask = r47 | |
598 | GR_exp_2tom7 = r48 | |
599 | GR_ad_ln10 = r49 | |
600 | GR_ad_tbl_1 = r50 | |
601 | GR_ad_tbl_2 = r51 | |
602 | GR_ad_tbl_3 = r52 | |
603 | GR_ad_q = r53 | |
604 | GR_ad_z_1 = r54 | |
605 | GR_ad_z_2 = r55 | |
606 | GR_ad_z_3 = r56 | |
607 | ||
608 | // | |
609 | // Added for unwind support | |
610 | // | |
611 | ||
612 | GR_SAVE_PFS = r50 | |
613 | GR_SAVE_B0 = r51 | |
614 | GR_SAVE_GP = r52 | |
615 | GR_Parameter_X = r53 | |
616 | GR_Parameter_Y = r54 | |
617 | GR_Parameter_RESULT = r55 | |
618 | GR_Parameter_TAG = r56 | |
619 | ||
620 | .section .text | |
621 | ||
622 | GLOBAL_IEEE754_ENTRY(logl) | |
623 | { .mfi | |
624 | alloc r32 = ar.pfs,0,21,4,0 | |
625 | fclass.m p6, p0 = FR_Input_X, 0x1E3 // Test for natval, nan, inf | |
626 | cmp.eq p7, p14 = r0, r0 // Set p7 if logl | |
627 | } | |
628 | { .mfb | |
629 | addl GR_ad_z_1 = @ltoff(Constants_Z_1#),gp | |
630 | fnorm.s1 FR_X_Prime = FR_Input_X // Normalize x | |
631 | br.cond.sptk LOGL_BEGIN | |
632 | } | |
633 | ;; | |
634 | ||
635 | GLOBAL_IEEE754_END(logl) | |
636 | ||
637 | ||
638 | GLOBAL_IEEE754_ENTRY(log10l) | |
639 | { .mfi | |
640 | alloc r32 = ar.pfs,0,21,4,0 | |
641 | fclass.m p6, p0 = FR_Input_X, 0x1E3 // Test for natval, nan, inf | |
642 | cmp.ne p7, p14 = r0, r0 // Set p14 if log10l | |
643 | } | |
644 | { .mfb | |
645 | addl GR_ad_z_1 = @ltoff(Constants_Z_1#),gp | |
646 | fnorm.s1 FR_X_Prime = FR_Input_X // Normalize x | |
647 | nop.b 999 | |
648 | } | |
649 | ;; | |
650 | ||
651 | ||
652 | // Common code for logl and log10 | |
653 | LOGL_BEGIN: | |
654 | { .mfi | |
655 | ld8 GR_ad_z_1 = [GR_ad_z_1] // Get pointer to Constants_Z_1 | |
656 | fclass.m p10, p0 = FR_Input_X, 0x0b // Test for denormal | |
657 | mov GR_exp_2tom7 = 0x0fff8 // Exponent of 2^-7 | |
658 | } | |
659 | ;; | |
660 | ||
661 | { .mfb | |
662 | getf.sig GR_signif = FR_Input_X // Get significand of x | |
663 | fcmp.eq.s1 p9, p0 = FR_Input_X, f1 // Test for x=1.0 | |
664 | (p6) br.cond.spnt LOGL_64_special // Branch for nan, inf, natval | |
665 | } | |
666 | ;; | |
667 | ||
668 | { .mfi | |
669 | add GR_ad_tbl_1 = 0x040, GR_ad_z_1 // Point to Constants_G_H_h1 | |
670 | fcmp.lt.s1 p13, p0 = FR_Input_X, f0 // Test for x<0 | |
671 | add GR_ad_p = -0x100, GR_ad_z_1 // Point to Constants_P | |
672 | } | |
673 | { .mib | |
674 | add GR_ad_z_2 = 0x140, GR_ad_z_1 // Point to Constants_Z_2 | |
675 | add GR_ad_tbl_2 = 0x180, GR_ad_z_1 // Point to Constants_G_H_h2 | |
676 | (p10) br.cond.spnt LOGL_64_denormal // Branch for denormal | |
677 | } | |
678 | ;; | |
679 | ||
680 | LOGL_64_COMMON: | |
681 | { .mfi | |
682 | add GR_ad_q = 0x080, GR_ad_p // Point to Constants_Q | |
683 | fcmp.eq.s1 p8, p0 = FR_Input_X, f0 // Test for x=0 | |
684 | extr.u GR_Index1 = GR_signif, 59, 4 // Get high 4 bits of signif | |
685 | } | |
686 | { .mfb | |
687 | add GR_ad_tbl_3 = 0x280, GR_ad_z_1 // Point to Constants_G_H_h3 | |
688 | (p9) fma.s0 f8 = FR_Input_X, f0, f0 // If x=1, return +0.0 | |
689 | (p9) br.ret.spnt b0 // Exit if x=1 | |
690 | } | |
691 | ;; | |
692 | ||
693 | { .mfi | |
694 | shladd GR_ad_z_1 = GR_Index1, 2, GR_ad_z_1 // Point to Z_1 | |
695 | fclass.nm p10, p0 = FR_Input_X, 0x1FF // Test for unsupported | |
696 | extr.u GR_X_0 = GR_signif, 49, 15 // Get high 15 bits of significand | |
697 | } | |
698 | { .mfi | |
699 | ldfe FR_P8 = [GR_ad_p],16 // Load P_8 for near1 path | |
700 | fsub.s1 FR_W = FR_X_Prime, f1 // W = x - 1 | |
701 | add GR_ad_ln10 = 0x060, GR_ad_q // Point to Constants_1_by_LN10 | |
702 | } | |
703 | ;; | |
704 | ||
705 | { .mfi | |
706 | ld4 GR_Z_1 = [GR_ad_z_1] // Load Z_1 | |
707 | nop.f 999 | |
708 | mov GR_exp_mask = 0x1FFFF // Create exponent mask | |
709 | } | |
710 | { .mib | |
711 | shladd GR_ad_tbl_1 = GR_Index1, 4, GR_ad_tbl_1 // Point to G_1 | |
712 | mov GR_Bias = 0x0FFFF // Create exponent bias | |
713 | (p13) br.cond.spnt LOGL_64_negative // Branch if x<0 | |
714 | } | |
715 | ;; | |
716 | ||
717 | { .mfb | |
718 | ldfps FR_G, FR_H = [GR_ad_tbl_1],8 // Load G_1, H_1 | |
719 | fmerge.se FR_S_hi = f1,FR_X_Prime // Form |x| | |
720 | (p8) br.cond.spnt LOGL_64_zero // Branch if x=0 | |
721 | } | |
722 | ;; | |
723 | ||
724 | { .mmb | |
725 | getf.exp GR_N = FR_X_Prime // Get N = exponent of x | |
726 | ldfd FR_h = [GR_ad_tbl_1] // Load h_1 | |
727 | (p10) br.cond.spnt LOGL_64_unsupported // Branch for unsupported type | |
728 | } | |
729 | ;; | |
730 | ||
731 | { .mfi | |
732 | ldfe FR_log2_hi = [GR_ad_q],16 // Load log2_hi | |
733 | fcmp.eq.s0 p8, p0 = FR_Input_X, f0 // Dummy op to flag denormals | |
734 | pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15 // Get bits 30-15 of X_0 * Z_1 | |
735 | } | |
736 | ;; | |
737 | ||
738 | // | |
739 | // For performance, don't use result of pmpyshr2.u for 4 cycles. | |
740 | // | |
741 | { .mmi | |
742 | ldfe FR_log2_lo = [GR_ad_q],16 // Load log2_lo | |
743 | (p14) ldfe FR_1LN10_hi = [GR_ad_ln10],16 // If log10l, load 1/ln10_hi | |
744 | sub GR_N = GR_N, GR_Bias | |
745 | } | |
746 | ;; | |
747 | ||
748 | { .mmi | |
749 | ldfe FR_Q4 = [GR_ad_q],16 // Load Q4 | |
750 | (p14) ldfe FR_1LN10_lo = [GR_ad_ln10] // If log10l, load 1/ln10_lo | |
751 | nop.i 999 | |
752 | } | |
753 | ;; | |
754 | ||
755 | { .mmi | |
756 | ldfe FR_Q3 = [GR_ad_q],16 // Load Q3 | |
757 | setf.sig FR_float_N = GR_N // Put integer N into rightmost significand | |
758 | nop.i 999 | |
759 | } | |
760 | ;; | |
761 | ||
762 | { .mmi | |
763 | getf.exp GR_M = FR_W // Get signexp of w = x - 1 | |
764 | ldfe FR_Q2 = [GR_ad_q],16 // Load Q2 | |
765 | extr.u GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1 | |
766 | } | |
767 | ;; | |
768 | ||
769 | { .mmi | |
770 | ldfe FR_Q1 = [GR_ad_q] // Load Q1 | |
771 | shladd GR_ad_z_2 = GR_Index2, 2, GR_ad_z_2 // Point to Z_2 | |
772 | add GR_ad_p2 = 0x30,GR_ad_p // Point to P_4 | |
773 | } | |
774 | ;; | |
775 | ||
776 | { .mmi | |
777 | ld4 GR_Z_2 = [GR_ad_z_2] // Load Z_2 | |
778 | shladd GR_ad_tbl_2 = GR_Index2, 4, GR_ad_tbl_2 // Point to G_2 | |
779 | and GR_M = GR_exp_mask, GR_M // Get exponent of w = x - 1 | |
780 | } | |
781 | ;; | |
782 | ||
783 | { .mmi | |
784 | ldfps FR_G2, FR_H2 = [GR_ad_tbl_2],8 // Load G_2, H_2 | |
785 | cmp.lt p8, p9 = GR_M, GR_exp_2tom7 // Test |x-1| < 2^-7 | |
786 | nop.i 999 | |
787 | } | |
788 | ;; | |
789 | ||
790 | // Paths are merged. | |
791 | // p8 is for the near1 path: |x-1| < 2^-7 | |
792 | // p9 is for regular path: |x-1| >= 2^-7 | |
793 | ||
794 | { .mmi | |
795 | ldfd FR_h2 = [GR_ad_tbl_2] // Load h_2 | |
796 | nop.m 999 | |
797 | nop.i 999 | |
798 | } | |
799 | ;; | |
800 | ||
801 | { .mmi | |
802 | (p8) ldfe FR_P7 = [GR_ad_p],16 // Load P_7 for near1 path | |
803 | (p8) ldfe FR_P4 = [GR_ad_p2],16 // Load P_4 for near1 path | |
804 | (p9) pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15 // Get bits 30-15 of X_1 * Z_2 | |
805 | } | |
806 | ;; | |
807 | ||
808 | // | |
809 | // For performance, don't use result of pmpyshr2.u for 4 cycles. | |
810 | // | |
811 | { .mmi | |
812 | (p8) ldfe FR_P6 = [GR_ad_p],16 // Load P_6 for near1 path | |
813 | (p8) ldfe FR_P3 = [GR_ad_p2],16 // Load P_3 for near1 path | |
814 | nop.i 999 | |
815 | } | |
816 | ;; | |
817 | ||
818 | { .mmf | |
819 | (p8) ldfe FR_P5 = [GR_ad_p],16 // Load P_5 for near1 path | |
820 | (p8) ldfe FR_P2 = [GR_ad_p2],16 // Load P_2 for near1 path | |
821 | (p8) fmpy.s1 FR_wsq = FR_W, FR_W // wsq = w * w for near1 path | |
822 | } | |
823 | ;; | |
824 | ||
825 | { .mmi | |
826 | (p8) ldfe FR_P1 = [GR_ad_p2],16 ;; // Load P_1 for near1 path | |
827 | nop.m 999 | |
828 | (p9) extr.u GR_Index3 = GR_X_2, 1, 5 // Extract bits 1-5 of X_2 | |
829 | } | |
830 | ;; | |
831 | ||
832 | { .mfi | |
833 | (p9) shladd GR_ad_tbl_3 = GR_Index3, 4, GR_ad_tbl_3 // Point to G_3 | |
834 | (p9) fcvt.xf FR_float_N = FR_float_N | |
835 | nop.i 999 | |
836 | } | |
837 | ;; | |
838 | ||
839 | { .mfi | |
840 | (p9) ldfps FR_G3, FR_H3 = [GR_ad_tbl_3],8 // Load G_3, H_3 | |
841 | nop.f 999 | |
842 | nop.i 999 | |
843 | } | |
844 | ;; | |
845 | ||
846 | { .mfi | |
847 | (p9) ldfd FR_h3 = [GR_ad_tbl_3] // Load h_3 | |
848 | (p9) fmpy.s1 FR_G = FR_G, FR_G2 // G = G_1 * G_2 | |
849 | nop.i 999 | |
850 | } | |
851 | { .mfi | |
852 | nop.m 999 | |
853 | (p9) fadd.s1 FR_H = FR_H, FR_H2 // H = H_1 + H_2 | |
854 | nop.i 999 | |
855 | } | |
856 | ;; | |
857 | ||
858 | { .mmf | |
859 | nop.m 999 | |
860 | nop.m 999 | |
861 | (p9) fadd.s1 FR_h = FR_h, FR_h2 // h = h_1 + h_2 | |
862 | } | |
863 | ;; | |
864 | ||
865 | { .mfi | |
866 | nop.m 999 | |
867 | (p8) fmpy.s1 FR_w4 = FR_wsq, FR_wsq // w4 = w^4 for near1 path | |
868 | nop.i 999 | |
869 | } | |
870 | { .mfi | |
871 | nop.m 999 | |
872 | (p8) fma.s1 FR_p87 = FR_W, FR_P8, FR_P7 // p87 = w * P8 + P7 | |
873 | nop.i 999 | |
874 | } | |
875 | ;; | |
876 | ||
877 | { .mfi | |
878 | nop.m 999 | |
879 | (p8) fma.s1 FR_p43 = FR_W, FR_P4, FR_P3 // p43 = w * P4 + P3 | |
880 | nop.i 999 | |
881 | } | |
882 | ;; | |
883 | ||
884 | { .mfi | |
885 | nop.m 999 | |
886 | (p9) fmpy.s1 FR_G = FR_G, FR_G3 // G = (G_1 * G_2) * G_3 | |
887 | nop.i 999 | |
888 | } | |
889 | { .mfi | |
890 | nop.m 999 | |
891 | (p9) fadd.s1 FR_H = FR_H, FR_H3 // H = (H_1 + H_2) + H_3 | |
892 | nop.i 999 | |
893 | } | |
894 | ;; | |
895 | ||
896 | { .mfi | |
897 | nop.m 999 | |
898 | (p9) fadd.s1 FR_h = FR_h, FR_h3 // h = (h_1 + h_2) + h_3 | |
899 | nop.i 999 | |
900 | } | |
901 | { .mfi | |
902 | nop.m 999 | |
903 | (p8) fmpy.s1 FR_w6 = FR_w4, FR_wsq // w6 = w^6 for near1 path | |
904 | nop.i 999 | |
905 | } | |
906 | ;; | |
907 | ||
908 | { .mfi | |
909 | nop.m 999 | |
910 | (p8) fma.s1 FR_p432 = FR_W, FR_p43, FR_P2 // p432 = w * p43 + P2 | |
911 | nop.i 999 | |
912 | } | |
913 | { .mfi | |
914 | nop.m 999 | |
915 | (p8) fma.s1 FR_p876 = FR_W, FR_p87, FR_P6 // p876 = w * p87 + P6 | |
916 | nop.i 999 | |
917 | } | |
918 | ;; | |
919 | ||
920 | { .mfi | |
921 | nop.m 999 | |
922 | (p9) fms.s1 FR_r = FR_G, FR_S_hi, f1 // r = G * S_hi - 1 | |
923 | nop.i 999 | |
924 | } | |
925 | { .mfi | |
926 | nop.m 999 | |
927 | (p9) fma.s1 FR_Y_hi = FR_float_N, FR_log2_hi, FR_H // Y_hi = N * log2_hi + H | |
928 | nop.i 999 | |
929 | } | |
930 | ;; | |
931 | ||
932 | { .mfi | |
933 | nop.m 999 | |
934 | (p9) fma.s1 FR_h = FR_float_N, FR_log2_lo, FR_h // h = N * log2_lo + h | |
935 | nop.i 999 | |
936 | } | |
937 | ;; | |
938 | ||
939 | { .mfi | |
940 | nop.m 999 | |
941 | (p8) fma.s1 FR_p4321 = FR_W, FR_p432, FR_P1 // p4321 = w * p432 + P1 | |
942 | nop.i 999 | |
943 | } | |
944 | { .mfi | |
945 | nop.m 999 | |
946 | (p8) fma.s1 FR_p8765 = FR_W, FR_p876, FR_P5 // p8765 = w * p876 + P5 | |
947 | nop.i 999 | |
948 | } | |
949 | ;; | |
950 | ||
951 | { .mfi | |
952 | nop.m 999 | |
953 | (p9) fma.s1 FR_poly_lo = FR_r, FR_Q4, FR_Q3 // poly_lo = r * Q4 + Q3 | |
954 | nop.i 999 | |
955 | } | |
956 | { .mfi | |
957 | nop.m 999 | |
958 | (p9) fmpy.s1 FR_rsq = FR_r, FR_r // rsq = r * r | |
959 | nop.i 999 | |
960 | } | |
961 | ;; | |
962 | ||
963 | { .mfi | |
964 | nop.m 999 | |
965 | (p8) fma.s1 FR_Y_lo = FR_wsq, FR_p4321, f0 // Y_lo = wsq * p4321 | |
966 | nop.i 999 | |
967 | } | |
968 | { .mfi | |
969 | nop.m 999 | |
970 | (p8) fma.s1 FR_Y_hi = FR_W, f1, f0 // Y_hi = w for near1 path | |
971 | nop.i 999 | |
972 | } | |
973 | ;; | |
974 | ||
975 | { .mfi | |
976 | nop.m 999 | |
977 | (p9) fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_Q2 // poly_lo = poly_lo * r + Q2 | |
978 | nop.i 999 | |
979 | } | |
980 | { .mfi | |
981 | nop.m 999 | |
982 | (p9) fma.s1 FR_rcub = FR_rsq, FR_r, f0 // rcub = r^3 | |
983 | nop.i 999 | |
984 | } | |
985 | ;; | |
986 | ||
987 | { .mfi | |
988 | nop.m 999 | |
989 | (p8) fma.s1 FR_Y_lo = FR_w6, FR_p8765,FR_Y_lo // Y_lo = w6 * p8765 + w2 * p4321 | |
990 | nop.i 999 | |
991 | } | |
992 | ;; | |
993 | ||
994 | { .mfi | |
995 | nop.m 999 | |
996 | (p9) fma.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r // poly_hi = Q1 * rsq + r | |
997 | nop.i 999 | |
998 | } | |
999 | ;; | |
1000 | ||
1001 | { .mfi | |
1002 | nop.m 999 | |
1003 | (p9) fma.s1 FR_poly_lo = FR_poly_lo, FR_rcub, FR_h // poly_lo = poly_lo*r^3 + h | |
1004 | nop.i 999 | |
1005 | } | |
1006 | ;; | |
1007 | ||
1008 | { .mfi | |
1009 | nop.m 999 | |
1010 | (p9) fadd.s1 FR_Y_lo = FR_poly_hi, FR_poly_lo // Y_lo = poly_hi + poly_lo | |
1011 | nop.i 999 | |
1012 | } | |
1013 | ;; | |
1014 | ||
1015 | // Remainder of code is common for near1 and regular paths | |
1016 | { .mfi | |
1017 | nop.m 999 | |
1018 | (p7) fadd.s0 f8 = FR_Y_lo,FR_Y_hi // If logl, result=Y_lo+Y_hi | |
1019 | nop.i 999 | |
1020 | } | |
1021 | { .mfi | |
1022 | nop.m 999 | |
1023 | (p14) fmpy.s1 FR_Output_X_tmp = FR_Y_lo,FR_1LN10_hi | |
1024 | nop.i 999 | |
1025 | } | |
1026 | ;; | |
1027 | ||
1028 | { .mfi | |
1029 | nop.m 999 | |
1030 | (p14) fma.s1 FR_Output_X_tmp = FR_Y_hi,FR_1LN10_lo,FR_Output_X_tmp | |
1031 | nop.i 999 | |
1032 | } | |
1033 | ;; | |
1034 | ||
1035 | { .mfb | |
1036 | nop.m 999 | |
1037 | (p14) fma.s0 f8 = FR_Y_hi,FR_1LN10_hi,FR_Output_X_tmp | |
1038 | br.ret.sptk b0 // Common exit for 0 < x < inf | |
1039 | } | |
1040 | ;; | |
1041 | ||
1042 | ||
1043 | // Here if x=+-0 | |
1044 | LOGL_64_zero: | |
1045 | // | |
1046 | // If x=+-0 raise divide by zero and return -inf | |
1047 | // | |
1048 | { .mfi | |
1049 | (p7) mov GR_Parameter_TAG = 0 | |
1050 | fsub.s1 FR_Output_X_tmp = f0, f1 | |
1051 | nop.i 999 | |
1052 | } | |
1053 | ;; | |
1054 | ||
1055 | { .mfb | |
1056 | (p14) mov GR_Parameter_TAG = 6 | |
1057 | frcpa.s0 FR_Output_X_tmp, p8 = FR_Output_X_tmp, f0 | |
1058 | br.cond.sptk __libm_error_region | |
1059 | } | |
1060 | ;; | |
1061 | ||
1062 | LOGL_64_special: | |
1063 | { .mfi | |
1064 | nop.m 999 | |
1065 | fclass.m.unc p8, p0 = FR_Input_X, 0x1E1 // Test for natval, nan, +inf | |
1066 | nop.i 999 | |
1067 | } | |
1068 | ;; | |
1069 | ||
1070 | // | |
1071 | // For SNaN raise invalid and return QNaN. | |
1072 | // For QNaN raise invalid and return QNaN. | |
1073 | // For +Inf return +Inf. | |
1074 | // | |
1075 | { .mfb | |
1076 | nop.m 999 | |
1077 | (p8) fmpy.s0 f8 = FR_Input_X, f1 | |
1078 | (p8) br.ret.sptk b0 // Return for natval, nan, +inf | |
1079 | } | |
1080 | ;; | |
1081 | ||
1082 | // | |
1083 | // For -Inf raise invalid and return QNaN. | |
1084 | // | |
1085 | { .mmi | |
1086 | (p7) mov GR_Parameter_TAG = 1 | |
1087 | nop.m 999 | |
1088 | nop.i 999 | |
1089 | } | |
1090 | ;; | |
1091 | ||
1092 | { .mfb | |
1093 | (p14) mov GR_Parameter_TAG = 7 | |
1094 | fmpy.s0 FR_Output_X_tmp = FR_Input_X, f0 | |
1095 | br.cond.sptk __libm_error_region | |
1096 | } | |
1097 | ;; | |
1098 | ||
1099 | // Here if x denormal or unnormal | |
1100 | LOGL_64_denormal: | |
1101 | { .mmi | |
1102 | getf.sig GR_signif = FR_X_Prime // Get significand of normalized input | |
1103 | nop.m 999 | |
1104 | nop.i 999 | |
1105 | } | |
1106 | ;; | |
1107 | ||
1108 | { .mmb | |
1109 | getf.exp GR_N = FR_X_Prime // Get exponent of normalized input | |
1110 | nop.m 999 | |
1111 | br.cond.sptk LOGL_64_COMMON // Branch back to common code | |
1112 | } | |
1113 | ;; | |
1114 | ||
1115 | LOGL_64_unsupported: | |
1116 | // | |
1117 | // Return generated NaN or other value. | |
1118 | // | |
1119 | { .mfb | |
1120 | nop.m 999 | |
1121 | fmpy.s0 f8 = FR_Input_X, f0 | |
1122 | br.ret.sptk b0 | |
1123 | } | |
1124 | ;; | |
1125 | ||
1126 | // Here if -inf < x < 0 | |
1127 | LOGL_64_negative: | |
1128 | // | |
1129 | // Deal with x < 0 in a special way - raise | |
1130 | // invalid and produce QNaN indefinite. | |
1131 | // | |
1132 | { .mfi | |
1133 | (p7) mov GR_Parameter_TAG = 1 | |
1134 | frcpa.s0 FR_Output_X_tmp, p8 = f0, f0 | |
1135 | nop.i 999 | |
1136 | } | |
1137 | ;; | |
1138 | ||
1139 | { .mib | |
1140 | (p14) mov GR_Parameter_TAG = 7 | |
1141 | nop.i 999 | |
1142 | br.cond.sptk __libm_error_region | |
1143 | } | |
1144 | ;; | |
1145 | ||
1146 | ||
1147 | GLOBAL_IEEE754_END(log10l) | |
1148 | ||
1149 | LOCAL_LIBM_ENTRY(__libm_error_region) | |
1150 | .prologue | |
1151 | { .mfi | |
1152 | add GR_Parameter_Y=-32,sp // Parameter 2 value | |
1153 | nop.f 0 | |
1154 | .save ar.pfs,GR_SAVE_PFS | |
1155 | mov GR_SAVE_PFS=ar.pfs // Save ar.pfs | |
1156 | } | |
1157 | { .mfi | |
1158 | .fframe 64 | |
1159 | add sp=-64,sp // Create new stack | |
1160 | nop.f 0 | |
1161 | mov GR_SAVE_GP=gp // Save gp | |
1162 | };; | |
1163 | { .mmi | |
1164 | stfe [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack | |
1165 | add GR_Parameter_X = 16,sp // Parameter 1 address | |
1166 | .save b0, GR_SAVE_B0 | |
1167 | mov GR_SAVE_B0=b0 // Save b0 | |
1168 | };; | |
1169 | .body | |
1170 | { .mib | |
1171 | stfe [GR_Parameter_X] = FR_X // Store Parameter 1 on stack | |
1172 | add GR_Parameter_RESULT = 0,GR_Parameter_Y | |
1173 | nop.b 0 // Parameter 3 address | |
1174 | } | |
1175 | { .mib | |
1176 | stfe [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack | |
1177 | add GR_Parameter_Y = -16,GR_Parameter_Y | |
1178 | br.call.sptk b0=__libm_error_support# // Call error handling function | |
1179 | };; | |
1180 | { .mmi | |
1181 | nop.m 999 | |
1182 | nop.m 999 | |
1183 | add GR_Parameter_RESULT = 48,sp | |
1184 | };; | |
1185 | { .mmi | |
1186 | ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack | |
1187 | .restore sp | |
1188 | add sp = 64,sp // Restore stack pointer | |
1189 | mov b0 = GR_SAVE_B0 // Restore return address | |
1190 | };; | |
1191 | { .mib | |
1192 | mov gp = GR_SAVE_GP // Restore gp | |
1193 | mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs | |
1194 | br.ret.sptk b0 // Return | |
1195 | };; | |
1196 | ||
1197 | LOCAL_LIBM_END(__libm_error_region#) | |
1198 | ||
1199 | .type __libm_error_support#,@function | |
1200 | .global __libm_error_support# |