]>
Commit | Line | Data |
---|---|---|
d5efd131 MF |
1 | .file "asinh.s" |
2 | ||
3 | ||
4 | // Copyright (c) 2000 - 2005, Intel Corporation | |
5 | // All rights reserved. | |
6 | // | |
d5efd131 MF |
7 | // |
8 | // Redistribution and use in source and binary forms, with or without | |
9 | // modification, are permitted provided that the following conditions are | |
10 | // met: | |
11 | // | |
12 | // * Redistributions of source code must retain the above copyright | |
13 | // notice, this list of conditions and the following disclaimer. | |
14 | // | |
15 | // * Redistributions in binary form must reproduce the above copyright | |
16 | // notice, this list of conditions and the following disclaimer in the | |
17 | // documentation and/or other materials provided with the distribution. | |
18 | // | |
19 | // * The name of Intel Corporation may not be used to endorse or promote | |
20 | // products derived from this software without specific prior written | |
21 | // permission. | |
22 | ||
23 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
24 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
25 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
26 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS | |
27 | // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | |
28 | // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | |
29 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | |
30 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY | |
31 | // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING | |
32 | // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | |
33 | // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
34 | // | |
35 | // Intel Corporation is the author of this code, and requests that all | |
36 | // problem reports or change requests be submitted to it directly at | |
37 | // http://www.intel.com/software/products/opensource/libraries/num.htm. | |
38 | // | |
39 | // ============================================================== | |
40 | // History | |
41 | // ============================================================== | |
42 | // 04/02/01 Initial version | |
43 | // 04/19/01 Improved speed of the paths #1,2,3,4,5 | |
44 | // 10/18/01 Improved accuracy | |
45 | // 05/20/02 Cleaned up namespace and sf0 syntax | |
46 | // 02/06/03 Reordered header: .section, .global, .proc, .align | |
47 | // 05/21/03 Improved performance, fixed to handle unorms | |
48 | // 03/31/05 Reformatted delimiters between data tables | |
49 | // | |
50 | // API | |
51 | // ============================================================== | |
52 | // double asinh(double) | |
53 | // | |
54 | // Overview of operation | |
55 | // ============================================================== | |
56 | // | |
57 | // There are 7 paths: | |
58 | // 1. x = 0.0 | |
59 | // Return asinh(x) = 0.0 | |
60 | // | |
61 | // 2. 0.0 <|x| < 2^(-3) | |
62 | // Return asinh(x) = POL13(x), | |
63 | // where POL13(x) = (x^2*C13 + ...)*x^2 + C5)*x^2 + C3)*x^3 + x | |
64 | // | |
65 | // 3. 2^(-3) <= |x| < 2^63 | |
66 | // Return asinh(x) = sign(x)*(log(|x| + sqrt(x^2 + 1.0))) | |
67 | // To compute x + sqrt(x^2 + 1.0) modified Newton Raphson method is used | |
68 | // (3 iterations) | |
69 | // Algorithm description for log function see below. | |
70 | // | |
71 | // 4. 2^63 <= |x| < +INF | |
72 | // Return asinh(x) = sign(x)*log(2*|x|) | |
73 | // Algorithm description for log function see below. | |
74 | // | |
75 | // 5. x = INF | |
76 | // Return asinh(x) = INF | |
77 | // | |
78 | // 6. x = [S,Q]NaN | |
79 | // Return asinh(x) = QNaN | |
80 | // | |
81 | // 7. x = denormal | |
82 | // Return asinh(x) = x correctly rounded | |
83 | // | |
84 | //============================================================== | |
85 | // Algorithm Description for log(x) function | |
86 | // Below we are using the fact that inequality x - 1.0 > 2^(-6) is always | |
87 | // true for this asinh implementation | |
88 | // | |
89 | // Consider x = 2^N 1.f1 f2 f3 f4...f63 | |
90 | // Log(x) = log(frcpa(x) x/frcpa(x)) | |
91 | // = log(1/frcpa(x)) + log(frcpa(x) x) | |
92 | // = -log(frcpa(x)) + log(frcpa(x) x) | |
93 | // | |
94 | // frcpa(x) = 2^-N frcpa((1.f1 f2 ... f63) | |
95 | // | |
96 | // -log(frcpa(x)) = -log(C) | |
97 | // = -log(2^-N) - log(frcpa(1.f1 f2 ... f63)) | |
98 | // | |
99 | // -log(frcpa(x)) = -log(C) | |
100 | // = +Nlog2 - log(frcpa(1.f1 f2 ... f63)) | |
101 | // | |
102 | // -log(frcpa(x)) = -log(C) | |
103 | // = +Nlog2 + log(frcpa(1.f1 f2 ... f63)) | |
104 | // | |
105 | // Log(x) = log(1/frcpa(x)) + log(frcpa(x) x) | |
106 | // | |
107 | // Log(x) = +Nlog2 + log(1./frcpa(1.f1 f2 ... f63)) + log(frcpa(x) x) | |
108 | // Log(x) = +Nlog2 - log(/frcpa(1.f1 f2 ... f63)) + log(frcpa(x) x) | |
109 | // Log(x) = +Nlog2 + T + log(frcpa(x) x) | |
110 | // | |
111 | // Log(x) = +Nlog2 + T + log(C x) | |
112 | // | |
113 | // Cx = 1 + r | |
114 | // | |
115 | // Log(x) = +Nlog2 + T + log(1+r) | |
116 | // Log(x) = +Nlog2 + T + Series( r - r^2/2 + r^3/3 - r^4/4 ....) | |
117 | // | |
118 | // 1.f1 f2 ... f8 has 256 entries. | |
119 | // They are 1 + k/2^8, k = 0 ... 255 | |
120 | // These 256 values are the table entries. | |
121 | // | |
122 | // Implementation | |
123 | //============================================================== | |
124 | // C = frcpa(x) | |
125 | // r = C * x - 1 | |
126 | // | |
127 | // Form rseries = r + P1*r^2 + P2*r^3 + P3*r^4 + P4*r^5 + P5*r^6 | |
128 | // | |
129 | // x = f * 2*n where f is 1.f_1f_2f_3....f_63 | |
130 | // Nfloat = float(n) where n is the true unbiased exponent | |
131 | // pre-index = f_1f_2....f_8 | |
132 | // index = pre_index * 16 | |
133 | // get the dxt table entry at index + offset = T | |
134 | // | |
135 | // result = (T + Nfloat * log(2)) + rseries | |
136 | // | |
137 | // The T table is calculated as follows | |
138 | // Form x_k = 1 + k/2^8 where k goes from 0... 255 | |
139 | // y_k = frcpa(x_k) | |
140 | // log(1/y_k) in quad and round to double-extended | |
141 | // | |
142 | // | |
143 | // Registers used | |
144 | //============================================================== | |
145 | // Floating Point registers used: | |
146 | // f8, input | |
147 | // f9 -> f15, f32 -> f68 | |
148 | ||
149 | // General registers used: | |
150 | // r14 -> r27 | |
151 | ||
152 | // Predicate registers used: | |
153 | // p6 -> p14 | |
154 | ||
155 | // p6 to filter out case when x = [Q,S]NaN or INF or zero | |
156 | // p7 to filter out case when x < 0.0 | |
157 | // p8 to select path #2 | |
158 | // p9 used in the frcpa from path #3 | |
159 | // p11 to filter out case when x >= 0 | |
160 | // p12 to filter out case when x = unorm | |
161 | // p13 to select path #4 | |
162 | // Assembly macros | |
163 | //============================================================== | |
164 | log_GR_exp_17_ones = r14 | |
165 | log_GR_signexp_f8 = r15 | |
166 | log_table_address2 = r16 | |
167 | log_GR_exp_16_ones = r17 | |
168 | log_GR_exp_f8 = r18 | |
169 | log_GR_true_exp_f8 = r19 | |
170 | log_GR_significand_f8 = r20 | |
171 | log_GR_index = r21 | |
172 | log_GR_comp2 = r22 | |
173 | asinh_GR_f8 = r23 | |
174 | asinh_GR_comp = r24 | |
175 | asinh_GR_f8 = r25 | |
176 | log_table_address3 = r26 | |
177 | NR_table_address = r27 | |
178 | ||
179 | //============================================================== | |
180 | log_y = f9 | |
181 | NR1 = f10 | |
182 | NR2 = f11 | |
183 | log_y_rs = f12 | |
184 | log_y_rs_iter = f13 | |
185 | log_y_rs_iter1 = f14 | |
186 | fNormX = f15 | |
187 | asinh_w_sq = f32 | |
188 | log_C13 = f33 | |
189 | log_C11 = f34 | |
190 | log_P3 = f35 | |
191 | log_P2 = f36 | |
192 | log_P1 = f37 | |
193 | log_P5 = f38 | |
194 | log_P4 = f39 | |
195 | log_C3 = f40 | |
196 | log_C5 = f41 | |
197 | log_C7 = f42 | |
198 | log2 = f43 | |
199 | asinh_f8 = f44 | |
200 | log_C = f45 | |
201 | log_arg = f46 | |
202 | log_C9 = f47 | |
203 | asinh_w_four = f48 | |
204 | log_int_Nfloat = f49 | |
205 | log_r = f50 | |
206 | log_rsq = f51 | |
207 | log_rp_p4 = f52 | |
208 | log_rp_p32 = f53 | |
209 | log_rcube = f54 | |
210 | log_rp_p10 = f55 | |
211 | log_rp_p2 = f56 | |
212 | log_Nfloat = f57 | |
213 | log_T = f58 | |
214 | log_r2P_r = f59 | |
215 | log_T_plus_Nlog2 = f60 | |
216 | asinh_w_3 = f61 | |
217 | asinh_w_5 = f62 | |
218 | asinh_w_cube = f63 | |
219 | asinh_w_7 = f64 | |
220 | log_arg_early = f65 | |
221 | asinh_w_9 = f66 | |
222 | asinh_w_13 = f67 | |
223 | asinh_w_seven = f68 | |
224 | ||
225 | // Data tables | |
226 | //============================================================== | |
227 | ||
228 | RODATA | |
229 | .align 16 | |
230 | ||
231 | LOCAL_OBJECT_START(log_table_1) | |
232 | data8 0xBFC5555DA7212371 // P5 | |
233 | data8 0x3FC999A19EEF5826 // P4 | |
234 | data8 0xBFCFFFFFFFFEF009 // P3 | |
235 | data8 0x3FD555555554ECB2 // P2 | |
236 | data8 0xBFE0000000000000 // P1 = -0.5 | |
237 | data8 0x0000000000000000 // pad | |
238 | data8 0xb17217f7d1cf79ac, 0x00003ffe // log2 | |
239 | LOCAL_OBJECT_END(log_table_1) | |
240 | ||
241 | LOCAL_OBJECT_START(log_table_2) | |
242 | data8 0x3FE0000000000000 // 0.5 | |
243 | data8 0x4008000000000000 // 3.0 | |
244 | // | |
245 | data8 0x8824BE4D74BC4F00, 0x00003FF9 // C13 | |
246 | data8 0xB725A2CD9556CC57, 0x0000BFF9 // C11 | |
247 | data8 0xF8E339127FBFF49D, 0x00003FF9 // C9 | |
248 | data8 0xB6DB6D7DCE17CB78, 0x0000BFFA // C7 | |
249 | data8 0x999999998802CCEF, 0x00003FFB // C5 | |
250 | data8 0xAAAAAAAAAAA8DC40, 0x0000BFFC // C3 | |
251 | LOCAL_OBJECT_END(log_table_2) | |
252 | ||
253 | ||
254 | LOCAL_OBJECT_START(log_table_3) | |
255 | data8 0x80200aaeac44ef38 , 0x00003ff6 // log(1/frcpa(1+ 0/2^-8)) | |
256 | // | |
257 | data8 0xc09090a2c35aa070 , 0x00003ff7 // log(1/frcpa(1+ 1/2^-8)) | |
258 | data8 0xa0c94fcb41977c75 , 0x00003ff8 // log(1/frcpa(1+ 2/2^-8)) | |
259 | data8 0xe18b9c263af83301 , 0x00003ff8 // log(1/frcpa(1+ 3/2^-8)) | |
260 | data8 0x8d35c8d6399c30ea , 0x00003ff9 // log(1/frcpa(1+ 4/2^-8)) | |
261 | data8 0xadd4d2ecd601cbb8 , 0x00003ff9 // log(1/frcpa(1+ 5/2^-8)) | |
262 | // | |
263 | data8 0xce95403a192f9f01 , 0x00003ff9 // log(1/frcpa(1+ 6/2^-8)) | |
264 | data8 0xeb59392cbcc01096 , 0x00003ff9 // log(1/frcpa(1+ 7/2^-8)) | |
265 | data8 0x862c7d0cefd54c5d , 0x00003ffa // log(1/frcpa(1+ 8/2^-8)) | |
266 | data8 0x94aa63c65e70d499 , 0x00003ffa // log(1/frcpa(1+ 9/2^-8)) | |
267 | data8 0xa54a696d4b62b382 , 0x00003ffa // log(1/frcpa(1+ 10/2^-8)) | |
268 | // | |
269 | data8 0xb3e4a796a5dac208 , 0x00003ffa // log(1/frcpa(1+ 11/2^-8)) | |
270 | data8 0xc28c45b1878340a9 , 0x00003ffa // log(1/frcpa(1+ 12/2^-8)) | |
271 | data8 0xd35c55f39d7a6235 , 0x00003ffa // log(1/frcpa(1+ 13/2^-8)) | |
272 | data8 0xe220f037b954f1f5 , 0x00003ffa // log(1/frcpa(1+ 14/2^-8)) | |
273 | data8 0xf0f3389b036834f3 , 0x00003ffa // log(1/frcpa(1+ 15/2^-8)) | |
274 | // | |
275 | data8 0xffd3488d5c980465 , 0x00003ffa // log(1/frcpa(1+ 16/2^-8)) | |
276 | data8 0x87609ce2ed300490 , 0x00003ffb // log(1/frcpa(1+ 17/2^-8)) | |
277 | data8 0x8ede9321e8c85927 , 0x00003ffb // log(1/frcpa(1+ 18/2^-8)) | |
278 | data8 0x96639427f2f8e2f4 , 0x00003ffb // log(1/frcpa(1+ 19/2^-8)) | |
279 | data8 0x9defad3e8f73217b , 0x00003ffb // log(1/frcpa(1+ 20/2^-8)) | |
280 | // | |
281 | data8 0xa582ebd50097029c , 0x00003ffb // log(1/frcpa(1+ 21/2^-8)) | |
282 | data8 0xac06dbe75ab80fee , 0x00003ffb // log(1/frcpa(1+ 22/2^-8)) | |
283 | data8 0xb3a78449b2d3ccca , 0x00003ffb // log(1/frcpa(1+ 23/2^-8)) | |
284 | data8 0xbb4f79635ab46bb2 , 0x00003ffb // log(1/frcpa(1+ 24/2^-8)) | |
285 | data8 0xc2fec93a83523f3f , 0x00003ffb // log(1/frcpa(1+ 25/2^-8)) | |
286 | // | |
287 | data8 0xc99af2eaca4c4571 , 0x00003ffb // log(1/frcpa(1+ 26/2^-8)) | |
288 | data8 0xd1581106472fa653 , 0x00003ffb // log(1/frcpa(1+ 27/2^-8)) | |
289 | data8 0xd8002560d4355f2e , 0x00003ffb // log(1/frcpa(1+ 28/2^-8)) | |
290 | data8 0xdfcb43b4fe508632 , 0x00003ffb // log(1/frcpa(1+ 29/2^-8)) | |
291 | data8 0xe67f6dff709d4119 , 0x00003ffb // log(1/frcpa(1+ 30/2^-8)) | |
292 | // | |
293 | data8 0xed393b1c22351280 , 0x00003ffb // log(1/frcpa(1+ 31/2^-8)) | |
294 | data8 0xf5192bff087bcc35 , 0x00003ffb // log(1/frcpa(1+ 32/2^-8)) | |
295 | data8 0xfbdf4ff6dfef2fa3 , 0x00003ffb // log(1/frcpa(1+ 33/2^-8)) | |
296 | data8 0x81559a97f92f9cc7 , 0x00003ffc // log(1/frcpa(1+ 34/2^-8)) | |
297 | data8 0x84be72bce90266e8 , 0x00003ffc // log(1/frcpa(1+ 35/2^-8)) | |
298 | // | |
299 | data8 0x88bc74113f23def2 , 0x00003ffc // log(1/frcpa(1+ 36/2^-8)) | |
300 | data8 0x8c2ba3edf6799d11 , 0x00003ffc // log(1/frcpa(1+ 37/2^-8)) | |
301 | data8 0x8f9dc92f92ea08b1 , 0x00003ffc // log(1/frcpa(1+ 38/2^-8)) | |
302 | data8 0x9312e8f36efab5a7 , 0x00003ffc // log(1/frcpa(1+ 39/2^-8)) | |
303 | data8 0x968b08643409ceb6 , 0x00003ffc // log(1/frcpa(1+ 40/2^-8)) | |
304 | // | |
305 | data8 0x9a062cba08a1708c , 0x00003ffc // log(1/frcpa(1+ 41/2^-8)) | |
306 | data8 0x9d845b3abf95485c , 0x00003ffc // log(1/frcpa(1+ 42/2^-8)) | |
307 | data8 0xa06fd841bc001bb4 , 0x00003ffc // log(1/frcpa(1+ 43/2^-8)) | |
308 | data8 0xa3f3a74652fbe0db , 0x00003ffc // log(1/frcpa(1+ 44/2^-8)) | |
309 | data8 0xa77a8fb2336f20f5 , 0x00003ffc // log(1/frcpa(1+ 45/2^-8)) | |
310 | // | |
311 | data8 0xab0497015d28b0a0 , 0x00003ffc // log(1/frcpa(1+ 46/2^-8)) | |
312 | data8 0xae91c2be6ba6a615 , 0x00003ffc // log(1/frcpa(1+ 47/2^-8)) | |
313 | data8 0xb189d1b99aebb20b , 0x00003ffc // log(1/frcpa(1+ 48/2^-8)) | |
314 | data8 0xb51cced5de9c1b2c , 0x00003ffc // log(1/frcpa(1+ 49/2^-8)) | |
315 | data8 0xb819bee9e720d42f , 0x00003ffc // log(1/frcpa(1+ 50/2^-8)) | |
316 | // | |
317 | data8 0xbbb2a0947b093a5d , 0x00003ffc // log(1/frcpa(1+ 51/2^-8)) | |
318 | data8 0xbf4ec1505811684a , 0x00003ffc // log(1/frcpa(1+ 52/2^-8)) | |
319 | data8 0xc2535bacfa8975ff , 0x00003ffc // log(1/frcpa(1+ 53/2^-8)) | |
320 | data8 0xc55a3eafad187eb8 , 0x00003ffc // log(1/frcpa(1+ 54/2^-8)) | |
321 | data8 0xc8ff2484b2c0da74 , 0x00003ffc // log(1/frcpa(1+ 55/2^-8)) | |
322 | // | |
323 | data8 0xcc0b1a008d53ab76 , 0x00003ffc // log(1/frcpa(1+ 56/2^-8)) | |
324 | data8 0xcfb6203844b3209b , 0x00003ffc // log(1/frcpa(1+ 57/2^-8)) | |
325 | data8 0xd2c73949a47a19f5 , 0x00003ffc // log(1/frcpa(1+ 58/2^-8)) | |
326 | data8 0xd5daae18b49d6695 , 0x00003ffc // log(1/frcpa(1+ 59/2^-8)) | |
327 | data8 0xd8f08248cf7e8019 , 0x00003ffc // log(1/frcpa(1+ 60/2^-8)) | |
328 | // | |
329 | data8 0xdca7749f1b3e540e , 0x00003ffc // log(1/frcpa(1+ 61/2^-8)) | |
330 | data8 0xdfc28e033aaaf7c7 , 0x00003ffc // log(1/frcpa(1+ 62/2^-8)) | |
331 | data8 0xe2e012a5f91d2f55 , 0x00003ffc // log(1/frcpa(1+ 63/2^-8)) | |
332 | data8 0xe600064ed9e292a8 , 0x00003ffc // log(1/frcpa(1+ 64/2^-8)) | |
333 | data8 0xe9226cce42b39f60 , 0x00003ffc // log(1/frcpa(1+ 65/2^-8)) | |
334 | // | |
335 | data8 0xec4749fd97a28360 , 0x00003ffc // log(1/frcpa(1+ 66/2^-8)) | |
336 | data8 0xef6ea1bf57780495 , 0x00003ffc // log(1/frcpa(1+ 67/2^-8)) | |
337 | data8 0xf29877ff38809091 , 0x00003ffc // log(1/frcpa(1+ 68/2^-8)) | |
338 | data8 0xf5c4d0b245cb89be , 0x00003ffc // log(1/frcpa(1+ 69/2^-8)) | |
339 | data8 0xf8f3afd6fcdef3aa , 0x00003ffc // log(1/frcpa(1+ 70/2^-8)) | |
340 | // | |
341 | data8 0xfc2519756be1abc7 , 0x00003ffc // log(1/frcpa(1+ 71/2^-8)) | |
342 | data8 0xff59119f503e6832 , 0x00003ffc // log(1/frcpa(1+ 72/2^-8)) | |
343 | data8 0x8147ce381ae0e146 , 0x00003ffd // log(1/frcpa(1+ 73/2^-8)) | |
344 | data8 0x82e45f06cb1ad0f2 , 0x00003ffd // log(1/frcpa(1+ 74/2^-8)) | |
345 | data8 0x842f5c7c573cbaa2 , 0x00003ffd // log(1/frcpa(1+ 75/2^-8)) | |
346 | // | |
347 | data8 0x85ce471968c8893a , 0x00003ffd // log(1/frcpa(1+ 76/2^-8)) | |
348 | data8 0x876e8305bc04066d , 0x00003ffd // log(1/frcpa(1+ 77/2^-8)) | |
349 | data8 0x891012678031fbb3 , 0x00003ffd // log(1/frcpa(1+ 78/2^-8)) | |
350 | data8 0x8a5f1493d766a05f , 0x00003ffd // log(1/frcpa(1+ 79/2^-8)) | |
351 | data8 0x8c030c778c56fa00 , 0x00003ffd // log(1/frcpa(1+ 80/2^-8)) | |
352 | // | |
353 | data8 0x8da85df17e31d9ae , 0x00003ffd // log(1/frcpa(1+ 81/2^-8)) | |
354 | data8 0x8efa663e7921687e , 0x00003ffd // log(1/frcpa(1+ 82/2^-8)) | |
355 | data8 0x90a22b6875c6a1f8 , 0x00003ffd // log(1/frcpa(1+ 83/2^-8)) | |
356 | data8 0x91f62cc8f5d24837 , 0x00003ffd // log(1/frcpa(1+ 84/2^-8)) | |
357 | data8 0x93a06cfc3857d980 , 0x00003ffd // log(1/frcpa(1+ 85/2^-8)) | |
358 | // | |
359 | data8 0x94f66d5e6fd01ced , 0x00003ffd // log(1/frcpa(1+ 86/2^-8)) | |
360 | data8 0x96a330156e6772f2 , 0x00003ffd // log(1/frcpa(1+ 87/2^-8)) | |
361 | data8 0x97fb3582754ea25b , 0x00003ffd // log(1/frcpa(1+ 88/2^-8)) | |
362 | data8 0x99aa8259aad1bbf2 , 0x00003ffd // log(1/frcpa(1+ 89/2^-8)) | |
363 | data8 0x9b0492f6227ae4a8 , 0x00003ffd // log(1/frcpa(1+ 90/2^-8)) | |
364 | // | |
365 | data8 0x9c5f8e199bf3a7a5 , 0x00003ffd // log(1/frcpa(1+ 91/2^-8)) | |
366 | data8 0x9e1293b9998c1daa , 0x00003ffd // log(1/frcpa(1+ 92/2^-8)) | |
367 | data8 0x9f6fa31e0b41f308 , 0x00003ffd // log(1/frcpa(1+ 93/2^-8)) | |
368 | data8 0xa0cda11eaf46390e , 0x00003ffd // log(1/frcpa(1+ 94/2^-8)) | |
369 | data8 0xa22c8f029cfa45aa , 0x00003ffd // log(1/frcpa(1+ 95/2^-8)) | |
370 | // | |
371 | data8 0xa3e48badb7856b34 , 0x00003ffd // log(1/frcpa(1+ 96/2^-8)) | |
372 | data8 0xa5459a0aa95849f9 , 0x00003ffd // log(1/frcpa(1+ 97/2^-8)) | |
373 | data8 0xa6a79c84480cfebd , 0x00003ffd // log(1/frcpa(1+ 98/2^-8)) | |
374 | data8 0xa80a946d0fcb3eb2 , 0x00003ffd // log(1/frcpa(1+ 99/2^-8)) | |
375 | data8 0xa96e831a3ea7b314 , 0x00003ffd // log(1/frcpa(1+100/2^-8)) | |
376 | // | |
377 | data8 0xaad369e3dc544e3b , 0x00003ffd // log(1/frcpa(1+101/2^-8)) | |
378 | data8 0xac92e9588952c815 , 0x00003ffd // log(1/frcpa(1+102/2^-8)) | |
379 | data8 0xadfa035aa1ed8fdc , 0x00003ffd // log(1/frcpa(1+103/2^-8)) | |
380 | data8 0xaf6219eae1ad6e34 , 0x00003ffd // log(1/frcpa(1+104/2^-8)) | |
381 | data8 0xb0cb2e6d8160f753 , 0x00003ffd // log(1/frcpa(1+105/2^-8)) | |
382 | // | |
383 | data8 0xb2354249ad950f72 , 0x00003ffd // log(1/frcpa(1+106/2^-8)) | |
384 | data8 0xb3a056e98ef4a3b4 , 0x00003ffd // log(1/frcpa(1+107/2^-8)) | |
385 | data8 0xb50c6dba52c6292a , 0x00003ffd // log(1/frcpa(1+108/2^-8)) | |
386 | data8 0xb679882c33876165 , 0x00003ffd // log(1/frcpa(1+109/2^-8)) | |
387 | data8 0xb78c07429785cedc , 0x00003ffd // log(1/frcpa(1+110/2^-8)) | |
388 | // | |
389 | data8 0xb8faeb8dc4a77d24 , 0x00003ffd // log(1/frcpa(1+111/2^-8)) | |
390 | data8 0xba6ad77eb36ae0d6 , 0x00003ffd // log(1/frcpa(1+112/2^-8)) | |
391 | data8 0xbbdbcc915e9bee50 , 0x00003ffd // log(1/frcpa(1+113/2^-8)) | |
392 | data8 0xbd4dcc44f8cf12ef , 0x00003ffd // log(1/frcpa(1+114/2^-8)) | |
393 | data8 0xbec0d81bf5b531fa , 0x00003ffd // log(1/frcpa(1+115/2^-8)) | |
394 | // | |
395 | data8 0xc034f19c139186f4 , 0x00003ffd // log(1/frcpa(1+116/2^-8)) | |
396 | data8 0xc14cb69f7c5e55ab , 0x00003ffd // log(1/frcpa(1+117/2^-8)) | |
397 | data8 0xc2c2abbb6e5fd56f , 0x00003ffd // log(1/frcpa(1+118/2^-8)) | |
398 | data8 0xc439b2c193e6771e , 0x00003ffd // log(1/frcpa(1+119/2^-8)) | |
399 | data8 0xc553acb9d5c67733 , 0x00003ffd // log(1/frcpa(1+120/2^-8)) | |
400 | // | |
401 | data8 0xc6cc96e441272441 , 0x00003ffd // log(1/frcpa(1+121/2^-8)) | |
402 | data8 0xc8469753eca88c30 , 0x00003ffd // log(1/frcpa(1+122/2^-8)) | |
403 | data8 0xc962cf3ce072b05c , 0x00003ffd // log(1/frcpa(1+123/2^-8)) | |
404 | data8 0xcadeba8771f694aa , 0x00003ffd // log(1/frcpa(1+124/2^-8)) | |
405 | data8 0xcc5bc08d1f72da94 , 0x00003ffd // log(1/frcpa(1+125/2^-8)) | |
406 | // | |
407 | data8 0xcd7a3f99ea035c29 , 0x00003ffd // log(1/frcpa(1+126/2^-8)) | |
408 | data8 0xcef93860c8a53c35 , 0x00003ffd // log(1/frcpa(1+127/2^-8)) | |
409 | data8 0xd0192f68a7ed23df , 0x00003ffd // log(1/frcpa(1+128/2^-8)) | |
410 | data8 0xd19a201127d3c645 , 0x00003ffd // log(1/frcpa(1+129/2^-8)) | |
411 | data8 0xd2bb92f4061c172c , 0x00003ffd // log(1/frcpa(1+130/2^-8)) | |
412 | // | |
413 | data8 0xd43e80b2ee8cc8fc , 0x00003ffd // log(1/frcpa(1+131/2^-8)) | |
414 | data8 0xd56173601fc4ade4 , 0x00003ffd // log(1/frcpa(1+132/2^-8)) | |
415 | data8 0xd6e6637efb54086f , 0x00003ffd // log(1/frcpa(1+133/2^-8)) | |
416 | data8 0xd80ad9f58f3c8193 , 0x00003ffd // log(1/frcpa(1+134/2^-8)) | |
417 | data8 0xd991d1d31aca41f8 , 0x00003ffd // log(1/frcpa(1+135/2^-8)) | |
418 | // | |
419 | data8 0xdab7d02231484a93 , 0x00003ffd // log(1/frcpa(1+136/2^-8)) | |
420 | data8 0xdc40d532cde49a54 , 0x00003ffd // log(1/frcpa(1+137/2^-8)) | |
421 | data8 0xdd685f79ed8b265e , 0x00003ffd // log(1/frcpa(1+138/2^-8)) | |
422 | data8 0xde9094bbc0e17b1d , 0x00003ffd // log(1/frcpa(1+139/2^-8)) | |
423 | data8 0xe01c91b78440c425 , 0x00003ffd // log(1/frcpa(1+140/2^-8)) | |
424 | // | |
425 | data8 0xe14658f26997e729 , 0x00003ffd // log(1/frcpa(1+141/2^-8)) | |
426 | data8 0xe270cdc2391e0d23 , 0x00003ffd // log(1/frcpa(1+142/2^-8)) | |
427 | data8 0xe3ffce3a2aa64922 , 0x00003ffd // log(1/frcpa(1+143/2^-8)) | |
428 | data8 0xe52bdb274ed82887 , 0x00003ffd // log(1/frcpa(1+144/2^-8)) | |
429 | data8 0xe6589852e75d7df6 , 0x00003ffd // log(1/frcpa(1+145/2^-8)) | |
430 | // | |
431 | data8 0xe786068c79937a7d , 0x00003ffd // log(1/frcpa(1+146/2^-8)) | |
432 | data8 0xe91903adad100911 , 0x00003ffd // log(1/frcpa(1+147/2^-8)) | |
433 | data8 0xea481236f7d35bb0 , 0x00003ffd // log(1/frcpa(1+148/2^-8)) | |
434 | data8 0xeb77d48c692e6b14 , 0x00003ffd // log(1/frcpa(1+149/2^-8)) | |
435 | data8 0xeca84b83d7297b87 , 0x00003ffd // log(1/frcpa(1+150/2^-8)) | |
436 | // | |
437 | data8 0xedd977f4962aa158 , 0x00003ffd // log(1/frcpa(1+151/2^-8)) | |
438 | data8 0xef7179a22f257754 , 0x00003ffd // log(1/frcpa(1+152/2^-8)) | |
439 | data8 0xf0a450d139366ca7 , 0x00003ffd // log(1/frcpa(1+153/2^-8)) | |
440 | data8 0xf1d7e0524ff9ffdb , 0x00003ffd // log(1/frcpa(1+154/2^-8)) | |
441 | data8 0xf30c29036a8b6cae , 0x00003ffd // log(1/frcpa(1+155/2^-8)) | |
442 | // | |
443 | data8 0xf4412bc411ea8d92 , 0x00003ffd // log(1/frcpa(1+156/2^-8)) | |
444 | data8 0xf576e97564c8619d , 0x00003ffd // log(1/frcpa(1+157/2^-8)) | |
445 | data8 0xf6ad62fa1b5f172f , 0x00003ffd // log(1/frcpa(1+158/2^-8)) | |
446 | data8 0xf7e499368b55c542 , 0x00003ffd // log(1/frcpa(1+159/2^-8)) | |
447 | data8 0xf91c8d10abaffe22 , 0x00003ffd // log(1/frcpa(1+160/2^-8)) | |
448 | // | |
449 | data8 0xfa553f7018c966f3 , 0x00003ffd // log(1/frcpa(1+161/2^-8)) | |
450 | data8 0xfb8eb13e185d802c , 0x00003ffd // log(1/frcpa(1+162/2^-8)) | |
451 | data8 0xfcc8e3659d9bcbed , 0x00003ffd // log(1/frcpa(1+163/2^-8)) | |
452 | data8 0xfe03d6d34d487fd2 , 0x00003ffd // log(1/frcpa(1+164/2^-8)) | |
453 | data8 0xff3f8c7581e9f0ae , 0x00003ffd // log(1/frcpa(1+165/2^-8)) | |
454 | // | |
455 | data8 0x803e029e280173ae , 0x00003ffe // log(1/frcpa(1+166/2^-8)) | |
456 | data8 0x80dca10cc52d0757 , 0x00003ffe // log(1/frcpa(1+167/2^-8)) | |
457 | data8 0x817ba200632755a1 , 0x00003ffe // log(1/frcpa(1+168/2^-8)) | |
458 | data8 0x821b05f3b01d6774 , 0x00003ffe // log(1/frcpa(1+169/2^-8)) | |
459 | data8 0x82bacd623ff19d06 , 0x00003ffe // log(1/frcpa(1+170/2^-8)) | |
460 | // | |
461 | data8 0x835af8c88e7a8f47 , 0x00003ffe // log(1/frcpa(1+171/2^-8)) | |
462 | data8 0x83c5f8299e2b4091 , 0x00003ffe // log(1/frcpa(1+172/2^-8)) | |
463 | data8 0x8466cb43f3d87300 , 0x00003ffe // log(1/frcpa(1+173/2^-8)) | |
464 | data8 0x850803a67c80ca4b , 0x00003ffe // log(1/frcpa(1+174/2^-8)) | |
465 | data8 0x85a9a1d11a23b461 , 0x00003ffe // log(1/frcpa(1+175/2^-8)) | |
466 | // | |
467 | data8 0x864ba644a18e6e05 , 0x00003ffe // log(1/frcpa(1+176/2^-8)) | |
468 | data8 0x86ee1182dcc432f7 , 0x00003ffe // log(1/frcpa(1+177/2^-8)) | |
469 | data8 0x875a925d7e48c316 , 0x00003ffe // log(1/frcpa(1+178/2^-8)) | |
470 | data8 0x87fdaa109d23aef7 , 0x00003ffe // log(1/frcpa(1+179/2^-8)) | |
471 | data8 0x88a129ed4becfaf2 , 0x00003ffe // log(1/frcpa(1+180/2^-8)) | |
472 | // | |
473 | data8 0x89451278ecd7f9cf , 0x00003ffe // log(1/frcpa(1+181/2^-8)) | |
474 | data8 0x89b29295f8432617 , 0x00003ffe // log(1/frcpa(1+182/2^-8)) | |
475 | data8 0x8a572ac5a5496882 , 0x00003ffe // log(1/frcpa(1+183/2^-8)) | |
476 | data8 0x8afc2d0ce3b2dadf , 0x00003ffe // log(1/frcpa(1+184/2^-8)) | |
477 | data8 0x8b6a69c608cfd3af , 0x00003ffe // log(1/frcpa(1+185/2^-8)) | |
478 | // | |
479 | data8 0x8c101e106e899a83 , 0x00003ffe // log(1/frcpa(1+186/2^-8)) | |
480 | data8 0x8cb63de258f9d626 , 0x00003ffe // log(1/frcpa(1+187/2^-8)) | |
481 | data8 0x8d2539c5bd19e2b1 , 0x00003ffe // log(1/frcpa(1+188/2^-8)) | |
482 | data8 0x8dcc0e064b29e6f1 , 0x00003ffe // log(1/frcpa(1+189/2^-8)) | |
483 | data8 0x8e734f45d88357ae , 0x00003ffe // log(1/frcpa(1+190/2^-8)) | |
484 | // | |
485 | data8 0x8ee30cef034a20db , 0x00003ffe // log(1/frcpa(1+191/2^-8)) | |
486 | data8 0x8f8b0515686d1d06 , 0x00003ffe // log(1/frcpa(1+192/2^-8)) | |
487 | data8 0x90336bba039bf32f , 0x00003ffe // log(1/frcpa(1+193/2^-8)) | |
488 | data8 0x90a3edd23d1c9d58 , 0x00003ffe // log(1/frcpa(1+194/2^-8)) | |
489 | data8 0x914d0de2f5d61b32 , 0x00003ffe // log(1/frcpa(1+195/2^-8)) | |
490 | // | |
491 | data8 0x91be0c20d28173b5 , 0x00003ffe // log(1/frcpa(1+196/2^-8)) | |
492 | data8 0x9267e737c06cd34a , 0x00003ffe // log(1/frcpa(1+197/2^-8)) | |
493 | data8 0x92d962ae6abb1237 , 0x00003ffe // log(1/frcpa(1+198/2^-8)) | |
494 | data8 0x9383fa6afbe2074c , 0x00003ffe // log(1/frcpa(1+199/2^-8)) | |
495 | data8 0x942f0421651c1c4e , 0x00003ffe // log(1/frcpa(1+200/2^-8)) | |
496 | // | |
497 | data8 0x94a14a3845bb985e , 0x00003ffe // log(1/frcpa(1+201/2^-8)) | |
498 | data8 0x954d133857f861e7 , 0x00003ffe // log(1/frcpa(1+202/2^-8)) | |
499 | data8 0x95bfd96468e604c4 , 0x00003ffe // log(1/frcpa(1+203/2^-8)) | |
500 | data8 0x9632d31cafafa858 , 0x00003ffe // log(1/frcpa(1+204/2^-8)) | |
501 | data8 0x96dfaabd86fa1647 , 0x00003ffe // log(1/frcpa(1+205/2^-8)) | |
502 | // | |
503 | data8 0x9753261fcbb2a594 , 0x00003ffe // log(1/frcpa(1+206/2^-8)) | |
504 | data8 0x9800c11b426b996d , 0x00003ffe // log(1/frcpa(1+207/2^-8)) | |
505 | data8 0x9874bf4d45ae663c , 0x00003ffe // log(1/frcpa(1+208/2^-8)) | |
506 | data8 0x99231f5ee9a74f79 , 0x00003ffe // log(1/frcpa(1+209/2^-8)) | |
507 | data8 0x9997a18a56bcad28 , 0x00003ffe // log(1/frcpa(1+210/2^-8)) | |
508 | // | |
509 | data8 0x9a46c873a3267e79 , 0x00003ffe // log(1/frcpa(1+211/2^-8)) | |
510 | data8 0x9abbcfc621eb6cb6 , 0x00003ffe // log(1/frcpa(1+212/2^-8)) | |
511 | data8 0x9b310cb0d354c990 , 0x00003ffe // log(1/frcpa(1+213/2^-8)) | |
512 | data8 0x9be14cf9e1b3515c , 0x00003ffe // log(1/frcpa(1+214/2^-8)) | |
513 | data8 0x9c5710b8cbb73a43 , 0x00003ffe // log(1/frcpa(1+215/2^-8)) | |
514 | // | |
515 | data8 0x9ccd0abd301f399c , 0x00003ffe // log(1/frcpa(1+216/2^-8)) | |
516 | data8 0x9d7e67f3bdce8888 , 0x00003ffe // log(1/frcpa(1+217/2^-8)) | |
517 | data8 0x9df4ea81a99daa01 , 0x00003ffe // log(1/frcpa(1+218/2^-8)) | |
518 | data8 0x9e6ba405a54514ba , 0x00003ffe // log(1/frcpa(1+219/2^-8)) | |
519 | data8 0x9f1e21c8c7bb62b3 , 0x00003ffe // log(1/frcpa(1+220/2^-8)) | |
520 | // | |
521 | data8 0x9f956593f6b6355c , 0x00003ffe // log(1/frcpa(1+221/2^-8)) | |
522 | data8 0xa00ce1092e5498c3 , 0x00003ffe // log(1/frcpa(1+222/2^-8)) | |
523 | data8 0xa0c08309c4b912c1 , 0x00003ffe // log(1/frcpa(1+223/2^-8)) | |
524 | data8 0xa1388a8c6faa2afa , 0x00003ffe // log(1/frcpa(1+224/2^-8)) | |
525 | data8 0xa1b0ca7095b5f985 , 0x00003ffe // log(1/frcpa(1+225/2^-8)) | |
526 | // | |
527 | data8 0xa22942eb47534a00 , 0x00003ffe // log(1/frcpa(1+226/2^-8)) | |
528 | data8 0xa2de62326449d0a3 , 0x00003ffe // log(1/frcpa(1+227/2^-8)) | |
529 | data8 0xa357690f88bfe345 , 0x00003ffe // log(1/frcpa(1+228/2^-8)) | |
530 | data8 0xa3d0a93f45169a4b , 0x00003ffe // log(1/frcpa(1+229/2^-8)) | |
531 | data8 0xa44a22f7ffe65f30 , 0x00003ffe // log(1/frcpa(1+230/2^-8)) | |
532 | // | |
533 | data8 0xa500c5e5b4c1aa36 , 0x00003ffe // log(1/frcpa(1+231/2^-8)) | |
534 | data8 0xa57ad064eb2ebbc2 , 0x00003ffe // log(1/frcpa(1+232/2^-8)) | |
535 | data8 0xa5f5152dedf4384e , 0x00003ffe // log(1/frcpa(1+233/2^-8)) | |
536 | data8 0xa66f9478856233ec , 0x00003ffe // log(1/frcpa(1+234/2^-8)) | |
537 | data8 0xa6ea4e7cca02c32e , 0x00003ffe // log(1/frcpa(1+235/2^-8)) | |
538 | // | |
539 | data8 0xa765437325341ccf , 0x00003ffe // log(1/frcpa(1+236/2^-8)) | |
540 | data8 0xa81e21e6c75b4020 , 0x00003ffe // log(1/frcpa(1+237/2^-8)) | |
541 | data8 0xa899ab333fe2b9ca , 0x00003ffe // log(1/frcpa(1+238/2^-8)) | |
542 | data8 0xa9157039c51ebe71 , 0x00003ffe // log(1/frcpa(1+239/2^-8)) | |
543 | data8 0xa991713433c2b999 , 0x00003ffe // log(1/frcpa(1+240/2^-8)) | |
544 | // | |
545 | data8 0xaa0dae5cbcc048b3 , 0x00003ffe // log(1/frcpa(1+241/2^-8)) | |
546 | data8 0xaa8a27ede5eb13ad , 0x00003ffe // log(1/frcpa(1+242/2^-8)) | |
547 | data8 0xab06de228a9e3499 , 0x00003ffe // log(1/frcpa(1+243/2^-8)) | |
548 | data8 0xab83d135dc633301 , 0x00003ffe // log(1/frcpa(1+244/2^-8)) | |
549 | data8 0xac3fb076adc7fe7a , 0x00003ffe // log(1/frcpa(1+245/2^-8)) | |
550 | // | |
551 | data8 0xacbd3cbbe47988f1 , 0x00003ffe // log(1/frcpa(1+246/2^-8)) | |
552 | data8 0xad3b06b1a5dc57c3 , 0x00003ffe // log(1/frcpa(1+247/2^-8)) | |
553 | data8 0xadb90e94af887717 , 0x00003ffe // log(1/frcpa(1+248/2^-8)) | |
554 | data8 0xae3754a218f7c816 , 0x00003ffe // log(1/frcpa(1+249/2^-8)) | |
555 | data8 0xaeb5d9175437afa2 , 0x00003ffe // log(1/frcpa(1+250/2^-8)) | |
556 | // | |
557 | data8 0xaf349c322e9c7cee , 0x00003ffe // log(1/frcpa(1+251/2^-8)) | |
558 | data8 0xafb39e30d1768d1c , 0x00003ffe // log(1/frcpa(1+252/2^-8)) | |
559 | data8 0xb032df51c2c93116 , 0x00003ffe // log(1/frcpa(1+253/2^-8)) | |
560 | data8 0xb0b25fd3e6035ad9 , 0x00003ffe // log(1/frcpa(1+254/2^-8)) | |
561 | data8 0xb1321ff67cba178c , 0x00003ffe // log(1/frcpa(1+255/2^-8)) | |
562 | LOCAL_OBJECT_END(log_table_3) | |
563 | ||
564 | ||
565 | .section .text | |
566 | GLOBAL_LIBM_ENTRY(asinh) | |
567 | ||
568 | { .mfi | |
569 | getf.exp asinh_GR_f8 = f8 // Must recompute later if x unorm | |
570 | fclass.m p12,p0 = f8, 0x0b // Test x unorm | |
571 | mov log_GR_exp_17_ones = 0x1ffff | |
572 | } | |
573 | { .mfi | |
574 | addl NR_table_address = @ltoff(log_table_1), gp | |
575 | fma.s1 log_y = f8, f8, f1 // y = x^2 + 1 | |
576 | mov asinh_GR_comp = 0xfffc | |
577 | } | |
578 | ;; | |
579 | ||
580 | { .mfi | |
581 | mov log_GR_exp_16_ones = 0xffff //BIAS | |
582 | fclass.m p6,p0 = f8, 0xe7 // Test for x = NaN and inf and zero | |
583 | mov log_GR_comp2 = 0x1003e | |
584 | } | |
585 | { .mfi | |
586 | ld8 NR_table_address = [NR_table_address] | |
587 | fma.s1 asinh_w_sq = f8,f8,f0 // x^2 | |
588 | nop.i 0 | |
589 | } | |
590 | ;; | |
591 | ||
592 | { .mfi | |
593 | nop.m 0 | |
594 | fcmp.lt.s1 p7,p11 = f8,f0 // if x<0 | |
595 | nop.i 0 | |
596 | } | |
597 | { .mfb | |
598 | nop.m 0 | |
599 | fnorm.s1 fNormX = f8 // Normalize x | |
600 | (p12) br.cond.spnt ASINH_UNORM // Branch if x=unorm | |
601 | } | |
602 | ;; | |
603 | ||
604 | ASINH_COMMON: | |
605 | // Return here if x=unorm and not denorm | |
606 | { .mfi | |
607 | //to get second table address | |
608 | adds log_table_address2 = 0x40, NR_table_address | |
609 | fma.s1 log_arg = f8,f1,f8 | |
610 | nop.i 0 | |
611 | } | |
612 | { .mfb | |
613 | nop.m 0 | |
614 | (p6) fma.d.s0 f8 = f8,f1,f8 // quietize nan result if x=nan | |
615 | (p6) br.ret.spnt b0 // Exit for x=nan and inf and zero | |
616 | } | |
617 | ;; | |
618 | ||
619 | { .mfi | |
620 | ldfpd NR1,NR2 = [log_table_address2],16 | |
621 | frsqrta.s1 log_y_rs,p0 = log_y // z=1/sqrt(y) | |
622 | nop.i 0 | |
623 | } | |
624 | ;; | |
625 | ||
626 | { .mfi | |
627 | ldfe log_C13 = [log_table_address2],16 | |
628 | nop.f 0 | |
629 | and asinh_GR_f8 = asinh_GR_f8,log_GR_exp_17_ones | |
630 | } | |
631 | ;; | |
632 | ||
633 | { .mib | |
634 | ldfe log_C11 = [log_table_address2],16 | |
635 | cmp.le p13,p0 = log_GR_comp2,asinh_GR_f8 | |
636 | (p13) br.cond.spnt LOG_COMMON1 // Branch if path 4, |x| >= 2^63 | |
637 | } | |
638 | ;; | |
639 | ||
640 | { .mfi | |
641 | nop.m 0 | |
642 | fma.s1 log_y_rs_iter = log_y_rs,log_y,f0 // y*z | |
643 | nop.i 0 | |
644 | } | |
645 | ;; | |
646 | ||
647 | .pred.rel "mutex",p7,p11 | |
648 | { .mfi | |
649 | nop.m 0 | |
650 | (p11) mov asinh_f8 = fNormX | |
651 | nop.i 0 | |
652 | } | |
653 | { .mfb | |
654 | cmp.gt p8,p0 = asinh_GR_comp,asinh_GR_f8 | |
655 | (p7) fnma.s1 asinh_f8 = fNormX,f1,f0 | |
656 | (p8) br.cond.spnt ASINH_NEAR_ZERO // Branch if path 2, 0 < |x| < 2^-3 | |
657 | } | |
658 | ;; | |
659 | ||
660 | // Here if main path, 2^-3 <= |x| < 2^63 | |
661 | ///////////////////////////////// The first iteration ///////////////////////// | |
662 | { .mfi | |
663 | ldfpd log_P5,log_P4 = [NR_table_address],16 | |
664 | fnma.s1 log_y_rs_iter = log_y_rs_iter,log_y_rs,NR2 // 3-(y*z)*z | |
665 | nop.i 0 | |
666 | } | |
667 | { .mfi | |
668 | nop.m 0 | |
669 | fma.s1 log_y_rs_iter1 = log_y_rs,NR1,f0 // 0.5*z | |
670 | nop.i 0 | |
671 | } | |
672 | ;; | |
673 | ||
674 | { .mfi | |
675 | ldfpd log_P3,log_P2 = [NR_table_address],16 | |
676 | // (0.5*z)*(3-(y*z)*z) | |
677 | fma.s1 log_y_rs_iter = log_y_rs_iter1,log_y_rs_iter,f0 | |
678 | nop.i 0 | |
679 | } | |
680 | ;; | |
681 | ||
682 | /////////////////////////// The second iteration ///////////////////////////// | |
683 | { .mfi | |
684 | ldfd log_P1 = [NR_table_address],16 | |
685 | fma.s1 log_y_rs = log_y_rs_iter,log_y,f0 | |
686 | nop.i 0 | |
687 | } | |
688 | ;; | |
689 | ||
690 | { .mfi | |
691 | nop.m 0 | |
692 | fnma.s1 log_y_rs = log_y_rs,log_y_rs_iter,NR2 | |
693 | nop.i 0 | |
694 | } | |
695 | { .mfi | |
696 | nop.m 0 | |
697 | fma.s1 log_y_rs_iter1 = log_y_rs_iter,NR1,f0 | |
698 | nop.i 0 | |
699 | } | |
700 | ;; | |
701 | ||
702 | { .mfi | |
703 | ldfe log2 = [NR_table_address],16 | |
704 | // (0.5*z)*(3-(y*z)*z) | |
705 | fma.s1 log_y_rs_iter = log_y_rs_iter1,log_y_rs,f0 | |
706 | nop.i 0 | |
707 | } | |
708 | { .mfi | |
709 | nop.m 0 | |
710 | // (0.5*z)*(3-(y*z)*z) | |
711 | fma.s1 log_arg_early = log_y_rs_iter1,log_y_rs,f0 | |
712 | nop.i 0 | |
713 | } | |
714 | ;; | |
715 | ||
716 | ////////////////////////////////// The third iteration //////////////////////// | |
717 | { .mfi | |
718 | nop.m 0 | |
719 | fma.s1 log_y_rs = log_y_rs_iter,log_y,f0 | |
720 | nop.i 0 | |
721 | } | |
722 | { .mfi | |
723 | nop.m 0 | |
724 | fma.s1 log_y_rs_iter1 = log_y_rs_iter,NR1,f0 | |
725 | nop.i 0 | |
726 | } | |
727 | ;; | |
728 | ||
729 | { .mfi | |
730 | nop.m 0 | |
731 | fma.s1 log_arg_early = log_arg_early,log_y,asinh_f8 | |
732 | nop.i 0 | |
733 | } | |
734 | ;; | |
735 | ||
736 | { .mfi | |
737 | nop.m 0 | |
738 | fnma.s1 log_y_rs = log_y_rs,log_y_rs_iter,NR2 | |
739 | nop.i 0 | |
740 | } | |
741 | { .mfi | |
742 | nop.m 0 | |
743 | fma.s1 log_y_rs_iter1 = log_y_rs_iter1,log_y,f0 | |
744 | nop.i 0 | |
745 | } | |
746 | ;; | |
747 | ||
748 | { .mfi | |
749 | nop.m 0 | |
750 | frcpa.s1 log_C,p0 = f1,log_arg_early | |
751 | nop.i 0 | |
752 | } | |
753 | ;; | |
754 | ||
755 | { .mfi | |
756 | getf.exp log_GR_signexp_f8 = log_arg_early | |
757 | nop.f 0 | |
758 | nop.i 0 | |
759 | } | |
760 | ;; | |
761 | ||
762 | { .mfi | |
763 | getf.sig log_GR_significand_f8 = log_arg_early | |
764 | // (0.5*z)*(3-(y*z)*z)*y + |x| | |
765 | fma.s1 log_arg = log_y_rs_iter1,log_y_rs,asinh_f8 | |
766 | //to get third table address | |
767 | adds log_table_address3 = 0x70, NR_table_address | |
768 | } | |
769 | ;; | |
770 | ||
771 | ///////////////////////////////// The end NR iterations ///////////////////// | |
772 | { .mfi | |
773 | nop.m 0 | |
774 | nop.f 0 | |
775 | //significant bit destruction | |
776 | and log_GR_exp_f8 = log_GR_signexp_f8, log_GR_exp_17_ones | |
777 | } | |
778 | ;; | |
779 | ||
780 | { .mfi | |
781 | //BIAS subtraction | |
782 | sub log_GR_true_exp_f8 = log_GR_exp_f8, log_GR_exp_16_ones | |
783 | (p7) fnma.s1 log2 = log2,f1,f0 | |
784 | nop.i 0 | |
785 | } | |
786 | ;; | |
787 | ||
788 | { .mfi | |
789 | setf.sig log_int_Nfloat = log_GR_true_exp_f8 | |
790 | fms.s1 log_r = log_C,log_arg,f1 // C = frcpa(x); r = C * x - 1 | |
791 | extr.u log_GR_index = log_GR_significand_f8,55,8 //Extract 8 bits | |
792 | } | |
793 | ;; | |
794 | ||
795 | { .mmi | |
796 | //pre-index*16 + index | |
797 | shladd log_table_address3 = log_GR_index,4,log_table_address3 | |
798 | ;; | |
799 | ldfe log_T = [log_table_address3] | |
800 | nop.i 0 | |
801 | } | |
802 | ;; | |
803 | ||
804 | { .mfi | |
805 | nop.m 0 | |
806 | fma.s1 log_rsq = log_r, log_r, f0 //r^2 | |
807 | nop.i 0 | |
808 | } | |
809 | { .mfi | |
810 | nop.m 0 | |
811 | fma.s1 log_rp_p4 = log_P5, log_r, log_P4 //P5*r + P4 | |
812 | nop.i 0 | |
813 | } | |
814 | ;; | |
815 | ||
816 | { .mfi | |
817 | nop.m 0 | |
818 | fma.s1 log_rp_p32 = log_P3, log_r, log_P2 //P3*r + P2 | |
819 | nop.i 0 | |
820 | } | |
821 | ;; | |
822 | ||
823 | { .mfi | |
824 | nop.m 0 | |
825 | //convert N to the floating-point format | |
826 | fcvt.xf log_Nfloat = log_int_Nfloat | |
827 | nop.i 0 | |
828 | } | |
829 | ;; | |
830 | ||
831 | { .mfi | |
832 | nop.m 0 | |
833 | fma.s1 log_rcube = log_rsq, log_r, f0 //r^3 | |
834 | nop.i 0 | |
835 | } | |
836 | { .mfi | |
837 | nop.m 0 | |
838 | fma.s1 log_rp_p10 = log_rsq, log_P1, log_r //P1*r^2 + r | |
839 | nop.i 0 | |
840 | } | |
841 | ;; | |
842 | ||
843 | { .mfi | |
844 | nop.m 0 | |
845 | //(P5*r + P4)*r^2 + P3*r + P2 | |
846 | fma.s1 log_rp_p2 = log_rp_p4, log_rsq, log_rp_p32 | |
847 | nop.i 0 | |
848 | } | |
849 | ;; | |
850 | ||
851 | .pred.rel "mutex",p7,p11 | |
852 | { .mfi | |
853 | nop.m 0 | |
854 | (p11) fma.s1 log_T_plus_Nlog2 = log_Nfloat,log2,log_T //N*log2 + T if x>0 | |
855 | nop.i 0 | |
856 | } | |
857 | { .mfi | |
858 | nop.m 0 | |
859 | (p7) fms.s1 log_T_plus_Nlog2 = log_Nfloat,log2,log_T //N*log2 - T if x<0 | |
860 | nop.i 0 | |
861 | } | |
862 | ;; | |
863 | ||
864 | { .mfi | |
865 | nop.m 0 | |
866 | //((P5*r + P4)*r^2 + P3*r + P2)*w^3 + P1*r^2 + r | |
867 | fma.s1 log_r2P_r = log_rp_p2, log_rcube, log_rp_p10 | |
868 | nop.i 0 | |
869 | } | |
870 | ;; | |
871 | ||
872 | { .mfi | |
873 | nop.m 0 | |
874 | // N*log2 + T + ((P5*r + P4)*r^2 + P3*r + P2)*r^3 + P1*r^2 + r | |
875 | (p11) fadd.d.s0 f8 = log_T_plus_Nlog2,log_r2P_r | |
876 | nop.i 0 | |
877 | } | |
878 | { .mfb | |
879 | nop.m 0 | |
880 | // -N*log2 - T - ((P5*r + P4)*r^2 + P3*r + P2)*r^3 + P1*r^2 + r | |
881 | (p7) fsub.d.s0 f8 = log_T_plus_Nlog2,log_r2P_r | |
882 | br.ret.sptk b0 // Exit main path, path 3: 2^-3 <= |x| < 2^63 | |
883 | } | |
884 | ;; | |
885 | ||
886 | // Here if path 4, |x| >= 2^63 | |
887 | LOG_COMMON1: | |
888 | { .mfi | |
889 | ldfpd log_P5,log_P4 = [NR_table_address],16 | |
890 | nop.f 0 | |
891 | nop.i 0 | |
892 | } | |
893 | ;; | |
894 | ||
895 | { .mfi | |
896 | ldfpd log_P3,log_P2 = [NR_table_address],16 | |
897 | frcpa.s1 log_C,p0 = f1,log_arg | |
898 | nop.i 0 | |
899 | } | |
900 | ;; | |
901 | ||
902 | { .mmi | |
903 | getf.exp log_GR_signexp_f8 = log_arg | |
904 | ldfd log_P1 = [NR_table_address],16 | |
905 | nop.i 0 | |
906 | } | |
907 | ;; | |
908 | ||
909 | { .mmi | |
910 | getf.sig log_GR_significand_f8 = log_arg | |
911 | ldfe log2 = [NR_table_address],16 | |
912 | nop.i 0 | |
913 | } | |
914 | ;; | |
915 | ||
916 | { .mfi | |
917 | adds log_table_address3 = 0x70, NR_table_address | |
918 | nop.f 0 | |
919 | //significant bit destruction | |
920 | and log_GR_exp_f8 = log_GR_signexp_f8, log_GR_exp_17_ones | |
921 | } | |
922 | ;; | |
923 | ||
924 | { .mmf | |
925 | nop.m 0 | |
926 | //BIAS subtraction | |
927 | sub log_GR_true_exp_f8 = log_GR_exp_f8, log_GR_exp_16_ones | |
928 | fms.s1 log_r = log_C,log_arg,f1 //C = frcpa(x); r = C * x - 1 | |
929 | } | |
930 | ;; | |
931 | ||
932 | { .mfi | |
933 | setf.sig log_int_Nfloat = log_GR_true_exp_f8 | |
934 | nop.f 0 | |
935 | extr.u log_GR_index = log_GR_significand_f8,55,8 //Extract 8 bits | |
936 | } | |
937 | ;; | |
938 | ||
939 | { .mmi | |
940 | //pre-index*16 + index | |
941 | shladd log_table_address3 = log_GR_index,4,log_table_address3 | |
942 | ;; | |
943 | ldfe log_T = [log_table_address3] | |
944 | nop.i 0 | |
945 | ||
946 | } | |
947 | ;; | |
948 | ||
949 | { .mfi | |
950 | nop.m 0 | |
951 | fma.s1 log_rsq = log_r, log_r, f0 //r^2 | |
952 | nop.i 0 | |
953 | } | |
954 | { .mfi | |
955 | nop.m 0 | |
956 | fma.s1 log_rp_p4 = log_P5, log_r, log_P4 //P5*r + P4 | |
957 | nop.i 0 | |
958 | } | |
959 | ;; | |
960 | ||
961 | { .mfi | |
962 | nop.m 0 | |
963 | fma.s1 log_rp_p32 = log_P3, log_r, log_P2 //P3*r + P2 | |
964 | nop.i 0 | |
965 | } | |
966 | { .mfi | |
967 | nop.m 0 | |
968 | (p7) fnma.s1 log2 = log2,f1,f0 | |
969 | nop.i 0 | |
970 | } | |
971 | ;; | |
972 | ||
973 | { .mfi | |
974 | nop.m 0 | |
975 | fma.s1 log_rcube = log_rsq, log_r, f0 //r^3 | |
976 | nop.i 0 | |
977 | } | |
978 | { .mfi | |
979 | nop.m 0 | |
980 | fma.s1 log_rp_p10 = log_rsq, log_P1, log_r //P1*r^2 + r | |
981 | nop.i 0 | |
982 | } | |
983 | ;; | |
984 | ||
985 | { .mfi | |
986 | nop.m 0 | |
987 | //convert N to the floating-point format | |
988 | fcvt.xf log_Nfloat = log_int_Nfloat | |
989 | nop.i 0 | |
990 | } | |
991 | { .mfi | |
992 | nop.m 0 | |
993 | //(P5*r + P4)*r^2 + P3*r + P2 | |
994 | fma.s1 log_rp_p2 = log_rp_p4, log_rsq, log_rp_p32 | |
995 | nop.i 0 | |
996 | } | |
997 | ;; | |
998 | ||
999 | { .mfi | |
1000 | nop.m 0 | |
1001 | (p7) fnma.s1 log_T = log_T,f1,f0 | |
1002 | nop.i 0 | |
1003 | } | |
1004 | ;; | |
1005 | ||
1006 | { .mfi | |
1007 | nop.m 0 | |
1008 | fma.s1 log_T_plus_Nlog2 = log_Nfloat,log2,log_T //N*log2 + T | |
1009 | nop.i 0 | |
1010 | } | |
1011 | { .mfi | |
1012 | nop.m 0 | |
1013 | //((P5*r + P4)*r^2 + P3*r + P2)*w^3 + P1*r^2 + r | |
1014 | fma.s1 log_r2P_r = log_rp_p2, log_rcube, log_rp_p10 | |
1015 | nop.i 0 | |
1016 | } | |
1017 | ;; | |
1018 | ||
1019 | .pred.rel "mutex",p7,p11 | |
1020 | { .mfi | |
1021 | nop.m 0 | |
1022 | // N*log2 + T + ((P5*r + P4)*r^2 + P3*r + P2)*r^3 + P1*r^2 + r | |
1023 | (p11) fadd.d.s0 f8 = log_T_plus_Nlog2,log_r2P_r | |
1024 | nop.i 0 | |
1025 | } | |
1026 | { .mfb | |
1027 | nop.m 0 | |
1028 | // -N*log2 - T - ((P5*r + P4)*r^2 + P3*r + P2)*r^3 + P1*r^2 + r | |
1029 | (p7) fsub.d.s0 f8 = log_T_plus_Nlog2,log_r2P_r | |
1030 | br.ret.sptk b0 // Exit path 4, |x| >= 2^63 | |
1031 | } | |
1032 | ;; | |
1033 | ||
1034 | // Here is path 2, 0 < |x| < 2^-3 | |
1035 | ASINH_NEAR_ZERO: | |
1036 | { .mfi | |
1037 | ldfe log_C9 = [log_table_address2],16 | |
1038 | fma.s1 asinh_w_cube = asinh_w_sq,fNormX,f0 | |
1039 | nop.i 0 | |
1040 | } | |
1041 | ;; | |
1042 | ||
1043 | { .mfi | |
1044 | ldfe log_C7 = [log_table_address2],16 | |
1045 | fma.s1 asinh_w_four = asinh_w_sq,asinh_w_sq,f0 | |
1046 | nop.i 0 | |
1047 | } | |
1048 | ;; | |
1049 | ||
1050 | { .mfi | |
1051 | ldfe log_C5 = [log_table_address2],16 | |
1052 | nop.f 0 | |
1053 | nop.i 0 | |
1054 | } | |
1055 | ;; | |
1056 | ||
1057 | { .mfi | |
1058 | ldfe log_C3 = [log_table_address2],16 | |
1059 | nop.f 0 | |
1060 | nop.i 0 | |
1061 | } | |
1062 | ;; | |
1063 | ||
1064 | { .mfi | |
1065 | nop.m 0 | |
1066 | fma.s1 asinh_w_13 = log_C13,asinh_w_sq,log_C11 | |
1067 | nop.i 0 | |
1068 | } | |
1069 | { .mfi | |
1070 | nop.m 0 | |
1071 | fma.s1 asinh_w_9 = log_C9,asinh_w_sq,log_C7 | |
1072 | nop.i 0 | |
1073 | } | |
1074 | ;; | |
1075 | ||
1076 | { .mfi | |
1077 | nop.m 0 | |
1078 | fma.s1 asinh_w_3 = log_C5,asinh_w_sq,log_C3 | |
1079 | nop.i 0 | |
1080 | } | |
1081 | { .mfi | |
1082 | nop.m 0 | |
1083 | fma.s1 asinh_w_seven = asinh_w_four,asinh_w_cube,f0 | |
1084 | nop.i 0 | |
1085 | } | |
1086 | ;; | |
1087 | ||
1088 | { .mfi | |
1089 | nop.m 0 | |
1090 | fma.s1 asinh_w_7 = asinh_w_13,asinh_w_four,asinh_w_9 | |
1091 | nop.i 0 | |
1092 | } | |
1093 | { .mfi | |
1094 | nop.m 0 | |
1095 | fma.s1 asinh_w_5 = asinh_w_3,asinh_w_cube,fNormX | |
1096 | nop.i 0 | |
1097 | } | |
1098 | ;; | |
1099 | ||
1100 | { .mfb | |
1101 | nop.m 0 | |
1102 | fma.d.s0 f8 = asinh_w_7,asinh_w_seven,asinh_w_5 | |
1103 | br.ret.sptk b0 // Exit path 2 (0.0 <|x| < 2^(-3)) | |
1104 | } | |
1105 | ;; | |
1106 | ||
1107 | ASINH_UNORM: | |
1108 | // Here if x=unorm | |
1109 | { .mfi | |
1110 | getf.exp asinh_GR_f8 = fNormX // Recompute if x unorm | |
1111 | fclass.m p0,p13 = fNormX, 0x0b // Test x denorm | |
1112 | nop.i 0 | |
1113 | } | |
1114 | ;; | |
1115 | ||
1116 | { .mfb | |
1117 | nop.m 0 | |
1118 | fcmp.eq.s0 p14,p0 = f8, f0 // Dummy to set denormal flag | |
1119 | (p13) br.cond.sptk ASINH_COMMON // Continue if x unorm and not denorm | |
1120 | } | |
1121 | ;; | |
1122 | ||
1123 | .pred.rel "mutex",p7,p11 | |
1124 | { .mfi | |
1125 | nop.m 0 | |
1126 | (p7) fma.d.s0 f8 = f8,f8,f8 // Result x+x^2 if x=-denorm | |
1127 | nop.i 0 | |
1128 | } | |
1129 | { .mfb | |
1130 | nop.m 0 | |
1131 | (p11) fnma.d.s0 f8 = f8,f8,f8 // Result x-x^2 if x=+denorm | |
1132 | br.ret.spnt b0 // Exit if denorm | |
1133 | } | |
1134 | ;; | |
1135 | ||
1136 | GLOBAL_LIBM_END(asinh) | |
0609ec0a | 1137 | libm_alias_double_other (asinh, asinh) |