]>
Commit | Line | Data |
---|---|---|
0ecb606c JJ |
1 | .file "tanhl.s" |
2 | ||
3 | ||
4 | // Copyright (c) 2001 - 2003, Intel Corporation | |
5 | // All rights reserved. | |
6 | // | |
7 | // Contributed 2001 by the Intel Numerics Group, Intel Corporation | |
8 | // | |
9 | // Redistribution and use in source and binary forms, with or without | |
10 | // modification, are permitted provided that the following conditions are | |
11 | // met: | |
12 | // | |
13 | // * Redistributions of source code must retain the above copyright | |
14 | // notice, this list of conditions and the following disclaimer. | |
15 | // | |
16 | // * Redistributions in binary form must reproduce the above copyright | |
17 | // notice, this list of conditions and the following disclaimer in the | |
18 | // documentation and/or other materials provided with the distribution. | |
19 | // | |
20 | // * The name of Intel Corporation may not be used to endorse or promote | |
21 | // products derived from this software without specific prior written | |
22 | // permission. | |
23 | ||
24 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
25 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
26 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
27 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS | |
28 | // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | |
29 | // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | |
30 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | |
31 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY | |
32 | // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING | |
33 | // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | |
34 | // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
35 | // | |
36 | // Intel Corporation is the author of this code, and requests that all | |
37 | // problem reports or change requests be submitted to it directly at | |
38 | // http://www.intel.com/software/products/opensource/libraries/num.htm. | |
39 | // | |
40 | // History | |
41 | //============================================================== | |
42 | // 11/29/01 Initial version | |
43 | // 05/20/02 Cleaned up namespace and sf0 syntax | |
44 | // 08/14/02 Changed mli templates to mlx | |
45 | // 02/10/03 Reordered header: .section, .global, .proc, .align | |
46 | // | |
47 | // API | |
48 | //============================================================== | |
49 | // long double tanhl(long double) | |
50 | // | |
51 | // Overview of operation | |
52 | //============================================================== | |
53 | // | |
54 | // Algorithm description | |
55 | // --------------------- | |
56 | // | |
57 | // There are 4 paths: | |
58 | // | |
59 | // 1. Special path: x = 0, Inf, NaNs, denormal | |
60 | // Return tanhl(x) = +/-0.0 for zeros | |
61 | // Return tanhl(x) = QNaN for NaNs | |
62 | // Return tanhl(x) = sign(x)*1.0 for Inf | |
63 | // Return tanhl(x) = x + x^2 for - denormals | |
64 | // Return tanhl(x) = x - x^2 for + denormals | |
65 | // | |
66 | // 2. [0;1/8] path: 0.0 < |x| < 1/8 | |
67 | // Return tanhl(x) = x + x^3*A3 + ... + x^15*A15 | |
68 | // | |
69 | // 3. Main path: 1/8 <= |x| < 22.8 | |
70 | // For several ranges of 1/8 <= |x| < 22.8 | |
71 | // Return tanhl(x) = sign(x)*((A0H+A0L) + y*(A1H+A1L) + y^2*(A2H+A2L) + | |
72 | // + y^3*A3 + y^4*A4 + ... + y^25*A25 ) | |
73 | // where y = (|x|/a) - b | |
74 | // | |
75 | // For each range there is particular set of coefficients. | |
76 | // Below is the list of ranges: | |
77 | // 1/8 <= |x| < 1/4 a = 0.125, b = 1.5 | |
78 | // 1/4 <= |x| < 1/2 a = 0.25, b = 1.5 | |
79 | // 1/2 <= |x| < 1.0 a = 0.5, b = 1.5 | |
80 | // 1.0 <= |x| < 2.0 a = 1.0, b = 1.5 | |
81 | // 2.0 <= |x| < 3.25 a = 2.0, b = 1.5 | |
82 | // 3.25 <= |x| < 4.0 a = 2.0, b = 2.0 | |
83 | // 4.0 <= |x| < 6.5 a = 4.0, b = 1.5 | |
84 | // 6.5 <= |x| < 8.0 a = 4.0, b = 2.0 | |
85 | // 8.0 <= |x| < 13.0 a = 8.0, b = 1.5 | |
86 | // 13.0 <= |x| < 16.0 a = 8.0, b = 2.0 | |
87 | // 16.0 <= |x| < 22.8 a = 16.0, b = 1.5 | |
88 | // ( [3.25;4.0], [6.5;8.0], [13.9;16.0] subranges separated | |
89 | // for monotonicity issues resolve ) | |
90 | // | |
91 | // 4. Saturation path: 22.8 <= |x| < +INF | |
92 | // Return tanhl(x) = sign(x)*(1.0 - tiny_value) | |
93 | // (tiny_value ~ 1e-1233) | |
94 | // | |
95 | // Implementation notes | |
96 | // -------------------- | |
97 | // | |
98 | // 1. Special path: x = 0, INF, NaNa, denormals | |
99 | // | |
100 | // This branch is cut off by one fclass operation. | |
101 | // Then zeros+nans, infinities and denormals processed separately. | |
102 | // For denormals we use simple fma operaton x+x*x (- for +denorms) | |
103 | // | |
104 | // 2. [0;1/8] path: 0.0 < |x| < 1/8 | |
105 | // | |
106 | // Here we use simple polynimial computations, where last step | |
107 | // is performed as x + x^3*A3+... | |
108 | // The rest of polynomial is factorized using binary tree technique. | |
109 | // | |
110 | // 3. Main path: 1/8 <= |x| < 22.8 | |
111 | // | |
112 | // Multiprecision have to be performed only for first few | |
113 | // polynomial iterations (up to 3-rd x degree) | |
114 | // Here we use the same parallelisation way as above: | |
115 | // Split whole polynomial to first, "multiprecision" part, and second, | |
116 | // so called "tail", native precision part. | |
117 | // | |
118 | // 1) Multiprecision part: | |
119 | // [v1=(A0H+A0L)+y*(A1H+A1L)] + [v2=y^2*((A2H+A2L)+y*A3)] | |
120 | // v1 and v2 terms calculated in parallel | |
121 | // | |
122 | // 2) Tail part: | |
123 | // v3 = x^4 * ( A4 + x*A5 + ... + x^21*A25 ) | |
124 | // v3 is splitted to 2 even parts (10 coefficient in each one). | |
125 | // These 2 parts are also factorized using binary tree technique. | |
126 | // | |
127 | // So Multiprecision and Tail parts cost is almost the same | |
128 | // and we have both results ready before final summation. | |
129 | // | |
130 | // Some tricks were applied to maintain symmetry at direct | |
131 | // rounding modes (to +/-inf). We had to set result sign | |
132 | // not at the last operation but much more earlier and at | |
133 | // several places. | |
134 | // | |
135 | // 4. Saturation path: 22.8 <= |x| < +INF | |
136 | // | |
137 | // We use formula sign(x)*(1.0 - tiny_value) instead of simple sign(x)*1.0 | |
138 | // just to meet IEEE requirements for different rounding modes in this case. | |
139 | // | |
140 | // Registers used | |
141 | //============================================================== | |
142 | // Floating Point registers used: | |
143 | // f8 - input & output | |
144 | // f32 -> f92 | |
145 | ||
146 | // General registers used: | |
147 | // r2, r3, r32 -> r52 | |
148 | ||
149 | // Predicate registers used: | |
150 | // p0, p6 -> p11, p14, p15 | |
151 | ||
152 | // p6 - arg is zero, denormal or special IEEE | |
153 | // p7 - arg is in [16;32] binary interval | |
154 | // p8 - arg is in one of subranges | |
155 | // [3.25;4.0], [6.5;8.0], [13.9;16.0] | |
156 | // p9 - arg < 1/8 | |
157 | // p10 - arg is NOT in one of subranges | |
158 | // [3.25;4.0], [6.5;8.0], [13.9;16.0] | |
159 | // p11 - arg in saturation domain | |
160 | // p14 - arg is positive | |
161 | // p15 - arg is negative | |
162 | ||
163 | // Assembly macros | |
164 | //============================================================== | |
165 | rDataPtr = r2 | |
166 | rTailDataPtr = r3 | |
167 | ||
168 | rBias = r33 | |
169 | rSignBit = r34 | |
170 | rInterval = r35 | |
171 | ||
172 | rArgExp = r36 | |
173 | rArgSig = r37 | |
174 | r3p25Offset = r38 | |
175 | r2to4 = r39 | |
176 | r1p25 = r40 | |
177 | rOffset = r41 | |
178 | r1p5 = r42 | |
179 | rSaturation = r43 | |
180 | r1625Sign = r44 | |
181 | rTiny = r45 | |
182 | rAddr1 = r46 | |
183 | rAddr2 = r47 | |
184 | rTailAddr1 = r48 | |
185 | rTailAddr2 = r49 | |
186 | rTailOffset = r50 | |
187 | rTailAddOffset = r51 | |
188 | rShiftedDataPtr = r52 | |
189 | ||
190 | //============================================================== | |
191 | fA0H = f32 | |
192 | fA0L = f33 | |
193 | fA1H = f34 | |
194 | fA1L = f35 | |
195 | fA2H = f36 | |
196 | fA2L = f37 | |
197 | fA3 = f38 | |
198 | fA4 = f39 | |
199 | fA5 = f40 | |
200 | fA6 = f41 | |
201 | fA7 = f42 | |
202 | fA8 = f43 | |
203 | fA9 = f44 | |
204 | fA10 = f45 | |
205 | fA11 = f46 | |
206 | fA12 = f47 | |
207 | fA13 = f48 | |
208 | fA14 = f49 | |
209 | fA15 = f50 | |
210 | fA16 = f51 | |
211 | fA17 = f52 | |
212 | fA18 = f53 | |
213 | fA19 = f54 | |
214 | fA20 = f55 | |
215 | fA21 = f56 | |
216 | fA22 = f57 | |
217 | fA23 = f58 | |
218 | fA24 = f59 | |
219 | fA25 = f60 | |
220 | ||
221 | fArgSqr = f61 | |
222 | fArgCube = f62 | |
223 | fArgFour = f63 | |
224 | fArgEight = f64 | |
225 | ||
226 | fArgAbsNorm = f65 | |
227 | fArgAbsNorm2 = f66 | |
228 | fArgAbsNorm2L = f67 | |
229 | fArgAbsNorm3 = f68 | |
230 | fArgAbsNorm4 = f69 | |
231 | fArgAbsNorm11 = f70 | |
232 | ||
233 | fRes = f71 | |
234 | fResH = f72 | |
235 | fResL = f73 | |
236 | fRes1H = f74 | |
237 | fRes1L = f75 | |
238 | fRes1Hd = f76 | |
239 | fRes2H = f77 | |
240 | fRes2L = f78 | |
241 | fRes3H = f79 | |
242 | fRes3L = f80 | |
243 | fRes4 = f81 | |
244 | ||
245 | fTT = f82 | |
246 | fTH = f83 | |
247 | fTL = f84 | |
248 | fTT2 = f85 | |
249 | fTH2 = f86 | |
250 | fTL2 = f87 | |
251 | ||
252 | f1p5 = f88 | |
253 | f2p0 = f89 | |
254 | fTiny = f90 | |
255 | fSignumX = f91 | |
256 | fArgAbsNorm4X = f92 | |
257 | ||
258 | // Data tables | |
259 | //============================================================== | |
260 | RODATA | |
261 | ||
262 | .align 16 | |
263 | LOCAL_OBJECT_START(tanhl_data) | |
264 | ||
265 | ////////// Main tables /////////// | |
266 | _0p125_to_0p25_data: // exp = 2^-3 | |
267 | // Polynomial coefficients for the tanh(x), 1/8 <= |x| < 1/4 | |
268 | data8 0x93D27D6AE7E835F8, 0x0000BFF4 //A3 = -5.6389704216278164626050408239e-04 | |
269 | data8 0xBF66E8668A78A8BC //A2H = -2.7963640930198357253955165902e-03 | |
270 | data8 0xBBD5384EFD0E7A54 //A2L = -1.7974001252014762983581666453e-20 | |
271 | data8 0x3FBEE69E31DB6156 //A1H = 1.2070645062647619716322822114e-01 | |
272 | data8 0x3C43A0B4E24A3DCA //A1L = 2.1280460108882061756490131241e-18 | |
273 | data8 0x3FC7B8FF903BF776 //A0H = 1.8533319990813951205765874874e-01 | |
274 | data8 0x3C593F1A61986FD4 //A0L = 5.4744612262799573374268254539e-18 | |
275 | data8 0xDB9E6735560AAE5A, 0x0000BFA3 //A25 = -3.4649731131719154051239475238e-28 | |
276 | data8 0xF0DDE953E4327704, 0x00003FA4 //A24 = 7.6004173864565644629900702857e-28 | |
277 | data8 0x8532AED11DEC5612, 0x00003FAB //A23 = 5.3798235684551098715428515761e-26 | |
278 | data8 0xAEF72A34D88B0038, 0x0000BFAD //A22 = -2.8267199091484508912273222600e-25 | |
279 | data8 0x9645EF1DCB759DDD, 0x0000BFB2 //A21 = -7.7689413112830095709522203109e-24 | |
280 | data8 0xA5D12364E121F70F, 0x00003FB5 //A20 = 6.8580281614531622113161030550e-23 | |
281 | data8 0x9CF166EA815AC705, 0x00003FB9 //A19 = 1.0385615003184753213024737634e-21 | |
282 | data8 0x852B1D0252498752, 0x0000BFBD //A18 = -1.4099753997949827217635356478e-20 | |
283 | data8 0x9270F5716D25EC9F, 0x0000BFC0 //A17 = -1.2404055949090177751123473821e-19 | |
284 | data8 0xC216A9C4EEBDDDCA, 0x00003FC4 //A16 = 2.6303900460415782677749729120e-18 | |
285 | data8 0xDCE944D89FF592F2, 0x00003FC6 //A15 = 1.1975620514752377092265425941e-17 | |
286 | data8 0x83C8DDF213711381, 0x0000BFCC //A14 = -4.5721980583985311263109531319e-16 | |
287 | LOCAL_OBJECT_END(tanhl_data) | |
288 | ||
289 | LOCAL_OBJECT_START(_0p25_to_0p5_data) | |
290 | // Polynomial coefficients for the tanh(x), 1/4 <= |x| < 1/2 | |
291 | data8 0xB6E27B747C47C8AD, 0x0000BFF6 //A3 = -2.7905990032063258105302045572e-03 | |
292 | data8 0xBF93FD54E226F8F7 //A2H = -1.9521070769536099515084615064e-02 | |
293 | data8 0xBC491BC884F6F18A //A2L = -2.7222721075104525371410300625e-18 | |
294 | data8 0x3FCBE3FBB015A591 //A1H = 2.1789499376181400980279079249e-01 | |
295 | data8 0x3C76AFC2D1AE35F7 //A1L = 1.9677459707672596091076696742e-17 | |
296 | data8 0x3FD6EF53DE8C8FAF //A0H = 3.5835739835078589399230963863e-01 | |
297 | data8 0x3C8E2A1C14355F9D //A0L = 5.2327050592919416045278607775e-17 | |
298 | data8 0xF56D363AAE3BAD53, 0x00003FBB //A25 = 6.4963882412697389947564301120e-21 | |
299 | data8 0xAD6348526CEEB897, 0x0000BFBD //A24 = -1.8358149767147407353343152624e-20 | |
300 | data8 0x85D96A988565FD65, 0x0000BFC1 //A23 = -2.2674950494950919052759556703e-19 | |
301 | data8 0xD52CAF6B1E4D9717, 0x00003FC3 //A22 = 1.4445269502644677106995571101e-18 | |
302 | data8 0xBD7E1BE5CBEF7A01, 0x00003FC5 //A21 = 5.1362075721080004718090799595e-18 | |
303 | data8 0xAE84A9B12ADD6948, 0x0000BFC9 //A20 = -7.5685210830925426342786733068e-17 | |
304 | data8 0xEAC2D5FCF80E250C, 0x00003FC6 //A19 = 1.2726423522879522181100392135e-17 | |
305 | data8 0xE0D2A8AC8C2EDB95, 0x00003FCE //A18 = 3.1200443098733419749016380203e-15 | |
306 | data8 0xB22F0AB7B417F78E, 0x0000BFD0 //A17 = -9.8911854977385933809488291835e-15 | |
307 | data8 0xE25A627BAEFFA7A4, 0x0000BFD3 //A16 = -1.0052095388666003876301743498e-13 | |
308 | data8 0xC90F32EC4A17F908, 0x00003FD6 //A15 = 7.1430637679768183097897337145e-13 | |
309 | data8 0x905F6F124AF956B1, 0x00003FD8 //A14 = 2.0516607231389483452611375485e-12 | |
310 | LOCAL_OBJECT_END(_0p25_to_0p5_data) | |
311 | ||
312 | LOCAL_OBJECT_START(_0p5_to_1_data) | |
313 | // Polynomial coefficients for the tanh(x), 1/2 <= |x| < 1 | |
314 | data8 0xAB402BE491EE72A7, 0x00003FF7 //A3 = 5.2261556931080934657023772945e-03 | |
315 | data8 0xBFB8403D3DDA87BE //A2H = -9.4730212784752659826992271519e-02 | |
316 | data8 0xBC6FF7BC2AB71A8B //A2L = -1.3863786398568460929625760740e-17 | |
317 | data8 0x3FD3173B1EFA6EF4 //A1H = 2.9829290414066567116435635398e-01 | |
318 | data8 0x3C881E4DCABDE840 //A1L = 4.1838710466827119847963316219e-17 | |
319 | data8 0x3FE45323E552F228 //A0H = 6.3514895238728730220145735075e-01 | |
320 | data8 0x3C739D5832BF7BCF //A0L = 1.7012977006567066423682445459e-17 | |
321 | data8 0xF153980BECD8AE12, 0x00003FD0 //A25 = 1.3396313991261493342597057700e-14 | |
322 | data8 0xEC9ACCD245368129, 0x0000BFD3 //A24 = -1.0507358886349528807350792383e-13 | |
323 | data8 0x8AE6498CA36D2D1A, 0x00003FD4 //A23 = 1.2336759149738309660361813001e-13 | |
324 | data8 0x8DF02FBF5AC70E64, 0x00003FD7 //A22 = 1.0085317723615282268326194551e-12 | |
325 | data8 0x9E15C7125DA204EE, 0x0000BFD9 //A21 = -4.4930478919612724261941857560e-12 | |
326 | data8 0xA62C6F39BDDCEC1C, 0x00003FD7 //A20 = 1.1807342457875095150035780314e-12 | |
327 | data8 0xDFD8D65D30F80F52, 0x00003FDC //A19 = 5.0896919887121116317817665996e-11 | |
328 | data8 0xB795AFFD458F743E, 0x0000BFDE //A18 = -1.6696932710534097241291327756e-10 | |
329 | data8 0xFEF30234CB01EC89, 0x0000BFDD //A17 = -1.1593749714588103589483091370e-10 | |
330 | data8 0xA2F638356E13761E, 0x00003FE2 //A16 = 2.3714062288761887457674853605e-09 | |
331 | data8 0xC429CC0D031E4FD5, 0x0000BFE3 //A15 = -5.7091025466377379046489586383e-09 | |
332 | data8 0xC78363FF929EFF62, 0x0000BFE4 //A14 = -1.1613199289622686725595739572e-08 | |
333 | LOCAL_OBJECT_END(_0p5_to_1_data) | |
334 | ||
335 | LOCAL_OBJECT_START(_1_to_2_data) | |
336 | // Polynomial coefficients for the tanh(x), 1 <= |x| < 2.0 | |
337 | data8 0xB3D8FB48A548D99A, 0x00003FFB //A3 = 8.7816203264683800892441646129e-02 | |
338 | data8 0xBFC4EFBD8FB38E3B //A2H = -1.6356629864377389416141284073e-01 | |
339 | data8 0xBC77687FD8087B23 //A2L = -2.0303377679446772162287121190e-17 | |
340 | data8 0x3FC72165282C6F72 //A1H = 1.8070663892364852154415189034e-01 | |
341 | data8 0x3C64E01F7A76D777 //A1L = 9.0532964466719018524360408402e-18 | |
342 | data8 0x3FECF6F9786DF577 //A0H = 9.0514825364486639625027919465e-01 | |
343 | data8 0x3C8834EDCE71A65B //A0L = 4.1992023813070331863928976191e-17 | |
344 | data8 0xC3EEEB3EFA688094, 0x00003FE2 //A25 = 2.8512044383274095705865793485e-09 | |
345 | data8 0x88461973672AEB12, 0x0000BFE1 //A24 = -9.9152258079470849685057375343e-10 | |
346 | data8 0xFC2AF9950DC5027E, 0x0000BFE4 //A23 = -1.4678101918123116001692289670e-08 | |
347 | data8 0x9C80CA742F89B7B5, 0x00003FE6 //A22 = 3.6438714992394138274843759814e-08 | |
348 | data8 0xA0B3D7FAA606260A, 0x0000BFE6 //A21 = -3.7416469848124568887944709492e-08 | |
349 | data8 0xDA5858432FBD9D9D, 0x0000BFE6 //A20 = -5.0837429421503142141842414978e-08 | |
350 | data8 0xB0244D1E1AE9C1B0, 0x00003FE9 //A19 = 3.2808967255272595749004827841e-07 | |
351 | data8 0xC8D3109ACF740738, 0x0000BFEA //A18 = -7.4812945767507614821609020680e-07 | |
352 | data8 0xBB0F3440EEA55BBF, 0x00003FEA //A17 = 6.9685053481643125932497676583e-07 | |
353 | data8 0xC13A8B08D8576C19, 0x00003FEB //A16 = 1.4396658837712390333960587173e-06 | |
354 | data8 0xFF3A1163CC5522A1, 0x0000BFED //A15 = -7.6063522055104010298762276148e-06 | |
355 | data8 0x8672AF27EB0823B7, 0x00003FEF //A14 = 1.6027448793338500004496520337e-05 | |
356 | LOCAL_OBJECT_END(_1_to_2_data) | |
357 | ||
358 | LOCAL_OBJECT_START(_2_to_3p25_data) | |
359 | // Polynomial coefficients for the tanh(x), 2 <= |x| < 3.25 | |
360 | data8 0xD45657BEC559E366, 0x00003FFA //A3 = 5.1840155367548909799883161889e-02 | |
361 | data8 0xBFA41B109CA6AB81 //A2H = -3.9268988726084870510835145296e-02 | |
362 | data8 0xBC2C3D708A4E56C5 //A2L = -7.6544669252238280132415018518e-19 | |
363 | data8 0x3F9434A517BBC5F4 //A1H = 1.9732074330880380874653212686e-02 | |
364 | data8 0x3C3ED62DD9585229 //A1L = 1.6716574468135097509707871438e-18 | |
365 | data8 0x3FEFD77D111A0AFF //A0H = 9.9505475368673035330147058630e-01 | |
366 | data8 0x3C9C415E151C6CA5 //A0L = 9.8030409604070051319822874013e-17 | |
367 | data8 0xB1596391D4534D52, 0x00003FEC //A25 = 2.6427086526487251988631279067e-06 | |
368 | data8 0xC4DC44E243D1AF5F, 0x00003FEF //A24 = 2.3467591534149209236830008333e-05 | |
369 | data8 0xAED5786023982BB8, 0x00003FF0 //A23 = 4.1683642395739762658623742687e-05 | |
370 | data8 0xCF39926C9FBC6A10, 0x00003FF0 //A22 = 4.9406263949321793291856681624e-05 | |
371 | data8 0xA255A72359928142, 0x00003FF0 //A21 = 3.8703580278108400672236161973e-05 | |
372 | data8 0xA2E573B9FC332C0D, 0x00003FED //A20 = 4.8546879618263642155709302480e-06 | |
373 | data8 0x82C7BD01830ACA93, 0x00003FF0 //A19 = 3.1180436075031301077175550468e-05 | |
374 | data8 0xB38AF4C76E96444B, 0x0000BFF0 //A18 = -4.2806338675404452784440167120e-05 | |
375 | data8 0xEC08FF0FB194464C, 0x00003FF0 //A17 = 5.6275163156181928637744511210e-05 | |
376 | data8 0xB850825D9E235135, 0x0000BFF0 //A16 = -4.3943998628289568813056822585e-05 | |
377 | data8 0xF98436E838763687, 0x0000BFEF //A15 = -2.9744680263523220185672219686e-05 | |
378 | data8 0xE1851A2D00737A5D, 0x00003FF2 //A14 = 2.1507256570895163202182573369e-04 | |
379 | LOCAL_OBJECT_END(_2_to_3p25_data) | |
380 | ||
381 | LOCAL_OBJECT_START(_4_to_6p5_data) | |
382 | // Polynomial coefficients for the tanh(x), 4 <= |x| < 6.5 | |
383 | data8 0x896FDBD321A0BE58, 0x00003FF5 //A3 = 1.0485606995331904734870550114e-03 | |
384 | data8 0xBF39C522B95A37D6 //A2H = -3.9321992640217512306882730044e-04 | |
385 | data8 0xBBA9B3EC39A45338 //A2L = -2.7213922673282819034134988241e-21 | |
386 | data8 0x3F19C5377A48B5AD //A1H = 9.8306189621330793766869338146e-05 | |
387 | data8 0x3BCAFCB1D08A891C //A1L = 1.1429476443042275163117526657e-20 | |
388 | data8 0x3FEFFFE63ABE253B //A0H = 9.9998771165079547440512897083e-01 | |
389 | data8 0x3C9BB74C4EE0D16F //A0L = 9.6159219890436197391279544561e-17 | |
390 | data8 0x8D86121D469AFA7E, 0x0000BFEF //A25 = -1.6870941388985743600323604423e-05 | |
391 | data8 0x9D3656A36593C5C4, 0x00003FEF //A24 = 1.8741161763079973068909254398e-05 | |
392 | data8 0xDCD772D5BF9ADB96, 0x00003FF0 //A23 = 5.2652739523018349983563695656e-05 | |
393 | data8 0xFF79ADCF0DCBCC2D, 0x00003FF1 //A22 = 1.2182012003034659966028035977e-04 | |
394 | data8 0x84D24E394DEFD0D2, 0x00003FF1 //A21 = 6.3334229517535065590380468696e-05 | |
395 | data8 0xA66B56BFD2782544, 0x00003FF1 //A20 = 7.9354902476954571736114945842e-05 | |
396 | data8 0xFB15771FBF3155FE, 0x0000BFEE //A19 = -1.4965763624796745134798717707e-05 | |
397 | data8 0xC774790126BE54C3, 0x00003FEF //A18 = 2.3776885435831770523136610539e-05 | |
398 | data8 0x825A13DACB8C68CD, 0x00003FEF //A17 = 1.5539153272890695426189818556e-05 | |
399 | data8 0xCFF96E6810AACE27, 0x0000BFF1 //A16 = -9.9169893703251156059893890295e-05 | |
400 | data8 0x8A85D2061B865024, 0x00003FF3 //A15 = 2.6421115104625621420758344535e-04 | |
401 | data8 0x922EC6F3CFE0496E, 0x0000BFF4 //A14 = -5.5764283474946207558456581668e-04 | |
402 | LOCAL_OBJECT_END(_4_to_6p5_data) | |
403 | ||
404 | LOCAL_OBJECT_START(_8_to_13_data) | |
405 | // Polynomial coefficients for the tanh(x), 8 <= |x| < 13 | |
406 | data8 0xDD6050A898303460, 0x00003FE6 //A3 = 5.1543170295688189081352133793e-08 | |
407 | data8 0xBE44C1078FDBADC0 //A2H = -9.6643444318955652627581125180e-09 | |
408 | data8 0xBAF95FCAA6DBBA6F //A2L = -1.3118146684038113473094275420e-24 | |
409 | data8 0x3E14C1078FE26748 //A1H = 1.2080430540780827633746315479e-09 | |
410 | data8 0x3A88168082F37D95 //A1L = 9.7290246966246404028418245094e-27 | |
411 | data8 0x3FEFFFFFFFF59F7C //A0H = 9.9999999992449728480892190419e-01 | |
412 | data8 0x3C7C068EBC5C2EEB //A0L = 2.4308346546749583521003998922e-17 | |
413 | data8 0x9DC155C77A6C46E5, 0x00003FF2 //A25 = 1.5044709695520252096006763473e-04 | |
414 | data8 0xF2F9E09CA47F46E9, 0x00003FF3 //A24 = 4.6344010077547944693833282056e-04 | |
415 | data8 0xCBFD67E704734BC8, 0x00003FF4 //A23 = 7.7815958662026429864083620142e-04 | |
416 | data8 0xC18DC821CD67E621, 0x00003FF4 //A22 = 7.3834928521190855055818897104e-04 | |
417 | data8 0x8AF72BCAB05A296E, 0x00003FF4 //A21 = 5.3011135848666430331904214879e-04 | |
418 | data8 0xC2E73BE9B9AB4007, 0x00003FF2 //A20 = 1.8587423129049905806822275188e-04 | |
419 | data8 0xE7E8C2058E2FF9F7, 0x00003FF1 //A19 = 1.1058292891321512917337425414e-04 | |
420 | data8 0xC46309F52E429F97, 0x0000BFF0 //A18 = -4.6822278664829811025251866877e-05 | |
421 | data8 0x81966C1E007E9BEB, 0x00003FF1 //A17 = 6.1792176836716291200611553354e-05 | |
422 | data8 0x8CEDC4BEFCAB9A7E, 0x0000BFF1 //A16 = -6.7200080564674449915571760779e-05 | |
423 | data8 0x8B64E9FA53210018, 0x00003FF1 //A15 = 6.6468331917938095774361868182e-05 | |
424 | data8 0x82DEDAA539A3A3F1, 0x0000BFF1 //A14 = -6.2403928644276709411156885292e-05 | |
425 | LOCAL_OBJECT_END(_8_to_13_data) | |
426 | ||
427 | LOCAL_OBJECT_START(_16_to_22p8_data) | |
428 | // Polynomial coefficients for the tanh(x), 16 <= |x| < 22.88 | |
429 | data8 0x992C00F33DDE804D, 0x00003FCE //A3 = 2.1256869805798788337547274131e-15 | |
430 | data8 0x3C8D42EA28102760 //A2H = 5.0760412270332007485198379096e-17 | |
431 | data8 0x391A747B43B072DD //A2L = 1.2737621993898125881520341053e-33 | |
432 | data8 0x3C309BC5C3CB4D5F //A1H = 9.0034785192019775952205276560e-19 | |
433 | data8 0x38A8EF3B5C9DCE71 //A1L = 9.3793162715476168397242934494e-36 | |
434 | data8 0x3FF0000000000000 //A0H = 1.0000000000000000000000000000e+00 | |
435 | data8 0x3BACC66AFD5CA22A //A0L = 3.0466790472070565954180861749e-21 | |
436 | data8 0xF020FB351C2F37CB, 0x00003FF1 //A25 = 1.1450235038836625246604146870e-04 | |
437 | data8 0xBE80596C51302A7B, 0x00003FF4 //A24 = 7.2670503421185030764546828414e-04 | |
438 | data8 0x91343CF8577E0131, 0x00003FF6 //A23 = 2.2156380512949603402001207105e-03 | |
439 | data8 0x8D029A8679641286, 0x00003FF7 //A22 = 4.3032888906494613055765544559e-03 | |
440 | data8 0xC3713F64D8DC4BAB, 0x00003FF7 //A21 = 5.9644279041951657632420721490e-03 | |
441 | data8 0xCD678C455A5D06C2, 0x00003FF7 //A20 = 6.2684473911812928601693994403e-03 | |
442 | data8 0xA9E1C825BDCEEBCC, 0x00003FF7 //A19 = 5.1843859941826642445235686826e-03 | |
443 | data8 0xE29C919AD93F6EB9, 0x00003FF6 //A18 = 3.4578185539872939928152204329e-03 | |
444 | data8 0xF7E615A75994A607, 0x00003FF5 //A17 = 1.8913175041916131006881986311e-03 | |
445 | data8 0xE102EFE0F7F2B2AD, 0x00003FF4 //A16 = 8.5835064987089641065525269712e-04 | |
446 | data8 0xAAD62946DEE96996, 0x00003FF3 //A15 = 3.2584489313998677644253007210e-04 | |
447 | data8 0xDA2470DE110B293E, 0x00003FF1 //A14 = 1.0401837693241806604296821650e-04 | |
448 | LOCAL_OBJECT_END(_16_to_22p8_data) | |
449 | ||
450 | LOCAL_OBJECT_START(_3p25_to_4_data) | |
451 | // Polynomial coefficients for the tanh(x), 3.25 <= |x| < 4 | |
452 | data8 0xE9E07240432926E6, 0x00003FF7 //A3 = 7.1373517862636557382403555215e-03 | |
453 | data8 0xBF75F495227AF306 //A2H = -5.3602052282115727338540622782e-03 | |
454 | data8 0xBBBE92D355A6B716 //A2L = -6.4741983326810209847018826624e-21 | |
455 | data8 0x3F65F85AD510B690 //A1H = 2.6819013660517934671823070403e-03 | |
456 | data8 0x3C159A0B73E6EC01 //A1L = 2.9275813076637328121849573333e-19 | |
457 | data8 0x3FEFFA81708A0B42 //A0H = 9.9932929973906703402519724477e-01 | |
458 | data8 0x3C66857246C19DC6 //A0L = 9.7670460995685717424398031188e-18 | |
459 | data8 0xE6B6B8365B1E4D6C, 0x00003FE3 //A25 = 6.7146538162212081470554423396e-09 | |
460 | data8 0xE0453CEEF483A510, 0x00003FE2 //A24 = 3.2635647369924061614015292015e-09 | |
461 | data8 0x9C7D83B56E92CF1A, 0x00003FE5 //A23 = 1.8217867585545497089756353348e-08 | |
462 | data8 0xA94635C48ABA9EB4, 0x0000BFE4 //A22 = -9.8530586070049930796756799547e-09 | |
463 | data8 0xB1B0C14443067646, 0x00003FE5 //A21 = 2.0685890807654992387562340307e-08 | |
464 | data8 0x9C6E549781E293C3, 0x00003FDE //A20 = 1.4227314592865135171341122138e-10 | |
465 | data8 0xB0CBFCE7C80F57A7, 0x0000BFE7 //A19 = -8.2327438416004542109809245219e-08 | |
466 | data8 0xB151AB3876E896E1, 0x00003FE9 //A18 = 3.3028241036175815328309577940e-07 | |
467 | data8 0xFCF3A5C1A5CB7EEE, 0x0000BFEA //A17 = -9.4231869277542043001280640966e-07 | |
468 | data8 0x96A9016C7C95BEDA, 0x00003FEC //A16 = 2.2450115975007100522962781833e-06 | |
469 | data8 0x9B9B0A3901DEC05B, 0x0000BFED //A15 = -4.6374089937147736266514566049e-06 | |
470 | data8 0x8987DF26A6789CCF, 0x00003FEE //A14 = 8.1974714257536543772040700977e-06 | |
471 | LOCAL_OBJECT_END(_3p25_to_4_data) | |
472 | ||
473 | LOCAL_OBJECT_START(_6p5_to_8_data) | |
474 | // Polynomial coefficients for the tanh(x), 6.5 <= |x| < 8.0 | |
475 | data8 0xA11C8A63815E5657, 0x00003FEF //A3 = 1.9205985861286093001394561449e-05 | |
476 | data8 0xBEDE355AD6CB61D8 //A2H = -7.2022479400070228499307345427e-06 | |
477 | data8 0xBB8E6B50B8468A63 //A2L = -8.0518953122203408718779840543e-22 | |
478 | data8 0x3EBE355B48DCF330 //A1H = 1.8005623902549165889479948488e-06 | |
479 | data8 0x3B5837550FFA98DA //A1L = 8.0124491698609178046195694087e-23 | |
480 | data8 0x3FEFFFFF872A91F8 //A0H = 9.9999977492967584424832239165e-01 | |
481 | data8 0x3C8A43B839B4EB63 //A0L = 4.5561696441306660142461355317e-17 | |
482 | data8 0xB5BC1948966B8826, 0x0000BFE6 //A25 = -4.2313421330480692560677276010e-08 | |
483 | data8 0x91D0BE367389BDFC, 0x0000BFE8 //A24 = -1.3580117599617083801153887619e-07 | |
484 | data8 0xFFD950AF282AB36C, 0x0000BFE8 //A23 = -2.3827784451962439125197203287e-07 | |
485 | data8 0x959B1770EBB8903A, 0x0000BFE9 //A22 = -2.7866256690165347051403663794e-07 | |
486 | data8 0xCC78060D1C0CFF3C, 0x0000BFE8 //A21 = -1.9042644867126442102188429523e-07 | |
487 | data8 0xF8919BAF2E87F31D, 0x0000BFE8 //A20 = -2.3149771783868910586746973299e-07 | |
488 | data8 0xC5B6AC942A3F2440, 0x00003FE8 //A19 = 1.8413511183396213757149263639e-07 | |
489 | data8 0xABF1A4703056450A, 0x0000BFEA //A18 = -6.4054099983863829656292958643e-07 | |
490 | data8 0xBB543D8BDB670453, 0x00003FEB //A17 = 1.3957102903892251890348444989e-06 | |
491 | data8 0xC9D6F37700C1D092, 0x0000BFEC //A16 = -3.0076451968978522605262647414e-06 | |
492 | data8 0xCA6EF4BB64E49EC8, 0x00003FED //A15 = 6.0329860989478473738709576062e-06 | |
493 | data8 0xBE25D0FD069D0A93, 0x0000BFEE //A14 = -1.1333687314965721384777951065e-05 | |
494 | LOCAL_OBJECT_END(_6p5_to_8_data) | |
495 | ||
496 | LOCAL_OBJECT_START(_13_to_16_data) | |
497 | // Polynomial coefficients for the tanh(x), 13 <= |x| < 16 | |
498 | data8 0x98176FD2075BDBD5, 0x00003FDB //A3 = 1.7290807363028159200235264756e-11 | |
499 | data8 0xBD8C8464F76162D1 //A2H = -3.2420263805679445515400340441e-12 | |
500 | data8 0xBA2D56B508E0F1FD //A2L = -1.8515322669984580704502445180e-28 | |
501 | data8 0x3D5C8464F761639C //A1H = 4.0525329757100331782338488690e-13 | |
502 | data8 0x3A0A09D9E328E620 //A1L = 4.1081479300866418212862258651e-29 | |
503 | data8 0x3FEFFFFFFFFFFF1B //A0H = 9.9999999999997457589273608392e-01 | |
504 | data8 0x3C9B9B089E9BFD89 //A0L = 9.5776165728054091471814161399e-17 | |
505 | data8 0xC5395B9EC765BDB7, 0x00003FE6 //A25 = 4.5919803498257974411526879804e-08 | |
506 | data8 0x9A0F1FCB1DC24C3A, 0x00003FE8 //A24 = 1.4347869798460288751020493795e-07 | |
507 | data8 0x8AA5C3459FAD0B28, 0x00003FE9 //A23 = 2.5825111356333853968900510087e-07 | |
508 | data8 0x9578B747988CFF9D, 0x00003FE9 //A22 = 2.7841245127068220034870119246e-07 | |
509 | data8 0x810DF1A589D9CAF1, 0x00003FE9 //A21 = 2.4038267971021370956311255310e-07 | |
510 | data8 0x8A00D77B9416EB75, 0x00003FE8 //A20 = 1.2852557749068320312899366352e-07 | |
511 | data8 0xB2436C4A1849C498, 0x00003FE7 //A19 = 8.3010350873515703893886683374e-08 | |
512 | data8 0xEA6405B18356600B, 0x00003FE3 //A18 = 6.8216675390299296071261114202e-09 | |
513 | data8 0xF7606C022194B7E8, 0x00003FE5 //A17 = 2.8798432098264655723769995993e-08 | |
514 | data8 0xAF4B0C453FCAF34E, 0x0000BFE5 //A16 = -2.0406809167824936143455638336e-08 | |
515 | data8 0xC324C1F10D5FA7CC, 0x00003FE5 //A15 = 2.2717703170390130238356558599e-08 | |
516 | data8 0xB34A2E3A4D3B9C31, 0x0000BFE5 //A14 = -2.0872076027950789618606920471e-08 | |
517 | LOCAL_OBJECT_END(_13_to_16_data) | |
518 | ||
519 | ||
520 | //////// "Tail" tables ////////// | |
521 | LOCAL_OBJECT_START(_0p125_to_0p25_data_tail) | |
522 | // Polynomial coefficients for the erf(x), 1/8 <= |x| < 1/4 | |
523 | data8 0x9D7D206E97ADC83A, 0x0000BFCC //A13 = -5.4639895428711257047470806445e-16 | |
524 | data8 0xA8972B666A845810, 0x00003FD3 //A12 = 7.4869224589947988668562043110e-14 | |
525 | data8 0x9A5B31511C9F4698, 0x0000BFD4 //A11 = -1.3709586467430093373657009487e-13 | |
526 | data8 0xCBB8047BCB274982, 0x0000BFDA //A10 = -1.1580074124926108509393610532e-11 | |
527 | data8 0xF95EB849E5F9247C, 0x00003FDC //A9 = 5.6700173336564916962945623180e-11 | |
528 | data8 0xE7893404C6A53386, 0x00003FE1 //A8 = 1.6846457582993065168777704528e-09 | |
529 | data8 0xF2E5C7E2B5F55ECC, 0x0000BFE4 //A7 = -1.4138500046802141367543484859e-08 | |
530 | data8 0xF43906FF53A002C0, 0x0000BFE8 //A6 = -2.2745017243678613107034288816e-07 | |
531 | data8 0xC6175D5E47D1D259, 0x00003FEC //A5 = 2.9517899220726077077586632607e-06 | |
532 | data8 0xE7C2AE92CB36769B, 0x00003FEF //A4 = 2.7628001723157068127646694830e-05 | |
533 | LOCAL_OBJECT_END(_0p125_to_0p25_data_tail) | |
534 | ||
535 | LOCAL_OBJECT_START(_0p25_to_0p5_data_tail) | |
536 | // Polynomial coefficients for the tanh(x), 1/4 <= |x| < 1/2 | |
537 | data8 0x9E2972C008B9965E, 0x0000BFDC //A13 = -3.5961854154738002253192260213e-11 | |
538 | data8 0xC3EABA3D219BEA8A, 0x00003FDB //A12 = 2.2273173303628274478819473067e-11 | |
539 | data8 0xC50FB68D960D5CD9, 0x00003FE1 //A11 = 1.4338102430978399800743148719e-09 | |
540 | data8 0xB3BB92499EF2D583, 0x0000BFE3 //A10 = -5.2309100551458044083112632491e-09 | |
541 | data8 0xBD915BE632F1D04E, 0x0000BFE6 //A9 = -4.4137194873936112573773943707e-08 | |
542 | data8 0xBC48C813FA819141, 0x00003FE9 //A8 = 3.5070684356359066908197915734e-07 | |
543 | data8 0xD3E34EA031AC611B, 0x00003FEA //A7 = 7.8934400708919584259192272835e-07 | |
544 | data8 0x8EAC489D859541CD, 0x0000BFEF //A6 = -1.7007944944124693133572815137e-05 | |
545 | data8 0x98D4D7E5D1508B8A, 0x00003FEF //A5 = 1.8218924920302265989878708948e-05 | |
546 | data8 0xAC262F3F8CF49C02, 0x00003FF4 //A4 = 6.5669692402266433496312492412e-04 | |
547 | LOCAL_OBJECT_END(_0p25_to_0p5_data_tail) | |
548 | ||
549 | LOCAL_OBJECT_START(_0p5_to_1_data_tail) | |
550 | // Polynomial coefficients for the tanh(x), 1/2 <= |x| < 1 | |
551 | data8 0xDF67FB36FFA2A538, 0x00003FE7 //A13 = 1.0403160796697495720021114635e-07 | |
552 | data8 0xB7FB80FB5AFA63A4, 0x0000BFE8 //A12 = -1.7134699677764282023124981753e-07 | |
553 | data8 0xC87625A0BA7D6C5F, 0x0000BFEA //A11 = -7.4677732458471897291461679095e-07 | |
554 | data8 0x90DA375DD9AF6D79, 0x00003FED //A10 = 4.3169381418023765618186668159e-06 | |
555 | data8 0x82DFB03317B17316, 0x0000BFED //A9 = -3.9003426534601562552753368105e-06 | |
556 | data8 0xAA582FD4F3438BB4, 0x0000BFF0 //A8 = -4.0613288845040776435400454867e-05 | |
557 | data8 0xB1532D8CF763B21C, 0x00003FF2 //A7 = 1.6911021594787399557528570601e-04 | |
558 | data8 0x82E12AEF7CAB76C6, 0x0000BFEF //A6 = -1.5602059530458172761585925044e-05 | |
559 | data8 0x83256E3D0FBA5C93, 0x0000BFF6 //A5 = -2.0011324059500451791903108104e-03 | |
560 | data8 0xCC4AB2EC0965499B, 0x00003FF7 //A4 = 6.2344907419841579664122448353e-03 | |
561 | LOCAL_OBJECT_END(_0p5_to_1_data_tail) | |
562 | ||
563 | LOCAL_OBJECT_START(_1_to_2_data_tail) | |
564 | // Polynomial coefficients for the tanh(x), 1 <= |x| < 2.0 | |
565 | data8 0xCCAEE174EAC17F78, 0x0000BFEE //A13 = -1.2200065117856038355953618829e-05 | |
566 | data8 0xA39DD0981D1A2776, 0x0000BFF0 //A12 = -3.9009204899026604074167603200e-05 | |
567 | data8 0xB7104FA27FAF80D0, 0x00003FF2 //A11 = 1.7458316338540792661905876072e-04 | |
568 | data8 0xB219A7274436A734, 0x0000BFF3 //A10 = -3.3969918595931391572998415468e-04 | |
569 | data8 0xCCD9D03C0C73CECF, 0x00003FF2 //A9 = 1.9536097875337884986025498958e-04 | |
570 | data8 0x85321EA40CFEEBEE, 0x00003FF5 //A8 = 1.0162031558369402750607778300e-03 | |
571 | data8 0x81F272C08C308220, 0x0000BFF7 //A7 = -3.9656696618251138315464862909e-03 | |
572 | data8 0xE8761C6BDEA9ED87, 0x00003FF7 //A6 = 7.0941580558970243020090656343e-03 | |
573 | data8 0xAE4E9F3691F66877, 0x0000BFF6 //A5 = -2.6597155288710984120834711909e-03 | |
574 | data8 0xCC8286B331BD8AAA, 0x0000BFF9 //A4 = -2.4964583478826523250880337777e-02 | |
575 | LOCAL_OBJECT_END(_1_to_2_data_tail) | |
576 | ||
577 | LOCAL_OBJECT_START(_2_to_3p25_data_tail) | |
578 | // Polynomial coefficients for the tanh(x), 2 <= |x| < 3.25 | |
579 | data8 0x92E1711A3BD6408B, 0x0000BFF4 //A13 = -5.6030514548041036913731470443e-04 | |
580 | data8 0x8B9BD885FF3E98C5, 0x00003FF5 //A12 = 1.0651304064581604055612602669e-03 | |
581 | data8 0xD041356C7FA26A22, 0x0000BFF5 //A11 = -1.5888574328066952147023520244e-03 | |
582 | data8 0xDFA210BE9BE6B7FD, 0x00003FF5 //A10 = 1.7061849060196387827639060629e-03 | |
583 | data8 0x8ECC3606808028E9, 0x0000BFF4 //A9 = -5.4472999329435778312080340471e-04 | |
584 | data8 0xD5C053B8EEBD10C8, 0x0000BFF6 //A8 = -3.2615856552479930645151033322e-03 | |
585 | data8 0xB7BFD63AC5051539, 0x00003FF8 //A7 = 1.1215171059191957498023766643e-02 | |
586 | data8 0xC367C59D7FA3ADA2, 0x0000BFF9 //A6 = -2.3853193251842394834616848995e-02 | |
587 | data8 0x9FC9FB890BB053CF, 0x00003FFA //A5 = 3.9010984954739386625695104667e-02 | |
588 | data8 0xD01D077B42E7ED76, 0x0000BFFA //A4 = -5.0808934425896607486919526567e-02 | |
589 | LOCAL_OBJECT_END(_2_to_3p25_data_tail) | |
590 | ||
591 | LOCAL_OBJECT_START(_4_to_6p5_data_tail) | |
592 | // Polynomial coefficients for the tanh(x), 4 <= |x| < 6.5 | |
593 | data8 0x870CCE8C76C52C7E, 0x00003FF5 //A13 = 1.0303499350193060915603525934e-03 | |
594 | data8 0xE1431E54AD2A738B, 0x0000BFF5 //A12 = -1.7186140560972621669872002486e-03 | |
595 | data8 0xAB20056533E28734, 0x00003FF6 //A11 = 2.6111615345168277554841545330e-03 | |
596 | data8 0xECCB91D64718B9BD, 0x0000BFF6 //A10 = -3.6132079169671860943878776041e-03 | |
597 | data8 0x94771DA3B8C2EB4F, 0x00003FF7 //A9 = 4.5308012699419563988381317896e-03 | |
598 | data8 0xA7497377E4946F2C, 0x0000BFF7 //A8 = -5.1051915941441437592654444804e-03 | |
599 | data8 0xA76B2D6FCA088AE9, 0x00003FF7 //A7 = 5.1092120989582196669504468168e-03 | |
600 | data8 0x928C8961F33C9560, 0x0000BFF7 //A6 = -4.4723196805537430568162704711e-03 | |
601 | data8 0xDBDDDF6CDE9AB9BE, 0x00003FF6 //A5 = 3.3548994514326736175581084349e-03 | |
602 | data8 0x896E211733AD9D40, 0x0000BFF6 //A4 = -2.0970183170010094667442967500e-03 | |
603 | LOCAL_OBJECT_END(_4_to_6p5_data_tail) | |
604 | ||
605 | LOCAL_OBJECT_START(_8_to_13_data_tail) | |
606 | // Polynomial coefficients for the tanh(x), 8 <= |x| < 13 | |
607 | data8 0xE50C3476BED020AA, 0x00003FF0 //A13 = 5.4609221347524272615754239857e-05 | |
608 | data8 0xBA16F5F4EDC0EABC, 0x0000BFF0 //A12 = -4.4367239594986428539386662937e-05 | |
609 | data8 0x8B916C2F002C3D91, 0x00003FF0 //A11 = 3.3275617838067362533536610680e-05 | |
610 | data8 0xBFE8031097CB4442, 0x0000BFEF //A10 = -2.2877013297722792747267224605e-05 | |
611 | data8 0xEFE1FFD106B2DA41, 0x00003FEE //A9 = 1.4298129659899553350478452989e-05 | |
612 | data8 0x86EF1FF403A6622E, 0x0000BFEE //A8 = -8.0426979849841642112688693288e-06 | |
613 | data8 0x86EF200FD047306B, 0x00003FED //A7 = 4.0213490418736097707257704218e-06 | |
614 | data8 0xEC22782377882553, 0x0000BFEB //A6 = -1.7593402092805559754997565942e-06 | |
615 | data8 0xB119DA1DB7C47773, 0x00003FEA //A5 = 6.5975257917246601211360847253e-07 | |
616 | data8 0xDD6050A7761D67BB, 0x0000BFE8 //A4 = -2.0617268111985310661707082242e-07 | |
617 | LOCAL_OBJECT_END(_8_to_13_data_tail) | |
618 | ||
619 | LOCAL_OBJECT_START(_16_to_22p8_data_tail) | |
620 | // Polynomial coefficients for the tanh(x), 16 <= |x| < 22.88 | |
621 | data8 0xEAF4AF87336E81B1, 0x00003FEF //A13 = 2.8008914392791730186582989654e-05 | |
622 | data8 0xD5B309EA768E2711, 0x00003FED //A12 = 6.3687375204024238267961143128e-06 | |
623 | data8 0xA4048CA537113538, 0x00003FEB //A11 = 1.2220276227448617951538196845e-06 | |
624 | data8 0xD3EC78BB3425377D, 0x00003FE8 //A10 = 1.9736934193679794194181457250e-07 | |
625 | data8 0xE5763CD37440266E, 0x00003FE5 //A9 = 2.6712876934440631473215182284e-08 | |
626 | data8 0xCECA765EEB4A265F, 0x00003FE2 //A8 = 3.0092031912460315516888139627e-09 | |
627 | data8 0x99ABF588DF81A52E, 0x00003FDF //A7 = 2.7952722177649984066847682907e-10 | |
628 | data8 0xB9C78918294A4685, 0x00003FDB //A6 = 2.1120676552098603524020495036e-11 | |
629 | data8 0xB3A3C42AD539D50F, 0x00003FD7 //A5 = 1.2764169243389521270291967366e-12 | |
630 | data8 0x86BC347939478174, 0x00003FD3 //A4 = 5.9834437707863962671883176163e-14 | |
631 | LOCAL_OBJECT_END(_16_to_22p8_data_tail) | |
632 | ||
633 | LOCAL_OBJECT_START(_3p25_to_4_data_tail) | |
634 | // Polynomial coefficients for the tanh(x), 3.25 <= |x| < 4 | |
635 | data8 0xBE9A2BE19F21BA1C, 0x0000BFEE //A13 = -1.1360778336288065244475976873e-05 | |
636 | data8 0xF84910F515BDB014, 0x00003FED //A12 = 7.3994819819577018481862729782e-06 | |
637 | data8 0xC4C84FB788AA4007, 0x00003FEF //A11 = 2.3458298013663976251972482656e-05 | |
638 | data8 0x86CC6243C170E5ED, 0x0000BFF2 //A10 = -1.2855374755847770638424932233e-04 | |
639 | data8 0xD3065AC539ABABFF, 0x00003FF3 //A9 = 4.0249790677367806832685138089e-04 | |
640 | data8 0x82C4413795EC381B, 0x0000BFF5 //A8 = -9.9767013652382759950854031514e-04 | |
641 | data8 0x88D588720888899A, 0x00003FF6 //A7 = 2.0879228705174076794011525274e-03 | |
642 | data8 0xF4CA066137741469, 0x0000BFF6 //A6 = -3.7351861548964870836350490741e-03 | |
643 | data8 0xB998746D56E81737, 0x00003FF7 //A5 = 5.6639259807333999973200378964e-03 | |
644 | data8 0xE93FB2F48233275B, 0x0000BFF7 //A4 = -7.1181892208343798194003322900e-03 | |
645 | LOCAL_OBJECT_END(_3p25_to_4_data_tail) | |
646 | ||
647 | LOCAL_OBJECT_START(_6p5_to_8_data_tail) | |
648 | // Polynomial coefficients for the tanh(x), 6.5 <= |x| < 8.0 | |
649 | data8 0xA6881D7D21774BFD, 0x00003FEF //A13 = 1.9852125640303530752913966680e-05 | |
650 | data8 0x875E983AA042E605, 0x0000BFF0 //A12 = -3.2274606306629334402383651599e-05 | |
651 | data8 0xCB19E01E94FC133C, 0x00003FF0 //A11 = 4.8423069963831314927026982707e-05 | |
652 | data8 0x8BA5E8D9E72D56B2, 0x0000BFF1 //A10 = -6.6589395655200734237190902534e-05 | |
653 | data8 0xAE91F647ED4E46B2, 0x00003FF1 //A9 = 8.3241541003842930001632190258e-05 | |
654 | data8 0xC465A7E0B22F884E, 0x0000BFF1 //A8 = -9.3649431639051891449916386619e-05 | |
655 | data8 0xC4666148AA01A4D7, 0x00003FF1 //A7 = 9.3650780646160216748407869111e-05 | |
656 | data8 0xABD9E63D181B0C6C, 0x0000BFF1 //A6 = -8.1945023256769295802996591839e-05 | |
657 | data8 0x80E38B18E509387A, 0x00003FF1 //A5 = 6.1458988764532931141264026311e-05 | |
658 | data8 0xA11C80E20ADA5A64, 0x0000BFF0 //A4 = -3.8411937140983728563216440713e-05 | |
659 | LOCAL_OBJECT_END(_6p5_to_8_data_tail) | |
660 | ||
661 | LOCAL_OBJECT_START(_13_to_16_data_tail) | |
662 | // Polynomial coefficients for the tanh(x), 13 <= |x| < 16 | |
663 | data8 0x9D6CCDA4767CA6D9, 0x00003FE5 //A13 = 1.8326683535066775712253572575e-08 | |
664 | data8 0xFFAF154F334BF403, 0x0000BFE4 //A12 = -1.4882762852665077172347508377e-08 | |
665 | data8 0xBFC68FA7C61B6C17, 0x00003FE4 //A11 = 1.1162810813806544919835662888e-08 | |
666 | data8 0x83D8439A6B19A015, 0x0000BFE4 //A10 = -7.6743763372603959795701788561e-09 | |
667 | data8 0xA4CE5BE9DC6A2962, 0x00003FE3 //A9 = 4.7964885012772346158732715382e-09 | |
668 | data8 0xB96826C0697253CA, 0x0000BFE2 //A8 = -2.6980246373950994097953903952e-09 | |
669 | data8 0xB96826CADDC00E35, 0x00003FE1 //A7 = 1.3490123232313844006540534789e-09 | |
670 | data8 0xA23B21F1155DF322, 0x0000BFE0 //A6 = -5.9019289132168830718664922372e-10 | |
671 | data8 0xF358B2E9A50C349C, 0x00003FDE //A5 = 2.2132233424669131155945897524e-10 | |
672 | data8 0x98176FD2074C1D77, 0x0000BFDD //A4 = -6.9163229452106125388824134881e-11 | |
673 | LOCAL_OBJECT_END(_13_to_16_data_tail) | |
674 | ||
675 | LOCAL_OBJECT_START(_0_to_1o8_data) | |
676 | // Polynomial coefficients for the tanh(x), 0.0 <= |x| < 0.125 | |
677 | data8 0xBA0EC1879495150B, 0x0000BFF5 // A15 = -1.4195071451378679802688367813e-03 | |
678 | data8 0xEB5A82898D1BCBA4, 0x00003FF6 // A13 = 3.5912102408030526706365632879e-03 | |
679 | data8 0x91370DAFE0B64438, 0x0000BFF8 // A11 = -8.8632234251336964576640807982e-03 | |
680 | data8 0xB327A435358F1200, 0x00003FF9 // A9 = 2.1869488447622383899199238857e-02 | |
681 | data8 0xDD0DD0DD07A0775F, 0x0000BFFA // A7 = -5.3968253967902161405327069187e-02 | |
682 | data8 0x888888888887C299, 0x00003FFC // A5 = 1.3333333333333264660338062012e-01 | |
683 | data8 0xAAAAAAAAAAAAAA98, 0x0000BFFD // A3 = -3.3333333333333333282255458755e-01 | |
684 | LOCAL_OBJECT_END(_0_to_1o8_data) | |
685 | ||
686 | ||
687 | .section .text | |
688 | GLOBAL_LIBM_ENTRY(tanhl) | |
689 | ||
690 | { .mfi | |
691 | alloc r32 = ar.pfs, 0, 21, 0, 0 | |
692 | fmerge.se fArgAbsNorm = f1, f8 // normalized x (1.0 <= x < 2.0) | |
693 | addl rSignBit = 0x20000, r0 // Set sign bit for exponent | |
694 | } | |
695 | { .mlx | |
696 | addl rDataPtr = @ltoff(tanhl_data), gp // Get common data ptr | |
697 | movl r1p5 = 0x3FF8000000000000 // 1.5 in dbl repres. | |
698 | };; | |
699 | ||
700 | { .mfi | |
701 | getf.exp rArgExp = f8 // Get arg exponent | |
702 | fclass.m p6,p0 = f8, 0xEF // Filter 0, denormals and specials | |
703 | // 0xEF = @qnan|@snan|@pos|@neg|@zero|@unorm|@inf | |
704 | addl rBias = 0xfffc, r0 // Value to subtract from exp | |
705 | // to get actual interval number | |
706 | } | |
707 | { .mfi | |
708 | ld8 rDataPtr = [rDataPtr] // Get real common data pointer | |
709 | fma.s1 fArgSqr = f8, f8, f0 // x^2 (for [0;1/8] path) | |
710 | addl r2to4 = 0x10000, r0 // unbiased exponent | |
711 | // for [2;4] binary interval | |
712 | };; | |
713 | ||
714 | { .mfi | |
715 | getf.sig rArgSig = f8 // Get arg significand | |
716 | fcmp.lt.s1 p15, p14 = f8, f0 // Is arg negative/positive? | |
717 | addl rSaturation = 0xb70, r0 // First 12 bits of | |
718 | // saturation value signif. | |
719 | } | |
720 | { .mfi | |
721 | setf.d f1p5 = r1p5 // 1.5 construction | |
722 | fma.s1 f2p0 = f1,f1,f1 // 2.0 construction | |
723 | addl r1625Sign = 0xd01, r0 // First 12 bits of | |
724 | // 1.625 value signif. | |
725 | // 1.625 significand used to filter values greater than 3.25, 6.5, 13.0 | |
726 | };; | |
727 | ||
728 | { .mfi | |
729 | addl rTailDataPtr = 0xB00, rDataPtr // Pointer to "tail" data | |
730 | fmerge.s fSignumX = f8, f1 // signum(x) | |
731 | andcm rArgExp = rArgExp, rSignBit // Remove sign of exp | |
732 | } | |
733 | { .mfb | |
734 | addl rTiny = 0xf000, r0 // Tiny value for saturation path | |
735 | nop.f 0 | |
736 | (p6) br.cond.spnt tanhl_spec // Branch to zero, denorm & specs | |
737 | };; | |
738 | ||
739 | { .mfi | |
740 | sub rInterval = rArgExp, rBias // Get actual interval number | |
741 | nop.f 0 | |
742 | shr.u rArgSig = rArgSig, 52 // Leave only 12 bits of sign. | |
743 | } | |
744 | { .mfi | |
745 | adds rShiftedDataPtr = 0x10, rDataPtr // Second ptr to data | |
746 | nop.f 0 | |
747 | cmp.ge p8, p10 = rArgExp, r2to4 // If exp >= 2to4 interval? | |
748 | };; | |
749 | ||
750 | { .mfi | |
751 | (p8) cmp.le p8, p10 = r1625Sign, rArgSig // If signd is greater | |
752 | // than 1.625? (arg is at one of binary subranges) | |
753 | nop.f 0 | |
754 | shl rOffset = rInterval, 8 // Make offset from | |
755 | // interval number | |
756 | } | |
757 | { .mfi | |
758 | cmp.gt p9, p0 = 0x0, rInterval // If interval is less than 0 | |
759 | // (means arg is in [0; 1/8]) | |
760 | nop.f 0 | |
761 | cmp.eq p7, p0 = 0x7, rInterval // If arg is in [16;] interv.? | |
762 | };; | |
763 | ||
764 | { .mfi | |
765 | (p8) adds rOffset = 0x400, rOffset // Add additional offset | |
766 | // (arg is at one of binary subranges) | |
767 | fma.s1 fArgCube = fArgSqr, f8, f0 // x^3 (for [0;1/8] path) | |
768 | shl rTailOffset = rInterval, 7 // Make offset to "tail" data | |
769 | // from interval number | |
770 | } | |
771 | { .mib | |
772 | setf.exp fTiny = rTiny // Construct "tiny" value | |
773 | // for saturation path | |
774 | cmp.ltu p11, p0 = 0x7, rInterval // if arg > 32 | |
775 | (p9) br.cond.spnt _0_to_1o8 | |
776 | };; | |
777 | ||
778 | { .mfi | |
779 | add rAddr1 = rDataPtr, rOffset // Get address for | |
780 | // interval data | |
781 | nop.f 0 | |
782 | shl rTailAddOffset = rInterval, 5 // Offset to interval | |
783 | // "tail" data | |
784 | } | |
785 | { .mib | |
786 | add rAddr2 = rShiftedDataPtr, rOffset // Get second | |
787 | // address for interval data | |
788 | (p7) cmp.leu p11, p0 = rSaturation, rArgSig // if arg is | |
789 | // in [22.8;32] interval | |
790 | (p11) br.cond.spnt _saturation // Branch to Saturation path | |
791 | };; | |
792 | ||
793 | { .mmi | |
794 | ldfe fA3 = [rAddr1], 0x90 // Load A3 | |
795 | ldfpd fA2H, fA2L = [rAddr2], 16 // Load A2High, A2Low | |
796 | add rTailOffset = rTailOffset, rTailAddOffset // "Tail" offset | |
797 | };; | |
798 | ||
799 | { .mmi | |
800 | ldfe fA20 = [rAddr1], 16 // Load A20 | |
801 | ldfpd fA1H, fA1L = [rAddr2], 16 // Load A1High, A1Low | |
802 | (p8) adds rTailOffset = 0x280, rTailOffset // Additional offset | |
803 | // (arg is at one of binary subranges) | |
804 | };; | |
805 | ||
806 | { .mmi | |
807 | ldfe fA19 = [rAddr1], 16 // Load A19 | |
808 | ldfpd fA0H, fA0L = [rAddr2], 16 // Load A0High, A0Low | |
809 | add rTailAddr1 = rTailDataPtr, rTailOffset // First tail | |
810 | // data address | |
811 | };; | |
812 | ||
813 | .pred.rel "mutex",p8,p10 | |
814 | { .mfi | |
815 | ldfe fA18 = [rAddr1], 16 // Load A18 | |
816 | (p8) fms.s1 fArgAbsNorm = fArgAbsNorm, f1, f2p0 // Add 2.0 | |
817 | // (arg is at one of binary subranges) | |
818 | adds rTailAddr2 = 0x10, rTailAddr1 // First tail | |
819 | // data address | |
820 | } | |
821 | { .mfi | |
822 | ldfe fA25 = [rAddr2], 16 // Load A25 | |
823 | (p10) fms.s1 fArgAbsNorm = fArgAbsNorm, f1, f1p5 // Add 1.5 | |
824 | // to normalized arg | |
825 | nop.i 0 | |
826 | };; | |
827 | ||
828 | { .mmi | |
829 | ldfe fA17 = [rAddr1], 16 // Load A17 | |
830 | ldfe fA24 = [rAddr2], 16 // Load A24 | |
831 | nop.i 0 | |
832 | };; | |
833 | ||
834 | { .mmi | |
835 | ldfe fA16 = [rAddr1], 16 // Load A16 | |
836 | ldfe fA23 = [rAddr2], 16 // Load A23 | |
837 | nop.i 0 | |
838 | };; | |
839 | ||
840 | { .mmi | |
841 | ldfe fA15 = [rAddr1], 16 // Load A15 | |
842 | ldfe fA22 = [rAddr2], 16 // Load A22 | |
843 | nop.i 0 | |
844 | };; | |
845 | ||
846 | { .mmi | |
847 | ldfe fA14 = [rAddr1], 16 // Load A14 | |
848 | ldfe fA21 = [rAddr2], 16 // Load A21 | |
849 | nop.i 0 | |
850 | };; | |
851 | ||
852 | { .mfi | |
853 | ldfe fA13 = [rTailAddr1], 32 // Load A13 | |
854 | fms.s1 fArgAbsNorm2 = fArgAbsNorm, fArgAbsNorm, f0 // x^2 | |
855 | nop.i 0 | |
856 | } | |
857 | { .mfi | |
858 | ldfe fA12 = [rTailAddr2], 32 // Load A12 | |
859 | nop.f 0 | |
860 | nop.i 0 | |
861 | };; | |
862 | ||
863 | { .mfi | |
864 | ldfe fA11 = [rTailAddr1], 32 // Load A11 | |
865 | fma.s1 fRes3H = fA3, fArgAbsNorm, fA2H // (A3*x+A2)*x^2 | |
866 | nop.i 0 | |
867 | } | |
868 | { .mfi | |
869 | ldfe fA10 = [rTailAddr2], 32 // Load A10 | |
870 | fma.s1 fTH = fA3, fArgAbsNorm, f0 // (A3*x+A2)*x^2 | |
871 | nop.i 0 | |
872 | };; | |
873 | ||
874 | { .mfi | |
875 | ldfe fA9 = [rTailAddr1], 32 // Load A9 | |
876 | fma.s1 fTT2 = fA1L, fArgAbsNorm, f0 // A1*x+A0 | |
877 | nop.i 0 | |
878 | } | |
879 | { .mfi | |
880 | ldfe fA8 = [rTailAddr2], 32 // Load A8 | |
881 | nop.f 0 | |
882 | nop.i 0 | |
883 | };; | |
884 | ||
885 | { .mmi | |
886 | ldfe fA7 = [rTailAddr1], 32 // Load A7 | |
887 | ldfe fA6 = [rTailAddr2], 32 // Load A6 | |
888 | nop.i 0 | |
889 | };; | |
890 | ||
891 | { .mmi | |
892 | ldfe fA5 = [rTailAddr1], 32 // Load A5 | |
893 | ldfe fA4 = [rTailAddr2], 32 // Load A4 | |
894 | nop.i 0 | |
895 | };; | |
896 | ||
897 | { .mfi | |
898 | nop.m 0 | |
899 | fms.s1 fArgAbsNorm2L = fArgAbsNorm, fArgAbsNorm, fArgAbsNorm2 | |
900 | // Low part of x^2 (delta) | |
901 | nop.i 0 | |
902 | } | |
903 | { .mfi | |
904 | nop.m 0 | |
905 | fms.s1 fArgAbsNorm4 = fArgAbsNorm2, fArgAbsNorm2, f0 // x^4 | |
906 | nop.i 0 | |
907 | };; | |
908 | ||
909 | { .mfi | |
910 | nop.m 0 | |
911 | fms.s1 fRes3L = fA2H, f1, fRes3H // // (A3*x+A2)*x^2 | |
912 | nop.i 0 | |
913 | };; | |
914 | ||
915 | { .mfi | |
916 | nop.m 0 | |
917 | fms.s1 fArgAbsNorm3 = fArgAbsNorm2, fArgAbsNorm, f0 // x^3 | |
918 | nop.i 0 | |
919 | } | |
920 | { .mfi | |
921 | nop.m 0 | |
922 | fma.s1 fTH2 = fA1H, fArgAbsNorm, fTT2 // A1*x+A0 | |
923 | nop.i 0 | |
924 | };; | |
925 | ||
926 | { .mfi | |
927 | nop.m 0 | |
928 | fma.s1 fA23 = fA24, fArgAbsNorm, fA23 // Polynomial tail | |
929 | nop.i 0 | |
930 | } | |
931 | { .mfi | |
932 | nop.m 0 | |
933 | fma.s1 fA21 = fA22, fArgAbsNorm, fA21 // Polynomial tail | |
934 | nop.i 0 | |
935 | };; | |
936 | ||
937 | { .mfi | |
938 | nop.m 0 | |
939 | fma.s1 fA12 = fA13, fArgAbsNorm, fA12 // Polynomial tail | |
940 | nop.i 0 | |
941 | } | |
942 | ;; | |
943 | ||
944 | { .mfi | |
945 | nop.m 0 | |
946 | fma.s1 fRes3L = fRes3L, f1, fTH // (A3*x+A2)*x^2 | |
947 | nop.i 0 | |
948 | } | |
949 | { .mfi | |
950 | nop.m 0 | |
951 | fma.s1 fA19 = fA20, fArgAbsNorm, fA19 // Polynomial tail | |
952 | nop.i 0 | |
953 | };; | |
954 | ||
955 | { .mfi | |
956 | nop.m 0 | |
957 | fma.s1 fRes1H = fTH2, f1, fA0H // A1*x+A0 | |
958 | nop.i 0 | |
959 | } | |
960 | { .mfi | |
961 | nop.m 0 | |
962 | fms.s1 fTL2 = fA1H, fArgAbsNorm, fTH2 // A1*x+A0 | |
963 | nop.i 0 | |
964 | };; | |
965 | ||
966 | { .mfi | |
967 | nop.m 0 | |
968 | fma.s1 fA8 = fA9, fArgAbsNorm, fA8 // Polynomial tail | |
969 | nop.i 0 | |
970 | } | |
971 | { .mfi | |
972 | nop.m 0 | |
973 | fma.s1 fA10 = fA11, fArgAbsNorm, fA10 // Polynomial tail | |
974 | nop.i 0 | |
975 | };; | |
976 | ||
977 | { .mfi | |
978 | nop.m 0 | |
979 | fma.s1 fA15 = fA16, fArgAbsNorm, fA15 // Polynomial tail | |
980 | nop.i 0 | |
981 | } | |
982 | { .mfi | |
983 | nop.m 0 | |
984 | fma.s1 fA17 = fA18, fArgAbsNorm, fA17 // Polynomial tail | |
985 | nop.i 0 | |
986 | };; | |
987 | ||
988 | { .mfi | |
989 | nop.m 0 | |
990 | fms.s1 fArgAbsNorm11 = fArgAbsNorm4, fArgAbsNorm4, f0 // x^8 | |
991 | nop.i 0 | |
992 | } | |
993 | { .mfi | |
994 | nop.m 0 | |
995 | fma.s1 fA4 = fA5, fArgAbsNorm, fA4 // Polynomial tail | |
996 | nop.i 0 | |
997 | };; | |
998 | ||
999 | { .mfi | |
1000 | nop.m 0 | |
1001 | fma.s1 fRes3L = fRes3L, f1, fA2L // (A3*x+A2)*x^2 | |
1002 | nop.i 0 | |
1003 | } | |
1004 | { .mfi | |
1005 | nop.m 0 | |
1006 | fma.s1 fA6 = fA7, fArgAbsNorm, fA6 // Polynomial tail | |
1007 | nop.i 0 | |
1008 | };; | |
1009 | ||
1010 | { .mfi | |
1011 | nop.m 0 | |
1012 | fma.s1 fTL2 = fTL2, f1, fTT2 // A1*x+A0 | |
1013 | nop.i 0 | |
1014 | } | |
1015 | { .mfi | |
1016 | nop.m 0 | |
1017 | fms.s1 fRes1L = fA0H, f1, fRes1H // A1*x+A0 | |
1018 | nop.i 0 | |
1019 | };; | |
1020 | ||
1021 | { .mfi | |
1022 | nop.m 0 | |
1023 | fma.s1 fA23 = fA25, fArgAbsNorm2, fA23 // Polynomial tail | |
1024 | nop.i 0 | |
1025 | } | |
1026 | { .mfi | |
1027 | nop.m 0 | |
1028 | fma.s1 fA12 = fA14, fArgAbsNorm2, fA12 // Polynomial tail | |
1029 | nop.i 0 | |
1030 | };; | |
1031 | ||
1032 | { .mfi | |
1033 | nop.m 0 | |
1034 | fma.s1 fA19 = fA21, fArgAbsNorm2, fA19 // Polynomial tail | |
1035 | nop.i 0 | |
1036 | } | |
1037 | { .mfi | |
1038 | nop.m 0 | |
1039 | fma.s1 fA8 = fA10, fArgAbsNorm2, fA8 // Polynomial tail | |
1040 | nop.i 0 | |
1041 | };; | |
1042 | ||
1043 | { .mfi | |
1044 | nop.m 0 | |
1045 | fma.s1 fA15 = fA17, fArgAbsNorm2, fA15 // Polynomial tail | |
1046 | nop.i 0 | |
1047 | } | |
1048 | { .mfi | |
1049 | nop.m 0 | |
1050 | fms.s1 fArgAbsNorm11 = fArgAbsNorm11, fArgAbsNorm3, f0 // x^11 | |
1051 | nop.i 0 | |
1052 | };; | |
1053 | ||
1054 | { .mfi | |
1055 | nop.m 0 | |
1056 | fma.s1 fTT = fRes3L, fArgAbsNorm2, f0 // (A3*x+A2)*x^2 | |
1057 | nop.i 0 | |
1058 | } | |
1059 | { .mfi | |
1060 | nop.m 0 | |
1061 | fma.s1 fA4 = fA6, fArgAbsNorm2, fA4 // Polynomial tail | |
1062 | nop.i 0 | |
1063 | };; | |
1064 | ||
1065 | { .mfi | |
1066 | nop.m 0 | |
1067 | fma.s1 fRes1L = fRes1L, f1, fTH2 // A1*x+A0 | |
1068 | nop.i 0 | |
1069 | } | |
1070 | { .mfi | |
1071 | nop.m 0 | |
1072 | fms.s1 fArgAbsNorm4X = fArgAbsNorm4, fSignumX, f0 // x^4 * signum | |
1073 | nop.i 0 | |
1074 | };; | |
1075 | ||
1076 | { .mfi | |
1077 | nop.m 0 | |
1078 | fma.s1 fA19 = fA23, fArgAbsNorm4, fA19 // Polynomial tail | |
1079 | nop.i 0 | |
1080 | } | |
1081 | { .mfi | |
1082 | nop.m 0 | |
1083 | fma.s1 fA8 = fA12, fArgAbsNorm4, fA8 // Polynomial tail | |
1084 | nop.i 0 | |
1085 | };; | |
1086 | ||
1087 | { .mfi | |
1088 | nop.m 0 | |
1089 | fma.s1 fTT = fRes3H, fArgAbsNorm2L, fTT // (A3*x+A2)*x^2 | |
1090 | nop.i 0 | |
1091 | };; | |
1092 | ||
1093 | { .mfi | |
1094 | nop.m 0 | |
1095 | fma.s1 fRes1L = fRes1L, f1, fTL2 // A1*x+A0 | |
1096 | nop.i 0 | |
1097 | };; | |
1098 | ||
1099 | { .mfi | |
1100 | nop.m 0 | |
1101 | fma.s1 fA15 = fA19, fArgAbsNorm4, fA15 // Polynomial tail | |
1102 | nop.i 0 | |
1103 | } | |
1104 | { .mfi | |
1105 | nop.m 0 | |
1106 | fma.s1 fA4 = fA8, fArgAbsNorm4, fA4 // Polynomial tail | |
1107 | nop.i 0 | |
1108 | };; | |
1109 | ||
1110 | { .mfi | |
1111 | nop.m 0 | |
1112 | fma.s1 fRes2H = fRes3H, fArgAbsNorm2, fTT // (A3*x+A2)*x^2 | |
1113 | nop.i 0 | |
1114 | };; | |
1115 | ||
1116 | { .mfi | |
1117 | nop.m 0 | |
1118 | fma.s1 fRes1L = fRes1L, f1, fA0L // A1*x+A0 | |
1119 | nop.i 0 | |
1120 | };; | |
1121 | ||
1122 | { .mfi | |
1123 | nop.m 0 | |
1124 | fma.s1 fRes4 = fA15, fArgAbsNorm11, fA4 // Result of | |
1125 | // polynomial tail | |
1126 | nop.i 0 | |
1127 | };; | |
1128 | ||
1129 | { .mfi | |
1130 | nop.m 0 | |
1131 | fms.s1 fRes2L = fRes3H, fArgAbsNorm2, fRes2H // (A3*x+A2)*x^2 | |
1132 | nop.i 0 | |
1133 | } | |
1134 | { .mfi | |
1135 | nop.m 0 | |
1136 | fma.s1 fResH = fRes2H, f1, fRes1H // High result | |
1137 | nop.i 0 | |
1138 | };; | |
1139 | ||
1140 | { .mfi | |
1141 | nop.m 0 | |
1142 | (p14) fma.s1 fRes1L = fRes4, fArgAbsNorm4X, fRes1L // A1*x+A0 | |
1143 | nop.i 0 | |
1144 | } | |
1145 | { .mfi | |
1146 | nop.m 0 | |
1147 | (p15) fms.s1 fRes1L = fRes4, fArgAbsNorm4X, fRes1L // A1*x+A0 | |
1148 | nop.i 0 | |
1149 | };; | |
1150 | ||
1151 | { .mfi | |
1152 | nop.m 0 | |
1153 | fma.s1 fRes2L = fRes2L, f1, fTT // (A3*x+A2)*x^2 | |
1154 | nop.i 0 | |
1155 | } | |
1156 | { .mfi | |
1157 | nop.m 0 | |
1158 | fms.s1 fResL = fRes1H, f1, fResH // Low result | |
1159 | nop.i 0 | |
1160 | };; | |
1161 | ||
1162 | { .mfi | |
1163 | nop.m 0 | |
1164 | fma.s0 fRes1L = fRes2L, fSignumX, fRes1L // Low result | |
1165 | // .s0 - for symmetry issue resolving at +/-inf rounding mode | |
1166 | nop.i 0 | |
1167 | } | |
1168 | { .mfi | |
1169 | nop.m 0 | |
1170 | fma.s1 fResL = fResL, f1, fRes2H // Low result | |
1171 | nop.i 0 | |
1172 | };; | |
1173 | ||
1174 | { .mfi | |
1175 | nop.m 0 | |
1176 | (p14) fma.s0 fResL = fRes1L, f1, fResL // Low result | |
1177 | // .s0 - for symmetry issue resolving at +/-inf rounding mode | |
1178 | nop.i 0 | |
1179 | } | |
1180 | { .mfi | |
1181 | nop.m 0 | |
1182 | (p15) fms.s0 fResL = fRes1L, f1, fResL // Low result | |
1183 | // .s0 - for symmetry issue resolving at +/-inf rounding mode | |
1184 | nop.i 0 | |
1185 | };; | |
1186 | ||
1187 | .pred.rel "mutex",p14,p15 | |
1188 | { .mfi | |
1189 | nop.m 0 | |
1190 | (p14) fma.s0 f8 = fResL, f1, fResH// Add high and low results | |
1191 | nop.i 0 | |
1192 | } | |
1193 | { .mfb | |
1194 | nop.m 0 | |
1195 | (p15) fms.s0 f8 = fResL, f1, fResH // Add high and low results | |
1196 | br.ret.sptk b0 // Main path return | |
1197 | };; | |
1198 | ||
1199 | // satiration path //////////////////////////////////////////////////////////// | |
1200 | _saturation: | |
1201 | ||
1202 | .pred.rel "mutex",p14,p15 | |
1203 | { .mfi | |
1204 | nop.m 0 | |
1205 | (p14) fms.s0 f8 = f1, f1, fTiny // Saturation result r = 1-tiny | |
1206 | nop.i 0 | |
1207 | };; | |
1208 | { .mfb | |
1209 | nop.m 0 | |
1210 | (p15) fnma.s0 f8 = f1, f1, fTiny // Saturation result r = tiny-1 | |
1211 | br.ret.sptk b0 // Saturation path return | |
1212 | };; | |
1213 | ||
1214 | ||
1215 | // 0, denormals and special IEEE numbers path ///////////////////////////////// | |
1216 | tanhl_spec: | |
1217 | ||
1218 | { .mfi | |
1219 | nop.m 0 | |
1220 | fclass.m p6,p0 = f8, 0x23 // To filter infinities | |
1221 | // 0x23 = @pos|@neg|@inf | |
1222 | nop.i 0 | |
1223 | };; | |
1224 | ||
1225 | { .mfi | |
1226 | nop.m 0 | |
1227 | fclass.m p7,p0 = f8, 0xC7 // To filter NaNs & Zeros | |
1228 | // 0xC7 = @pos|@neg|@zero|@qnan|@snan | |
1229 | nop.i 0 | |
1230 | };; | |
1231 | ||
1232 | { .mfb | |
1233 | nop.m 0 | |
1234 | (p6) fmerge.s f8 = f8, f1 // +/-1 for INF args | |
1235 | (p6) br.ret.spnt b0 // exit for x = INF | |
1236 | };; | |
1237 | ||
1238 | { .mfb | |
1239 | nop.m 0 | |
1240 | (p7) fma.s0 f8 = f8, f1, f8 // +/-0 for 0 args | |
1241 | // and NaNs for NaNs | |
1242 | (p7) br.ret.spnt b0 // exit for x = NaN or +/-0 | |
1243 | };; | |
1244 | ||
1245 | { .mfi | |
1246 | nop.m 0 | |
1247 | fnorm.s0 f8 = f8 // Normalize arg | |
1248 | nop.i 0 | |
1249 | };; | |
1250 | ||
1251 | .pred.rel "mutex",p14,p15 | |
1252 | { .mfi | |
1253 | nop.m 0 | |
1254 | (p14) fnma.s0 f8 = f8, f8, f8 // res = r-r^2 | |
1255 | nop.i 0 | |
1256 | } | |
1257 | { .mfb | |
1258 | nop.m 0 | |
1259 | (p15) fma.s0 f8 = f8, f8, f8 // res = r+r^2 | |
1260 | br.ret.sptk b0 // 0, denormals, IEEE specials return | |
1261 | };; | |
1262 | ||
1263 | ||
1264 | // 0 < |x| < 1/8 path ///////////////////////////////////////////////////////// | |
1265 | _0_to_1o8: | |
1266 | ||
1267 | { .mmi | |
1268 | adds rAddr1 = 0x11e0, rDataPtr // Ptr 1 to coeffs | |
1269 | adds rAddr2 = 0x11f0, rDataPtr // Ptr 2 to coeffs | |
1270 | nop.i 0 | |
1271 | };; | |
1272 | ||
1273 | { .mmi | |
1274 | ldfe fA15 = [rAddr1], 32 // Load A15 | |
1275 | ldfe fA13 = [rAddr2], 32 // Load A13 | |
1276 | nop.i 0 | |
1277 | };; | |
1278 | ||
1279 | { .mmi | |
1280 | ldfe fA11 = [rAddr1], 32 // Load A11 | |
1281 | ldfe fA9 = [rAddr2], 32 // Load A9 | |
1282 | nop.i 0 | |
1283 | };; | |
1284 | ||
1285 | { .mmi | |
1286 | ldfe fA7 = [rAddr1], 32 // Load A7 | |
1287 | ldfe fA5 = [rAddr2] // Load A5 | |
1288 | nop.i 0 | |
1289 | };; | |
1290 | ||
1291 | { .mfi | |
1292 | ldfe fA3 = [rAddr1] // Load A3 | |
1293 | fma.s1 fA11 = fA13, fArgSqr, fA11 // Polynomial tail | |
1294 | nop.i 0 | |
1295 | } | |
1296 | { .mfi | |
1297 | nop.m 0 | |
1298 | fma.s1 fArgFour = fArgSqr, fArgSqr, f0 // a^4 | |
1299 | nop.i 0 | |
1300 | };; | |
1301 | ||
1302 | ||
1303 | { .mfi | |
1304 | nop.m 0 | |
1305 | fma.s1 fA3 = fA5, fArgSqr, fA3 // Polynomial tail | |
1306 | nop.i 0 | |
1307 | } | |
1308 | { .mfi | |
1309 | nop.m 0 | |
1310 | fma.s1 fA7 = fA9, fArgSqr, fA7 // Polynomial tail | |
1311 | nop.i 0 | |
1312 | };; | |
1313 | ||
1314 | ||
1315 | { .mfi | |
1316 | nop.m 0 | |
1317 | fma.s1 fA11 = fA15, fArgFour, fA11 // Polynomial tail | |
1318 | nop.i 0 | |
1319 | };; | |
1320 | ||
1321 | { .mfi | |
1322 | nop.m 0 | |
1323 | fma.s1 fA3 = fA7, fArgFour, fA3 // Polynomial tail | |
1324 | nop.i 0 | |
1325 | } | |
1326 | { .mfi | |
1327 | nop.m 0 | |
1328 | fma.s1 fArgEight = fArgFour, fArgFour, f0 // a^8 | |
1329 | nop.i 0 | |
1330 | };; | |
1331 | ||
1332 | { .mfi | |
1333 | nop.m 0 | |
1334 | fma.s1 fRes = fA11, fArgEight, fA3 //Polynomial tail result | |
1335 | nop.i 0 | |
1336 | };; | |
1337 | ||
1338 | { .mfb | |
1339 | nop.m 0 | |
1340 | fma.s0 f8 = fRes, fArgCube, f8 // (Polynomial tail)*x^3 | |
1341 | br.ret.sptk b0 // [0;1/8] interval return | |
1342 | };; | |
1343 | ||
1344 | GLOBAL_LIBM_END(tanhl) | |
1345 | ||
1346 | ||
1347 | ||
1348 |