]>
Commit | Line | Data |
---|---|---|
d5efd131 MF |
1 | .file "tanh.s" |
2 | ||
3 | ||
4 | // Copyright (c) 2001 - 2005, Intel Corporation | |
5 | // All rights reserved. | |
6 | // | |
7 | // Contributed 2001 by the Intel Numerics Group, Intel Corporation | |
8 | // | |
9 | // Redistribution and use in source and binary forms, with or without | |
10 | // modification, are permitted provided that the following conditions are | |
11 | // met: | |
12 | // | |
13 | // * Redistributions of source code must retain the above copyright | |
14 | // notice, this list of conditions and the following disclaimer. | |
15 | // | |
16 | // * Redistributions in binary form must reproduce the above copyright | |
17 | // notice, this list of conditions and the following disclaimer in the | |
18 | // documentation and/or other materials provided with the distribution. | |
19 | // | |
20 | // * The name of Intel Corporation may not be used to endorse or promote | |
21 | // products derived from this software without specific prior written | |
22 | // permission. | |
23 | ||
0347518d MF |
24 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
25 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
d5efd131 | 26 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
0347518d | 27 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS |
d5efd131 | 28 | // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
0347518d MF |
29 | // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
30 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | |
31 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY | |
d5efd131 | 32 | // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING |
0347518d MF |
33 | // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
34 | // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
35 | // | |
d5efd131 | 36 | // Intel Corporation is the author of this code, and requests that all |
0347518d | 37 | // problem reports or change requests be submitted to it directly at |
d5efd131 MF |
38 | // http://www.intel.com/software/products/opensource/libraries/num.htm. |
39 | // | |
40 | // History | |
41 | //============================================================================== | |
42 | // 05/30/01 Initial version | |
43 | // 12/04/01 Rewritten version with erf-like algorithm. | |
44 | // Performance improved. | |
45 | // 05/20/02 Cleaned up namespace and sf0 syntax | |
46 | // 08/14/02 Changed mli templates to mlx | |
47 | // 02/10/03 Reordered header: .section, .global, .proc, .align | |
48 | // 03/31/05 Reformatted delimiters between data tables | |
49 | // | |
50 | // API | |
51 | //============================================================================== | |
52 | // double tanh(double) | |
53 | // | |
54 | // Overview of operation | |
55 | //============================================================================== | |
56 | // | |
57 | // Algorithm description | |
58 | // --------------------- | |
59 | // | |
60 | // There are 4 paths: | |
61 | // | |
62 | // 1. Special path: x = 0, Inf, NaNs, denormals | |
63 | // Return tanh(x) = +/-0.0 for zeros | |
64 | // Return tanh(x) = QNaN for NaNs | |
65 | // Return tanh(x) = sign(x)*1.0 for Inf | |
66 | // Return tanh(x) = x + x^2 for - denormals | |
67 | // Return tanh(x) = x - x^2 for + denormals | |
68 | // | |
69 | // 2. Near zero path: 0.0 < |x| < 0.25 | |
70 | // Return tanh(x) = x + x^3*A3 + ... + x^19*A19 | |
71 | // | |
72 | // 3. Main path: 0.25 <= |x| < 19.0625 | |
73 | // For several ranges of 0.25 <= |x| < 19.0625 | |
0347518d | 74 | // Return tanh(x) = sign(x)*(A0 + y*A1 + y^2*A2 + |
d5efd131 MF |
75 | // + y^3*A3 + ... + y^19*A19) |
76 | // where y = (|x|/a) - b | |
0347518d | 77 | // |
d5efd131 MF |
78 | // For each range there is particular set of coefficients. |
79 | // Below is the list of ranges: | |
80 | // 1/4 <= |x| < 1/2 a = 0.25, b = 1.0 | |
81 | // 1/2 <= |x| < 1.0 a = 0.5, b = 1.0 | |
82 | // 1.0 <= |x| < 2.0 a = 1.0, b = 1.0 | |
83 | // 2.0 <= |x| < 3.25 a = 2.0, b = 1.0 | |
84 | // 3.25 <= |x| < 4.0 a = 2.0, b = 2.0 | |
85 | // 4.0 <= |x| < 6.5 a = 4.0, b = 1.0 | |
86 | // 6.5 <= |x| < 8.0 a = 4.0, b = 2.0 | |
87 | // 8.0 <= |x| < 13.0 a = 8.0, b = 1.0 | |
88 | // 13.0 <= |x| < 16.0 a = 8.0, b = 2.0 | |
89 | // 16.0 <= |x| < 19.0625 a = 16.0, b = 1.0 | |
0347518d | 90 | // ( [3.25;4.0], [6.5;8.0], [13.0;16.0] subranges separated |
d5efd131 MF |
91 | // for monotonicity issues resolve ) |
92 | // | |
0347518d | 93 | // 4. Saturation path: 19.0625 <= |x| < +INF |
d5efd131 MF |
94 | // Return tanh(x) = sign(x)*(1.0 - tiny_value) |
95 | // (tiny_value ~ 2^(-63)) | |
96 | // | |
97 | // Registers used | |
98 | //============================================================================== | |
0347518d | 99 | // Floating Point registers used: |
d5efd131 MF |
100 | // f8 = input, output |
101 | // f32 -> f64 | |
102 | // | |
0347518d | 103 | // General registers used: |
d5efd131 MF |
104 | // r32 -> r51, r2, r3 |
105 | // | |
106 | // Predicate registers used: | |
107 | // p6, p8, p10, p11, p12, p14, p15 | |
108 | // p6 arg is zero, denormal or special IEEE | |
0347518d | 109 | // p8 to filter out case when signd(x) > 1.625 |
d5efd131 | 110 | // p10 to filter out case when |x| < 0.25 |
0347518d | 111 | // p11 to filter out case when signd(x) <= 1.625 |
d5efd131 MF |
112 | // p12 to filter out case when |x| >= 19.0625 |
113 | // p14 set to 1 for positive x | |
114 | // p15 set to 1 for negative x | |
115 | ||
116 | // Assembly macros | |
117 | //============================================================================== | |
118 | rDataPtr = r2 | |
119 | rDataPtr1 = r3 | |
120 | ||
121 | rBias = r33 | |
122 | rCoeffAddr3 = r34 | |
123 | rThreeAndQ = r35 | |
124 | rCoeffAddr2 = r36 | |
125 | rMask = r37 | |
126 | rArg = r38 | |
127 | rSignBit = r39 | |
128 | rAbsArg = r40 | |
129 | rSaturation = r41 | |
130 | rIndex = r42 | |
131 | rCoeffAddr1 = r43 | |
132 | rCoeffAddr4 = r44 | |
133 | rShiftedArg = r45 | |
134 | rShiftedArgMasked = r46 | |
135 | rBiasedExpOf4 = r47 | |
136 | rShiftedAbsArg = r48 | |
137 | rArgSgnd = r49 | |
138 | r1625Sgnd = r50 | |
139 | rTwo = r51 | |
140 | ||
141 | //============================================================================== | |
142 | fA0 = f32 | |
143 | fA1 = f33 | |
144 | fA2 = f34 | |
145 | fA3 = f35 | |
146 | fA4 = f36 | |
147 | fA5 = f37 | |
148 | fA6 = f38 | |
149 | fA7 = f39 | |
150 | fA8 = f40 | |
151 | fA9 = f41 | |
152 | fA10 = f42 | |
153 | fA11 = f43 | |
154 | fA12 = f44 | |
155 | fA13 = f45 | |
156 | fA14 = f46 | |
157 | fA15 = f47 | |
158 | fA16 = f48 | |
159 | fA17 = f49 | |
160 | fA18 = f50 | |
161 | fA19 = f51 | |
162 | fArgSqr = f52 | |
163 | fArgAbsNorm = f53 | |
164 | fSignumX = f54 | |
165 | fRes = f55 | |
166 | fThreeAndQ = f56 | |
167 | fArgAbs = f57 | |
168 | fTSqr = f58 | |
169 | fTQuadr = f59 | |
170 | fTDeg3 = f60 | |
171 | fTDeg7 = f61 | |
0347518d | 172 | fArgAbsNormSgn = f62 |
d5efd131 MF |
173 | fTQuadrSgn = f63 |
174 | fTwo = f64 | |
175 | ||
176 | // Data tables | |
177 | //============================================================================== | |
178 | RODATA | |
179 | ||
180 | .align 16 | |
181 | ||
182 | LOCAL_OBJECT_START(tanh_data) | |
183 | // CAUTION: The order of these table coefficients shouldn't be changed! | |
184 | ||
185 | // Main path coefficients: | |
186 | // Coefficients ##0..15 ("main" coefficient tables) | |
0347518d | 187 | // Polynomial coefficients for the tanh(x), 0.25 <= |x| < 0.5 |
d5efd131 MF |
188 | data8 0xE9D218BC9A3FB55A, 0x00003FC7 //A19 |
189 | data8 0xC8C0D38687F36EBA, 0x00003FCE //A18 | |
190 | data8 0xA2663E519FAC8A43, 0x0000BFD2 //A17 | |
191 | data8 0xD913F0490674B0DF, 0x00003FD3 //A16 | |
192 | data8 0xF75D84789DE0AE52, 0x00003FD6 //A15 | |
193 | data8 0xACB3C40EEF3A06F0, 0x0000BFD9 //A14 | |
194 | data8 0xEBD7F5DC02CFD5BA, 0x0000BFDB //A13 | |
195 | data8 0x8B52CDF66D709E2A, 0x00003FDF //A12 | |
196 | data8 0x9EC21F28E05C4A3E, 0x00003FE0 //A11 | |
197 | data8 0xC412B44D0176F3ED, 0x0000BFE4 //A10 | |
198 | data8 0x97BF35A34DD1EA4C, 0x0000BFE0 //A9 | |
199 | data8 0xF89F5B39E3A3AA36, 0x00003FE9 //A8 | |
200 | data8 0xF2BA654BCEEBA433, 0x0000BFEA //A7 | |
201 | data8 0x8E1C15876AA589AD, 0x0000BFEF //A6 | |
202 | data8 0x942226246A8C2A86, 0x00003FF1 //A5 | |
203 | data8 0x8F06D9FF7DB47261, 0x00003FF4 //A4 | |
204 | // | |
0347518d | 205 | // Polynomial coefficients for the tanh(x), 0.5 <= |x| < 1.0 |
d5efd131 MF |
206 | data8 0xC4A7B8FB672A8520, 0x00003FDC //A19 |
207 | data8 0xA20724B847E13499, 0x0000BFE0 //A18 | |
208 | data8 0xE17DB53F02E4D340, 0x00003FE2 //A17 | |
209 | data8 0x90264A1012F4CA6F, 0x0000BFE4 //A16 | |
210 | data8 0xEBEC9F776F0BF415, 0x0000BFE0 //A15 | |
211 | data8 0x89AF912B305B45A4, 0x00003FE7 //A14 | |
212 | data8 0xB4A960B81F5EC36A, 0x0000BFE7 //A13 | |
213 | data8 0x969A4E95B2DA86B5, 0x0000BFEA //A12 | |
214 | data8 0x8A3FC0EC082305CB, 0x00003FEC //A11 | |
215 | data8 0x83D7795BCBE24373, 0x00003FEC //A10 | |
216 | data8 0xDCBF42AEB82932EC, 0x0000BFEF //A9 | |
217 | data8 0x83318E61ECAFD804, 0x00003FF0 //A8 | |
218 | data8 0xEA4DE5746975A914, 0x00003FF2 //A7 | |
219 | data8 0xCE63E8FA6B96480B, 0x0000BFF4 //A6 | |
220 | data8 0xDF017BE0D4FE45D8, 0x0000BFF4 //A5 | |
221 | data8 0xA8A0C6E2226DF3CD, 0x00003FF8 //A4 | |
222 | // | |
0347518d | 223 | // Polynomial coefficients for the tanh(x), 1.0 <= |x| < 2.0 |
d5efd131 MF |
224 | data8 0x8E89D2EBFDAA160B, 0x00003FE9 //A19 |
225 | data8 0xDD9226310A272046, 0x0000BFEC //A18 | |
226 | data8 0xA038042D28B0D665, 0x00003FEF //A17 | |
227 | data8 0x8C04796F03516306, 0x0000BFF1 //A16 | |
228 | data8 0x9CD6A9CB4E90A2FD, 0x00003FF2 //A15 | |
229 | data8 0xC8980E166F5A84FD, 0x0000BFF2 //A14 | |
230 | data8 0x9ADFE65F56B7BCFD, 0x00003FED //A13 | |
231 | data8 0x8B11FDFB5D0A7B96, 0x00003FF4 //A12 | |
232 | data8 0x8209A125E829CBFA, 0x0000BFF5 //A11 | |
233 | data8 0xCF38AAC17B85BD76, 0x00003FF1 //A10 | |
234 | data8 0xD5C2E248D8AB99AB, 0x00003FF6 //A9 | |
235 | data8 0xE12BE2785727F2D6, 0x0000BFF7 //A8 | |
236 | data8 0x9FC9EF90F87BF1E2, 0x00003FF6 //A7 | |
237 | data8 0x9B02FE0DAF42C08F, 0x00003FF9 //A6 | |
238 | data8 0xBDACE06F531D9491, 0x0000BFFA //A5 | |
239 | data8 0xE3048AD1DB2F648C, 0x00003FF9 //A4 | |
240 | // | |
0347518d | 241 | // Polynomial coefficients for the tanh(x), 2.0 <= |x| < 3.25 |
d5efd131 MF |
242 | data8 0x856EC3B0330A385A, 0x00003FEB //A19 |
243 | data8 0xC641D69DAE2D429C, 0x0000BFF2 //A18 | |
244 | data8 0xC683EB0BE1343FFF, 0x00003FF5 //A17 | |
245 | data8 0xC358954224E4E823, 0x0000BFF7 //A16 | |
246 | data8 0xF813A8D6D396BC5F, 0x00003FF8 //A15 | |
247 | data8 0xE0ECDFED078D37D6, 0x0000BFF9 //A14 | |
248 | data8 0x950E4E619855E316, 0x00003FFA //A13 | |
249 | data8 0x8453B8F93370FB58, 0x0000BFFA //A12 | |
250 | data8 0xFDBA28430AEC95BA, 0x00003FF7 //A11 | |
251 | data8 0x9371AAC1FDB1E664, 0x00003FFA //A10 | |
252 | data8 0xAC972DA97782D88A, 0x0000BFFB //A9 | |
253 | data8 0xE18F47B10B9CE1BC, 0x00003FFB //A8 | |
254 | data8 0xAB7C81230BF13BC6, 0x0000BFFB //A7 | |
255 | data8 0xA6CAAD4A3E31A7D5, 0x0000BFF8 //A6 | |
256 | data8 0x9CABD76D1D5C3878, 0x00003FFC //A5 | |
257 | data8 0x92906D077941CAA9, 0x0000BFFD //A4 | |
258 | // | |
0347518d | 259 | // Polynomial coefficients for the tanh(x), 4.0 <= |x| < 6.5 |
d5efd131 MF |
260 | data8 0x9232D19F71709AC9, 0x0000BFF5 //A19 |
261 | data8 0x819E31323F5DD3F8, 0x00003FF8 //A18 | |
262 | data8 0xDA8E1CDB8D23DC29, 0x0000BFF9 //A17 | |
263 | data8 0xE97C7CD8FC0486D8, 0x00003FFA //A16 | |
264 | data8 0xB0C4AD234D88C9F2, 0x0000BFFB //A15 | |
265 | data8 0xC5989BFB28FDE267, 0x00003FFB //A14 | |
266 | data8 0x9B26520EC4EFEE8E, 0x0000BFFB //A13 | |
267 | data8 0xC4B6F758AD21E574, 0x00003FF9 //A12 | |
268 | data8 0xCC36E3FFA10D2CFF, 0x00003FFA //A11 | |
269 | data8 0x8738696FB06A5CED, 0x0000BFFC //A10 | |
270 | data8 0xD31981825BF39228, 0x00003FFC //A9 | |
271 | data8 0x82C58FB9BEE43992, 0x0000BFFD //A8 | |
272 | data8 0x88D5AAE49164B6F3, 0x00003FFD //A7 | |
273 | data8 0xF4CA0B968AF2DDE2, 0x0000BFFC //A6 | |
274 | data8 0xB99874B482BD17EE, 0x00003FFC //A5 | |
275 | data8 0xE93FB2F99431DC1D, 0x0000BFFB //A4 | |
276 | // | |
0347518d | 277 | // Polynomial coefficients for the tanh(x), 8.0 <= |x| < 13.0 |
d5efd131 MF |
278 | data8 0xAAA9EB7EADA85CEC, 0x00003FF5 //A19 |
279 | data8 0x980C80EE05A6BE78, 0x0000BFF8 //A18 | |
280 | data8 0x818DA9F5396390A5, 0x00003FFA //A17 | |
281 | data8 0x8D8CC21E23D8A6A2, 0x0000BFFB //A16 | |
282 | data8 0xE0EC19E55A886765, 0x00003FFB //A15 | |
283 | data8 0x8C11197A7E6244C5, 0x0000BFFC //A14 | |
284 | data8 0x901D2BF203C2F7F3, 0x00003FFC //A13 | |
285 | data8 0xFEACAEE66EE803E5, 0x0000BFFB //A12 | |
286 | data8 0xC684E4925E318C3F, 0x00003FFB //A11 | |
287 | data8 0x8A9D8A970565F28D, 0x0000BFFB //A10 | |
288 | data8 0xAE34C61DE5CEA4D4, 0x00003FFA //A9 | |
289 | data8 0xC44C5714BD6208A0, 0x0000BFF9 //A8 | |
290 | data8 0xC4612F7D6C8BDB79, 0x00003FF8 //A7 | |
291 | data8 0xABD91DCE40D5EECB, 0x0000BFF7 //A6 | |
292 | data8 0x80E375C1B847B72F, 0x00003FF6 //A5 | |
293 | data8 0xA11C7DD978CF700A, 0x0000BFF4 //A4 | |
294 | // | |
0347518d | 295 | // Polynomial coefficients for the tanh(x), 16.0 <= |x| < 19.0625 |
d5efd131 MF |
296 | data8 0xE29D17C510F86F6B, 0x00003FF3 //A19 |
297 | data8 0x88FE52EB39A3A98C, 0x0000BFF5 //A18 | |
298 | data8 0xA406547E50360693, 0x00003FF5 //A17 | |
299 | data8 0x83E6260B71C6D7DE, 0x0000BFF5 //A16 | |
300 | data8 0xA36AB5B0CBC97B85, 0x00003FF4 //A15 | |
301 | data8 0xA94931E0B7BA6C14, 0x0000BFF3 //A14 | |
302 | data8 0x9A4596DAF350AD63, 0x00003FF2 //A13 | |
303 | data8 0xFE47643F375AECA5, 0x0000BFF0 //A12 | |
304 | data8 0xBF8433C5ABEE63B1, 0x00003FEF //A11 | |
305 | data8 0x83CEE05D7AE90A0A, 0x0000BFEE //A10 | |
306 | data8 0xA4CC45480BCEB02D, 0x00003FEC //A9 | |
307 | data8 0xB967CBDCBC16CB10, 0x0000BFEA //A8 | |
308 | data8 0xB9681B214EDC098D, 0x00003FE8 //A7 | |
309 | data8 0xA23B20D87B80DFA8, 0x0000BFE6 //A6 | |
310 | data8 0xF358B2C46F10CBAF, 0x00003FE3 //A5 | |
311 | data8 0x98176FD06229A385, 0x0000BFE1 //A4 | |
312 | // | |
313 | // Binary subranges | |
0347518d | 314 | // Polynomial coefficients for the tanh(x), 3.25 <= |x| < 4.0 |
d5efd131 MF |
315 | data8 0xEF2EE841288F6706, 0x00003FE9 //A19 |
316 | data8 0xE65D5B74B85F82A6, 0x00003FEB //A18 | |
317 | data8 0xE495FC21E42A79FF, 0x00003FEA //A17 | |
318 | data8 0xF99B267A913CF3E5, 0x00003FEC //A16 | |
319 | data8 0xFE3D700F4A0A0FDE, 0x0000BFEC //A15 | |
320 | data8 0x8F91BB4EE4E4EA52, 0x00003FEE //A14 | |
321 | data8 0xBCA9F41A5C6EF8BA, 0x0000BFEE //A13 | |
322 | data8 0xF93E00884027A9CF, 0x00003FED //A12 | |
323 | data8 0xC4D4036A61BABC2F, 0x00003FEF //A11 | |
324 | data8 0x86CC2AD1AD47C7D5, 0x0000BFF2 //A10 | |
325 | data8 0xD3065DEF4CE9AD32, 0x00003FF3 //A9 | |
326 | data8 0x82C44125F568D54E, 0x0000BFF5 //A8 | |
327 | data8 0x88D588729BAF14CA, 0x00003FF6 //A7 | |
328 | data8 0xF4CA0661307243C7, 0x0000BFF6 //A6 | |
329 | data8 0xB998746D57061F74, 0x00003FF7 //A5 | |
330 | data8 0xE93FB2F482327C19, 0x0000BFF7 //A4 | |
331 | // | |
0347518d | 332 | // Polynomial coefficients for the tanh(x), 6.5 <= |x| < 8.0 |
d5efd131 MF |
333 | data8 0xEB189B71ADC40BE2, 0x00003FEA //A19 |
334 | data8 0xA60B46F9FF6DC2DF, 0x00003FEA //A18 | |
335 | data8 0xBB061CDD9F368B9D, 0x00003FEC //A17 | |
336 | data8 0x841E08BDF5429991, 0x0000BFEC //A16 | |
337 | data8 0xDD33990B433F25BE, 0x00003FED //A15 | |
338 | data8 0xBA5DE6B870F0A2BB, 0x0000BFEE //A14 | |
339 | data8 0xA71D489AAA6DACF0, 0x00003FEF //A13 | |
340 | data8 0x874CCB2B8F3FBC0E, 0x0000BFF0 //A12 | |
341 | data8 0xCB1D2E9754EA534A, 0x00003FF0 //A11 | |
342 | data8 0x8BA5ABB53BA6ABCF, 0x0000BFF1 //A10 | |
343 | data8 0xAE91FD1C2391A32B, 0x00003FF1 //A9 | |
344 | data8 0xC465A74B798E5761, 0x0000BFF1 //A8 | |
345 | data8 0xC4666152397D15C1, 0x00003FF1 //A7 | |
346 | data8 0xABD9E63CA575B950, 0x0000BFF1 //A6 | |
347 | data8 0x80E38B18E8D0F460, 0x00003FF1 //A5 | |
348 | data8 0xA11C80E20AAFDD3C, 0x0000BFF0 //A4 | |
349 | // | |
0347518d | 350 | // Polynomial coefficients for the tanh(x), 13.0 <= |x| < 16.0 |
d5efd131 MF |
351 | data8 0xBECD0AF7E22E5594, 0x00003FE9 //A19 |
352 | data8 0xE2834E2D68C1128C, 0x00003FEA //A18 | |
353 | data8 0x97B117611B317379, 0x00003FEB //A17 | |
354 | data8 0xEE91A0D39A772F6B, 0x00003FEA //A16 | |
355 | data8 0x92F6EC377DCADA4F, 0x00003FEA //A15 | |
356 | data8 0xD8FCCD6A3277FAB7, 0x00003FE8 //A14 | |
357 | data8 0xC15AB9CB0C3DCFE0, 0x00003FE7 //A13 | |
358 | data8 0xC3C659704A7147CD, 0x00003FE2 //A12 | |
359 | data8 0xFA17F09D27C97912, 0x00003FE4 //A11 | |
360 | data8 0xF664147182B94788, 0x0000BFE3 //A10 | |
361 | data8 0xA6C89FA741464DA1, 0x00003FE3 //A9 | |
362 | data8 0xB90FE464A825EFA8, 0x0000BFE2 //A8 | |
363 | data8 0xB973AE0FD86EC024, 0x00003FE1 //A7 | |
364 | data8 0xA23A087F96846951, 0x0000BFE0 //A6 | |
365 | data8 0xF358D8A7FC012D5D, 0x00003FDE //A5 | |
366 | data8 0x98176E2309B7C73A, 0x0000BFDD //A4 | |
367 | // | |
368 | // Coefficients ##16..19 ("tail" coefficient tables) | |
0347518d | 369 | // Polynomial coefficients for the tanh(x), 0.25 <= |x| < 0.5 |
d5efd131 MF |
370 | data8 0x838F209ABB9BA7B3, 0x0000BFF7 //A3 |
371 | data8 0xEBC0AC78DA4FC500, 0x0000BFF8 //A2 | |
372 | data8 0xF0A4D02960B60E69, 0x00003FFC //A1 | |
373 | data8 0xFACBF534D0E42F8A, 0x00003FFC //A0 | |
374 | // | |
0347518d | 375 | // Polynomial coefficients for the tanh(x), 0.5 <= |x| < 1.0 |
d5efd131 MF |
376 | data8 0xC0ECBDC0A0D133A6, 0x0000BFF8 //A3 |
377 | data8 0xBA13A076BF8E812F, 0x0000BFFB //A2 | |
378 | data8 0xC954A37D1A1CA070, 0x00003FFD //A1 | |
379 | data8 0xEC9A9EBAB4579B29, 0x00003FFD //A0 | |
380 | // | |
0347518d | 381 | // Polynomial coefficients for the tanh(x), 1.0 <= |x| < 2.0 |
d5efd131 MF |
382 | data8 0xD42E9175A6EA1397, 0x00003FFB //A3 |
383 | data8 0xA3C361378A55CF56, 0x0000BFFD //A2 | |
384 | data8 0xD706E07CC8622983, 0x00003FFD //A1 | |
385 | data8 0xC2F7D5A8A79CA2AC, 0x00003FFE //A0 | |
386 | // | |
387 | // Polynomial coefficients for the tanh(x), 2.0 <= |x| < 3.25 | |
388 | data8 0xAC7A7F8776817C7E, 0x00003FFD //A3 | |
389 | data8 0x8B7CE95E69FCFE9A, 0x0000BFFD //A2 | |
390 | data8 0x90B161317028D995, 0x00003FFC //A1 | |
391 | data8 0xF6CA82F0DE1E9E9A, 0x00003FFE //A0 | |
392 | // | |
393 | // Polynomial coefficients for the tanh(x), 4.0 <= |x| < 6.5 | |
394 | data8 0xE9E072407BC22DC6, 0x00003FFA //A3 | |
395 | data8 0xAFA4A913D8E6BB4A, 0x0000BFF9 //A2 | |
396 | data8 0xAFC2D6A885BAA875, 0x00003FF7 //A1 | |
397 | data8 0xFFD40B84505A10B2, 0x00003FFE //A0 | |
398 | // | |
399 | // Polynomial coefficients for the tanh(x), 8.0 <= |x| < 13.0 | |
400 | data8 0xA11C8A1FED168CD5, 0x00003FF2 //A3 | |
401 | data8 0xF1AAD6B02063A5F5, 0x0000BFEF //A2 | |
402 | data8 0xF1AADA46AD341C34, 0x00003FEC //A1 | |
403 | data8 0xFFFFFC39548FC34B, 0x00003FFE //A0 | |
404 | // | |
405 | // Polynomial coefficients for the tanh(x), 16.0 <= |x| < 19.0625 | |
406 | data8 0x98176FD1F0950C16, 0x00003FDE //A3 | |
407 | data8 0xE42327BB09C8B2A5, 0x0000BFDA //A2 | |
408 | data8 0xE42327BB0B154F13, 0x00003FD6 //A1 | |
409 | data8 0xFFFFFFFFFFF8DEE7, 0x00003FFE //A0 | |
410 | // | |
411 | // Binary subranges | |
412 | // Polynomial coefficients for the tanh(x), 3.25 <= |x| < 4.0 | |
413 | data8 0xE9E072404329293B, 0x00003FF7 //A3 | |
414 | data8 0xAFA4A913D798300B, 0x0000BFF7 //A2 | |
415 | data8 0xAFC2D6A885B48567, 0x00003FF6 //A1 | |
416 | data8 0xFFD40B84505A10B4, 0x00003FFE //A0 | |
417 | // | |
418 | // Polynomial coefficients for the tanh(x), 6.5 <= |x| < 8.0 | |
419 | data8 0xA11C8A63815F7A28, 0x00003FEF //A3 | |
420 | data8 0xF1AAD6B65B0EBF53, 0x0000BFED //A2 | |
421 | data8 0xF1AADA46E799831F, 0x00003FEB //A1 | |
422 | data8 0xFFFFFC39548FC348, 0x00003FFE //A0 | |
423 | // | |
424 | // Polynomial coefficients for the tanh(x), 13.0 <= |x| < 16.0 | |
425 | data8 0x98176FE982140A59, 0x00003FDB //A3 | |
426 | data8 0xE42327B9B0D7202F, 0x0000BFD8 //A2 | |
427 | data8 0xE42327BB13076BD6, 0x00003FD5 //A1 | |
428 | data8 0xFFFFFFFFFFF8DEE7, 0x00003FFE //A0 | |
429 | // | |
0347518d | 430 | // Polynomial coefficients for the tanh(x), 0.0 <= |x| < 0.25 |
d5efd131 MF |
431 | // ('tanh_near_zero' path) |
432 | data8 0xBF2BA5D26E479D0C //A9 | |
433 | data8 0x3F4336D96F81EE26 //A8 | |
434 | data8 0xBF8226E34AE197B0 //A5 | |
435 | data8 0x3F9664F488148657 //A4 | |
436 | data8 0xAAAAAAAAAAAAAA99, 0x0000BFFD //A1 | |
437 | data8 0xBF57D91925BB5EE2 //A7 | |
438 | data8 0x3F6D6D36C3D5B7A1 //A6 | |
439 | data8 0xBFABA1BA1BA19D32 //A3 | |
440 | data8 0x3FC1111111111108 //A2 | |
441 | // | |
442 | // 1.0 - 2^(-63) | |
443 | // ('tanh_saturation' path) | |
0347518d | 444 | data8 0xFFFFFFFFFFFFFFFF, 0x00003FFE |
d5efd131 MF |
445 | LOCAL_OBJECT_END(tanh_data) |
446 | ||
447 | // CAUTION: The order of table coefficients shouldn't be changed! | |
448 | ||
449 | ||
450 | .section .text | |
451 | GLOBAL_LIBM_ENTRY(tanh) | |
452 | { .mfi | |
453 | alloc r32 = ar.pfs, 0, 20, 0, 0 | |
454 | fmerge.se fArgAbsNorm = f1, f8 // normalized x | |
455 | adds rSignBit = 0x1, r0 // Bit for sign removing | |
456 | } | |
457 | { .mfi | |
458 | addl rDataPtr = @ltoff(tanh_data), gp // Data pointer | |
459 | fma.s1 fTwo = f1, f1, f1 // 2.0 construct | |
460 | addl rArgSgnd = 0xfff, r0 // mask for exponent | |
461 | };; | |
462 | ||
463 | { .mfi | |
0347518d MF |
464 | getf.d rArg = f8 // x in GR |
465 | fclass.m p6,p0 = f8, 0xEF // Filter 0, denormals and specials | |
d5efd131 MF |
466 | // 0xEF = @qnan|@snan|@pos|@neg|@zero|@unorm|@inf |
467 | shl rArgSgnd = rArgSgnd, 52 // mask for exponent | |
468 | } | |
469 | { .mlx | |
470 | ld8 rDataPtr = [rDataPtr] // Real data pointer | |
471 | movl r1625Sgnd = 0xA000000000000 // 1.625 signd | |
472 | // 1.625 significand used to filter values greater than 3.25, 6.5, 13.0 | |
473 | // to enter binary subranges | |
474 | };; | |
475 | ||
476 | { .mfi | |
477 | addl rBias = 0x3FD00, r0 // bias of 0.25 << 8 | |
478 | fma.s1 fArgSqr = f8, f8, f0 // x^2 | |
479 | shl rSignBit = rSignBit, 63 // mask for sign bit | |
480 | } | |
481 | { .mlx | |
482 | addl rMask = 0x7FF00, r0 // Mask for index bits | |
483 | movl rTwo = 0x4000000000000000 // 2.0 | |
484 | };; | |
485 | ||
486 | { .mfi | |
487 | andcm rArgSgnd = rArg, rArgSgnd // Remove exponent | |
488 | nop.f 0 | |
489 | shr.u rShiftedArg = rArg, 44 // Select only necessary bits of arg | |
490 | } | |
491 | { .mfb | |
492 | andcm rAbsArg = rArg, rSignBit // Remove sign | |
493 | nop.f 0 | |
494 | (p6) br.cond.spnt _tanh_spec // Branch to zero, denorm & specs | |
495 | };; | |
0347518d | 496 | |
d5efd131 MF |
497 | { .mfi |
498 | and rShiftedArgMasked = rShiftedArg, rMask // bias of x << 8 | |
499 | fmerge.s fArgAbs = f1, f8 // |x| | |
0347518d | 500 | shr rShiftedAbsArg = rAbsArg, 44 // Select only necessary |
d5efd131 MF |
501 | // bits of absolute arg |
502 | } | |
503 | { .mfi | |
504 | cmp.gt p8, p11 = rArgSgnd, r1625Sgnd // p8 = 1 if | |
505 | // signd(x) > 1.625 - to filter values greater than 3.25, 6.5, 13.0 | |
506 | nop.f 0 | |
507 | nop.i 0 | |
508 | };; | |
509 | ||
510 | { .mfi | |
511 | sub rIndex = rShiftedArgMasked, rBias // index << 8 | |
0347518d | 512 | nop.f 0 |
d5efd131 MF |
513 | cmp.lt p10, p0 = rShiftedArgMasked, rBias // p10=1 if |x|<0.25 |
514 | } | |
515 | { .mfb | |
516 | (p8) cmp.gt p8, p11 = rAbsArg, rTwo // If arg is greater than 2.0? | |
517 | // (then we should use binary subranges) | |
0347518d | 518 | nop.f 0 |
d5efd131 MF |
519 | (p10) br.cond.spnt tanh_near_zero // branch out if |x| < 0.25 |
520 | };; | |
521 | ||
522 | .pred.rel "mutex",p8,p11 | |
523 | { .mfi | |
0347518d | 524 | (p8) add rIndex = 0x400, rIndex // Make pointer to binary |
d5efd131 MF |
525 | // subranges |
526 | (p11) fms.s1 fArgAbsNorm = fArgAbsNorm, f1, f1 // |x|/b - 1.0 | |
527 | addl rSaturation = 0x40331, r0 // shifted bits of 19.0625 | |
528 | } | |
529 | { .mfi | |
0347518d | 530 | nop.m 0 |
d5efd131 MF |
531 | (p8) fms.s1 fArgAbsNorm = fArgAbsNorm, f1, fTwo // |x|/b - 2.0 |
532 | // this is only for binary subranges [3.25;4], [6.5;8], [13.0;16] | |
0347518d | 533 | nop.i 0 |
d5efd131 MF |
534 | } |
535 | ;; | |
536 | ||
537 | { .mfi | |
538 | add rCoeffAddr1 = rDataPtr, rIndex// coeff. ##0,2,..14 | |
539 | nop.f 0 | |
540 | nop.i 0 | |
541 | };; | |
542 | ||
543 | { .mfi | |
544 | adds rCoeffAddr2 = 16, rCoeffAddr1 // Shifted pointer to coeffs | |
545 | fmerge.s fSignumX = f8, f1 // signum(x) | |
546 | nop.i 0 | |
0347518d | 547 | } |
d5efd131 MF |
548 | { .mfb |
549 | cmp.le p12, p0 = rSaturation, rShiftedAbsArg // |x|>=19.0625? | |
550 | nop.f 0 | |
551 | (p12) br.cond.spnt tanh_saturation // branch out if x |x| >= 19.0625 | |
552 | };; | |
553 | ||
554 | {.mfi | |
555 | ldfe fA19 = [rCoeffAddr1], 32 // Load A19 | |
556 | nop.f 0 | |
557 | nop.i 0 | |
558 | } | |
559 | {.mfi | |
560 | ldfe fA18 = [rCoeffAddr2], 32 // Load A18 | |
561 | nop.f 0 | |
562 | adds rCoeffAddr3 = 0xA00, rDataPtr // Pointer to "tail" | |
563 | // coefficients tables | |
564 | };; | |
565 | ||
566 | {.mfi | |
567 | ldfe fA17 = [rCoeffAddr1], 32 // Load A17 | |
568 | nop.f 0 | |
569 | nop.i 0 | |
570 | } | |
571 | {.mfi | |
572 | ldfe fA16 = [rCoeffAddr2], 32 // Load A16 | |
573 | nop.f 0 | |
574 | nop.i 0 | |
575 | };; | |
576 | ||
577 | {.mfi | |
578 | ldfe fA15 = [rCoeffAddr1], 32 // Load A15 | |
579 | fma.s1 fTSqr = fArgAbsNorm, fArgAbsNorm, f0 // x^2 | |
580 | shr.u rIndex = rIndex, 2 // Index for "tail" tables | |
581 | } | |
582 | {.mfi | |
583 | ldfe fA14 = [rCoeffAddr2], 32 // Load A14 | |
584 | nop.f 0 | |
585 | adds rCoeffAddr4 = 16, r0 // Shifter pointer | |
586 | // to "tail" tables | |
587 | };; | |
588 | ||
589 | {.mfi | |
590 | ldfe fA13 = [rCoeffAddr1], 32 // Load A13 | |
591 | nop.f 0 | |
592 | add rCoeffAddr3 = rCoeffAddr3, rIndex // "tail" coeffs to load | |
593 | // ##16..23 | |
594 | } | |
595 | {.mfi | |
596 | ldfe fA12 = [rCoeffAddr2], 32 // Load A12 | |
597 | nop.f 0 | |
0347518d | 598 | cmp.lt p15, p14 = rArg, r0 // Arg positive (p14) |
d5efd131 MF |
599 | // or negative (p15)? |
600 | };; | |
601 | ||
602 | {.mfi | |
603 | ldfe fA11 = [rCoeffAddr1], 32 // Load A11 | |
604 | nop.f 0 | |
0347518d MF |
605 | add rCoeffAddr4 = rCoeffAddr3, rCoeffAddr4 // shifted "tail" |
606 | // coeffs to load | |
d5efd131 MF |
607 | } |
608 | {.mfi | |
609 | ldfe fA10 = [rCoeffAddr2], 32 // Load A10 | |
610 | nop.f 0 | |
611 | nop.i 0 | |
612 | };; | |
613 | ||
614 | {.mfi | |
615 | ldfe fA9 = [rCoeffAddr1], 32 // Load A9 | |
616 | nop.f 0 | |
617 | nop.i 0 | |
618 | } | |
619 | {.mfi | |
620 | ldfe fA8 = [rCoeffAddr2], 32 // Load A8 | |
621 | nop.f 0 | |
622 | nop.i 0 | |
623 | };; | |
624 | ||
625 | {.mfi | |
626 | ldfe fA7 = [rCoeffAddr1], 32 // Load A7 | |
627 | nop.f 0 | |
628 | nop.i 0 | |
629 | } | |
630 | {.mfi | |
631 | ldfe fA6 = [rCoeffAddr2], 32 // Load A6 | |
632 | nop.f 0 | |
633 | nop.i 0 | |
634 | };; | |
635 | ||
636 | {.mfi | |
637 | ldfe fA5 = [rCoeffAddr1], 32 // Load A5 | |
638 | fma.s1 fTDeg3 = fArgAbsNorm, fTSqr, f0 // x^3 | |
639 | nop.i 0 | |
640 | } | |
641 | {.mfi | |
642 | ldfe fA4 = [rCoeffAddr2], 32 // Load A4 | |
643 | fma.s1 fTQuadr = fTSqr, fTSqr, f0 // x^4 | |
644 | nop.i 0 | |
645 | };; | |
646 | ||
647 | // Path #3 Polynomial Pol19(y) computation; y = fArgAbsNorm | |
648 | {.mfi | |
649 | ldfe fA3 = [rCoeffAddr3], 32 // Load A3 | |
650 | fma.s1 fArgAbsNormSgn = fArgAbsNorm, fSignumX, f0 // sign(x)*x | |
651 | nop.i 0 | |
652 | } | |
653 | {.mfi | |
654 | ldfe fA2 = [rCoeffAddr4], 32 // Load A2 | |
655 | nop.f 0 | |
656 | nop.i 0 | |
657 | };; | |
658 | ||
659 | {.mfi | |
660 | ldfe fA1 = [rCoeffAddr3], 32 // Load A1 | |
661 | fma.s1 fRes = fA19, fArgAbsNorm, fA18 // Polynomial | |
662 | nop.i 0 | |
663 | } | |
664 | {.mfi | |
665 | ldfe fA0 = [rCoeffAddr4], 32 // Load A0 | |
666 | nop.f 0 | |
667 | nop.i 0 | |
668 | };; | |
669 | ||
670 | { .mfi | |
671 | nop.m 0 | |
672 | fma.s1 fA17 = fA17, fArgAbsNorm, fA16 // Polynomial | |
673 | nop.i 0 | |
674 | };; | |
675 | ||
676 | { .mfi | |
677 | nop.m 0 | |
678 | fma.s1 fA15 = fA15, fArgAbsNorm, fA14 // Polynomial | |
679 | nop.i 0 | |
680 | };; | |
681 | ||
682 | { .mfi | |
683 | nop.m 0 | |
684 | fma.s1 fTDeg7 = fTDeg3, fTQuadr, f0 // Polynomial | |
685 | nop.i 0 | |
686 | } | |
687 | { .mfi | |
688 | nop.m 0 | |
689 | fma.s1 fA13 = fA13, fArgAbsNorm, fA12 // Polynomial | |
690 | nop.i 0 | |
691 | };; | |
692 | ||
693 | { .mfi | |
694 | nop.m 0 | |
695 | fma.s1 fA11 = fA11, fArgAbsNorm, fA10 // Polynomial | |
696 | nop.i 0 | |
697 | };; | |
698 | ||
699 | { .mfi | |
700 | nop.m 0 | |
701 | fma.s1 fA9 = fA9, fArgAbsNorm, fA8 // Polynomial | |
702 | nop.i 0 | |
703 | };; | |
704 | ||
705 | { .mfi | |
706 | nop.m 0 | |
707 | fma.s1 fRes = fRes, fTSqr, fA17 // Polynomial | |
708 | nop.i 0 | |
709 | } | |
710 | { .mfi | |
711 | nop.m 0 | |
712 | fma.s1 fA7 = fA7, fArgAbsNorm, fA6 // Polynomial | |
713 | nop.i 0 | |
714 | };; | |
715 | ||
716 | { .mfi | |
717 | nop.m 0 | |
718 | fma.s1 fA5 = fA5, fArgAbsNorm, f0 // Polynomial | |
719 | nop.i 0 | |
720 | };; | |
721 | ||
722 | { .mfi | |
723 | nop.m 0 | |
0347518d | 724 | fma.s1 fA15 = fA15, fTSqr, fA13 // Polynomial |
d5efd131 MF |
725 | nop.i 0 |
726 | } | |
727 | { .mfi | |
728 | nop.m 0 | |
729 | fma.s1 fA4 = fA4, fArgAbsNorm, fA3 // Polynomial | |
730 | nop.i 0 | |
731 | };; | |
732 | ||
733 | { .mfi | |
734 | nop.m 0 | |
735 | fma.s1 fA2 = fA2, fArgAbsNorm, fA1 // Polynomial | |
736 | nop.i 0 | |
737 | };; | |
738 | ||
739 | { .mfi | |
740 | nop.m 0 | |
741 | fma.s1 fA11 = fA11, fTSqr, fA9 // Polynomial | |
742 | nop.i 0 | |
743 | };; | |
744 | ||
745 | { .mfi | |
0347518d | 746 | nop.m 0 |
d5efd131 MF |
747 | fma.s1 fA7 = fA7, fTSqr, fA5 // Polynomial |
748 | nop.i 0 | |
749 | };; | |
750 | ||
751 | { .mfi | |
0347518d | 752 | nop.m 0 |
d5efd131 MF |
753 | fma.s1 fRes = fRes, fTQuadr, fA15 // Polynomial |
754 | nop.i 0 | |
755 | };; | |
756 | ||
757 | { .mfi | |
0347518d | 758 | nop.m 0 |
d5efd131 MF |
759 | fma.s1 fA4 = fA4, fTSqr, fA2 // Polynomial |
760 | nop.i 0 | |
761 | };; | |
762 | ||
763 | { .mfi | |
764 | nop.m 0 | |
765 | fma.s1 fRes = fRes, fTQuadr, fA11 // Polynomial | |
766 | nop.i 0 | |
767 | };; | |
768 | ||
769 | { .mfi | |
0347518d | 770 | nop.m 0 |
d5efd131 MF |
771 | fma.s1 fA4 = fA7, fTDeg3, fA4 // Polynomial |
772 | nop.i 0 | |
773 | };; | |
774 | ||
775 | { .mfi | |
776 | nop.m 0 | |
777 | fma.s1 fRes = fRes, fTDeg7, fA4 // Polynomial | |
778 | nop.i 0 | |
779 | };; | |
780 | ||
781 | { .mfi | |
782 | nop.m 0 | |
783 | // result for negative argument | |
784 | (p15) fms.d.s0 f8 = fRes, fArgAbsNormSgn, fA0 // Polynomial | |
785 | nop.i 0 | |
786 | } | |
787 | { .mfb | |
788 | nop.m 0 | |
789 | // result for positive argument | |
790 | (p14) fma.d.s0 f8 = fRes, fArgAbsNormSgn, fA0 // Polynomial | |
791 | br.ret.sptk b0 | |
792 | };; | |
793 | ||
794 | ||
795 | // |x| < 0.25 Path ///////////////////////////////////////////////////////////// | |
796 | .align 32 | |
797 | tanh_near_zero: | |
798 | { .mfi | |
799 | adds rCoeffAddr1 = 0xC80, rDataPtr // address of A9 | |
0347518d | 800 | fma.s0 fTSqr = fArgSqr, fArgSqr, f0 // x^4 |
d5efd131 MF |
801 | nop.i 0 |
802 | } | |
803 | { .mfi | |
804 | adds rCoeffAddr2 = 0xCB0, rDataPtr // address of A7 | |
805 | nop.f 0 | |
806 | nop.i 0 | |
807 | };; | |
808 | ||
809 | { .mfi | |
810 | ldfpd fA9, fA8 = [rCoeffAddr1], 16 // Load A9, A8 | |
811 | nop.f 0 | |
812 | nop.i 0 | |
813 | } | |
814 | { .mfi | |
815 | ldfpd fA7, fA6 = [rCoeffAddr2], 16 // Load A7, A6 | |
816 | nop.f 0 | |
817 | nop.i 0 | |
818 | };; | |
819 | ||
820 | { .mfi | |
821 | ldfpd fA5, fA4 = [rCoeffAddr1], 16 // Load A5, A4 | |
822 | nop.f 0 | |
823 | nop.i 0 | |
824 | } | |
825 | { .mfi | |
826 | ldfpd fA3, fA2 = [rCoeffAddr2], 16 // Load A3, A2 | |
827 | nop.f 0 | |
828 | nop.i 0 | |
829 | };; | |
830 | ||
831 | { .mfi | |
832 | ldfe fA1 = [rCoeffAddr1] // Load A1 | |
833 | nop.f 0 | |
834 | nop.i 0 | |
835 | };; | |
836 | ||
837 | { .mfi | |
838 | nop.m 0 | |
839 | fma.s1 fTQuadr = fTSqr, fTSqr, f0 // x^4 | |
840 | nop.i 0 | |
841 | };; | |
842 | ||
843 | { .mfi | |
844 | nop.m 0 | |
845 | fma.s1 fRes = fA9, fArgSqr, fA8 // Polynomial | |
846 | nop.i 0 | |
847 | } | |
848 | { .mfi | |
849 | nop.m 0 | |
850 | fma.s1 fA7 = fA7, fArgSqr, fA6 // Polynomial | |
851 | nop.i 0 | |
852 | };; | |
853 | ||
854 | { .mfi | |
855 | nop.m 0 | |
856 | fma.s1 fA3 = fA3, fArgSqr, fA2 // Polynomial | |
857 | nop.i 0 | |
858 | } | |
859 | { .mfi | |
860 | nop.m 0 | |
861 | fma.s1 fA5 = fA5, fArgSqr, fA4 // Polynomial | |
862 | nop.i 0 | |
863 | };; | |
864 | ||
865 | { .mfi | |
866 | nop.m 0 | |
867 | fma.s1 fA1 = fA1, fArgSqr, f0 // Polynomial | |
868 | nop.i 0 | |
869 | } | |
870 | { .mfi | |
871 | nop.m 0 | |
872 | fma.s1 fTQuadrSgn = fTQuadr, f8, f0 // x^4 * x | |
873 | nop.i 0 | |
874 | };; | |
875 | ||
876 | { .mfi | |
877 | nop.m 0 | |
878 | fma.s1 fRes = fRes, fTSqr, fA7 // Polynomial | |
879 | nop.i 0 | |
880 | };; | |
881 | ||
882 | { .mfi | |
883 | nop.m 0 | |
884 | fma.s1 fA1 = fA3, fTSqr, fA1 // Polynomial | |
885 | nop.i 0 | |
886 | };; | |
887 | ||
888 | { .mfi | |
889 | nop.m 0 | |
890 | fma.s1 fRes = fRes, fTSqr, fA5 // Polynomial | |
891 | nop.i 0 | |
892 | };; | |
893 | ||
894 | { .mfi | |
895 | nop.m 0 | |
896 | fma.s1 fRes = fRes, fTQuadr, fA1 // Polynomial | |
897 | nop.i 0 | |
898 | };; | |
899 | ||
900 | { .mfb | |
901 | nop.m 0 | |
902 | fma.d.s0 f8 = fRes, f8, f8 // x+x*Polynomial | |
903 | br.ret.sptk b0 // Exit for |x| < 0.25 | |
904 | };; | |
905 | ||
906 | ||
907 | ||
908 | ||
909 | ||
910 | // 19.0625 <= |x| < +inf Saturation path /////////////////////////////////////// | |
911 | .align 32 | |
912 | tanh_saturation: | |
913 | { .mfi | |
914 | adds rDataPtr = 0xCD0, rDataPtr // address of A0 | |
915 | nop.f 0 | |
916 | nop.i 0 | |
917 | };; | |
918 | ||
919 | { .mfi | |
920 | ldfe fA0 = [rDataPtr] // Load A0 = 2^(-63) | |
921 | nop.f 0 | |
922 | nop.i 0 | |
923 | };; | |
924 | ||
925 | { .mfb | |
926 | nop.m 0 | |
927 | fma.d.s0 f8 = fA0, fSignumX, f0 // sign(x)*(1.0-2^(-63)) | |
928 | br.ret.sptk b0 // Exit for 19.0625 <=|x|< +inf | |
929 | };; | |
930 | ||
931 | ||
932 | ||
933 | ||
0347518d | 934 | |
d5efd131 MF |
935 | // 0, denormals and special IEEE numbers path ///////////////////////////////// |
936 | _tanh_spec: | |
937 | ||
0347518d MF |
938 | { .mfi |
939 | cmp.lt p15, p14 = rArg, r0 // Is arg negative (p15) | |
d5efd131 MF |
940 | // or positive p14) |
941 | fclass.m p6,p0 = f8, 0x23 // To filter infinities | |
0347518d | 942 | // 0x23 = @pos|@neg|@inf |
d5efd131 MF |
943 | nop.i 0 |
944 | };; | |
945 | ||
0347518d | 946 | { .mfi |
d5efd131 MF |
947 | nop.m 0 |
948 | fclass.m p7,p0 = f8, 0xC7 // To filter NaNs & Zeros | |
949 | // 0xC7 = @pos|@neg|@zero|@qnan|@snan | |
950 | nop.i 0 | |
951 | };; | |
952 | ||
0347518d | 953 | { .mfb |
d5efd131 | 954 | nop.m 0 |
0347518d | 955 | (p6) fmerge.s f8 = f8, f1 // +/-1 for INF args |
d5efd131 MF |
956 | (p6) br.ret.spnt b0 // exit for x = INF |
957 | };; | |
958 | ||
0347518d | 959 | { .mfb |
d5efd131 | 960 | nop.m 0 |
0347518d | 961 | (p7) fma.d.s0 f8 = f8, f1, f8 // +/-0 for 0 args |
d5efd131 MF |
962 | // and NaNs for NaNs |
963 | (p7) br.ret.spnt b0 // exit for x = NaN or +/-0 | |
964 | };; | |
965 | ||
0347518d | 966 | { .mfi |
d5efd131 MF |
967 | nop.m 0 |
968 | fnorm.s0 f8 = f8 // Normalize arg | |
969 | nop.i 0 | |
970 | };; | |
971 | ||
972 | .pred.rel "mutex",p14,p15 | |
0347518d | 973 | { .mfi |
d5efd131 MF |
974 | nop.m 0 |
975 | (p14) fnma.d.s0 f8 = f8, f8, f8 // res = r-r^2 | |
976 | nop.i 0 | |
977 | } | |
0347518d | 978 | { .mfb |
d5efd131 MF |
979 | nop.m 0 |
980 | (p15) fma.d.s0 f8 = f8, f8, f8 // res = r+r^2 | |
981 | br.ret.sptk b0 // 0, denormals, specials return | |
982 | };; | |
983 | ||
984 | GLOBAL_LIBM_END(tanh) | |
985 | ||
986 |