]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/ia64/fpu/s_tanh.S
ia64: strip trailing whitespace
[thirdparty/glibc.git] / sysdeps / ia64 / fpu / s_tanh.S
CommitLineData
d5efd131
MF
1.file "tanh.s"
2
3
4// Copyright (c) 2001 - 2005, Intel Corporation
5// All rights reserved.
6//
7// Contributed 2001 by the Intel Numerics Group, Intel Corporation
8//
9// Redistribution and use in source and binary forms, with or without
10// modification, are permitted provided that the following conditions are
11// met:
12//
13// * Redistributions of source code must retain the above copyright
14// notice, this list of conditions and the following disclaimer.
15//
16// * Redistributions in binary form must reproduce the above copyright
17// notice, this list of conditions and the following disclaimer in the
18// documentation and/or other materials provided with the distribution.
19//
20// * The name of Intel Corporation may not be used to endorse or promote
21// products derived from this software without specific prior written
22// permission.
23
0347518d
MF
24// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
25// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
d5efd131 26// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
0347518d 27// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
d5efd131 28// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
0347518d
MF
29// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
30// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
31// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
d5efd131 32// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
0347518d
MF
33// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
34// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35//
d5efd131 36// Intel Corporation is the author of this code, and requests that all
0347518d 37// problem reports or change requests be submitted to it directly at
d5efd131
MF
38// http://www.intel.com/software/products/opensource/libraries/num.htm.
39//
40// History
41//==============================================================================
42// 05/30/01 Initial version
43// 12/04/01 Rewritten version with erf-like algorithm.
44// Performance improved.
45// 05/20/02 Cleaned up namespace and sf0 syntax
46// 08/14/02 Changed mli templates to mlx
47// 02/10/03 Reordered header: .section, .global, .proc, .align
48// 03/31/05 Reformatted delimiters between data tables
49//
50// API
51//==============================================================================
52// double tanh(double)
53//
54// Overview of operation
55//==============================================================================
56//
57// Algorithm description
58// ---------------------
59//
60// There are 4 paths:
61//
62// 1. Special path: x = 0, Inf, NaNs, denormals
63// Return tanh(x) = +/-0.0 for zeros
64// Return tanh(x) = QNaN for NaNs
65// Return tanh(x) = sign(x)*1.0 for Inf
66// Return tanh(x) = x + x^2 for - denormals
67// Return tanh(x) = x - x^2 for + denormals
68//
69// 2. Near zero path: 0.0 < |x| < 0.25
70// Return tanh(x) = x + x^3*A3 + ... + x^19*A19
71//
72// 3. Main path: 0.25 <= |x| < 19.0625
73// For several ranges of 0.25 <= |x| < 19.0625
0347518d 74// Return tanh(x) = sign(x)*(A0 + y*A1 + y^2*A2 +
d5efd131
MF
75// + y^3*A3 + ... + y^19*A19)
76// where y = (|x|/a) - b
0347518d 77//
d5efd131
MF
78// For each range there is particular set of coefficients.
79// Below is the list of ranges:
80// 1/4 <= |x| < 1/2 a = 0.25, b = 1.0
81// 1/2 <= |x| < 1.0 a = 0.5, b = 1.0
82// 1.0 <= |x| < 2.0 a = 1.0, b = 1.0
83// 2.0 <= |x| < 3.25 a = 2.0, b = 1.0
84// 3.25 <= |x| < 4.0 a = 2.0, b = 2.0
85// 4.0 <= |x| < 6.5 a = 4.0, b = 1.0
86// 6.5 <= |x| < 8.0 a = 4.0, b = 2.0
87// 8.0 <= |x| < 13.0 a = 8.0, b = 1.0
88// 13.0 <= |x| < 16.0 a = 8.0, b = 2.0
89// 16.0 <= |x| < 19.0625 a = 16.0, b = 1.0
0347518d 90// ( [3.25;4.0], [6.5;8.0], [13.0;16.0] subranges separated
d5efd131
MF
91// for monotonicity issues resolve )
92//
0347518d 93// 4. Saturation path: 19.0625 <= |x| < +INF
d5efd131
MF
94// Return tanh(x) = sign(x)*(1.0 - tiny_value)
95// (tiny_value ~ 2^(-63))
96//
97// Registers used
98//==============================================================================
0347518d 99// Floating Point registers used:
d5efd131
MF
100// f8 = input, output
101// f32 -> f64
102//
0347518d 103// General registers used:
d5efd131
MF
104// r32 -> r51, r2, r3
105//
106// Predicate registers used:
107// p6, p8, p10, p11, p12, p14, p15
108// p6 arg is zero, denormal or special IEEE
0347518d 109// p8 to filter out case when signd(x) > 1.625
d5efd131 110// p10 to filter out case when |x| < 0.25
0347518d 111// p11 to filter out case when signd(x) <= 1.625
d5efd131
MF
112// p12 to filter out case when |x| >= 19.0625
113// p14 set to 1 for positive x
114// p15 set to 1 for negative x
115
116// Assembly macros
117//==============================================================================
118rDataPtr = r2
119rDataPtr1 = r3
120
121rBias = r33
122rCoeffAddr3 = r34
123rThreeAndQ = r35
124rCoeffAddr2 = r36
125rMask = r37
126rArg = r38
127rSignBit = r39
128rAbsArg = r40
129rSaturation = r41
130rIndex = r42
131rCoeffAddr1 = r43
132rCoeffAddr4 = r44
133rShiftedArg = r45
134rShiftedArgMasked = r46
135rBiasedExpOf4 = r47
136rShiftedAbsArg = r48
137rArgSgnd = r49
138r1625Sgnd = r50
139rTwo = r51
140
141//==============================================================================
142fA0 = f32
143fA1 = f33
144fA2 = f34
145fA3 = f35
146fA4 = f36
147fA5 = f37
148fA6 = f38
149fA7 = f39
150fA8 = f40
151fA9 = f41
152fA10 = f42
153fA11 = f43
154fA12 = f44
155fA13 = f45
156fA14 = f46
157fA15 = f47
158fA16 = f48
159fA17 = f49
160fA18 = f50
161fA19 = f51
162fArgSqr = f52
163fArgAbsNorm = f53
164fSignumX = f54
165fRes = f55
166fThreeAndQ = f56
167fArgAbs = f57
168fTSqr = f58
169fTQuadr = f59
170fTDeg3 = f60
171fTDeg7 = f61
0347518d 172fArgAbsNormSgn = f62
d5efd131
MF
173fTQuadrSgn = f63
174fTwo = f64
175
176// Data tables
177//==============================================================================
178RODATA
179
180.align 16
181
182LOCAL_OBJECT_START(tanh_data)
183// CAUTION: The order of these table coefficients shouldn't be changed!
184
185// Main path coefficients:
186// Coefficients ##0..15 ("main" coefficient tables)
0347518d 187// Polynomial coefficients for the tanh(x), 0.25 <= |x| < 0.5
d5efd131
MF
188data8 0xE9D218BC9A3FB55A, 0x00003FC7 //A19
189data8 0xC8C0D38687F36EBA, 0x00003FCE //A18
190data8 0xA2663E519FAC8A43, 0x0000BFD2 //A17
191data8 0xD913F0490674B0DF, 0x00003FD3 //A16
192data8 0xF75D84789DE0AE52, 0x00003FD6 //A15
193data8 0xACB3C40EEF3A06F0, 0x0000BFD9 //A14
194data8 0xEBD7F5DC02CFD5BA, 0x0000BFDB //A13
195data8 0x8B52CDF66D709E2A, 0x00003FDF //A12
196data8 0x9EC21F28E05C4A3E, 0x00003FE0 //A11
197data8 0xC412B44D0176F3ED, 0x0000BFE4 //A10
198data8 0x97BF35A34DD1EA4C, 0x0000BFE0 //A9
199data8 0xF89F5B39E3A3AA36, 0x00003FE9 //A8
200data8 0xF2BA654BCEEBA433, 0x0000BFEA //A7
201data8 0x8E1C15876AA589AD, 0x0000BFEF //A6
202data8 0x942226246A8C2A86, 0x00003FF1 //A5
203data8 0x8F06D9FF7DB47261, 0x00003FF4 //A4
204//
0347518d 205// Polynomial coefficients for the tanh(x), 0.5 <= |x| < 1.0
d5efd131
MF
206data8 0xC4A7B8FB672A8520, 0x00003FDC //A19
207data8 0xA20724B847E13499, 0x0000BFE0 //A18
208data8 0xE17DB53F02E4D340, 0x00003FE2 //A17
209data8 0x90264A1012F4CA6F, 0x0000BFE4 //A16
210data8 0xEBEC9F776F0BF415, 0x0000BFE0 //A15
211data8 0x89AF912B305B45A4, 0x00003FE7 //A14
212data8 0xB4A960B81F5EC36A, 0x0000BFE7 //A13
213data8 0x969A4E95B2DA86B5, 0x0000BFEA //A12
214data8 0x8A3FC0EC082305CB, 0x00003FEC //A11
215data8 0x83D7795BCBE24373, 0x00003FEC //A10
216data8 0xDCBF42AEB82932EC, 0x0000BFEF //A9
217data8 0x83318E61ECAFD804, 0x00003FF0 //A8
218data8 0xEA4DE5746975A914, 0x00003FF2 //A7
219data8 0xCE63E8FA6B96480B, 0x0000BFF4 //A6
220data8 0xDF017BE0D4FE45D8, 0x0000BFF4 //A5
221data8 0xA8A0C6E2226DF3CD, 0x00003FF8 //A4
222//
0347518d 223// Polynomial coefficients for the tanh(x), 1.0 <= |x| < 2.0
d5efd131
MF
224data8 0x8E89D2EBFDAA160B, 0x00003FE9 //A19
225data8 0xDD9226310A272046, 0x0000BFEC //A18
226data8 0xA038042D28B0D665, 0x00003FEF //A17
227data8 0x8C04796F03516306, 0x0000BFF1 //A16
228data8 0x9CD6A9CB4E90A2FD, 0x00003FF2 //A15
229data8 0xC8980E166F5A84FD, 0x0000BFF2 //A14
230data8 0x9ADFE65F56B7BCFD, 0x00003FED //A13
231data8 0x8B11FDFB5D0A7B96, 0x00003FF4 //A12
232data8 0x8209A125E829CBFA, 0x0000BFF5 //A11
233data8 0xCF38AAC17B85BD76, 0x00003FF1 //A10
234data8 0xD5C2E248D8AB99AB, 0x00003FF6 //A9
235data8 0xE12BE2785727F2D6, 0x0000BFF7 //A8
236data8 0x9FC9EF90F87BF1E2, 0x00003FF6 //A7
237data8 0x9B02FE0DAF42C08F, 0x00003FF9 //A6
238data8 0xBDACE06F531D9491, 0x0000BFFA //A5
239data8 0xE3048AD1DB2F648C, 0x00003FF9 //A4
240//
0347518d 241// Polynomial coefficients for the tanh(x), 2.0 <= |x| < 3.25
d5efd131
MF
242data8 0x856EC3B0330A385A, 0x00003FEB //A19
243data8 0xC641D69DAE2D429C, 0x0000BFF2 //A18
244data8 0xC683EB0BE1343FFF, 0x00003FF5 //A17
245data8 0xC358954224E4E823, 0x0000BFF7 //A16
246data8 0xF813A8D6D396BC5F, 0x00003FF8 //A15
247data8 0xE0ECDFED078D37D6, 0x0000BFF9 //A14
248data8 0x950E4E619855E316, 0x00003FFA //A13
249data8 0x8453B8F93370FB58, 0x0000BFFA //A12
250data8 0xFDBA28430AEC95BA, 0x00003FF7 //A11
251data8 0x9371AAC1FDB1E664, 0x00003FFA //A10
252data8 0xAC972DA97782D88A, 0x0000BFFB //A9
253data8 0xE18F47B10B9CE1BC, 0x00003FFB //A8
254data8 0xAB7C81230BF13BC6, 0x0000BFFB //A7
255data8 0xA6CAAD4A3E31A7D5, 0x0000BFF8 //A6
256data8 0x9CABD76D1D5C3878, 0x00003FFC //A5
257data8 0x92906D077941CAA9, 0x0000BFFD //A4
258//
0347518d 259// Polynomial coefficients for the tanh(x), 4.0 <= |x| < 6.5
d5efd131
MF
260data8 0x9232D19F71709AC9, 0x0000BFF5 //A19
261data8 0x819E31323F5DD3F8, 0x00003FF8 //A18
262data8 0xDA8E1CDB8D23DC29, 0x0000BFF9 //A17
263data8 0xE97C7CD8FC0486D8, 0x00003FFA //A16
264data8 0xB0C4AD234D88C9F2, 0x0000BFFB //A15
265data8 0xC5989BFB28FDE267, 0x00003FFB //A14
266data8 0x9B26520EC4EFEE8E, 0x0000BFFB //A13
267data8 0xC4B6F758AD21E574, 0x00003FF9 //A12
268data8 0xCC36E3FFA10D2CFF, 0x00003FFA //A11
269data8 0x8738696FB06A5CED, 0x0000BFFC //A10
270data8 0xD31981825BF39228, 0x00003FFC //A9
271data8 0x82C58FB9BEE43992, 0x0000BFFD //A8
272data8 0x88D5AAE49164B6F3, 0x00003FFD //A7
273data8 0xF4CA0B968AF2DDE2, 0x0000BFFC //A6
274data8 0xB99874B482BD17EE, 0x00003FFC //A5
275data8 0xE93FB2F99431DC1D, 0x0000BFFB //A4
276//
0347518d 277// Polynomial coefficients for the tanh(x), 8.0 <= |x| < 13.0
d5efd131
MF
278data8 0xAAA9EB7EADA85CEC, 0x00003FF5 //A19
279data8 0x980C80EE05A6BE78, 0x0000BFF8 //A18
280data8 0x818DA9F5396390A5, 0x00003FFA //A17
281data8 0x8D8CC21E23D8A6A2, 0x0000BFFB //A16
282data8 0xE0EC19E55A886765, 0x00003FFB //A15
283data8 0x8C11197A7E6244C5, 0x0000BFFC //A14
284data8 0x901D2BF203C2F7F3, 0x00003FFC //A13
285data8 0xFEACAEE66EE803E5, 0x0000BFFB //A12
286data8 0xC684E4925E318C3F, 0x00003FFB //A11
287data8 0x8A9D8A970565F28D, 0x0000BFFB //A10
288data8 0xAE34C61DE5CEA4D4, 0x00003FFA //A9
289data8 0xC44C5714BD6208A0, 0x0000BFF9 //A8
290data8 0xC4612F7D6C8BDB79, 0x00003FF8 //A7
291data8 0xABD91DCE40D5EECB, 0x0000BFF7 //A6
292data8 0x80E375C1B847B72F, 0x00003FF6 //A5
293data8 0xA11C7DD978CF700A, 0x0000BFF4 //A4
294//
0347518d 295// Polynomial coefficients for the tanh(x), 16.0 <= |x| < 19.0625
d5efd131
MF
296data8 0xE29D17C510F86F6B, 0x00003FF3 //A19
297data8 0x88FE52EB39A3A98C, 0x0000BFF5 //A18
298data8 0xA406547E50360693, 0x00003FF5 //A17
299data8 0x83E6260B71C6D7DE, 0x0000BFF5 //A16
300data8 0xA36AB5B0CBC97B85, 0x00003FF4 //A15
301data8 0xA94931E0B7BA6C14, 0x0000BFF3 //A14
302data8 0x9A4596DAF350AD63, 0x00003FF2 //A13
303data8 0xFE47643F375AECA5, 0x0000BFF0 //A12
304data8 0xBF8433C5ABEE63B1, 0x00003FEF //A11
305data8 0x83CEE05D7AE90A0A, 0x0000BFEE //A10
306data8 0xA4CC45480BCEB02D, 0x00003FEC //A9
307data8 0xB967CBDCBC16CB10, 0x0000BFEA //A8
308data8 0xB9681B214EDC098D, 0x00003FE8 //A7
309data8 0xA23B20D87B80DFA8, 0x0000BFE6 //A6
310data8 0xF358B2C46F10CBAF, 0x00003FE3 //A5
311data8 0x98176FD06229A385, 0x0000BFE1 //A4
312//
313// Binary subranges
0347518d 314// Polynomial coefficients for the tanh(x), 3.25 <= |x| < 4.0
d5efd131
MF
315data8 0xEF2EE841288F6706, 0x00003FE9 //A19
316data8 0xE65D5B74B85F82A6, 0x00003FEB //A18
317data8 0xE495FC21E42A79FF, 0x00003FEA //A17
318data8 0xF99B267A913CF3E5, 0x00003FEC //A16
319data8 0xFE3D700F4A0A0FDE, 0x0000BFEC //A15
320data8 0x8F91BB4EE4E4EA52, 0x00003FEE //A14
321data8 0xBCA9F41A5C6EF8BA, 0x0000BFEE //A13
322data8 0xF93E00884027A9CF, 0x00003FED //A12
323data8 0xC4D4036A61BABC2F, 0x00003FEF //A11
324data8 0x86CC2AD1AD47C7D5, 0x0000BFF2 //A10
325data8 0xD3065DEF4CE9AD32, 0x00003FF3 //A9
326data8 0x82C44125F568D54E, 0x0000BFF5 //A8
327data8 0x88D588729BAF14CA, 0x00003FF6 //A7
328data8 0xF4CA0661307243C7, 0x0000BFF6 //A6
329data8 0xB998746D57061F74, 0x00003FF7 //A5
330data8 0xE93FB2F482327C19, 0x0000BFF7 //A4
331//
0347518d 332// Polynomial coefficients for the tanh(x), 6.5 <= |x| < 8.0
d5efd131
MF
333data8 0xEB189B71ADC40BE2, 0x00003FEA //A19
334data8 0xA60B46F9FF6DC2DF, 0x00003FEA //A18
335data8 0xBB061CDD9F368B9D, 0x00003FEC //A17
336data8 0x841E08BDF5429991, 0x0000BFEC //A16
337data8 0xDD33990B433F25BE, 0x00003FED //A15
338data8 0xBA5DE6B870F0A2BB, 0x0000BFEE //A14
339data8 0xA71D489AAA6DACF0, 0x00003FEF //A13
340data8 0x874CCB2B8F3FBC0E, 0x0000BFF0 //A12
341data8 0xCB1D2E9754EA534A, 0x00003FF0 //A11
342data8 0x8BA5ABB53BA6ABCF, 0x0000BFF1 //A10
343data8 0xAE91FD1C2391A32B, 0x00003FF1 //A9
344data8 0xC465A74B798E5761, 0x0000BFF1 //A8
345data8 0xC4666152397D15C1, 0x00003FF1 //A7
346data8 0xABD9E63CA575B950, 0x0000BFF1 //A6
347data8 0x80E38B18E8D0F460, 0x00003FF1 //A5
348data8 0xA11C80E20AAFDD3C, 0x0000BFF0 //A4
349//
0347518d 350// Polynomial coefficients for the tanh(x), 13.0 <= |x| < 16.0
d5efd131
MF
351data8 0xBECD0AF7E22E5594, 0x00003FE9 //A19
352data8 0xE2834E2D68C1128C, 0x00003FEA //A18
353data8 0x97B117611B317379, 0x00003FEB //A17
354data8 0xEE91A0D39A772F6B, 0x00003FEA //A16
355data8 0x92F6EC377DCADA4F, 0x00003FEA //A15
356data8 0xD8FCCD6A3277FAB7, 0x00003FE8 //A14
357data8 0xC15AB9CB0C3DCFE0, 0x00003FE7 //A13
358data8 0xC3C659704A7147CD, 0x00003FE2 //A12
359data8 0xFA17F09D27C97912, 0x00003FE4 //A11
360data8 0xF664147182B94788, 0x0000BFE3 //A10
361data8 0xA6C89FA741464DA1, 0x00003FE3 //A9
362data8 0xB90FE464A825EFA8, 0x0000BFE2 //A8
363data8 0xB973AE0FD86EC024, 0x00003FE1 //A7
364data8 0xA23A087F96846951, 0x0000BFE0 //A6
365data8 0xF358D8A7FC012D5D, 0x00003FDE //A5
366data8 0x98176E2309B7C73A, 0x0000BFDD //A4
367//
368// Coefficients ##16..19 ("tail" coefficient tables)
0347518d 369// Polynomial coefficients for the tanh(x), 0.25 <= |x| < 0.5
d5efd131
MF
370data8 0x838F209ABB9BA7B3, 0x0000BFF7 //A3
371data8 0xEBC0AC78DA4FC500, 0x0000BFF8 //A2
372data8 0xF0A4D02960B60E69, 0x00003FFC //A1
373data8 0xFACBF534D0E42F8A, 0x00003FFC //A0
374//
0347518d 375// Polynomial coefficients for the tanh(x), 0.5 <= |x| < 1.0
d5efd131
MF
376data8 0xC0ECBDC0A0D133A6, 0x0000BFF8 //A3
377data8 0xBA13A076BF8E812F, 0x0000BFFB //A2
378data8 0xC954A37D1A1CA070, 0x00003FFD //A1
379data8 0xEC9A9EBAB4579B29, 0x00003FFD //A0
380//
0347518d 381// Polynomial coefficients for the tanh(x), 1.0 <= |x| < 2.0
d5efd131
MF
382data8 0xD42E9175A6EA1397, 0x00003FFB //A3
383data8 0xA3C361378A55CF56, 0x0000BFFD //A2
384data8 0xD706E07CC8622983, 0x00003FFD //A1
385data8 0xC2F7D5A8A79CA2AC, 0x00003FFE //A0
386//
387// Polynomial coefficients for the tanh(x), 2.0 <= |x| < 3.25
388data8 0xAC7A7F8776817C7E, 0x00003FFD //A3
389data8 0x8B7CE95E69FCFE9A, 0x0000BFFD //A2
390data8 0x90B161317028D995, 0x00003FFC //A1
391data8 0xF6CA82F0DE1E9E9A, 0x00003FFE //A0
392//
393// Polynomial coefficients for the tanh(x), 4.0 <= |x| < 6.5
394data8 0xE9E072407BC22DC6, 0x00003FFA //A3
395data8 0xAFA4A913D8E6BB4A, 0x0000BFF9 //A2
396data8 0xAFC2D6A885BAA875, 0x00003FF7 //A1
397data8 0xFFD40B84505A10B2, 0x00003FFE //A0
398//
399// Polynomial coefficients for the tanh(x), 8.0 <= |x| < 13.0
400data8 0xA11C8A1FED168CD5, 0x00003FF2 //A3
401data8 0xF1AAD6B02063A5F5, 0x0000BFEF //A2
402data8 0xF1AADA46AD341C34, 0x00003FEC //A1
403data8 0xFFFFFC39548FC34B, 0x00003FFE //A0
404//
405// Polynomial coefficients for the tanh(x), 16.0 <= |x| < 19.0625
406data8 0x98176FD1F0950C16, 0x00003FDE //A3
407data8 0xE42327BB09C8B2A5, 0x0000BFDA //A2
408data8 0xE42327BB0B154F13, 0x00003FD6 //A1
409data8 0xFFFFFFFFFFF8DEE7, 0x00003FFE //A0
410//
411// Binary subranges
412// Polynomial coefficients for the tanh(x), 3.25 <= |x| < 4.0
413data8 0xE9E072404329293B, 0x00003FF7 //A3
414data8 0xAFA4A913D798300B, 0x0000BFF7 //A2
415data8 0xAFC2D6A885B48567, 0x00003FF6 //A1
416data8 0xFFD40B84505A10B4, 0x00003FFE //A0
417//
418// Polynomial coefficients for the tanh(x), 6.5 <= |x| < 8.0
419data8 0xA11C8A63815F7A28, 0x00003FEF //A3
420data8 0xF1AAD6B65B0EBF53, 0x0000BFED //A2
421data8 0xF1AADA46E799831F, 0x00003FEB //A1
422data8 0xFFFFFC39548FC348, 0x00003FFE //A0
423//
424// Polynomial coefficients for the tanh(x), 13.0 <= |x| < 16.0
425data8 0x98176FE982140A59, 0x00003FDB //A3
426data8 0xE42327B9B0D7202F, 0x0000BFD8 //A2
427data8 0xE42327BB13076BD6, 0x00003FD5 //A1
428data8 0xFFFFFFFFFFF8DEE7, 0x00003FFE //A0
429//
0347518d 430// Polynomial coefficients for the tanh(x), 0.0 <= |x| < 0.25
d5efd131
MF
431// ('tanh_near_zero' path)
432data8 0xBF2BA5D26E479D0C //A9
433data8 0x3F4336D96F81EE26 //A8
434data8 0xBF8226E34AE197B0 //A5
435data8 0x3F9664F488148657 //A4
436data8 0xAAAAAAAAAAAAAA99, 0x0000BFFD //A1
437data8 0xBF57D91925BB5EE2 //A7
438data8 0x3F6D6D36C3D5B7A1 //A6
439data8 0xBFABA1BA1BA19D32 //A3
440data8 0x3FC1111111111108 //A2
441//
442// 1.0 - 2^(-63)
443// ('tanh_saturation' path)
0347518d 444data8 0xFFFFFFFFFFFFFFFF, 0x00003FFE
d5efd131
MF
445LOCAL_OBJECT_END(tanh_data)
446
447// CAUTION: The order of table coefficients shouldn't be changed!
448
449
450.section .text
451GLOBAL_LIBM_ENTRY(tanh)
452{ .mfi
453 alloc r32 = ar.pfs, 0, 20, 0, 0
454 fmerge.se fArgAbsNorm = f1, f8 // normalized x
455 adds rSignBit = 0x1, r0 // Bit for sign removing
456}
457{ .mfi
458 addl rDataPtr = @ltoff(tanh_data), gp // Data pointer
459 fma.s1 fTwo = f1, f1, f1 // 2.0 construct
460 addl rArgSgnd = 0xfff, r0 // mask for exponent
461};;
462
463{ .mfi
0347518d
MF
464 getf.d rArg = f8 // x in GR
465 fclass.m p6,p0 = f8, 0xEF // Filter 0, denormals and specials
d5efd131
MF
466 // 0xEF = @qnan|@snan|@pos|@neg|@zero|@unorm|@inf
467 shl rArgSgnd = rArgSgnd, 52 // mask for exponent
468}
469{ .mlx
470 ld8 rDataPtr = [rDataPtr] // Real data pointer
471 movl r1625Sgnd = 0xA000000000000 // 1.625 signd
472 // 1.625 significand used to filter values greater than 3.25, 6.5, 13.0
473 // to enter binary subranges
474};;
475
476{ .mfi
477 addl rBias = 0x3FD00, r0 // bias of 0.25 << 8
478 fma.s1 fArgSqr = f8, f8, f0 // x^2
479 shl rSignBit = rSignBit, 63 // mask for sign bit
480}
481{ .mlx
482 addl rMask = 0x7FF00, r0 // Mask for index bits
483 movl rTwo = 0x4000000000000000 // 2.0
484};;
485
486{ .mfi
487 andcm rArgSgnd = rArg, rArgSgnd // Remove exponent
488 nop.f 0
489 shr.u rShiftedArg = rArg, 44 // Select only necessary bits of arg
490}
491{ .mfb
492 andcm rAbsArg = rArg, rSignBit // Remove sign
493 nop.f 0
494(p6) br.cond.spnt _tanh_spec // Branch to zero, denorm & specs
495};;
0347518d 496
d5efd131
MF
497{ .mfi
498 and rShiftedArgMasked = rShiftedArg, rMask // bias of x << 8
499 fmerge.s fArgAbs = f1, f8 // |x|
0347518d 500 shr rShiftedAbsArg = rAbsArg, 44 // Select only necessary
d5efd131
MF
501 // bits of absolute arg
502}
503{ .mfi
504 cmp.gt p8, p11 = rArgSgnd, r1625Sgnd // p8 = 1 if
505 // signd(x) > 1.625 - to filter values greater than 3.25, 6.5, 13.0
506 nop.f 0
507 nop.i 0
508};;
509
510{ .mfi
511 sub rIndex = rShiftedArgMasked, rBias // index << 8
0347518d 512 nop.f 0
d5efd131
MF
513 cmp.lt p10, p0 = rShiftedArgMasked, rBias // p10=1 if |x|<0.25
514}
515{ .mfb
516(p8) cmp.gt p8, p11 = rAbsArg, rTwo // If arg is greater than 2.0?
517 // (then we should use binary subranges)
0347518d 518 nop.f 0
d5efd131
MF
519(p10) br.cond.spnt tanh_near_zero // branch out if |x| < 0.25
520};;
521
522.pred.rel "mutex",p8,p11
523{ .mfi
0347518d 524(p8) add rIndex = 0x400, rIndex // Make pointer to binary
d5efd131
MF
525 // subranges
526(p11) fms.s1 fArgAbsNorm = fArgAbsNorm, f1, f1 // |x|/b - 1.0
527 addl rSaturation = 0x40331, r0 // shifted bits of 19.0625
528}
529{ .mfi
0347518d 530 nop.m 0
d5efd131
MF
531(p8) fms.s1 fArgAbsNorm = fArgAbsNorm, f1, fTwo // |x|/b - 2.0
532 // this is only for binary subranges [3.25;4], [6.5;8], [13.0;16]
0347518d 533 nop.i 0
d5efd131
MF
534}
535;;
536
537{ .mfi
538 add rCoeffAddr1 = rDataPtr, rIndex// coeff. ##0,2,..14
539 nop.f 0
540 nop.i 0
541};;
542
543{ .mfi
544 adds rCoeffAddr2 = 16, rCoeffAddr1 // Shifted pointer to coeffs
545 fmerge.s fSignumX = f8, f1 // signum(x)
546 nop.i 0
0347518d 547}
d5efd131
MF
548{ .mfb
549 cmp.le p12, p0 = rSaturation, rShiftedAbsArg // |x|>=19.0625?
550 nop.f 0
551(p12) br.cond.spnt tanh_saturation // branch out if x |x| >= 19.0625
552};;
553
554{.mfi
555 ldfe fA19 = [rCoeffAddr1], 32 // Load A19
556 nop.f 0
557 nop.i 0
558}
559{.mfi
560 ldfe fA18 = [rCoeffAddr2], 32 // Load A18
561 nop.f 0
562 adds rCoeffAddr3 = 0xA00, rDataPtr // Pointer to "tail"
563 // coefficients tables
564};;
565
566{.mfi
567 ldfe fA17 = [rCoeffAddr1], 32 // Load A17
568 nop.f 0
569 nop.i 0
570}
571{.mfi
572 ldfe fA16 = [rCoeffAddr2], 32 // Load A16
573 nop.f 0
574 nop.i 0
575};;
576
577{.mfi
578 ldfe fA15 = [rCoeffAddr1], 32 // Load A15
579 fma.s1 fTSqr = fArgAbsNorm, fArgAbsNorm, f0 // x^2
580 shr.u rIndex = rIndex, 2 // Index for "tail" tables
581}
582{.mfi
583 ldfe fA14 = [rCoeffAddr2], 32 // Load A14
584 nop.f 0
585 adds rCoeffAddr4 = 16, r0 // Shifter pointer
586 // to "tail" tables
587};;
588
589{.mfi
590 ldfe fA13 = [rCoeffAddr1], 32 // Load A13
591 nop.f 0
592 add rCoeffAddr3 = rCoeffAddr3, rIndex // "tail" coeffs to load
593 // ##16..23
594}
595{.mfi
596 ldfe fA12 = [rCoeffAddr2], 32 // Load A12
597 nop.f 0
0347518d 598 cmp.lt p15, p14 = rArg, r0 // Arg positive (p14)
d5efd131
MF
599 // or negative (p15)?
600};;
601
602{.mfi
603 ldfe fA11 = [rCoeffAddr1], 32 // Load A11
604 nop.f 0
0347518d
MF
605 add rCoeffAddr4 = rCoeffAddr3, rCoeffAddr4 // shifted "tail"
606 // coeffs to load
d5efd131
MF
607}
608{.mfi
609 ldfe fA10 = [rCoeffAddr2], 32 // Load A10
610 nop.f 0
611 nop.i 0
612};;
613
614{.mfi
615 ldfe fA9 = [rCoeffAddr1], 32 // Load A9
616 nop.f 0
617 nop.i 0
618}
619{.mfi
620 ldfe fA8 = [rCoeffAddr2], 32 // Load A8
621 nop.f 0
622 nop.i 0
623};;
624
625{.mfi
626 ldfe fA7 = [rCoeffAddr1], 32 // Load A7
627 nop.f 0
628 nop.i 0
629}
630{.mfi
631 ldfe fA6 = [rCoeffAddr2], 32 // Load A6
632 nop.f 0
633 nop.i 0
634};;
635
636{.mfi
637 ldfe fA5 = [rCoeffAddr1], 32 // Load A5
638 fma.s1 fTDeg3 = fArgAbsNorm, fTSqr, f0 // x^3
639 nop.i 0
640}
641{.mfi
642 ldfe fA4 = [rCoeffAddr2], 32 // Load A4
643 fma.s1 fTQuadr = fTSqr, fTSqr, f0 // x^4
644 nop.i 0
645};;
646
647// Path #3 Polynomial Pol19(y) computation; y = fArgAbsNorm
648{.mfi
649 ldfe fA3 = [rCoeffAddr3], 32 // Load A3
650 fma.s1 fArgAbsNormSgn = fArgAbsNorm, fSignumX, f0 // sign(x)*x
651 nop.i 0
652}
653{.mfi
654 ldfe fA2 = [rCoeffAddr4], 32 // Load A2
655 nop.f 0
656 nop.i 0
657};;
658
659{.mfi
660 ldfe fA1 = [rCoeffAddr3], 32 // Load A1
661 fma.s1 fRes = fA19, fArgAbsNorm, fA18 // Polynomial
662 nop.i 0
663}
664{.mfi
665 ldfe fA0 = [rCoeffAddr4], 32 // Load A0
666 nop.f 0
667 nop.i 0
668};;
669
670{ .mfi
671 nop.m 0
672 fma.s1 fA17 = fA17, fArgAbsNorm, fA16 // Polynomial
673 nop.i 0
674};;
675
676{ .mfi
677 nop.m 0
678 fma.s1 fA15 = fA15, fArgAbsNorm, fA14 // Polynomial
679 nop.i 0
680};;
681
682{ .mfi
683 nop.m 0
684 fma.s1 fTDeg7 = fTDeg3, fTQuadr, f0 // Polynomial
685 nop.i 0
686}
687{ .mfi
688 nop.m 0
689 fma.s1 fA13 = fA13, fArgAbsNorm, fA12 // Polynomial
690 nop.i 0
691};;
692
693{ .mfi
694 nop.m 0
695 fma.s1 fA11 = fA11, fArgAbsNorm, fA10 // Polynomial
696 nop.i 0
697};;
698
699{ .mfi
700 nop.m 0
701 fma.s1 fA9 = fA9, fArgAbsNorm, fA8 // Polynomial
702 nop.i 0
703};;
704
705{ .mfi
706 nop.m 0
707 fma.s1 fRes = fRes, fTSqr, fA17 // Polynomial
708 nop.i 0
709}
710{ .mfi
711 nop.m 0
712 fma.s1 fA7 = fA7, fArgAbsNorm, fA6 // Polynomial
713 nop.i 0
714};;
715
716{ .mfi
717 nop.m 0
718 fma.s1 fA5 = fA5, fArgAbsNorm, f0 // Polynomial
719 nop.i 0
720};;
721
722{ .mfi
723 nop.m 0
0347518d 724 fma.s1 fA15 = fA15, fTSqr, fA13 // Polynomial
d5efd131
MF
725 nop.i 0
726}
727{ .mfi
728 nop.m 0
729 fma.s1 fA4 = fA4, fArgAbsNorm, fA3 // Polynomial
730 nop.i 0
731};;
732
733{ .mfi
734 nop.m 0
735 fma.s1 fA2 = fA2, fArgAbsNorm, fA1 // Polynomial
736 nop.i 0
737};;
738
739{ .mfi
740 nop.m 0
741 fma.s1 fA11 = fA11, fTSqr, fA9 // Polynomial
742 nop.i 0
743};;
744
745{ .mfi
0347518d 746 nop.m 0
d5efd131
MF
747 fma.s1 fA7 = fA7, fTSqr, fA5 // Polynomial
748 nop.i 0
749};;
750
751{ .mfi
0347518d 752 nop.m 0
d5efd131
MF
753 fma.s1 fRes = fRes, fTQuadr, fA15 // Polynomial
754 nop.i 0
755};;
756
757{ .mfi
0347518d 758 nop.m 0
d5efd131
MF
759 fma.s1 fA4 = fA4, fTSqr, fA2 // Polynomial
760 nop.i 0
761};;
762
763{ .mfi
764 nop.m 0
765 fma.s1 fRes = fRes, fTQuadr, fA11 // Polynomial
766 nop.i 0
767};;
768
769{ .mfi
0347518d 770 nop.m 0
d5efd131
MF
771 fma.s1 fA4 = fA7, fTDeg3, fA4 // Polynomial
772 nop.i 0
773};;
774
775{ .mfi
776 nop.m 0
777 fma.s1 fRes = fRes, fTDeg7, fA4 // Polynomial
778 nop.i 0
779};;
780
781{ .mfi
782 nop.m 0
783 // result for negative argument
784(p15) fms.d.s0 f8 = fRes, fArgAbsNormSgn, fA0 // Polynomial
785 nop.i 0
786}
787{ .mfb
788 nop.m 0
789 // result for positive argument
790(p14) fma.d.s0 f8 = fRes, fArgAbsNormSgn, fA0 // Polynomial
791 br.ret.sptk b0
792};;
793
794
795// |x| < 0.25 Path /////////////////////////////////////////////////////////////
796.align 32
797tanh_near_zero:
798{ .mfi
799 adds rCoeffAddr1 = 0xC80, rDataPtr // address of A9
0347518d 800 fma.s0 fTSqr = fArgSqr, fArgSqr, f0 // x^4
d5efd131
MF
801 nop.i 0
802}
803{ .mfi
804 adds rCoeffAddr2 = 0xCB0, rDataPtr // address of A7
805 nop.f 0
806 nop.i 0
807};;
808
809{ .mfi
810 ldfpd fA9, fA8 = [rCoeffAddr1], 16 // Load A9, A8
811 nop.f 0
812 nop.i 0
813}
814{ .mfi
815 ldfpd fA7, fA6 = [rCoeffAddr2], 16 // Load A7, A6
816 nop.f 0
817 nop.i 0
818};;
819
820{ .mfi
821 ldfpd fA5, fA4 = [rCoeffAddr1], 16 // Load A5, A4
822 nop.f 0
823 nop.i 0
824}
825{ .mfi
826 ldfpd fA3, fA2 = [rCoeffAddr2], 16 // Load A3, A2
827 nop.f 0
828 nop.i 0
829};;
830
831{ .mfi
832 ldfe fA1 = [rCoeffAddr1] // Load A1
833 nop.f 0
834 nop.i 0
835};;
836
837{ .mfi
838 nop.m 0
839 fma.s1 fTQuadr = fTSqr, fTSqr, f0 // x^4
840 nop.i 0
841};;
842
843{ .mfi
844 nop.m 0
845 fma.s1 fRes = fA9, fArgSqr, fA8 // Polynomial
846 nop.i 0
847}
848{ .mfi
849 nop.m 0
850 fma.s1 fA7 = fA7, fArgSqr, fA6 // Polynomial
851 nop.i 0
852};;
853
854{ .mfi
855 nop.m 0
856 fma.s1 fA3 = fA3, fArgSqr, fA2 // Polynomial
857 nop.i 0
858}
859{ .mfi
860 nop.m 0
861 fma.s1 fA5 = fA5, fArgSqr, fA4 // Polynomial
862 nop.i 0
863};;
864
865{ .mfi
866 nop.m 0
867 fma.s1 fA1 = fA1, fArgSqr, f0 // Polynomial
868 nop.i 0
869}
870{ .mfi
871 nop.m 0
872 fma.s1 fTQuadrSgn = fTQuadr, f8, f0 // x^4 * x
873 nop.i 0
874};;
875
876{ .mfi
877 nop.m 0
878 fma.s1 fRes = fRes, fTSqr, fA7 // Polynomial
879 nop.i 0
880};;
881
882{ .mfi
883 nop.m 0
884 fma.s1 fA1 = fA3, fTSqr, fA1 // Polynomial
885 nop.i 0
886};;
887
888{ .mfi
889 nop.m 0
890 fma.s1 fRes = fRes, fTSqr, fA5 // Polynomial
891 nop.i 0
892};;
893
894{ .mfi
895 nop.m 0
896 fma.s1 fRes = fRes, fTQuadr, fA1 // Polynomial
897 nop.i 0
898};;
899
900{ .mfb
901 nop.m 0
902 fma.d.s0 f8 = fRes, f8, f8 // x+x*Polynomial
903 br.ret.sptk b0 // Exit for |x| < 0.25
904};;
905
906
907
908
909
910// 19.0625 <= |x| < +inf Saturation path ///////////////////////////////////////
911.align 32
912tanh_saturation:
913{ .mfi
914 adds rDataPtr = 0xCD0, rDataPtr // address of A0
915 nop.f 0
916 nop.i 0
917};;
918
919{ .mfi
920 ldfe fA0 = [rDataPtr] // Load A0 = 2^(-63)
921 nop.f 0
922 nop.i 0
923};;
924
925{ .mfb
926 nop.m 0
927 fma.d.s0 f8 = fA0, fSignumX, f0 // sign(x)*(1.0-2^(-63))
928 br.ret.sptk b0 // Exit for 19.0625 <=|x|< +inf
929};;
930
931
932
933
0347518d 934
d5efd131
MF
935// 0, denormals and special IEEE numbers path /////////////////////////////////
936_tanh_spec:
937
0347518d
MF
938{ .mfi
939 cmp.lt p15, p14 = rArg, r0 // Is arg negative (p15)
d5efd131
MF
940 // or positive p14)
941 fclass.m p6,p0 = f8, 0x23 // To filter infinities
0347518d 942 // 0x23 = @pos|@neg|@inf
d5efd131
MF
943 nop.i 0
944};;
945
0347518d 946{ .mfi
d5efd131
MF
947 nop.m 0
948 fclass.m p7,p0 = f8, 0xC7 // To filter NaNs & Zeros
949 // 0xC7 = @pos|@neg|@zero|@qnan|@snan
950 nop.i 0
951};;
952
0347518d 953{ .mfb
d5efd131 954 nop.m 0
0347518d 955(p6) fmerge.s f8 = f8, f1 // +/-1 for INF args
d5efd131
MF
956(p6) br.ret.spnt b0 // exit for x = INF
957};;
958
0347518d 959{ .mfb
d5efd131 960 nop.m 0
0347518d 961(p7) fma.d.s0 f8 = f8, f1, f8 // +/-0 for 0 args
d5efd131
MF
962 // and NaNs for NaNs
963(p7) br.ret.spnt b0 // exit for x = NaN or +/-0
964};;
965
0347518d 966{ .mfi
d5efd131
MF
967 nop.m 0
968 fnorm.s0 f8 = f8 // Normalize arg
969 nop.i 0
970};;
971
972.pred.rel "mutex",p14,p15
0347518d 973{ .mfi
d5efd131
MF
974 nop.m 0
975(p14) fnma.d.s0 f8 = f8, f8, f8 // res = r-r^2
976 nop.i 0
977}
0347518d 978{ .mfb
d5efd131
MF
979 nop.m 0
980(p15) fma.d.s0 f8 = f8, f8, f8 // res = r+r^2
981 br.ret.sptk b0 // 0, denormals, specials return
982};;
983
984GLOBAL_LIBM_END(tanh)
985
986