]>
Commit | Line | Data |
---|---|---|
6dea4dd3 | 1 | /* Function atanhf vectorized with SSE4. |
6d7e8eda | 2 | Copyright (C) 2021-2023 Free Software Foundation, Inc. |
6dea4dd3 SP |
3 | This file is part of the GNU C Library. |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
16 | License along with the GNU C Library; if not, see | |
17 | https://www.gnu.org/licenses/. */ | |
18 | ||
19 | /* | |
20 | * ALGORITHM DESCRIPTION: | |
21 | * | |
22 | * Compute atanh(x) as 0.5 * log((1 + x)/(1 - x)) | |
23 | * | |
24 | * Special cases: | |
25 | * | |
26 | * atanh(0) = 0 | |
27 | * atanh(+1) = +INF | |
28 | * atanh(-1) = -INF | |
29 | * atanh(x) = NaN if |x| > 1, or if x is a NaN or INF | |
30 | * | |
31 | */ | |
32 | ||
fe1915d4 NG |
33 | /* Offsets for data table __svml_satanh_data_internal_avx512. Ordered |
34 | by use in the function. On cold-starts this might help the | |
35 | prefetcher. Possibly a better idea is to interleave start/end so | |
36 | that the prefetcher is less likely to detect a stream and pull | |
37 | irrelivant lines into cache. */ | |
38 | #define sOne 0 | |
39 | #define SgnMask 16 | |
40 | #define sTopMask12 32 | |
41 | #define iBrkValue 48 | |
42 | #define iOffExpoMask 64 | |
43 | #define sPoly 80 | |
44 | #define sLn2 208 | |
45 | #define TinyRange 224 | |
6dea4dd3 SP |
46 | |
47 | #include <sysdep.h> | |
fe1915d4 | 48 | #define ATANHF_DATA(x) ((x)+__svml_satanh_data_internal) |
6dea4dd3 | 49 | |
5aa7f304 | 50 | .section .text.sse4, "ax", @progbits |
6dea4dd3 | 51 | ENTRY(_ZGVbN4v_atanhf_sse4) |
5aa7f304 SP |
52 | movaps %xmm0, %xmm5 |
53 | ||
54 | /* Load constants including One = 1 */ | |
fe1915d4 | 55 | movups ATANHF_DATA(sOne)(%rip), %xmm4 |
5aa7f304 SP |
56 | movaps %xmm5, %xmm3 |
57 | ||
58 | /* Strip off the sign, so treat X as positive until right at the end */ | |
fe1915d4 NG |
59 | movups ATANHF_DATA(SgnMask)(%rip), %xmm1 |
60 | movaps %xmm4, %xmm2 | |
61 | andps %xmm1, %xmm0 | |
5aa7f304 | 62 | movaps %xmm4, %xmm10 |
fe1915d4 | 63 | movups ATANHF_DATA(sTopMask12)(%rip), %xmm11 |
5aa7f304 SP |
64 | movaps %xmm4, %xmm14 |
65 | movaps %xmm11, %xmm9 | |
66 | ||
fe1915d4 | 67 | |
5aa7f304 SP |
68 | /* |
69 | * Compute V = 2 * X trivially, and UHi + U_lo = 1 - X in two pieces, | |
70 | * the upper part UHi being <= 12 bits long. Then we have | |
71 | * atanh(X) = 1/2 * log((1 + X) / (1 - X)) = 1/2 * log1p(V / (UHi + ULo)). | |
72 | */ | |
fe1915d4 NG |
73 | movaps %xmm0, %xmm6 |
74 | mulps %xmm5, %xmm3 | |
75 | subps %xmm0, %xmm2 | |
76 | addps %xmm0, %xmm6 | |
77 | subps %xmm2, %xmm10 | |
78 | addps %xmm5, %xmm3 | |
79 | subps %xmm0, %xmm10 | |
80 | andps %xmm2, %xmm9 | |
81 | ||
5aa7f304 SP |
82 | |
83 | /* | |
84 | * Check whether |X| < 1, in which case we use the main function. | |
85 | * Otherwise set the rangemask so that the callout will get used. | |
86 | * Note that this will also use the callout for NaNs since not(NaN < 1). | |
87 | */ | |
fe1915d4 NG |
88 | rcpps %xmm9, %xmm7 |
89 | subps %xmm9, %xmm2 | |
90 | andps %xmm11, %xmm7 | |
5aa7f304 | 91 | |
5aa7f304 SP |
92 | |
93 | /* | |
94 | * Split V as well into upper 12 bits and lower part, so that we can get | |
95 | * a preliminary quotient estimate without rounding error. | |
96 | */ | |
fe1915d4 NG |
97 | andps %xmm6, %xmm11 |
98 | mulps %xmm7, %xmm9 | |
99 | addps %xmm2, %xmm10 | |
100 | subps %xmm11, %xmm6 | |
5aa7f304 SP |
101 | |
102 | /* Hence get initial quotient estimate QHi + QLo = R * VHi + R * VLo */ | |
fe1915d4 NG |
103 | mulps %xmm7, %xmm11 |
104 | mulps %xmm7, %xmm10 | |
5aa7f304 | 105 | subps %xmm9, %xmm14 |
fe1915d4 | 106 | mulps %xmm6, %xmm7 |
5aa7f304 SP |
107 | subps %xmm10, %xmm14 |
108 | ||
109 | /* Compute D = E + E^2 */ | |
110 | movaps %xmm14, %xmm13 | |
111 | movaps %xmm4, %xmm8 | |
112 | mulps %xmm14, %xmm13 | |
113 | ||
fe1915d4 NG |
114 | /* reduction: compute r,n */ |
115 | movdqu ATANHF_DATA(iBrkValue)(%rip), %xmm9 | |
5aa7f304 SP |
116 | addps %xmm13, %xmm14 |
117 | ||
118 | /* | |
119 | * Compute R * (VHi + VLo) * (1 + E + E^2) | |
120 | * = R * (VHi + VLo) * (1 + D) | |
121 | * = QHi + (QHi * D + QLo + QLo * D) | |
122 | */ | |
fe1915d4 NG |
123 | movaps %xmm14, %xmm2 |
124 | mulps %xmm7, %xmm14 | |
125 | mulps %xmm11, %xmm2 | |
126 | addps %xmm14, %xmm7 | |
127 | movdqu ATANHF_DATA(iOffExpoMask)(%rip), %xmm12 | |
5aa7f304 SP |
128 | movaps %xmm4, %xmm14 |
129 | ||
130 | /* Record the sign for eventual reincorporation. */ | |
fe1915d4 NG |
131 | addps %xmm7, %xmm2 |
132 | ||
5aa7f304 SP |
133 | |
134 | /* | |
135 | * Now finally accumulate the high and low parts of the | |
136 | * argument to log1p, H + L, with a final compensated summation. | |
137 | */ | |
fe1915d4 NG |
138 | movaps %xmm2, %xmm6 |
139 | andnps %xmm5, %xmm1 | |
140 | movaps %xmm4, %xmm7 | |
5aa7f304 | 141 | /* Or the sign bit in with the tiny result to handle atanh(-0) correctly */ |
5aa7f304 SP |
142 | addps %xmm11, %xmm6 |
143 | maxps %xmm6, %xmm7 | |
144 | minps %xmm6, %xmm8 | |
145 | subps %xmm6, %xmm11 | |
146 | movaps %xmm7, %xmm10 | |
5aa7f304 | 147 | addps %xmm8, %xmm10 |
fe1915d4 | 148 | addps %xmm11, %xmm2 |
5aa7f304 SP |
149 | subps %xmm10, %xmm7 |
150 | psubd %xmm9, %xmm10 | |
fe1915d4 | 151 | addps %xmm8, %xmm7 |
5aa7f304 SP |
152 | pand %xmm10, %xmm12 |
153 | psrad $23, %xmm10 | |
154 | cvtdq2ps %xmm10, %xmm13 | |
fe1915d4 | 155 | addps %xmm7, %xmm2 |
5aa7f304 SP |
156 | |
157 | /* final reconstruction */ | |
5aa7f304 SP |
158 | pslld $23, %xmm10 |
159 | paddd %xmm9, %xmm12 | |
160 | psubd %xmm10, %xmm14 | |
161 | ||
162 | /* polynomial evaluation */ | |
163 | subps %xmm4, %xmm12 | |
fe1915d4 NG |
164 | mulps %xmm14, %xmm2 |
165 | movups ATANHF_DATA(sPoly+0)(%rip), %xmm7 | |
166 | addps %xmm12, %xmm2 | |
167 | mulps %xmm2, %xmm7 | |
168 | ||
5aa7f304 SP |
169 | |
170 | /* Finally, halve the result and reincorporate the sign */ | |
fe1915d4 NG |
171 | addps ATANHF_DATA(sPoly+16)(%rip), %xmm7 |
172 | mulps %xmm2, %xmm7 | |
173 | addps ATANHF_DATA(sPoly+32)(%rip), %xmm7 | |
174 | mulps %xmm2, %xmm7 | |
175 | addps ATANHF_DATA(sPoly+48)(%rip), %xmm7 | |
176 | mulps %xmm2, %xmm7 | |
177 | addps ATANHF_DATA(sPoly+64)(%rip), %xmm7 | |
178 | mulps %xmm2, %xmm7 | |
179 | addps ATANHF_DATA(sPoly+80)(%rip), %xmm7 | |
180 | mulps %xmm2, %xmm7 | |
181 | addps ATANHF_DATA(sPoly+96)(%rip), %xmm7 | |
182 | mulps %xmm2, %xmm7 | |
183 | movaps ATANHF_DATA(sPoly+112)(%rip), %xmm6 | |
184 | addps %xmm6, %xmm7 | |
185 | mulps %xmm2, %xmm7 | |
186 | mulps %xmm2, %xmm7 | |
187 | mulps ATANHF_DATA(sLn2)(%rip), %xmm13 | |
188 | /* We can build `sHalf` with `sPoly & sOne`. */ | |
189 | andps %xmm4, %xmm6 | |
190 | orps %xmm1, %xmm3 | |
191 | xorps %xmm6, %xmm1 | |
5aa7f304 | 192 | |
fe1915d4 NG |
193 | addps %xmm2, %xmm7 |
194 | addps %xmm13, %xmm7 | |
195 | mulps %xmm7, %xmm1 | |
5aa7f304 | 196 | |
fe1915d4 NG |
197 | /* Finish check of NaNs. */ |
198 | cmpleps %xmm0, %xmm4 | |
199 | movmskps %xmm4, %edx | |
200 | cmpltps ATANHF_DATA(TinyRange)(%rip), %xmm0 | |
6dea4dd3 | 201 | |
fe1915d4 NG |
202 | andps %xmm0, %xmm3 |
203 | andnps %xmm1, %xmm0 | |
204 | orps %xmm3, %xmm0 | |
205 | ||
206 | testl %edx, %edx | |
207 | /* Go to special inputs processing branch. */ | |
208 | jne L(SPECIAL_VALUES_BRANCH) | |
209 | # LOE rbx rbp r12 r13 r14 r15 xmm0 | |
210 | /* No registers to restore on fast path. */ | |
5aa7f304 | 211 | ret |
6dea4dd3 | 212 | |
6dea4dd3 | 213 | |
fe1915d4 NG |
214 | /* Cold case. edx has 1s where there was a special value that |
215 | needs to be handled by a atanhf call. Optimize for code size | |
216 | more so than speed here. */ | |
6dea4dd3 | 217 | L(SPECIAL_VALUES_BRANCH): |
fe1915d4 NG |
218 | # LOE rbx rdx rbp r12 r13 r14 r15 xmm0 xmm5 |
219 | /* Stack coming in 16-byte aligned. Set 8-byte misaligned so on | |
220 | call entry will be 16-byte aligned. */ | |
221 | subq $56, %rsp | |
222 | cfi_def_cfa_offset(64) | |
223 | movups %xmm0, 24(%rsp) | |
224 | movups %xmm5, 40(%rsp) | |
225 | ||
226 | /* Use rbx/rbp for callee save registers as they get short | |
227 | encoding for many instructions (as compared with r12/r13). */ | |
228 | movq %rbx, (%rsp) | |
229 | cfi_offset(rbx, -64) | |
230 | movq %rbp, 8(%rsp) | |
231 | cfi_offset(rbp, -56) | |
232 | /* edx has 1s where there was a special value that needs to be handled | |
233 | by a tanhf call. */ | |
234 | movl %edx, %ebx | |
6dea4dd3 | 235 | L(SPECIAL_VALUES_LOOP): |
fe1915d4 NG |
236 | # LOE rbx rbp r12 r13 r14 r15 |
237 | /* use rbp as index for special value that is saved across calls to | |
238 | tanhf. We technically don't need a callee save register here as offset | |
239 | to rsp is always [0, 12] so we can restore rsp by realigning to 64. | |
240 | Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions | |
241 | in the loop. */ | |
242 | xorl %ebp, %ebp | |
243 | bsfl %ebx, %ebp | |
244 | ||
245 | /* Scalar math fucntion call to process special input. */ | |
246 | movss 40(%rsp, %rbp, 4), %xmm0 | |
5aa7f304 | 247 | call atanhf@PLT |
fe1915d4 NG |
248 | /* No good way to avoid the store-forwarding fault this will cause on |
249 | return. `lfence` avoids the SF fault but at greater cost as it | |
250 | serialized stack/callee save restoration. */ | |
251 | movss %xmm0, 24(%rsp, %rbp, 4) | |
252 | ||
253 | leal -1(%rbx), %eax | |
254 | andl %eax, %ebx | |
255 | jnz L(SPECIAL_VALUES_LOOP) | |
256 | # LOE r12 r13 r14 r15 | |
257 | /* All results have been written to 24(%rsp). */ | |
258 | movups 24(%rsp), %xmm0 | |
259 | movq (%rsp), %rbx | |
260 | cfi_restore(rbx) | |
261 | movq 8(%rsp), %rbp | |
262 | cfi_restore(rbp) | |
263 | addq $56, %rsp | |
264 | cfi_def_cfa_offset(8) | |
265 | ret | |
6dea4dd3 SP |
266 | END(_ZGVbN4v_atanhf_sse4) |
267 | ||
5aa7f304 SP |
268 | .section .rodata, "a" |
269 | .align 16 | |
6dea4dd3 SP |
270 | |
271 | #ifdef __svml_satanh_data_internal_typedef | |
272 | typedef unsigned int VUINT32; | |
fe1915d4 | 273 | typedef struct{ |
5aa7f304 | 274 | __declspec(align(16)) VUINT32 sOne[4][1]; |
fe1915d4 NG |
275 | __declspec(align(16)) VUINT32 SgnMask[4][1]; |
276 | __declspec(align(16)) VUINT32 sTopMask12[4][1]; | |
5aa7f304 SP |
277 | __declspec(align(16)) VUINT32 iBrkValue[4][1]; |
278 | __declspec(align(16)) VUINT32 iOffExpoMask[4][1]; | |
fe1915d4 | 279 | __declspec(align(16)) VUINT32 sPoly[8][4][1]; |
5aa7f304 | 280 | __declspec(align(16)) VUINT32 sLn2[4][1]; |
fe1915d4 | 281 | __declspec(align(16)) VUINT32 TinyRange[4][1]; |
6dea4dd3 SP |
282 | } __svml_satanh_data_internal; |
283 | #endif | |
fe1915d4 | 284 | |
6dea4dd3 | 285 | __svml_satanh_data_internal: |
5aa7f304 SP |
286 | /* sOne = SP 1.0 */ |
287 | .align 16 | |
288 | .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 | |
fe1915d4 NG |
289 | /* SgnMask */ |
290 | .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff | |
291 | /* sTopMask12 */ | |
5aa7f304 | 292 | .align 16 |
fe1915d4 | 293 | .long 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000 |
5aa7f304 SP |
294 | /* iBrkValue = SP 2/3 */ |
295 | .align 16 | |
296 | .long 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab | |
fe1915d4 | 297 | /* iOffExpoMask = SP significand mask ==*/ |
5aa7f304 SP |
298 | .align 16 |
299 | .long 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff | |
fe1915d4 NG |
300 | |
301 | /* sPoly[] = SP polynomial */ | |
5aa7f304 | 302 | .align 16 |
fe1915d4 NG |
303 | .long 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */ |
304 | .long 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */ | |
305 | .long 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */ | |
306 | .long 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */ | |
307 | .long 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */ | |
308 | .long 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */ | |
309 | .long 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */ | |
310 | .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */ | |
311 | ||
312 | /* sLn2 = SP ln(2) */ | |
5aa7f304 | 313 | .align 16 |
fe1915d4 | 314 | .long 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218 |
5aa7f304 SP |
315 | /* TinyRange */ |
316 | .align 16 | |
317 | .long 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000 | |
5aa7f304 SP |
318 | .align 16 |
319 | .type __svml_satanh_data_internal, @object | |
320 | .size __svml_satanh_data_internal, .-__svml_satanh_data_internal |