1 /* Function atanhf vectorized with AVX2.
2 Copyright (C) 2021-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
20 * ALGORITHM DESCRIPTION:
22 * Compute atanh(x) as 0.5 * log((1 + x)/(1 - x))
29 * atanh(x) = NaN if |x| > 1, or if x is a NaN or INF
33 /* Offsets for data table __svml_satanh_data_internal_avx512. Ordered
34 by use in the function. On cold-starts this might hhelp the
35 prefetcher. Possibly a better idea is to interleave start/end so
36 that the prefetcher is less likely to detect a stream and pull
37 irrelivant lines into cache. */
43 #define iOffExpoMask 160
49 #define ATANHF_DATA(x) ((x)+__svml_satanh_data_internal)
51 .section .text.avx2, "ax", @progbits
52 ENTRY(_ZGVdN8v_atanhf_avx2)
53 /* Strip off the sign, so treat X as positive until right at the end */
54 vmovaps ATANHF_DATA(SgnMask)(%rip), %ymm2
55 vandps %ymm2, %ymm0, %ymm3
56 /* Load constants including One = 1 */
57 vmovups ATANHF_DATA(sOne)(%rip), %ymm5
58 vsubps %ymm3, %ymm5, %ymm1
59 vmovups ATANHF_DATA(sTopMask12)(%rip), %ymm4
62 vsubps %ymm1, %ymm5, %ymm9
63 vandps %ymm4, %ymm7, %ymm6
64 vsubps %ymm3, %ymm9, %ymm7
66 /* No need to split sU when FMA is available */
67 vfnmadd213ps %ymm5, %ymm6, %ymm1
69 vfmadd213ps %ymm0, %ymm0, %ymm0
70 vfnmadd231ps %ymm6, %ymm7, %ymm1
73 * Check whether |X| < 1, in which case we use the main function.
74 * Otherwise set the rangemask so that the callout will get used.
75 * Note that this will also use the callout for NaNs since not(NaN < 1).
77 vcmpnlt_uqps %ymm5, %ymm3, %ymm14
78 vcmplt_oqps ATANHF_DATA(TinyRange)(%rip), %ymm3, %ymm15
81 * Compute V = 2 * X trivially, and UHi + U_lo = 1 - X in two pieces,
82 * the upper part UHi being <= 12 bits long. Then we have
83 * atanh(X) = 1/2 * log((1 + X) / (1 - X)) = 1/2 * log1p(V / (UHi + ULo)).
85 vaddps %ymm3, %ymm3, %ymm3
88 * Split V as well into upper 12 bits and lower part, so that we can get
89 * a preliminary quotient estimate without rounding error.
91 vandps %ymm4, %ymm3, %ymm4
92 vsubps %ymm4, %ymm3, %ymm7
94 /* Hence get initial quotient estimate QHi + QLo = R * VHi + R * VLo */
95 vmulps %ymm4, %ymm6, %ymm4
97 /* Compute D = E + E^2 */
98 vfmadd213ps %ymm1, %ymm1, %ymm1
100 /* Record the sign for eventual reincorporation. */
101 vandnps %ymm8, %ymm2, %ymm3
103 /* Or the sign bit in with the tiny result to handle atanh(-0) correctly */
104 vorps %ymm3, %ymm0, %ymm13
105 vmulps %ymm7, %ymm6, %ymm2
108 * Compute R * (VHi + VLo) * (1 + E + E^2)
109 * = R * (VHi + VLo) * (1 + D)
110 * = QHi + (QHi * D + QLo + QLo * D)
114 * If less precision is acceptable the `vmulps %ymm1, %ymm4, %ymm9;
115 * vaddps %ymm1, %ymm9, %ymm1` can be replaced with
116 * `vfmadd231ps %ymm1, %ymm4, %ymm4`.
118 vmulps %ymm1, %ymm4, %ymm6
119 vfmadd213ps %ymm2, %ymm2, %ymm1
120 vaddps %ymm1, %ymm6, %ymm1
123 * Now finally accumulate the high and low parts of the
124 * argument to log1p, H + L, with a final compensated summation.
126 vaddps %ymm1, %ymm4, %ymm2
128 /* reduction: compute r, n */
129 vmovups ATANHF_DATA(iBrkValue)(%rip), %ymm9
132 * Now we feed into the log1p code, using H in place of _VARG1 and
133 * later incorporating L into the reduced argument.
134 * compute 1+x as high, low parts
136 vmaxps %ymm2, %ymm5, %ymm0
137 vminps %ymm2, %ymm5, %ymm6
139 /* This is needed for rounding (see `vaddps %ymm1, %ymm4, %ymm2`). */
140 vsubps %ymm2, %ymm4, %ymm2
141 vaddps %ymm6, %ymm0, %ymm4
142 vpsubd %ymm9, %ymm4, %ymm7
143 vsubps %ymm4, %ymm0, %ymm4
144 vaddps %ymm2, %ymm1, %ymm2
145 vmovaps ATANHF_DATA(iOffExpoMask)(%rip), %ymm1
147 vandps %ymm1, %ymm7, %ymm0
148 vaddps %ymm4, %ymm6, %ymm4
149 vandnps %ymm7, %ymm1, %ymm6
150 vmovups ATANHF_DATA(sPoly+0)(%rip), %ymm1
151 vpaddd %ymm9, %ymm0, %ymm0
152 vaddps %ymm4, %ymm2, %ymm4
153 vpsubd %ymm6, %ymm5, %ymm6
155 /* polynomial evaluation */
156 vsubps %ymm5, %ymm0, %ymm2
157 vfmadd231ps %ymm4, %ymm6, %ymm2
158 vfmadd213ps ATANHF_DATA(sPoly+32)(%rip), %ymm2, %ymm1
159 vfmadd213ps ATANHF_DATA(sPoly+64)(%rip), %ymm2, %ymm1
160 vfmadd213ps ATANHF_DATA(sPoly+96)(%rip), %ymm2, %ymm1
161 vfmadd213ps ATANHF_DATA(sPoly+128)(%rip), %ymm2, %ymm1
162 vfmadd213ps ATANHF_DATA(sPoly+160)(%rip), %ymm2, %ymm1
163 vfmadd213ps ATANHF_DATA(sPoly+192)(%rip), %ymm2, %ymm1
164 vfmadd213ps ATANHF_DATA(sPoly+224)(%rip), %ymm2, %ymm1
166 vmulps %ymm1, %ymm2, %ymm1
167 vfmadd213ps %ymm2, %ymm2, %ymm1
169 /* final reconstruction */
170 vpsrad $23, %ymm7, %ymm6
171 vcvtdq2ps %ymm6, %ymm2
172 vfmadd132ps ATANHF_DATA(sLn2)(%rip), %ymm1, %ymm2
174 /* Finally, halve the result and reincorporate the sign */
175 vxorps ATANHF_DATA(sHalf)(%rip), %ymm3, %ymm3
176 vmulps %ymm2, %ymm3, %ymm2
177 vmovmskps %ymm14, %edx
180 vblendvps %ymm15, %ymm13, %ymm2, %ymm0
181 /* Go to special inputs processing branch */
182 jne L(SPECIAL_VALUES_BRANCH)
183 # LOE rbx rdx r12 r13 r14 r15 ymm0
184 /* No registers to restore on fast path. */
188 /* Cold case. edx has 1s where there was a special value that
189 needs to be handled by a atanhf call. Optimize for code size
190 more so than speed here. */
191 L(SPECIAL_VALUES_BRANCH):
192 # LOE rbx rdx r12 r13 r14 r15 ymm0 ymm8
193 /* Use r13 to save/restore the stack. This allows us to use rbp as
194 callee save register saving code size. */
196 cfi_adjust_cfa_offset(8)
198 /* Need to callee save registers to preserve state across tanhf calls.
201 cfi_adjust_cfa_offset(8)
204 cfi_adjust_cfa_offset(8)
207 cfi_def_cfa_register(r13)
209 /* Align stack and make room for 2x ymm vectors. */
213 /* Save all already computed inputs. */
214 vmovups %ymm0, (%rsp)
215 /* Save original input (ymm8 unchanged up to this point). */
216 vmovups %ymm8, 32(%rsp)
220 /* edx has 1s where there was a special value that needs to be handled
223 L(SPECIAL_VALUES_LOOP):
224 # LOE rbx rbp r12 r13 r14 r15
225 /* use rbp as index for special value that is saved across calls to
226 atanhf. We technically don't need a callee save register here as offset
227 to rsp is always [0, 28] so we can restore rsp by realigning to 64.
228 Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
229 in the loop. Realigning also costs more code size. */
233 /* Scalar math function call to process special input. */
234 vmovss 32(%rsp, %rbp, 4), %xmm0
237 /* No good way to avoid the store-forwarding fault this will cause on
238 return. `lfence` avoids the SF fault but at greater cost as it
239 serialized stack/callee save restoration. */
240 vmovss %xmm0, (%rsp, %rbp, 4)
243 jnz L(SPECIAL_VALUES_LOOP)
244 # LOE r12 r13 r14 r15
247 /* All results have been written to (%rsp). */
248 vmovups (%rsp), %ymm0
251 cfi_def_cfa_register(rsp)
252 /* Restore callee save registers. */
254 cfi_adjust_cfa_offset(-8)
257 cfi_adjust_cfa_offset(-8)
260 cfi_adjust_cfa_offset(-8)
263 END(_ZGVdN8v_atanhf_avx2)
265 .section .rodata, "a"
267 #ifdef __svml_satanh_data_internal_typedef
268 typedef unsigned int VUINT32;
270 __declspec(align(32)) VUINT32 SgnMask[8][1];
271 __declspec(align(32)) VUINT32 sOne[8][1];
272 __declspec(align(32)) VUINT32 sTopMask12[8][1];
273 __declspec(align(32)) VUINT32 TinyRange[8][1];
274 __declspec(align(32)) VUINT32 iBrkValue[8][1];
275 __declspec(align(32)) VUINT32 iOffExpoMask[8][1];
276 __declspec(align(32)) VUINT32 sPoly[8][8][1];
277 __declspec(align(32)) VUINT32 sLn2[8][1];
278 __declspec(align(32)) VUINT32 sHalf[8][1];
279 } __svml_satanh_data_internal;
281 __svml_satanh_data_internal:
283 .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
284 .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
287 .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
288 .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
291 .long 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
292 .long 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
295 .long 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000
296 .long 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000
297 /* iBrkValue = SP 2/3 */
299 .long 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
300 .long 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
301 /* iOffExpoMask = SP significand mask */
303 .long 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
304 .long 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
305 /* sPoly[] = SP polynomial */
307 .long 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed
308 .long 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */
309 .long 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3
310 .long 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */
311 .long 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12
312 .long 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */
313 .long 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37
314 .long 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */
315 .long 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190
316 .long 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */
317 .long 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e
318 .long 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */
319 .long 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94
320 .long 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */
321 .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
322 .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */
323 /* sLn2 = SP ln(2) */
325 .long 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
326 .long 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
329 .long 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000
330 .long 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000
332 .type __svml_satanh_data_internal, @object
333 .size __svml_satanh_data_internal, .-__svml_satanh_data_internal