]>
Commit | Line | Data |
---|---|---|
c0f36fc3 | 1 | /* Function tanhf vectorized with AVX2. |
581c785b | 2 | Copyright (C) 2021-2022 Free Software Foundation, Inc. |
c0f36fc3 SP |
3 | This file is part of the GNU C Library. |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
16 | License along with the GNU C Library; if not, see | |
17 | https://www.gnu.org/licenses/. */ | |
18 | ||
19 | /* | |
20 | * ALGORITHM DESCRIPTION: | |
21 | * | |
22 | * NOTE: Since the hyperbolic tangent function is odd | |
23 | * (tanh(x) = -tanh(-x)), below algorithm deals with the absolute | |
24 | * value of the argument |x|: tanh(x) = sign(x) * tanh(|x|) | |
25 | * | |
26 | * We use a table lookup method to compute tanh(|x|). | |
27 | * The basic idea is to split the input range into a number of subintervals | |
28 | * and to approximate tanh(.) with a polynomial on each of them. | |
29 | * | |
30 | * IEEE SPECIAL CONDITIONS: | |
2c632117 | 31 | * x = [+, -]0, r = [+, -]0 |
c0f36fc3 SP |
32 | * x = +Inf, r = +1 |
33 | * x = -Inf, r = -1 | |
34 | * x = QNaN, r = QNaN | |
35 | * x = SNaN, r = QNaN | |
36 | * | |
37 | * | |
38 | * ALGORITHM DETAILS | |
39 | * We handle special values in a callout function, aside from main path | |
40 | * computations. "Special" for this algorithm are: | |
41 | * INF, NAN, |x| > HUGE_THRESHOLD | |
42 | * | |
43 | * | |
44 | * Main path computations are organized as follows: | |
45 | * Actually we split the interval [0, SATURATION_THRESHOLD) | |
46 | * into a number of subintervals. On each subinterval we approximate tanh(.) | |
47 | * with a minimax polynomial of pre-defined degree. Polynomial coefficients | |
48 | * are computed beforehand and stored in table. We also use | |
49 | * | |
50 | * y := |x| + B, | |
51 | * | |
52 | * here B depends on subinterval and is used to make argument | |
53 | * closer to zero. | |
54 | * We also add large fake interval [SATURATION_THRESHOLD, HUGE_THRESHOLD], | |
55 | * where 1.0 + 0.0*y + 0.0*y^2 ... coefficients are stored - just to | |
56 | * preserve main path computation logic but return 1.0 for all arguments. | |
57 | * | |
58 | * Hence reconstruction looks as follows: | |
59 | * we extract proper polynomial and range reduction coefficients | |
60 | * (Pj and B), corresponding to subinterval, to which |x| belongs, | |
61 | * and return | |
62 | * | |
63 | * r := sign(x) * (P0 + P1 * y + ... + Pn * y^n) | |
64 | * | |
65 | * NOTE: we use multiprecision technique to multiply and sum the first | |
66 | * K terms of the polynomial. So Pj, j = 0..K are stored in | |
67 | * table each as a pair of target precision numbers (Pj and PLj) to | |
68 | * achieve wider than target precision. | |
69 | * | |
70 | * | |
71 | */ | |
72 | ||
c0f36fc3 SP |
73 | #include <sysdep.h> |
74 | ||
bcc41f66 NG |
75 | /* tanhf data tables for avx2 and sse4 implementatins defined here. |
76 | */ | |
77 | #include "svml_s_tanhf_rodata.S" | |
78 | ||
2c632117 | 79 | .section .text.avx2, "ax", @progbits |
c0f36fc3 | 80 | ENTRY(_ZGVdN8v_tanhf_avx2) |
2c632117 | 81 | /* Here huge arguments, INF and NaNs are filtered out to callout. */ |
bcc41f66 NG |
82 | vpand TANHF_DATA(_iExpMantMask)(%rip), %ymm0, %ymm4 |
83 | vpsubd TANHF_DATA(_iMinIdxOfsMask)(%rip), %ymm4, %ymm2 | |
84 | ||
85 | /* Selection of arguments between [0, 0x04280000] into ymm2. */ | |
86 | vpxor %ymm3, %ymm3, %ymm3 | |
87 | vpmaxsd %ymm3, %ymm2, %ymm2 | |
88 | vpminsd TANHF_DATA(_iMaxIdxMask)(%rip), %ymm2, %ymm2 | |
c0f36fc3 | 89 | |
2c632117 SP |
90 | /* |
91 | * small table specific variables * | |
92 | * Constant loading | |
93 | */ | |
bcc41f66 NG |
94 | vpsrld $14, %ymm2, %ymm1 |
95 | ||
96 | /* We are splitting xmm1 into 8 GPRs. This may be faster to do with | |
97 | store/load as we can take advantage of store-forwarding. */ | |
98 | vmovq %xmm1, %r8 | |
99 | /* We have eliminated all negative values for ymm1 so no need to sign | |
100 | extend. */ | |
101 | movl %r8d, %r9d | |
102 | shrq $32, %r8 | |
103 | ||
104 | /* Store base of lookup table in rax. */ | |
105 | leaq TANHF_DATA(_lookupTable)(%rip), %rax | |
106 | ||
107 | /* Instead of using cross-lane permutes on ymm vectors, use vpinsertf128 | |
108 | with memory operand. This helps alleviate bottleneck on p5. */ | |
109 | vmovupd 16(%r9, %rax), %xmm5 | |
110 | ||
111 | vpextrq $1, %xmm1, %rsi | |
112 | movl %esi, %edi | |
113 | shrq $32, %rsi | |
114 | ||
115 | vinsertf128 $1, 16(%rdi, %rax), %ymm5, %ymm5 | |
116 | ||
117 | vextracti128 $1, %ymm1, %xmm2 | |
118 | vmovq %xmm2, %rdx | |
119 | movl %edx, %ecx | |
120 | shrq $32, %rdx | |
121 | ||
122 | vmovupd (%rcx, %rax), %xmm6 | |
123 | ||
124 | vpextrq $1, %xmm2, %r10 | |
125 | movl %r10d, %r11d | |
126 | shrq $32, %r10 | |
127 | ||
128 | vinsertf128 $1, (%r11, %rax), %ymm6, %ymm6 | |
129 | ||
130 | vmovupd 16(%r8, %rax), %xmm1 | |
131 | vinsertf128 $1, 16(%rsi, %rax), %ymm1, %ymm1 | |
132 | vmovupd (%rdx, %rax), %xmm3 | |
133 | vinsertf128 $1, (%r10, %rax), %ymm3, %ymm3 | |
134 | ||
135 | vunpcklpd %ymm3, %ymm6, %ymm7 | |
2c632117 | 136 | vunpckhpd %ymm3, %ymm6, %ymm6 |
c0f36fc3 | 137 | |
bcc41f66 NG |
138 | vunpcklpd %ymm1, %ymm5, %ymm3 |
139 | vunpckhpd %ymm1, %ymm5, %ymm1 | |
c0f36fc3 | 140 | |
bcc41f66 NG |
141 | vmovaps TANHF_DATA(_sAbsMask)(%rip), %ymm11 |
142 | /* Store special cases in ymm15. */ | |
143 | vpcmpgtd TANHF_DATA(_iExpMask)(%rip), %ymm4, %ymm15 | |
c0f36fc3 | 144 | |
bcc41f66 | 145 | vandps %ymm11, %ymm0, %ymm4 |
c0f36fc3 | 146 | |
bcc41f66 | 147 | vcvtps2pd %xmm4, %ymm5 |
c0f36fc3 | 148 | |
bcc41f66 NG |
149 | vextractf128 $1, %ymm4, %xmm4 |
150 | vcvtps2pd %xmm4, %ymm4 | |
c0f36fc3 | 151 | |
bcc41f66 NG |
152 | vmovupd 16(%rcx, %rax), %xmm2 |
153 | vinsertf128 $1, 16(%r11, %rax), %ymm2, %ymm2 | |
c0f36fc3 | 154 | |
bcc41f66 NG |
155 | vfmadd213pd %ymm3, %ymm5, %ymm1 |
156 | ||
157 | vmovupd 16(%rdx, %rax), %xmm3 | |
158 | vinsertf128 $1, 16(%r10, %rax), %ymm3, %ymm3 | |
159 | ||
160 | vunpcklpd %ymm3, %ymm2, %ymm10 | |
161 | vunpckhpd %ymm3, %ymm2, %ymm2 | |
162 | ||
163 | vfmadd213pd %ymm10, %ymm4, %ymm2 | |
164 | vfmadd213pd %ymm6, %ymm4, %ymm2 | |
165 | vfmadd213pd %ymm7, %ymm4, %ymm2 | |
166 | vcvtpd2ps %ymm2, %xmm2 | |
167 | ||
168 | vmovupd (%r9, %rax), %xmm7 | |
169 | vinsertf128 $1, (%rdi, %rax), %ymm7, %ymm7 | |
170 | ||
171 | vmovupd (%r8, %rax), %xmm3 | |
172 | vinsertf128 $1, (%rsi, %rax), %ymm3, %ymm3 | |
173 | ||
174 | vunpckhpd %ymm3, %ymm7, %ymm4 | |
175 | vunpcklpd %ymm3, %ymm7, %ymm7 | |
c0f36fc3 | 176 | |
bcc41f66 NG |
177 | vfmadd213pd %ymm4, %ymm5, %ymm1 |
178 | vfmadd213pd %ymm7, %ymm5, %ymm1 | |
179 | ||
180 | ||
181 | vcvtpd2ps %ymm1, %xmm1 | |
182 | vinsertf128 $1, %xmm2, %ymm1, %ymm1 | |
183 | ||
184 | vmovmskps %ymm15, %edx | |
185 | vandnps %ymm0, %ymm11, %ymm2 | |
186 | testl %edx, %edx | |
187 | /* Go to special inputs processing branch */ | |
188 | jne L(SPECIAL_VALUES_BRANCH) | |
189 | # LOE rbx r12 r13 r14 r15 ymm0 ymm1 ymm2 | |
190 | /* Wait until after branch of write over ymm0. */ | |
191 | vorps %ymm2, %ymm1, %ymm0 | |
192 | /* No stack restoration on the fastpath. */ | |
193 | ret | |
c0f36fc3 | 194 | |
c0f36fc3 | 195 | |
bcc41f66 NG |
196 | /* Cold case. edx has 1s where there was a special value that |
197 | needs to be handled by a tanhf call. Optimize for code size | |
198 | more so than speed here. */ | |
199 | L(SPECIAL_VALUES_BRANCH): | |
200 | # LOE rbx rdx r12 r13 r14 r15 ymm0 ymm1 ymm2 | |
201 | /* Use r13 to save/restore the stack. This allows us to use rbp as | |
202 | callee save register saving code size. */ | |
203 | pushq %r13 | |
204 | cfi_adjust_cfa_offset(8) | |
205 | cfi_offset(r13, -16) | |
206 | /* Need to callee save registers to preserve state across tanhf calls. | |
2c632117 | 207 | */ |
bcc41f66 NG |
208 | pushq %rbx |
209 | cfi_adjust_cfa_offset(8) | |
210 | cfi_offset(rbx, -24) | |
211 | pushq %rbp | |
212 | cfi_adjust_cfa_offset(8) | |
213 | cfi_offset(rbp, -32) | |
214 | movq %rsp, %r13 | |
215 | cfi_def_cfa_register(r13) | |
216 | ||
217 | /* Align stack and make room for 2x ymm vectors. */ | |
218 | andq $-32, %rsp | |
219 | addq $-64, %rsp | |
220 | ||
221 | /* Save all already computed inputs. */ | |
222 | vorps %ymm2, %ymm1, %ymm1 | |
223 | vmovaps %ymm1, (%rsp) | |
224 | /* Save original input (ymm0 unchanged up to this point). */ | |
225 | vmovaps %ymm0, 32(%rsp) | |
226 | ||
227 | vzeroupper | |
c0f36fc3 | 228 | |
bcc41f66 NG |
229 | /* edx has 1s where there was a special value that needs to be handled |
230 | by a tanhf call. */ | |
231 | movl %edx, %ebx | |
c0f36fc3 | 232 | L(SPECIAL_VALUES_LOOP): |
bcc41f66 NG |
233 | # LOE rbx rbp r12 r13 r14 r15 |
234 | /* use rbp as index for special value that is saved across calls to | |
235 | tanhf. We technically don't need a callee save register here as offset | |
236 | to rsp is always [0, 28] so we can restore rsp by realigning to 64. | |
237 | Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions | |
238 | in the loop. Realigning also costs more code size. */ | |
239 | xorl %ebp, %ebp | |
240 | tzcntl %ebx, %ebp | |
c0f36fc3 | 241 | |
bcc41f66 | 242 | /* Scalar math function call to process special input. */ |
3079f652 | 243 | vmovss 32(%rsp, %rbp, 4), %xmm0 |
2c632117 | 244 | call tanhf@PLT |
c0f36fc3 | 245 | |
bcc41f66 NG |
246 | /* No good way to avoid the store-forwarding fault this will cause on |
247 | return. `lfence` avoids the SF fault but at greater cost as it | |
248 | serialized stack/callee save restoration. */ | |
3079f652 | 249 | vmovss %xmm0, (%rsp, %rbp, 4) |
bcc41f66 NG |
250 | |
251 | blsrl %ebx, %ebx | |
252 | jnz L(SPECIAL_VALUES_LOOP) | |
253 | # LOE r12 r13 r14 r15 | |
c0f36fc3 | 254 | |
c0f36fc3 | 255 | |
bcc41f66 NG |
256 | /* All results have been written to (%rsp). */ |
257 | vmovups (%rsp), %ymm0 | |
258 | /* Restore rsp. */ | |
259 | movq %r13, %rsp | |
260 | cfi_def_cfa_register(rsp) | |
261 | /* Restore callee save registers. */ | |
262 | popq %rbp | |
263 | cfi_adjust_cfa_offset(-8) | |
264 | cfi_restore(rbp) | |
265 | popq %rbx | |
266 | cfi_adjust_cfa_offset(-8) | |
267 | cfi_restore(rbp) | |
268 | popq %r13 | |
269 | cfi_adjust_cfa_offset(-8) | |
270 | cfi_restore(r13) | |
271 | ret | |
272 | END(_ZGVdN8v_tanhf_avx2) |