]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
Add freopen special-case tests: chroot, EFBIG, stdin/stdout/stderr
[thirdparty/glibc.git] / sysdeps / x86_64 / fpu / multiarch / svml_s_tanhf16_core_avx512.S
CommitLineData
c0f36fc3 1/* Function tanhf vectorized with AVX-512.
dff8da6b 2 Copyright (C) 2021-2024 Free Software Foundation, Inc.
c0f36fc3
SP
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
18
19/*
20 * ALGORITHM DESCRIPTION:
21 *
22 * NOTE: Since the hyperbolic tangent function is odd
23 * (tanh(x) = -tanh(-x)), below algorithm deals with the absolute
24 * value of the argument |x|: tanh(x) = sign(x) * tanh(|x|)
25 *
26 * We use a table lookup method to compute tanh(|x|).
27 * The basic idea is to split the input range into a number of subintervals
28 * and to approximate tanh(.) with a polynomial on each of them.
29 *
30 * IEEE SPECIAL CONDITIONS:
638d6a55 31 * x = [+, -]0, r = [+, -]0
c0f36fc3
SP
32 * x = +Inf, r = +1
33 * x = -Inf, r = -1
34 * x = QNaN, r = QNaN
35 * x = SNaN, r = QNaN
36 *
37 *
38 * ALGORITHM DETAILS
39 * We handle special values in a callout function, aside from main path
40 * computations. "Special" for this algorithm are:
41 * INF, NAN, |x| > HUGE_THRESHOLD
42 *
43 *
44 * Main path computations are organized as follows:
45 * Actually we split the interval [0, SATURATION_THRESHOLD)
46 * into a number of subintervals. On each subinterval we approximate tanh(.)
47 * with a minimax polynomial of pre-defined degree. Polynomial coefficients
48 * are computed beforehand and stored in table. We also use
49 *
50 * y := |x| + B,
51 *
52 * here B depends on subinterval and is used to make argument
53 * closer to zero.
54 * We also add large fake interval [SATURATION_THRESHOLD, HUGE_THRESHOLD],
55 * where 1.0 + 0.0*y + 0.0*y^2 ... coefficients are stored - just to
56 * preserve main path computation logic but return 1.0 for all arguments.
57 *
58 * Hence reconstruction looks as follows:
59 * we extract proper polynomial and range reduction coefficients
60 * (Pj and B), corresponding to subinterval, to which |x| belongs,
61 * and return
62 *
63 * r := sign(x) * (P0 + P1 * y + ... + Pn * y^n)
64 *
65 * NOTE: we use multiprecision technique to multiply and sum the first
66 * K terms of the polynomial. So Pj, j = 0..K are stored in
67 * table each as a pair of target precision numbers (Pj and PLj) to
68 * achieve wider than target precision.
69 *
70 *
71 */
72
e560b3c2
NG
73/* Offsets for data table __svml_stanh_data_internal_avx512. Ordered
74 by use in the function. On cold-starts this might help the
75 prefetcher. Possibly a better idea is to interleave start/end so
76 that the prefetcher is less likely to detect a stream and pull
77 irrelivant lines into cache. */
78
79/* Offsets for data table __svml_stanh_data_internal. 4 bytes each.
c0f36fc3 80 */
e560b3c2
NG
81#define _iExpMantMask_UISA 0
82#define _iMinIdxOfsMask_UISA 4
83#define _iMaxIdxMask_UISA 8
84#define _iExpMask 12
85
86/* Offsets for data table __svml_stanh_data_internal_al64. 64 bytes
87 each. */
88#define _sC_lo 0
89#define _sC_hi 64
90#define _sP7_lo 128
91#define _sP7_hi 192
92#define _sSignMask 256
93#define _sP6_lo 320
94#define _sP6_hi 384
95#define _sP5_lo 448
96#define _sP5_hi 512
97#define _sP4_lo 576
98#define _sP4_hi 640
99#define _sP3_lo 704
100#define _sP3_hi 768
101#define _sP2_lo 832
102#define _sP2_hi 896
103#define _sP0_lo 960
104#define _sP0_hi 1024
c0f36fc3
SP
105
106#include <sysdep.h>
e560b3c2
NG
107#define TANHF_DATA(x) ((x)+__svml_stanh_data_internal_al64)
108#define TANHF_DATA_UNALIGNED(x) ((x)+__svml_stanh_data_internal)
c0f36fc3 109
95177b78 110 .section .text.evex512, "ax", @progbits
c0f36fc3 111ENTRY(_ZGVeN16v_tanhf_skx)
638d6a55 112 /* Here huge arguments, INF and NaNs are filtered out to callout. */
e560b3c2
NG
113 vpandd TANHF_DATA_UNALIGNED(_iExpMantMask_UISA)(%rip){1to16}, %zmm0, %zmm1
114 vpsubd TANHF_DATA_UNALIGNED(_iMinIdxOfsMask_UISA)(%rip){1to16}, %zmm1, %zmm2
c0f36fc3 115
e560b3c2
NG
116 /* Selection arguments between [0, 0x03e00000] into zmm3. */
117 vpxord %zmm3, %zmm3, %zmm3
118 vpmaxsd %zmm3, %zmm2, %zmm3
119 vpminsd TANHF_DATA_UNALIGNED(_iMaxIdxMask_UISA)(%rip){1to16}, %zmm3, %zmm3
c0f36fc3 120
e560b3c2
NG
121 /* Setup permute indices in zmm3. */
122 vpsrld $21, %zmm3, %zmm3
c0f36fc3 123
e560b3c2
NG
124 /* Store if there are any special cases in k1. */
125 vpcmpd $6, TANHF_DATA_UNALIGNED(_iExpMask)(%rip){1to16}, %zmm1, %k1
c0f36fc3 126
e560b3c2
NG
127 vmovaps TANHF_DATA(_sC_lo)(%rip), %zmm5
128 vpermt2ps TANHF_DATA(_sC_hi)(%rip), %zmm3, %zmm5
c0f36fc3 129
e560b3c2
NG
130 vmovaps TANHF_DATA(_sP7_lo)(%rip), %zmm2
131 vpermt2ps TANHF_DATA(_sP7_hi)(%rip), %zmm3, %zmm2
c0f36fc3 132
e560b3c2
NG
133 /* Store absolute values of inputs in zmm1. */
134 vmovaps TANHF_DATA(_sSignMask)(%rip), %zmm4
135 vandnps %zmm0, %zmm4, %zmm1
136 vsubps {rn-sae}, %zmm5, %zmm1, %zmm1
c0f36fc3 137
e560b3c2
NG
138 vmovaps TANHF_DATA(_sP6_lo)(%rip), %zmm5
139 vpermt2ps TANHF_DATA(_sP6_hi)(%rip), %zmm3, %zmm5
c0f36fc3 140
e560b3c2
NG
141 vmovaps TANHF_DATA(_sP5_lo)(%rip), %zmm6
142 vpermt2ps TANHF_DATA(_sP5_hi)(%rip), %zmm3, %zmm6
143
144 vfmadd213ps {rn-sae}, %zmm5, %zmm1, %zmm2
145 vfmadd213ps {rn-sae}, %zmm6, %zmm1, %zmm2
c0f36fc3 146
e560b3c2
NG
147 vmovaps TANHF_DATA(_sP4_lo)(%rip), %zmm7
148 vpermt2ps TANHF_DATA(_sP4_hi)(%rip), %zmm3, %zmm7
c0f36fc3 149
e560b3c2
NG
150 vmovaps TANHF_DATA(_sP3_lo)(%rip), %zmm8
151 vpermt2ps TANHF_DATA(_sP3_hi)(%rip), %zmm3, %zmm8
c0f36fc3 152
e560b3c2
NG
153 vfmadd213ps {rn-sae}, %zmm7, %zmm1, %zmm2
154 vfmadd213ps {rn-sae}, %zmm8, %zmm1, %zmm2
155
156 vmovaps TANHF_DATA(_sP2_lo)(%rip), %zmm9
157 vpermt2ps TANHF_DATA(_sP2_hi)(%rip), %zmm3, %zmm9
158
159 vmovaps TANHF_DATA(_sP0_lo)(%rip), %zmm10
160 vpermt2ps TANHF_DATA(_sP0_hi)(%rip), %zmm3, %zmm10
161
162 vfmadd213ps {rn-sae}, %zmm9, %zmm1, %zmm2
163 vfmadd213ps {rn-sae}, %zmm10, %zmm1, %zmm2
164
165 kmovw %k1, %edx
166 testl %edx, %edx
167
168 /* Go to special inputs processing branch. */
169 jne L(SPECIAL_VALUES_BRANCH)
170 # LOE rbx r12 r13 r14 r15 zmm0 zmm2 zmm4
171 /* Wait until after branch of write over zmm0. */
172 vpternlogd $0xec, %zmm4, %zmm2, %zmm0
173
174 /* No stack restoration on the fastpath. */
175 ret
176
177 /* Cold case. edx has 1s where there was a special value that
178 needs to be handled by a tanhf call. Optimize for code size
179 more so than speed here. */
180L(SPECIAL_VALUES_BRANCH):
181 # LOE rbx rdx r12 r13 r14 r15 zmm0 zmm2 zmm4
182 /* Use r13 to save/restore the stack. This allows us to use rbp as
183 callee save register saving code size. */
184 pushq %r13
185 cfi_adjust_cfa_offset(8)
186 cfi_offset(r13, -16)
187 /* Need to callee save registers to preserve state across tanhf calls.
638d6a55 188 */
e560b3c2
NG
189 pushq %rbx
190 cfi_adjust_cfa_offset(8)
191 cfi_offset(rbx, -24)
192 pushq %rbp
193 cfi_adjust_cfa_offset(8)
194 cfi_offset(rbp, -32)
195 movq %rsp, %r13
196 cfi_def_cfa_register(r13)
197
198 /* Align stack and make room for 2x zmm vectors. */
199 andq $-64, %rsp
200 addq $-128, %rsp
201
202 /* Save original input (zmm0 unchanged up to this point). */
203 vmovaps %zmm0, 64(%rsp)
204 /* Save all already computed inputs. */
205 vpternlogd $0xec, %zmm4, %zmm2, %zmm0
206 vmovaps %zmm0, (%rsp)
c0f36fc3 207
e560b3c2
NG
208 vzeroupper
209
210 /* edx has 1s where there was a special value that needs to be handled
211 by a tanhf call. */
212 movl %edx, %ebx
c0f36fc3 213L(SPECIAL_VALUES_LOOP):
e560b3c2
NG
214 # LOE rbx rbp r12 r13 r14 r15
215 /* use rbp as index for special value that is saved across calls to
216 tanhf. We technically don't need a callee save register here as offset
217 to rsp is always [0, 56] so we can restore rsp by realigning to 64.
218 Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
219 in the loop. Realigning also costs more code size. */
220 xorl %ebp, %ebp
221 tzcntl %ebx, %ebp
c0f36fc3 222
1d2971b5 223 /* Scalar math function call to process special input. */
3079f652 224 vmovss 64(%rsp, %rbp, 4), %xmm0
638d6a55 225 call tanhf@PLT
c0f36fc3 226
e560b3c2
NG
227 /* No good way to avoid the store-forwarding fault this will cause on
228 return. `lfence` avoids the SF fault but at greater cost as it
229 serialized stack/callee save restoration. */
3079f652 230 vmovss %xmm0, (%rsp, %rbp, 4)
c0f36fc3 231
e560b3c2
NG
232 blsrl %ebx, %ebx
233 jnz L(SPECIAL_VALUES_LOOP)
234 # LOE r12 r13 r14 r15
235
236 /* All results have been written to (%rsp). */
237 vmovaps (%rsp), %zmm0
238 /* Restore rsp. */
239 movq %r13, %rsp
240 cfi_def_cfa_register(rsp)
241 /* Restore callee save registers. */
242 popq %rbp
243 cfi_adjust_cfa_offset(-8)
244 cfi_restore(rbp)
245 popq %rbx
246 cfi_adjust_cfa_offset(-8)
247 cfi_restore(rbp)
248 popq %r13
249 cfi_adjust_cfa_offset(-8)
250 cfi_restore(r13)
251 ret
c0f36fc3
SP
252END(_ZGVeN16v_tanhf_skx)
253
638d6a55 254 .section .rodata, "a"
e560b3c2 255 .align 16
c0f36fc3
SP
256#ifdef __svml_stanh_data_internal_typedef
257typedef unsigned int VUINT32;
e560b3c2
NG
258typedef struct
259 {
260 __declspec(align(4)) VUINT32 _iExpMantMask_UISA[1][1];
261 __declspec(align(4)) VUINT32 _iMinIdxOfsMask_UISA[1][1];
262 __declspec(align(4)) VUINT32 _iMaxIdxMask_UISA[1][1];
263 __declspec(align(4)) VUINT32 _iExpMask[1][1];
264 __declspec(align(64)) VUINT32 _sC_lo[16][1];
265 __declspec(align(64)) VUINT32 _sC_hi[16][1];
266 __declspec(align(64)) VUINT32 _sP7_lo[16][1];
267 __declspec(align(64)) VUINT32 _sP7_hi[16][1];
638d6a55 268 __declspec(align(64)) VUINT32 _sSignMask[16][1];
e560b3c2
NG
269 __declspec(align(64)) VUINT32 _sP6_lo[16][1];
270 __declspec(align(64)) VUINT32 _sP6_hi[16][1];
271 __declspec(align(64)) VUINT32 _sP5_lo[16][1];
272 __declspec(align(64)) VUINT32 _sP5_hi[16][1];
273 __declspec(align(64)) VUINT32 _sP4_lo[16][1];
274 __declspec(align(64)) VUINT32 _sP4_hi[16][1];
275 __declspec(align(64)) VUINT32 _sP3_lo[16][1];
276 __declspec(align(64)) VUINT32 _sP3_hi[16][1];
277 __declspec(align(64)) VUINT32 _sP2_lo[16][1];
278 __declspec(align(64)) VUINT32 _sP2_hi[16][1];
279 __declspec(align(64)) VUINT32 _sP0_lo[16][1];
280 __declspec(align(64)) VUINT32 _sP0_hi[16][1];
c0f36fc3
SP
281} __svml_stanh_data_internal;
282#endif
e560b3c2 283
c0f36fc3 284__svml_stanh_data_internal:
e560b3c2
NG
285 .align 4
286 /* _iExpMantMask_UISA */
287 .long 0x7fe00000
288
289 .align 4
290 /* _iMinIdxOfsMask_UISA */
291 .long 0x3d400000
292
293 .align 4
294 /* _iMaxIdxMask_UISA */
295 .long 0x03e00000
296
297 .align 4
298 /* _iExpMask */
299 .long 0x7f000000
300
301 .align 64
302__svml_stanh_data_internal_al64:
303 .align 64
304 /* _sC_lo */
638d6a55
SP
305 .long 0x00000000, 0x3d700000, 0x3d900000, 0x3db00000
306 .long 0x3dd00000, 0x3df00000, 0x3e100000, 0x3e300000
307 .long 0x3e500000, 0x3e700000, 0x3e900000, 0x3eb00000
308 .long 0x3ed00000, 0x3ef00000, 0x3f100000, 0x3f300000
e560b3c2
NG
309
310 .align 64
311 /* _sC_hi */
638d6a55
SP
312 .long 0x3f500000, 0x3f700000, 0x3f900000, 0x3fb00000
313 .long 0x3fd00000, 0x3ff00000, 0x40100000, 0x40300000
314 .long 0x40500000, 0x40700000, 0x40900000, 0x40b00000
315 .long 0x40d00000, 0x40f00000, 0x41100000, 0x00000000
e560b3c2 316
638d6a55 317 .align 64
e560b3c2
NG
318 /* _sP7_lo */
319 .long 0xbc0e2f66, 0x460bda12, 0x43d638ef, 0xc3e11c3e
320 .long 0xc2baa4e9, 0xc249da2d, 0xc1859b82, 0x40dd5b57
321 .long 0x40494640, 0x40c730a8, 0xbf0f160e, 0x3e30e76f
322 .long 0xbea81387, 0xbdb26a1c, 0xbd351e57, 0xbb4c01a0
323
638d6a55 324 .align 64
e560b3c2
NG
325 /* _sP7_hi */
326 .long 0x3c1d7bfb, 0x3c722cd1, 0x3c973f1c, 0x3c33a31b
327 .long 0x3b862ef4, 0x3a27b3d0, 0xba3b5907, 0xba0efc22
328 .long 0xb97f9f0f, 0xb8c8af50, 0xb7bdddfb, 0xb64f2950
329 .long 0xb4e085b1, 0xb3731dfa, 0xb15a1f04, 0x00000000
330
638d6a55 331 .align 64
e560b3c2
NG
332 /* _sSignMask */
333 .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
334 .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
335 .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
336 .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
337
638d6a55 338 .align 64
e560b3c2 339 /* _sP6_lo */
638d6a55
SP
340 .long 0x3e0910e9, 0x43761143, 0x4165ecdc, 0xc190f756
341 .long 0xc08c097d, 0xc02ba813, 0xbf7f6bda, 0x3f2b1dc0
342 .long 0x3ece105d, 0x3f426a94, 0xbadb0dc4, 0x3da43b17
343 .long 0xbd51ab88, 0xbcaea23d, 0xbd3b6d8d, 0xbd6caaad
e560b3c2
NG
344
345 .align 64
346 /* _sP6_hi */
638d6a55
SP
347 .long 0xbd795bed, 0xbd5fddda, 0xbd038f3b, 0xbc1cad63
348 .long 0x3abb4766, 0x3b95f10b, 0x3b825873, 0x3afaea66
349 .long 0x3a49f878, 0x39996bf3, 0x388f3e6c, 0x371bb0e3
350 .long 0x35a8a5e6, 0x34369b17, 0x322487b0, 0x00000000
e560b3c2 351
638d6a55 352 .align 64
e560b3c2
NG
353 /* _sP5_lo */
354 .long 0xb76dd6b9, 0xbe1c276d, 0x3c1dcf2f, 0x3dc1a78d
355 .long 0x3d96f985, 0x3da2b61b, 0x3dc13397, 0x3dd2f670
356 .long 0x3df48a0a, 0x3e06c5a8, 0x3e1a3aba, 0x3e27c405
357 .long 0x3e2e78d0, 0x3e2c3e44, 0x3e1d3097, 0x3df4a8f4
358
638d6a55 359 .align 64
e560b3c2
NG
360 /* _sP5_hi */
361 .long 0x3da38508, 0x3d31416a, 0x3b562657, 0xbcaeeac9
362 .long 0xbcce9419, 0xbcaaeac4, 0xbc49e7d0, 0xbba71ddd
363 .long 0xbb003b0e, 0xba3f9a05, 0xb92c08a7, 0xb7ba9232
364 .long 0xb64a0b0f, 0xb4dac169, 0xb2ab78ac, 0x00000000
365
638d6a55 366 .align 64
e560b3c2
NG
367 /* _sP4_lo */
368 .long 0xbeaaaaa5, 0xbeab0612, 0xbea7f01f, 0xbea4e120
369 .long 0xbea387b7, 0xbea15962, 0xbe9d57f7, 0xbe976b5a
370 .long 0xbe90230d, 0xbe880dff, 0xbe7479b3, 0xbe4c3d88
371 .long 0xbe212482, 0xbdeb8cba, 0xbd5e78ad, 0x3c6b5e6e
372
638d6a55 373 .align 64
e560b3c2
NG
374 /* _sP4_hi */
375 .long 0x3d839143, 0x3dc21ee1, 0x3de347af, 0x3dcbec96
376 .long 0x3d99ef2d, 0x3d542ea1, 0x3cdde701, 0x3c2cca67
377 .long 0x3b81cb27, 0x3ac073a1, 0x39ac3032, 0x383a94d9
378 .long 0x36ca081d, 0x355abd4c, 0x332b3cb6, 0x00000000
379
638d6a55 380 .align 64
e560b3c2
NG
381 /* _sP3_lo */
382 .long 0xb0343c7b, 0xbd6ee69d, 0xbd8f0da7, 0xbdae477d
383 .long 0xbdcd2a1f, 0xbdeba80d, 0xbe0c443b, 0xbe293cf3
384 .long 0xbe44f282, 0xbe5f3651, 0xbe81c7c0, 0xbe96d7ca
385 .long 0xbea7fb8e, 0xbeb50e9e, 0xbec12efe, 0xbec4be92
386
638d6a55 387 .align 64
e560b3c2
NG
388 /* _sP3_hi */
389 .long 0xbebce070, 0xbead510e, 0xbe8ef7d6, 0xbe4b8704
390 .long 0xbe083237, 0xbdaf7449, 0xbd2e1ec4, 0xbc83bf06
391 .long 0xbbc3e0b5, 0xbb10aadc, 0xba0157db, 0xb88c18f2
392 .long 0xb717b096, 0xb5a43bae, 0xb383012c, 0x00000000
393
638d6a55 394 .align 64
e560b3c2
NG
395 /* _sP2_lo */
396 .long 0x3f800000, 0x3f7f1f84, 0x3f7ebd11, 0x3f7e1e5f
397 .long 0x3f7d609f, 0x3f7c842d, 0x3f7b00e5, 0x3f789580
398 .long 0x3f75b8ad, 0x3f726fd9, 0x3f6cc59b, 0x3f63fb92
399 .long 0x3f59ff97, 0x3f4f11d7, 0x3f3d7573, 0x3f24f360
400
638d6a55 401 .align 64
e560b3c2
NG
402 /* _sP2_hi */
403 .long 0x3f0cbfe7, 0x3eec1a69, 0x3eb0a801, 0x3e6753a2
404 .long 0x3e132f1a, 0x3db7e7d3, 0x3d320845, 0x3c84d3d4
405 .long 0x3bc477b7, 0x3b10d3da, 0x3a01601e, 0x388c1a3b
406 .long 0x3717b0da, 0x35a43bce, 0x338306c6, 0x00000000
407
638d6a55 408 .align 64
e560b3c2
NG
409 /* _sP0_lo */
410 .long 0x00000000, 0x3d6fb9c9, 0x3d8fc35f, 0x3daf9169
411 .long 0x3dcf49ab, 0x3deee849, 0x3e0f0ee8, 0x3e2e4984
412 .long 0x3e4d2f8e, 0x3e6bb32e, 0x3e8c51cd, 0x3ea96163
413 .long 0x3ec543f1, 0x3edfd735, 0x3f028438, 0x3f18abf0
414
638d6a55 415 .align 64
e560b3c2
NG
416 /* _sP0_hi */
417 .long 0x3f2bc480, 0x3f3bec1c, 0x3f4f2e5b, 0x3f613c53
418 .long 0x3f6ce37d, 0x3f743c4f, 0x3f7a5feb, 0x3f7dea85
419 .long 0x3f7f3b3d, 0x3f7fb78c, 0x3f7fefd4, 0x3f7ffdd0
420 .long 0x3f7fffb4, 0x3f7ffff6, 0x3f7fffff, 0x3f800000
421
638d6a55 422 .align 64
e560b3c2
NG
423 .type __svml_stanh_data_internal_al64, @object
424 .size __svml_stanh_data_internal_al64, .-__svml_stanh_data_internal_al64
638d6a55
SP
425 .type __svml_stanh_data_internal, @object
426 .size __svml_stanh_data_internal, .-__svml_stanh_data_internal