1 /* Function asinhf vectorized with AVX-512.
2 Copyright (C) 2021-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
20 * ALGORITHM DESCRIPTION:
22 * Compute asinh(x) as log(x + sqrt(x*x + 1))
23 * using RSQRT instructions for starting the
24 * square root approximation, and small table lookups for log
25 * that map to AVX-512 permute instructions
29 * asinh(NaN) = quiet NaN, and raise invalid exception
30 * asinh(INF) = that INF
35 /* Offsets for data table __svml_sasinh_data_internal_avx512
41 #define SmallThreshold 384
43 #define LargeThreshold 512
48 #define RcpBitMask 832
51 #define poly_coeff3 1024
52 #define poly_coeff2 1088
53 #define poly_coeff1 1152
59 .section .text.evex512, "ax", @progbits
60 ENTRY(_ZGVeN16v_asinhf_skx)
62 cfi_def_cfa_offset(16)
71 vmulps {rn-sae}, %zmm10, %zmm10, %zmm0
72 vmovups One+__svml_sasinh_data_internal_avx512(%rip), %zmm2
74 /* polynomial computation for small inputs */
75 vmovups ca1+__svml_sasinh_data_internal_avx512(%rip), %zmm1
77 /* not a very small input ? */
78 vmovups SmallThreshold+__svml_sasinh_data_internal_avx512(%rip), %zmm11
81 vaddps {rn-sae}, %zmm2, %zmm0, %zmm7
84 vandps AbsMask+__svml_sasinh_data_internal_avx512(%rip), %zmm10, %zmm12
87 vmaxps {sae}, %zmm0, %zmm2, %zmm14
88 vrsqrt14ps %zmm7, %zmm8
91 vminps {sae}, %zmm0, %zmm2, %zmm15
92 vcmpps $21, {sae}, %zmm11, %zmm12, %k2
95 vsubps {rn-sae}, %zmm14, %zmm7, %zmm9
98 vxorps %zmm10, %zmm12, %zmm13
100 /* Sh ~sqrt(1+x^2) */
101 vmulps {rn-sae}, %zmm8, %zmm7, %zmm6
102 vmovups LargeThreshold+__svml_sasinh_data_internal_avx512(%rip), %zmm14
105 vsubps {rn-sae}, %zmm9, %zmm15, %zmm3
108 vaddps {rn-sae}, %zmm12, %zmm6, %zmm15
111 vfmsub213ps {rn-sae}, %zmm6, %zmm8, %zmm7
112 vmulps {rn-sae}, %zmm1, %zmm0, %zmm9
113 vcmpps $22, {sae}, %zmm14, %zmm12, %k0
114 vmovups c1s+__svml_sasinh_data_internal_avx512(%rip), %zmm1
116 /* polynomial computation for small inputs */
117 vfmadd213ps {rn-sae}, %zmm12, %zmm12, %zmm9
121 vmovaps %zmm10, %zmm4
122 vfmsub213ps {rn-sae}, %zmm0, %zmm10, %zmm4
124 /* Yl = (x^2)_low + B_low */
125 vaddps {rn-sae}, %zmm4, %zmm3, %zmm5
127 /* rel. error term: Eh=1-Sh*R0 */
129 vfnmadd231ps {rn-sae}, %zmm6, %zmm8, %zmm0
131 /* Sl = (Yh*R0)_low+(R0*Yl) */
132 vfmadd213ps {rn-sae}, %zmm7, %zmm8, %zmm5
134 /* very large inputs ? */
135 vmovups Threshold+__svml_sasinh_data_internal_avx512(%rip), %zmm7
137 /* rel. error term: Eh=(1-Sh*R0)-Sl*R0 */
138 vfnmadd231ps {rn-sae}, %zmm5, %zmm8, %zmm0
140 /* sqrt(1+x^2) ~ Sh + Sl + Sh*Eh*poly_s */
141 vmovups c2s+__svml_sasinh_data_internal_avx512(%rip), %zmm8
142 vcmpps $21, {sae}, %zmm7, %zmm12, %k1
145 vmulps {rn-sae}, %zmm0, %zmm6, %zmm4
146 vfmadd231ps {rn-sae}, %zmm0, %zmm8, %zmm1
148 /* Sl + Sh*Eh*poly_s */
149 vfmadd213ps {rn-sae}, %zmm5, %zmm1, %zmm4
152 vsubps {rn-sae}, %zmm6, %zmm15, %zmm5
154 /* fixup for very large inputs */
155 vmovups OneEighth+__svml_sasinh_data_internal_avx512(%rip), %zmm6
157 /* Xin0+Sl+Sh*Eh*poly_s ~ x+sqrt(1+x^2) */
158 vaddps {rn-sae}, %zmm4, %zmm15, %zmm3
161 vsubps {rn-sae}, %zmm5, %zmm12, %zmm5
164 vsubps {rn-sae}, %zmm15, %zmm3, %zmm0
165 vmulps {rn-sae}, %zmm6, %zmm12, %zmm3{%k1}
168 vmovups L2H+__svml_sasinh_data_internal_avx512(%rip), %zmm15
171 vsubps {rn-sae}, %zmm0, %zmm4, %zmm1
172 vrcp14ps %zmm3, %zmm6
175 vmovups __svml_sasinh_data_internal_avx512(%rip), %zmm0
178 vaddps {rn-sae}, %zmm5, %zmm1, %zmm7
180 /* round reciprocal to 1+4b mantissas */
181 vpaddd AddB5+__svml_sasinh_data_internal_avx512(%rip), %zmm6, %zmm4
182 vmovups poly_coeff1+__svml_sasinh_data_internal_avx512(%rip), %zmm5
183 vandps RcpBitMask+__svml_sasinh_data_internal_avx512(%rip), %zmm4, %zmm8
185 /* fixup for very large inputs */
186 vxorps %zmm7, %zmm7, %zmm7{%k1}
189 vmovups poly_coeff3+__svml_sasinh_data_internal_avx512(%rip), %zmm4
191 /* reduced argument for log(): (Rcp*Xin-1)+Rcp*Xin_low */
192 vfmsub231ps {rn-sae}, %zmm8, %zmm3, %zmm2
193 vmovups Four+__svml_sasinh_data_internal_avx512(%rip), %zmm3
196 vgetexpps {sae}, %zmm8, %zmm1
198 /* Prepare table index */
199 vpsrld $18, %zmm8, %zmm14
200 vfmadd231ps {rn-sae}, %zmm8, %zmm7, %zmm2
201 vmovups poly_coeff2+__svml_sasinh_data_internal_avx512(%rip), %zmm7
202 vsubps {rn-sae}, %zmm3, %zmm1, %zmm1{%k1}
203 vpermt2ps Log_tbl_H+64+__svml_sasinh_data_internal_avx512(%rip), %zmm14, %zmm0
204 vmovups Log_tbl_L+__svml_sasinh_data_internal_avx512(%rip), %zmm3
205 vfmadd231ps {rn-sae}, %zmm2, %zmm4, %zmm7
206 vfnmadd231ps {rn-sae}, %zmm1, %zmm15, %zmm0
209 vmulps {rn-sae}, %zmm2, %zmm2, %zmm6
210 vfmadd213ps {rn-sae}, %zmm5, %zmm2, %zmm7
211 vpermt2ps Log_tbl_L+64+__svml_sasinh_data_internal_avx512(%rip), %zmm14, %zmm3
214 vmovups L2L+__svml_sasinh_data_internal_avx512(%rip), %zmm14
215 vfnmadd213ps {rn-sae}, %zmm3, %zmm14, %zmm1
218 vfmadd213ps {rn-sae}, %zmm1, %zmm6, %zmm7
220 /* R+Tl + R^2*Poly */
221 vaddps {rn-sae}, %zmm2, %zmm7, %zmm2
222 vaddps {rn-sae}, %zmm2, %zmm0, %zmm9{%k2}
223 vxorps %zmm13, %zmm9, %zmm0
226 /* Go to special inputs processing branch */
227 jne L(SPECIAL_VALUES_BRANCH)
228 # LOE rbx r12 r13 r14 r15 edx zmm0 zmm10
231 * and exit the function
247 L(SPECIAL_VALUES_BRANCH):
248 vmovups %zmm10, 64(%rsp)
249 vmovups %zmm0, 128(%rsp)
250 # LOE rbx r12 r13 r14 r15 edx zmm0
253 # LOE rbx r12 r13 r14 r15 eax edx
257 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
258 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
261 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
262 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
265 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
266 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
267 # LOE rbx r15 r12d r13d
276 /* Call scalar math function */
277 jc L(SCALAR_MATH_CALL)
278 # LOE rbx r15 r12d r13d
284 L(SPECIAL_VALUES_LOOP):
288 /* Check bits in range mask */
289 jl L(RANGEMASK_CHECK)
290 # LOE rbx r15 r12d r13d
298 vmovups 128(%rsp), %zmm0
302 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
303 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
304 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
305 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
306 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
307 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
308 # LOE rbx r12 r13 r14 r15 zmm0
310 /* Scalar math function call
311 * to process special input
316 vmovss 64(%rsp, %r14, 4), %xmm0
318 # LOE rbx r14 r15 r12d r13d xmm0
320 vmovss %xmm0, 128(%rsp, %r14, 4)
322 /* Process special inputs in loop */
323 jmp L(SPECIAL_VALUES_LOOP)
324 # LOE rbx r15 r12d r13d
325 END(_ZGVeN16v_asinhf_skx)
327 .section .rodata, "a"
330 #ifdef __svml_sasinh_data_internal_avx512_typedef
331 typedef unsigned int VUINT32;
333 __declspec(align(64)) VUINT32 Log_tbl_H[32][1];
334 __declspec(align(64)) VUINT32 Log_tbl_L[32][1];
335 __declspec(align(64)) VUINT32 One[16][1];
336 __declspec(align(64)) VUINT32 AbsMask[16][1];
337 __declspec(align(64)) VUINT32 SmallThreshold[16][1];
338 __declspec(align(64)) VUINT32 Threshold[16][1];
339 __declspec(align(64)) VUINT32 LargeThreshold[16][1];
340 __declspec(align(64)) VUINT32 ca1[16][1];
341 __declspec(align(64)) VUINT32 c2s[16][1];
342 __declspec(align(64)) VUINT32 c1s[16][1];
343 __declspec(align(64)) VUINT32 AddB5[16][1];
344 __declspec(align(64)) VUINT32 RcpBitMask[16][1];
345 __declspec(align(64)) VUINT32 OneEighth[16][1];
346 __declspec(align(64)) VUINT32 Four[16][1];
347 __declspec(align(64)) VUINT32 poly_coeff3[16][1];
348 __declspec(align(64)) VUINT32 poly_coeff2[16][1];
349 __declspec(align(64)) VUINT32 poly_coeff1[16][1];
350 __declspec(align(64)) VUINT32 L2H[16][1];
351 __declspec(align(64)) VUINT32 L2L[16][1];
352 } __svml_sasinh_data_internal_avx512;
354 __svml_sasinh_data_internal_avx512:
424 .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
427 .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
430 .long 0x3c800000, 0x3c800000, 0x3c800000, 0x3c800000, 0x3c800000, 0x3c800000, 0x3c800000, 0x3c800000, 0x3c800000, 0x3c800000, 0x3c800000, 0x3c800000, 0x3c800000, 0x3c800000, 0x3c800000, 0x3c800000
433 .long 0x5f000000, 0x5f000000, 0x5f000000, 0x5f000000, 0x5f000000, 0x5f000000, 0x5f000000, 0x5f000000, 0x5f000000, 0x5f000000, 0x5f000000, 0x5f000000, 0x5f000000, 0x5f000000, 0x5f000000, 0x5f000000
436 .long 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff
439 .long 0xbe2AA5DE, 0xbe2AA5DE, 0xbe2AA5DE, 0xbe2AA5DE, 0xbe2AA5DE, 0xbe2AA5DE, 0xbe2AA5DE, 0xbe2AA5DE, 0xbe2AA5DE, 0xbe2AA5DE, 0xbe2AA5DE, 0xbe2AA5DE, 0xbe2AA5DE, 0xbe2AA5DE, 0xbe2AA5DE, 0xbe2AA5DE
442 .long 0x3ec00000, 0x3ec00000, 0x3ec00000, 0x3ec00000, 0x3ec00000, 0x3ec00000, 0x3ec00000, 0x3ec00000, 0x3ec00000, 0x3ec00000, 0x3ec00000, 0x3ec00000, 0x3ec00000, 0x3ec00000, 0x3ec00000, 0x3ec00000
445 .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000
448 .long 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000
451 .long 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
454 .long 0x3e000000, 0x3e000000, 0x3e000000, 0x3e000000, 0x3e000000, 0x3e000000, 0x3e000000, 0x3e000000, 0x3e000000, 0x3e000000, 0x3e000000, 0x3e000000, 0x3e000000, 0x3e000000, 0x3e000000, 0x3e000000
457 .long 0x40800000, 0x40800000, 0x40800000, 0x40800000, 0x40800000, 0x40800000, 0x40800000, 0x40800000, 0x40800000, 0x40800000, 0x40800000, 0x40800000, 0x40800000, 0x40800000, 0x40800000, 0x40800000
460 .long 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
463 .long 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
466 .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
467 /* L2H = log(2)_high */
469 .long 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
470 /* L2L = log(2)_low */
472 .long 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
474 .type __svml_sasinh_data_internal_avx512, @object
475 .size __svml_sasinh_data_internal_avx512, .-__svml_sasinh_data_internal_avx512