1 /* Function expm1f vectorized with AVX-512.
2 Copyright (C) 2021-2023 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
20 * ALGORITHM DESCRIPTION:
21 * After computing exp(x) in high-low parts, an accurate computation is performed to obtain exp(x)-1
22 * Typical exp() implementation, except that:
23 * - tables are small (32 elements), allowing for fast gathers
24 * - all arguments processed in the main path
25 * - final VSCALEF assists branch-free design (correct overflow/underflow and special case responses)
26 * - a VAND is used to ensure the reduced argument |R|<2, even for large inputs
27 * - RZ mode used to avoid oveflow to +/-Inf for x*log2(e); helps with special case handling
32 /* Offsets for data table __svml_sexpm1_data_internal_avx512
43 #define poly_coeff3 704
44 #define poly_coeff2 768
49 .section .text.evex512, "ax", @progbits
50 ENTRY(_ZGVeN16v_expm1f_skx)
52 cfi_def_cfa_offset(16)
58 vmovups L2E+__svml_sexpm1_data_internal_avx512(%rip), %zmm5
59 vmovups Shifter+__svml_sexpm1_data_internal_avx512(%rip), %zmm3
60 vmovups L2H+__svml_sexpm1_data_internal_avx512(%rip), %zmm8
61 vmovups L2L+__svml_sexpm1_data_internal_avx512(%rip), %zmm4
62 vmovups __svml_sexpm1_data_internal_avx512(%rip), %zmm6
65 vmovups poly_coeff3+__svml_sexpm1_data_internal_avx512(%rip), %zmm9
66 vmovups poly_coeff2+__svml_sexpm1_data_internal_avx512(%rip), %zmm12
67 vmovups Exp_tbl_L+__svml_sexpm1_data_internal_avx512(%rip), %zmm11
68 vmovups Threshold+__svml_sexpm1_data_internal_avx512(%rip), %zmm2
71 vmovups One+__svml_sexpm1_data_internal_avx512(%rip), %zmm14
74 /* 2^(52-5)*1.5 + x * log2(e) */
75 vfmadd213ps {rn-sae}, %zmm3, %zmm1, %zmm5
76 vcmpps $29, {sae}, %zmm2, %zmm1, %k0
78 /* Z0 ~ x*log2(e), rounded to 5 fractional bits */
79 vsubps {rn-sae}, %zmm3, %zmm5, %zmm7
80 vpermt2ps Exp_tbl_H+64+__svml_sexpm1_data_internal_avx512(%rip), %zmm5, %zmm6
81 vpermt2ps Exp_tbl_L+64+__svml_sexpm1_data_internal_avx512(%rip), %zmm5, %zmm11
82 vandps SgnMask+__svml_sexpm1_data_internal_avx512(%rip), %zmm1, %zmm0
84 /* R = x - Z0*log(2) */
85 vfnmadd213ps {rn-sae}, %zmm1, %zmm7, %zmm8
88 vscalefps {rn-sae}, %zmm7, %zmm6, %zmm2
89 vfnmadd231ps {rn-sae}, %zmm7, %zmm4, %zmm8
92 /* ensure |R|<2 even for special cases */
93 vandps EMask+__svml_sexpm1_data_internal_avx512(%rip), %zmm8, %zmm13
94 vsubps {rn-sae}, %zmm14, %zmm2, %zmm8
95 vmulps {rn-sae}, %zmm13, %zmm13, %zmm10
96 vfmadd231ps {rn-sae}, %zmm13, %zmm9, %zmm12
98 /* Tlr + R+ R2*Poly */
99 vfmadd213ps {rn-sae}, %zmm11, %zmm10, %zmm12
100 vaddps {rn-sae}, %zmm13, %zmm12, %zmm15
102 /* (Th-1)+Th*(Tlr + R+ R*Poly) */
103 vfmadd213ps {rn-sae}, %zmm8, %zmm15, %zmm2
104 vorps %zmm0, %zmm2, %zmm0
107 /* Go to special inputs processing branch */
108 jne L(SPECIAL_VALUES_BRANCH)
109 # LOE rbx r12 r13 r14 r15 edx zmm0 zmm1
112 * and exit the function
128 L(SPECIAL_VALUES_BRANCH):
129 vmovups %zmm1, 64(%rsp)
130 vmovups %zmm0, 128(%rsp)
131 # LOE rbx r12 r13 r14 r15 edx zmm0
134 # LOE rbx r12 r13 r14 r15 eax edx
138 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
139 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
142 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
143 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
146 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
147 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
148 # LOE rbx r15 r12d r13d
157 /* Call scalar math function */
158 jc L(SCALAR_MATH_CALL)
159 # LOE rbx r15 r12d r13d
165 L(SPECIAL_VALUES_LOOP):
169 /* Check bits in range mask */
170 jl L(RANGEMASK_CHECK)
171 # LOE rbx r15 r12d r13d
179 vmovups 128(%rsp), %zmm0
183 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
184 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
185 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
186 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
187 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
188 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
189 # LOE rbx r12 r13 r14 r15 zmm0
191 /* Scalar math fucntion call
192 * to process special input
197 vmovss 64(%rsp, %r14, 4), %xmm0
199 # LOE rbx r14 r15 r12d r13d xmm0
201 vmovss %xmm0, 128(%rsp, %r14, 4)
203 /* Process special inputs in loop */
204 jmp L(SPECIAL_VALUES_LOOP)
205 # LOE rbx r15 r12d r13d
206 END(_ZGVeN16v_expm1f_skx)
208 .section .rodata, "a"
211 #ifdef __svml_sexpm1_data_internal_avx512_typedef
212 typedef unsigned int VUINT32;
214 __declspec(align(64)) VUINT32 Exp_tbl_H[32][1];
215 __declspec(align(64)) VUINT32 Exp_tbl_L[32][1];
216 __declspec(align(64)) VUINT32 L2E[16][1];
217 __declspec(align(64)) VUINT32 Shifter[16][1];
218 __declspec(align(64)) VUINT32 Threshold[16][1];
219 __declspec(align(64)) VUINT32 SgnMask[16][1];
220 __declspec(align(64)) VUINT32 L2H[16][1];
221 __declspec(align(64)) VUINT32 L2L[16][1];
222 __declspec(align(64)) VUINT32 EMask[16][1];
223 __declspec(align(64)) VUINT32 poly_coeff3[16][1];
224 __declspec(align(64)) VUINT32 poly_coeff2[16][1];
225 __declspec(align(64)) VUINT32 One[16][1];
226 } __svml_sexpm1_data_internal_avx512;
228 __svml_sexpm1_data_internal_avx512:
230 .long 0x3f800000, 0x3f82cd87, 0x3f85aac3, 0x3f88980f
231 .long 0x3f8b95c2, 0x3f8ea43a, 0x3f91c3d3, 0x3f94f4f0
232 .long 0x3f9837f0, 0x3f9b8d3a, 0x3f9ef532, 0x3fa27043
233 .long 0x3fa5fed7, 0x3fa9a15b, 0x3fad583f, 0x3fb123f6
234 .long 0x3fb504f3, 0x3fb8fbaf, 0x3fbd08a4, 0x3fc12c4d
235 .long 0x3fc5672a, 0x3fc9b9be, 0x3fce248c, 0x3fd2a81e
236 .long 0x3fd744fd, 0x3fdbfbb8, 0x3fe0ccdf, 0x3fe5b907
237 .long 0x3feac0c7, 0x3fefe4ba, 0x3ff5257d, 0x3ffa83b3
240 .long 0x00000000, 0xb34a3a0a, 0x3346cb6a, 0xb36ed17e
241 .long 0xb24e0611, 0xb3517dd9, 0x334b2482, 0xb31586de
242 .long 0x33092801, 0xb2e6f467, 0x331b85f2, 0x3099b6f1
243 .long 0xb3051aa8, 0xb2e2a0da, 0xb2006c56, 0xb3365942
244 .long 0x329302ae, 0x32c595dc, 0xb302e5a2, 0xb28e10a1
245 .long 0x31b3d0e5, 0xb31a472b, 0x31d1daf2, 0xb305bf64
246 .long 0xb27ce182, 0xb2f26443, 0xb1b4b0da, 0xb1da8a8f
247 .long 0xb1d290be, 0xb2d5b899, 0x31b0a147, 0xb2156afc
250 .long 0x3fB8AA3B, 0x3fB8AA3B, 0x3fB8AA3B, 0x3fB8AA3B, 0x3fB8AA3B, 0x3fB8AA3B, 0x3fB8AA3B, 0x3fB8AA3B, 0x3fB8AA3B, 0x3fB8AA3B, 0x3fB8AA3B, 0x3fB8AA3B, 0x3fB8AA3B, 0x3fB8AA3B, 0x3fB8AA3B, 0x3fB8AA3B
251 /* Shifter=2^(23-5)*1.5 */
253 .long 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000
256 .long 0x42AD496B, 0x42AD496B, 0x42AD496B, 0x42AD496B, 0x42AD496B, 0x42AD496B, 0x42AD496B, 0x42AD496B, 0x42AD496B, 0x42AD496B, 0x42AD496B, 0x42AD496B, 0x42AD496B, 0x42AD496B, 0x42AD496B, 0x42AD496B
259 .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000
260 /* L2H = log(2)_high */
262 .long 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
263 /* L2L = log(2)_low */
265 .long 0xb102e308, 0xb102e308, 0xb102e308, 0xb102e308, 0xb102e308, 0xb102e308, 0xb102e308, 0xb102e308, 0xb102e308, 0xb102e308, 0xb102e308, 0xb102e308, 0xb102e308, 0xb102e308, 0xb102e308, 0xb102e308
268 .long 0xbfffffff, 0xbfffffff, 0xbfffffff, 0xbfffffff, 0xbfffffff, 0xbfffffff, 0xbfffffff, 0xbfffffff, 0xbfffffff, 0xbfffffff, 0xbfffffff, 0xbfffffff, 0xbfffffff, 0xbfffffff, 0xbfffffff, 0xbfffffff
271 .long 0x3e2AABF3, 0x3e2AABF3, 0x3e2AABF3, 0x3e2AABF3, 0x3e2AABF3, 0x3e2AABF3, 0x3e2AABF3, 0x3e2AABF3, 0x3e2AABF3, 0x3e2AABF3, 0x3e2AABF3, 0x3e2AABF3, 0x3e2AABF3, 0x3e2AABF3, 0x3e2AABF3, 0x3e2AABF3
274 .long 0x3f0000F6, 0x3f0000F6, 0x3f0000F6, 0x3f0000F6, 0x3f0000F6, 0x3f0000F6, 0x3f0000F6, 0x3f0000F6, 0x3f0000F6, 0x3f0000F6, 0x3f0000F6, 0x3f0000F6, 0x3f0000F6, 0x3f0000F6, 0x3f0000F6, 0x3f0000F6
277 .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
279 .type __svml_sexpm1_data_internal_avx512, @object
280 .size __svml_sexpm1_data_internal_avx512, .-__svml_sexpm1_data_internal_avx512