1 /* Function log10 vectorized with AVX-512.
2 Copyright (C) 2021-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
20 * ALGORITHM DESCRIPTION:
22 * Get short reciprocal approximation Rcp ~ 1/mantissa(x)
24 * log10(x) = k*log10(2.0) - log10(Rcp) + poly_approximation(R)
25 * log10(Rcp) is tabulated
30 /* Offsets for data table __svml_dlog10_data_internal_avx512
35 #define poly_coeff9 256
36 #define poly_coeff8 320
37 #define poly_coeff7 384
38 #define poly_coeff6 448
39 #define poly_coeff5 512
40 #define poly_coeff4 576
41 #define poly_coeff3 640
42 #define poly_coeff2 704
43 #define poly_coeff1 768
49 .section .text.evex512,"ax",@progbits
50 ENTRY(_ZGVeN8v_log10_skx)
52 cfi_def_cfa_offset(16)
59 vgetmantpd $8, {sae}, %zmm7, %zmm6
60 vmovups One+__svml_dlog10_data_internal_avx512(%rip), %zmm3
61 vmovups poly_coeff5+__svml_dlog10_data_internal_avx512(%rip), %zmm12
62 vmovups poly_coeff3+__svml_dlog10_data_internal_avx512(%rip), %zmm13
64 /* Start polynomial evaluation */
65 vmovups poly_coeff9+__svml_dlog10_data_internal_avx512(%rip), %zmm10
66 vmovups poly_coeff8+__svml_dlog10_data_internal_avx512(%rip), %zmm1
67 vmovups poly_coeff7+__svml_dlog10_data_internal_avx512(%rip), %zmm11
68 vmovups poly_coeff6+__svml_dlog10_data_internal_avx512(%rip), %zmm14
70 /* Prepare exponent correction: DblRcp<0.75? */
71 vmovups C075+__svml_dlog10_data_internal_avx512(%rip), %zmm2
74 vmovups __svml_dlog10_data_internal_avx512(%rip), %zmm5
77 vgetexppd {sae}, %zmm7, %zmm0
79 /* DblRcp ~ 1/Mantissa */
83 vfpclasspd $94, %zmm7, %k0
85 /* round DblRcp to 4 fractional bits (RN mode, no Precision exception) */
86 vrndscalepd $88, {sae}, %zmm8, %zmm4
87 vmovups poly_coeff4+__svml_dlog10_data_internal_avx512(%rip), %zmm8
90 /* Reduced argument: R = DblRcp*Mantissa - 1 */
91 vfmsub213pd {rn-sae}, %zmm3, %zmm4, %zmm6
92 vcmppd $17, {sae}, %zmm2, %zmm4, %k1
93 vfmadd231pd {rn-sae}, %zmm6, %zmm12, %zmm8
94 vmovups poly_coeff2+__svml_dlog10_data_internal_avx512(%rip), %zmm12
95 vfmadd231pd {rn-sae}, %zmm6, %zmm10, %zmm1
96 vfmadd231pd {rn-sae}, %zmm6, %zmm11, %zmm14
97 vmovups poly_coeff1+__svml_dlog10_data_internal_avx512(%rip), %zmm2
100 vmulpd {rn-sae}, %zmm6, %zmm6, %zmm15
101 vfmadd231pd {rn-sae}, %zmm6, %zmm13, %zmm12
103 /* Prepare table index */
104 vpsrlq $48, %zmm4, %zmm9
106 /* add 1 to Expon if DblRcp<0.75 */
107 vaddpd {rn-sae}, %zmm3, %zmm0, %zmm0{%k1}
108 vmulpd {rn-sae}, %zmm15, %zmm15, %zmm13
109 vfmadd213pd {rn-sae}, %zmm14, %zmm15, %zmm1
110 vfmadd213pd {rn-sae}, %zmm12, %zmm15, %zmm8
111 vpermt2pd Log_tbl+64+__svml_dlog10_data_internal_avx512(%rip), %zmm9, %zmm5
114 vfmadd213pd {rn-sae}, %zmm8, %zmm13, %zmm1
115 vfmadd213pd {rn-sae}, %zmm2, %zmm6, %zmm1
116 vfmadd213pd {rn-sae}, %zmm5, %zmm1, %zmm6
117 vmovups L2+__svml_dlog10_data_internal_avx512(%rip), %zmm1
118 vfmadd213pd {rn-sae}, %zmm6, %zmm1, %zmm0
121 /* Go to special inputs processing branch */
122 jne L(SPECIAL_VALUES_BRANCH)
123 # LOE rbx r12 r13 r14 r15 edx zmm0 zmm7
126 * and exit the function
142 L(SPECIAL_VALUES_BRANCH):
143 vmovups %zmm7, 64(%rsp)
144 vmovups %zmm0, 128(%rsp)
145 # LOE rbx r12 r13 r14 r15 edx zmm0
148 # LOE rbx r12 r13 r14 r15 eax edx
152 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
153 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
156 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
157 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
160 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
161 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
162 # LOE rbx r15 r12d r13d
171 /* Call scalar math function */
172 jc L(SCALAR_MATH_CALL)
173 # LOE rbx r15 r12d r13d
179 L(SPECIAL_VALUES_LOOP):
183 /* Check bits in range mask */
184 jl L(RANGEMASK_CHECK)
185 # LOE rbx r15 r12d r13d
193 vmovups 128(%rsp), %zmm0
197 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
198 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
199 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
200 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
201 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
202 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
203 # LOE rbx r12 r13 r14 r15 zmm0
205 /* Scalar math fucntion call
206 * to process special input
211 movsd 64(%rsp,%r14,8), %xmm0
213 # LOE rbx r14 r15 r12d r13d xmm0
215 movsd %xmm0, 128(%rsp,%r14,8)
217 /* Process special inputs in loop */
218 jmp L(SPECIAL_VALUES_LOOP)
219 # LOE rbx r15 r12d r13d
220 END(_ZGVeN8v_log10_skx)
222 .section .rodata, "a"
225 #ifdef __svml_dlog10_data_internal_avx512_typedef
226 typedef unsigned int VUINT32;
228 __declspec(align(64)) VUINT32 Log_tbl[16][2];
229 __declspec(align(64)) VUINT32 One[8][2];
230 __declspec(align(64)) VUINT32 C075[8][2];
231 __declspec(align(64)) VUINT32 poly_coeff9[8][2];
232 __declspec(align(64)) VUINT32 poly_coeff8[8][2];
233 __declspec(align(64)) VUINT32 poly_coeff7[8][2];
234 __declspec(align(64)) VUINT32 poly_coeff6[8][2];
235 __declspec(align(64)) VUINT32 poly_coeff5[8][2];
236 __declspec(align(64)) VUINT32 poly_coeff4[8][2];
237 __declspec(align(64)) VUINT32 poly_coeff3[8][2];
238 __declspec(align(64)) VUINT32 poly_coeff2[8][2];
239 __declspec(align(64)) VUINT32 poly_coeff1[8][2];
240 __declspec(align(64)) VUINT32 L2[8][2];
241 } __svml_dlog10_data_internal_avx512;
243 __svml_dlog10_data_internal_avx512:
245 .quad 0x0000000000000000
246 .quad 0xbf9af5f92b00e610
247 .quad 0xbfaa30a9d609efea
248 .quad 0xbfb31b3055c47118
249 .quad 0xbfb8cf183886480d
250 .quad 0xbfbe3bc1ab0e19fe
251 .quad 0xbfc1b3e71ec94f7b
252 .quad 0xbfc42c7e7fe3fc02
253 .quad 0x3fbffbfc2bbc7803
254 .quad 0x3fbb721cd17157e3
255 .quad 0x3fb715d0ce367afc
256 .quad 0x3fb2e3a740b7800f
257 .quad 0x3fadb11ed766abf4
258 .quad 0x3fa5e3966b7e9295
259 .quad 0x3f9cb38fccd8bfdb
260 .quad 0x3f8c3d0837784c41
263 .quad 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000
266 .quad 0x3fe8000000000000, 0x3fe8000000000000, 0x3fe8000000000000, 0x3fe8000000000000, 0x3fe8000000000000, 0x3fe8000000000000, 0x3fe8000000000000, 0x3fe8000000000000
267 /*== poly_coeff9 ==*/
269 .quad 0x3fa8c2d828480370, 0x3fa8c2d828480370, 0x3fa8c2d828480370, 0x3fa8c2d828480370, 0x3fa8c2d828480370, 0x3fa8c2d828480370, 0x3fa8c2d828480370, 0x3fa8c2d828480370
270 /*== poly_coeff8 ==*/
272 .quad 0xbfabd80d96029814, 0xbfabd80d96029814, 0xbfabd80d96029814, 0xbfabd80d96029814, 0xbfabd80d96029814, 0xbfabd80d96029814, 0xbfabd80d96029814, 0xbfabd80d96029814
273 /*== poly_coeff7 ==*/
275 .quad 0x3fafc3f6f38b58a2, 0x3fafc3f6f38b58a2, 0x3fafc3f6f38b58a2, 0x3fafc3f6f38b58a2, 0x3fafc3f6f38b58a2, 0x3fafc3f6f38b58a2, 0x3fafc3f6f38b58a2, 0x3fafc3f6f38b58a2
276 /*== poly_coeff6 ==*/
278 .quad 0xbfb287a63464dc80, 0xbfb287a63464dc80, 0xbfb287a63464dc80, 0xbfb287a63464dc80, 0xbfb287a63464dc80, 0xbfb287a63464dc80, 0xbfb287a63464dc80, 0xbfb287a63464dc80
279 /*== poly_coeff5 ==*/
281 .quad 0x3fb63c62777f27d9, 0x3fb63c62777f27d9, 0x3fb63c62777f27d9, 0x3fb63c62777f27d9, 0x3fb63c62777f27d9, 0x3fb63c62777f27d9, 0x3fb63c62777f27d9, 0x3fb63c62777f27d9
282 /*== poly_coeff4 ==*/
284 .quad 0xbfbbcb7b153c06a3, 0xbfbbcb7b153c06a3, 0xbfbbcb7b153c06a3, 0xbfbbcb7b153c06a3, 0xbfbbcb7b153c06a3, 0xbfbbcb7b153c06a3, 0xbfbbcb7b153c06a3, 0xbfbbcb7b153c06a3
285 /*== poly_coeff3 ==*/
287 .quad 0x3fc287a7636f428c, 0x3fc287a7636f428c, 0x3fc287a7636f428c, 0x3fc287a7636f428c, 0x3fc287a7636f428c, 0x3fc287a7636f428c, 0x3fc287a7636f428c, 0x3fc287a7636f428c
288 /*== poly_coeff2 ==*/
290 .quad 0xbfcbcb7b1526e4db, 0xbfcbcb7b1526e4db, 0xbfcbcb7b1526e4db, 0xbfcbcb7b1526e4db, 0xbfcbcb7b1526e4db, 0xbfcbcb7b1526e4db, 0xbfcbcb7b1526e4db, 0xbfcbcb7b1526e4db
291 /*== poly_coeff1 ==*/
293 .quad 0x3fdbcb7b1526e50e, 0x3fdbcb7b1526e50e, 0x3fdbcb7b1526e50e, 0x3fdbcb7b1526e50e, 0x3fdbcb7b1526e50e, 0x3fdbcb7b1526e50e, 0x3fdbcb7b1526e50e, 0x3fdbcb7b1526e50e
296 .quad 0x3fd34413509f79ff, 0x3fd34413509f79ff, 0x3fd34413509f79ff, 0x3fd34413509f79ff, 0x3fd34413509f79ff, 0x3fd34413509f79ff, 0x3fd34413509f79ff, 0x3fd34413509f79ff
298 .type __svml_dlog10_data_internal_avx512,@object
299 .size __svml_dlog10_data_internal_avx512,.-__svml_dlog10_data_internal_avx512