]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/x86_64/fpu/multiarch/svml_d_log108_core_avx512.S
Update copyright dates with scripts/update-copyrights
[thirdparty/glibc.git] / sysdeps / x86_64 / fpu / multiarch / svml_d_log108_core_avx512.S
1 /* Function log10 vectorized with AVX-512.
2 Copyright (C) 2021-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
18
19 /*
20 * ALGORITHM DESCRIPTION:
21 *
22 * Get short reciprocal approximation Rcp ~ 1/mantissa(x)
23 * R = Rcp*x - 1.0
24 * log10(x) = k*log10(2.0) - log10(Rcp) + poly_approximation(R)
25 * log10(Rcp) is tabulated
26 *
27 *
28 */
29
30 /* Offsets for data table __svml_dlog10_data_internal_avx512
31 */
32 #define Log_tbl 0
33 #define One 128
34 #define C075 192
35 #define poly_coeff9 256
36 #define poly_coeff8 320
37 #define poly_coeff7 384
38 #define poly_coeff6 448
39 #define poly_coeff5 512
40 #define poly_coeff4 576
41 #define poly_coeff3 640
42 #define poly_coeff2 704
43 #define poly_coeff1 768
44 #define L2 832
45
46 #include <sysdep.h>
47
48 .text
49 .section .text.evex512,"ax",@progbits
50 ENTRY(_ZGVeN8v_log10_skx)
51 pushq %rbp
52 cfi_def_cfa_offset(16)
53 movq %rsp, %rbp
54 cfi_def_cfa(6, 16)
55 cfi_offset(6, -16)
56 andq $-64, %rsp
57 subq $192, %rsp
58 vmovaps %zmm0, %zmm7
59 vgetmantpd $8, {sae}, %zmm7, %zmm6
60 vmovups One+__svml_dlog10_data_internal_avx512(%rip), %zmm3
61 vmovups poly_coeff5+__svml_dlog10_data_internal_avx512(%rip), %zmm12
62 vmovups poly_coeff3+__svml_dlog10_data_internal_avx512(%rip), %zmm13
63
64 /* Start polynomial evaluation */
65 vmovups poly_coeff9+__svml_dlog10_data_internal_avx512(%rip), %zmm10
66 vmovups poly_coeff8+__svml_dlog10_data_internal_avx512(%rip), %zmm1
67 vmovups poly_coeff7+__svml_dlog10_data_internal_avx512(%rip), %zmm11
68 vmovups poly_coeff6+__svml_dlog10_data_internal_avx512(%rip), %zmm14
69
70 /* Prepare exponent correction: DblRcp<0.75? */
71 vmovups C075+__svml_dlog10_data_internal_avx512(%rip), %zmm2
72
73 /* Table lookup */
74 vmovups __svml_dlog10_data_internal_avx512(%rip), %zmm5
75
76 /* GetExp(x) */
77 vgetexppd {sae}, %zmm7, %zmm0
78
79 /* DblRcp ~ 1/Mantissa */
80 vrcp14pd %zmm6, %zmm8
81
82 /* x<=0? */
83 vfpclasspd $94, %zmm7, %k0
84
85 /* round DblRcp to 4 fractional bits (RN mode, no Precision exception) */
86 vrndscalepd $88, {sae}, %zmm8, %zmm4
87 vmovups poly_coeff4+__svml_dlog10_data_internal_avx512(%rip), %zmm8
88 kmovw %k0, %edx
89
90 /* Reduced argument: R = DblRcp*Mantissa - 1 */
91 vfmsub213pd {rn-sae}, %zmm3, %zmm4, %zmm6
92 vcmppd $17, {sae}, %zmm2, %zmm4, %k1
93 vfmadd231pd {rn-sae}, %zmm6, %zmm12, %zmm8
94 vmovups poly_coeff2+__svml_dlog10_data_internal_avx512(%rip), %zmm12
95 vfmadd231pd {rn-sae}, %zmm6, %zmm10, %zmm1
96 vfmadd231pd {rn-sae}, %zmm6, %zmm11, %zmm14
97 vmovups poly_coeff1+__svml_dlog10_data_internal_avx512(%rip), %zmm2
98
99 /* R^2 */
100 vmulpd {rn-sae}, %zmm6, %zmm6, %zmm15
101 vfmadd231pd {rn-sae}, %zmm6, %zmm13, %zmm12
102
103 /* Prepare table index */
104 vpsrlq $48, %zmm4, %zmm9
105
106 /* add 1 to Expon if DblRcp<0.75 */
107 vaddpd {rn-sae}, %zmm3, %zmm0, %zmm0{%k1}
108 vmulpd {rn-sae}, %zmm15, %zmm15, %zmm13
109 vfmadd213pd {rn-sae}, %zmm14, %zmm15, %zmm1
110 vfmadd213pd {rn-sae}, %zmm12, %zmm15, %zmm8
111 vpermt2pd Log_tbl+64+__svml_dlog10_data_internal_avx512(%rip), %zmm9, %zmm5
112
113 /* polynomial */
114 vfmadd213pd {rn-sae}, %zmm8, %zmm13, %zmm1
115 vfmadd213pd {rn-sae}, %zmm2, %zmm6, %zmm1
116 vfmadd213pd {rn-sae}, %zmm5, %zmm1, %zmm6
117 vmovups L2+__svml_dlog10_data_internal_avx512(%rip), %zmm1
118 vfmadd213pd {rn-sae}, %zmm6, %zmm1, %zmm0
119 testl %edx, %edx
120
121 /* Go to special inputs processing branch */
122 jne L(SPECIAL_VALUES_BRANCH)
123 # LOE rbx r12 r13 r14 r15 edx zmm0 zmm7
124
125 /* Restore registers
126 * and exit the function
127 */
128
129 L(EXIT):
130 movq %rbp, %rsp
131 popq %rbp
132 cfi_def_cfa(7, 8)
133 cfi_restore(6)
134 ret
135 cfi_def_cfa(6, 16)
136 cfi_offset(6, -16)
137
138 /* Branch to process
139 * special inputs
140 */
141
142 L(SPECIAL_VALUES_BRANCH):
143 vmovups %zmm7, 64(%rsp)
144 vmovups %zmm0, 128(%rsp)
145 # LOE rbx r12 r13 r14 r15 edx zmm0
146
147 xorl %eax, %eax
148 # LOE rbx r12 r13 r14 r15 eax edx
149
150 vzeroupper
151 movq %r12, 16(%rsp)
152 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
153 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
154 movl %eax, %r12d
155 movq %r13, 8(%rsp)
156 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
157 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
158 movl %edx, %r13d
159 movq %r14, (%rsp)
160 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
161 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
162 # LOE rbx r15 r12d r13d
163
164 /* Range mask
165 * bits check
166 */
167
168 L(RANGEMASK_CHECK):
169 btl %r12d, %r13d
170
171 /* Call scalar math function */
172 jc L(SCALAR_MATH_CALL)
173 # LOE rbx r15 r12d r13d
174
175 /* Special inputs
176 * processing loop
177 */
178
179 L(SPECIAL_VALUES_LOOP):
180 incl %r12d
181 cmpl $8, %r12d
182
183 /* Check bits in range mask */
184 jl L(RANGEMASK_CHECK)
185 # LOE rbx r15 r12d r13d
186
187 movq 16(%rsp), %r12
188 cfi_restore(12)
189 movq 8(%rsp), %r13
190 cfi_restore(13)
191 movq (%rsp), %r14
192 cfi_restore(14)
193 vmovups 128(%rsp), %zmm0
194
195 /* Go to exit */
196 jmp L(EXIT)
197 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
198 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
199 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
200 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
201 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
202 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
203 # LOE rbx r12 r13 r14 r15 zmm0
204
205 /* Scalar math fucntion call
206 * to process special input
207 */
208
209 L(SCALAR_MATH_CALL):
210 movl %r12d, %r14d
211 movsd 64(%rsp,%r14,8), %xmm0
212 call log10@PLT
213 # LOE rbx r14 r15 r12d r13d xmm0
214
215 movsd %xmm0, 128(%rsp,%r14,8)
216
217 /* Process special inputs in loop */
218 jmp L(SPECIAL_VALUES_LOOP)
219 # LOE rbx r15 r12d r13d
220 END(_ZGVeN8v_log10_skx)
221
222 .section .rodata, "a"
223 .align 64
224
225 #ifdef __svml_dlog10_data_internal_avx512_typedef
226 typedef unsigned int VUINT32;
227 typedef struct {
228 __declspec(align(64)) VUINT32 Log_tbl[16][2];
229 __declspec(align(64)) VUINT32 One[8][2];
230 __declspec(align(64)) VUINT32 C075[8][2];
231 __declspec(align(64)) VUINT32 poly_coeff9[8][2];
232 __declspec(align(64)) VUINT32 poly_coeff8[8][2];
233 __declspec(align(64)) VUINT32 poly_coeff7[8][2];
234 __declspec(align(64)) VUINT32 poly_coeff6[8][2];
235 __declspec(align(64)) VUINT32 poly_coeff5[8][2];
236 __declspec(align(64)) VUINT32 poly_coeff4[8][2];
237 __declspec(align(64)) VUINT32 poly_coeff3[8][2];
238 __declspec(align(64)) VUINT32 poly_coeff2[8][2];
239 __declspec(align(64)) VUINT32 poly_coeff1[8][2];
240 __declspec(align(64)) VUINT32 L2[8][2];
241 } __svml_dlog10_data_internal_avx512;
242 #endif
243 __svml_dlog10_data_internal_avx512:
244 /*== Log_tbl ==*/
245 .quad 0x0000000000000000
246 .quad 0xbf9af5f92b00e610
247 .quad 0xbfaa30a9d609efea
248 .quad 0xbfb31b3055c47118
249 .quad 0xbfb8cf183886480d
250 .quad 0xbfbe3bc1ab0e19fe
251 .quad 0xbfc1b3e71ec94f7b
252 .quad 0xbfc42c7e7fe3fc02
253 .quad 0x3fbffbfc2bbc7803
254 .quad 0x3fbb721cd17157e3
255 .quad 0x3fb715d0ce367afc
256 .quad 0x3fb2e3a740b7800f
257 .quad 0x3fadb11ed766abf4
258 .quad 0x3fa5e3966b7e9295
259 .quad 0x3f9cb38fccd8bfdb
260 .quad 0x3f8c3d0837784c41
261 /*== One ==*/
262 .align 64
263 .quad 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000
264 /*== 0.75 ==*/
265 .align 64
266 .quad 0x3fe8000000000000, 0x3fe8000000000000, 0x3fe8000000000000, 0x3fe8000000000000, 0x3fe8000000000000, 0x3fe8000000000000, 0x3fe8000000000000, 0x3fe8000000000000
267 /*== poly_coeff9 ==*/
268 .align 64
269 .quad 0x3fa8c2d828480370, 0x3fa8c2d828480370, 0x3fa8c2d828480370, 0x3fa8c2d828480370, 0x3fa8c2d828480370, 0x3fa8c2d828480370, 0x3fa8c2d828480370, 0x3fa8c2d828480370
270 /*== poly_coeff8 ==*/
271 .align 64
272 .quad 0xbfabd80d96029814, 0xbfabd80d96029814, 0xbfabd80d96029814, 0xbfabd80d96029814, 0xbfabd80d96029814, 0xbfabd80d96029814, 0xbfabd80d96029814, 0xbfabd80d96029814
273 /*== poly_coeff7 ==*/
274 .align 64
275 .quad 0x3fafc3f6f38b58a2, 0x3fafc3f6f38b58a2, 0x3fafc3f6f38b58a2, 0x3fafc3f6f38b58a2, 0x3fafc3f6f38b58a2, 0x3fafc3f6f38b58a2, 0x3fafc3f6f38b58a2, 0x3fafc3f6f38b58a2
276 /*== poly_coeff6 ==*/
277 .align 64
278 .quad 0xbfb287a63464dc80, 0xbfb287a63464dc80, 0xbfb287a63464dc80, 0xbfb287a63464dc80, 0xbfb287a63464dc80, 0xbfb287a63464dc80, 0xbfb287a63464dc80, 0xbfb287a63464dc80
279 /*== poly_coeff5 ==*/
280 .align 64
281 .quad 0x3fb63c62777f27d9, 0x3fb63c62777f27d9, 0x3fb63c62777f27d9, 0x3fb63c62777f27d9, 0x3fb63c62777f27d9, 0x3fb63c62777f27d9, 0x3fb63c62777f27d9, 0x3fb63c62777f27d9
282 /*== poly_coeff4 ==*/
283 .align 64
284 .quad 0xbfbbcb7b153c06a3, 0xbfbbcb7b153c06a3, 0xbfbbcb7b153c06a3, 0xbfbbcb7b153c06a3, 0xbfbbcb7b153c06a3, 0xbfbbcb7b153c06a3, 0xbfbbcb7b153c06a3, 0xbfbbcb7b153c06a3
285 /*== poly_coeff3 ==*/
286 .align 64
287 .quad 0x3fc287a7636f428c, 0x3fc287a7636f428c, 0x3fc287a7636f428c, 0x3fc287a7636f428c, 0x3fc287a7636f428c, 0x3fc287a7636f428c, 0x3fc287a7636f428c, 0x3fc287a7636f428c
288 /*== poly_coeff2 ==*/
289 .align 64
290 .quad 0xbfcbcb7b1526e4db, 0xbfcbcb7b1526e4db, 0xbfcbcb7b1526e4db, 0xbfcbcb7b1526e4db, 0xbfcbcb7b1526e4db, 0xbfcbcb7b1526e4db, 0xbfcbcb7b1526e4db, 0xbfcbcb7b1526e4db
291 /*== poly_coeff1 ==*/
292 .align 64
293 .quad 0x3fdbcb7b1526e50e, 0x3fdbcb7b1526e50e, 0x3fdbcb7b1526e50e, 0x3fdbcb7b1526e50e, 0x3fdbcb7b1526e50e, 0x3fdbcb7b1526e50e, 0x3fdbcb7b1526e50e, 0x3fdbcb7b1526e50e
294 /*== L2 ==*/
295 .align 64
296 .quad 0x3fd34413509f79ff, 0x3fd34413509f79ff, 0x3fd34413509f79ff, 0x3fd34413509f79ff, 0x3fd34413509f79ff, 0x3fd34413509f79ff, 0x3fd34413509f79ff, 0x3fd34413509f79ff
297 .align 64
298 .type __svml_dlog10_data_internal_avx512,@object
299 .size __svml_dlog10_data_internal_avx512,.-__svml_dlog10_data_internal_avx512