1 /* Function log10f vectorized with AVX2.
2 Copyright (C) 2021-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
20 * ALGORITHM DESCRIPTION:
22 * Get short reciprocal approximation Rcp ~ 1/mantissa(x)
24 * log10(x) = k*log10(2.0) - log10(Rcp) + poly_approximation(R)
25 * log10(Rcp) is tabulated
30 /* Offsets for data table __svml_slog10_data_internal
37 #define iOffExpoMask 160
45 .section .text.avx2,"ax",@progbits
46 ENTRY(_ZGVdN8v_log10f_avx2)
48 cfi_def_cfa_offset(16)
55 /* reduction: compute r,n */
56 vmovups iBrkValue+__svml_slog10_data_internal(%rip), %ymm4
57 vmovups sPoly+__svml_slog10_data_internal(%rip), %ymm15
58 vmovups sPoly+64+__svml_slog10_data_internal(%rip), %ymm9
59 vmovups sPoly+128+__svml_slog10_data_internal(%rip), %ymm10
60 vmovups sPoly+192+__svml_slog10_data_internal(%rip), %ymm12
61 vpsubd %ymm4, %ymm0, %ymm1
62 vcmplt_oqps MinNorm+__svml_slog10_data_internal(%rip), %ymm0, %ymm5
63 vcmpnle_uqps MaxNorm+__svml_slog10_data_internal(%rip), %ymm0, %ymm6
64 vpand iOffExpoMask+__svml_slog10_data_internal(%rip), %ymm1, %ymm3
65 vpsrad $23, %ymm1, %ymm2
66 vpaddd %ymm4, %ymm3, %ymm8
67 vcvtdq2ps %ymm2, %ymm1
68 vsubps One+__svml_slog10_data_internal(%rip), %ymm8, %ymm13
69 vmulps L2L+__svml_slog10_data_internal(%rip), %ymm1, %ymm14
70 vfmadd213ps sPoly+32+__svml_slog10_data_internal(%rip), %ymm13, %ymm15
71 vfmadd213ps sPoly+96+__svml_slog10_data_internal(%rip), %ymm13, %ymm9
72 vmulps %ymm13, %ymm13, %ymm11
73 vfmadd213ps sPoly+160+__svml_slog10_data_internal(%rip), %ymm13, %ymm10
74 vfmadd213ps sPoly+224+__svml_slog10_data_internal(%rip), %ymm13, %ymm12
75 vfmadd213ps %ymm9, %ymm11, %ymm15
76 vfmadd213ps %ymm10, %ymm11, %ymm15
77 vfmadd213ps %ymm12, %ymm11, %ymm15
78 vfmadd213ps sPoly+256+__svml_slog10_data_internal(%rip), %ymm13, %ymm15
79 vfmadd213ps %ymm14, %ymm13, %ymm15
80 vorps %ymm6, %ymm5, %ymm7
82 /* combine and get argument value range mask */
84 vfmadd132ps L2H+__svml_slog10_data_internal(%rip), %ymm15, %ymm1
87 /* Go to special inputs processing branch */
88 jne L(SPECIAL_VALUES_BRANCH)
89 # LOE rbx r12 r13 r14 r15 edx ymm0 ymm1
92 * and exit the function
109 L(SPECIAL_VALUES_BRANCH):
110 vmovups %ymm0, 32(%rsp)
111 vmovups %ymm1, 64(%rsp)
112 # LOE rbx r12 r13 r14 r15 edx ymm1
115 # LOE rbx r12 r13 r14 r15 eax edx
119 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */
120 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
123 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */
124 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
127 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */
128 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
129 # LOE rbx r15 r12d r13d
138 /* Call scalar math function */
139 jc L(SCALAR_MATH_CALL)
140 # LOE rbx r15 r12d r13d
146 L(SPECIAL_VALUES_LOOP):
150 /* Check bits in range mask */
151 jl L(RANGEMASK_CHECK)
152 # LOE rbx r15 r12d r13d
160 vmovups 64(%rsp), %ymm1
164 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */
165 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
166 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */
167 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
168 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */
169 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
170 # LOE rbx r12 r13 r14 r15 ymm1
172 /* Scalar math fucntion call
173 * to process special input
178 movss 32(%rsp,%r14,4), %xmm0
180 # LOE rbx r14 r15 r12d r13d xmm0
182 movss %xmm0, 64(%rsp,%r14,4)
184 /* Process special inputs in loop */
185 jmp L(SPECIAL_VALUES_LOOP)
186 # LOE rbx r15 r12d r13d
187 END(_ZGVdN8v_log10f_avx2)
189 .section .rodata, "a"
192 #ifdef __svml_slog10_data_internal_typedef
193 typedef unsigned int VUINT32;
195 __declspec(align(32)) VUINT32 MinNorm[8][1];
196 __declspec(align(32)) VUINT32 MaxNorm[8][1];
197 __declspec(align(32)) VUINT32 L2H[8][1];
198 __declspec(align(32)) VUINT32 L2L[8][1];
199 __declspec(align(32)) VUINT32 iBrkValue[8][1];
200 __declspec(align(32)) VUINT32 iOffExpoMask[8][1];
201 __declspec(align(32)) VUINT32 One[8][1];
202 __declspec(align(32)) VUINT32 sPoly[9][8][1];
203 __declspec(align(32)) VUINT32 L2[8][1];
204 } __svml_slog10_data_internal;
206 __svml_slog10_data_internal:
208 .long 0x00800000, 0x00800000, 0x00800000, 0x00800000, 0x00800000, 0x00800000, 0x00800000, 0x00800000
211 .long 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff
214 .long 0x3e9a2100, 0x3e9a2100, 0x3e9a2100, 0x3e9a2100, 0x3e9a2100, 0x3e9a2100, 0x3e9a2100, 0x3e9a2100
217 .long 0xb64AF600, 0xb64AF600, 0xb64AF600, 0xb64AF600, 0xb64AF600, 0xb64AF600, 0xb64AF600, 0xb64AF600
218 /*== iBrkValue = SP 2/3 ==*/
220 .long 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
221 /*== iOffExpoMask = SP significand mask ==*/
223 .long 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
224 /*== sOne = SP 1.0 ==*/
226 .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
229 .long 0x3d8063B4, 0x3d8063B4, 0x3d8063B4, 0x3d8063B4, 0x3d8063B4, 0x3d8063B4, 0x3d8063B4, 0x3d8063B4 /* coeff9 */
230 .long 0xbd890073, 0xbd890073, 0xbd890073, 0xbd890073, 0xbd890073, 0xbd890073, 0xbd890073, 0xbd890073 /* coeff8 */
231 .long 0x3d775317, 0x3d775317, 0x3d775317, 0x3d775317, 0x3d775317, 0x3d775317, 0x3d775317, 0x3d775317 /* coeff7 */
232 .long 0xbd91FB27, 0xbd91FB27, 0xbd91FB27, 0xbd91FB27, 0xbd91FB27, 0xbd91FB27, 0xbd91FB27, 0xbd91FB27 /* coeff6 */
233 .long 0x3dB20B96, 0x3dB20B96, 0x3dB20B96, 0x3dB20B96, 0x3dB20B96, 0x3dB20B96, 0x3dB20B96, 0x3dB20B96 /* coeff5 */
234 .long 0xbdDE6E20, 0xbdDE6E20, 0xbdDE6E20, 0xbdDE6E20, 0xbdDE6E20, 0xbdDE6E20, 0xbdDE6E20, 0xbdDE6E20 /* coeff4 */
235 .long 0x3e143CE5, 0x3e143CE5, 0x3e143CE5, 0x3e143CE5, 0x3e143CE5, 0x3e143CE5, 0x3e143CE5, 0x3e143CE5 /* coeff3 */
236 .long 0xbe5E5BC5, 0xbe5E5BC5, 0xbe5E5BC5, 0xbe5E5BC5, 0xbe5E5BC5, 0xbe5E5BC5, 0xbe5E5BC5, 0xbe5E5BC5 /* coeff2 */
237 .long 0x3eDE5BD9, 0x3eDE5BD9, 0x3eDE5BD9, 0x3eDE5BD9, 0x3eDE5BD9, 0x3eDE5BD9, 0x3eDE5BD9, 0x3eDE5BD9 /* coeff1 */
240 .long 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b
242 .type __svml_slog10_data_internal,@object
243 .size __svml_slog10_data_internal,.-__svml_slog10_data_internal