1 /* Function log2f vectorized with SSE4.
2 Copyright (C) 2021 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
20 * ALGORITHM DESCRIPTION:
22 * Get short reciprocal approximation Rcp ~ 1/mantissa(x)
24 * log2(x) = k - log2(Rcp) + poly_approximation(R)
25 * log2(Rcp) is tabulated
30 /* Offsets for data table __svml_slog2_data_internal
35 #define iOffExpoMask 48
42 .section .text.sse4,"ax",@progbits
43 ENTRY(_ZGVbN4v_log2f_sse4)
45 cfi_def_cfa_offset(80)
48 /* reduction: compute r,n */
49 movdqu iBrkValue+__svml_slog2_data_internal(%rip), %xmm2
51 movdqu iOffExpoMask+__svml_slog2_data_internal(%rip), %xmm10
57 movups sPoly+__svml_slog2_data_internal(%rip), %xmm5
58 movups sPoly+32+__svml_slog2_data_internal(%rip), %xmm6
59 movups sPoly+64+__svml_slog2_data_internal(%rip), %xmm7
60 movups sPoly+96+__svml_slog2_data_internal(%rip), %xmm9
61 cmpltps MinNorm+__svml_slog2_data_internal(%rip), %xmm4
62 cmpnleps MaxNorm+__svml_slog2_data_internal(%rip), %xmm3
64 subps One+__svml_slog2_data_internal(%rip), %xmm10
69 addps sPoly+16+__svml_slog2_data_internal(%rip), %xmm5
71 addps sPoly+48+__svml_slog2_data_internal(%rip), %xmm6
74 addps sPoly+80+__svml_slog2_data_internal(%rip), %xmm7
75 addps sPoly+112+__svml_slog2_data_internal(%rip), %xmm9
80 /* combine and get argument value range mask */
86 addps sPoly+128+__svml_slog2_data_internal(%rip), %xmm9
91 /* Go to special inputs processing branch */
92 jne L(SPECIAL_VALUES_BRANCH)
93 # LOE rbx rbp r12 r13 r14 r15 edx xmm0 xmm1
96 * and exit the function
102 cfi_def_cfa_offset(8)
104 cfi_def_cfa_offset(80)
110 L(SPECIAL_VALUES_BRANCH):
111 movups %xmm0, 32(%rsp)
112 movups %xmm1, 48(%rsp)
113 # LOE rbx rbp r12 r13 r14 r15 edx
124 # LOE rbx rbp r15 r12d r13d
133 /* Call scalar math function */
134 jc L(SCALAR_MATH_CALL)
135 # LOE rbx rbp r15 r12d r13d
141 L(SPECIAL_VALUES_LOOP):
145 /* Check bits in range mask */
146 jl L(RANGEMASK_CHECK)
147 # LOE rbx rbp r15 r12d r13d
155 movups 48(%rsp), %xmm1
162 # LOE rbx rbp r12 r13 r14 r15 xmm1
164 /* Scalar math fucntion call
165 * to process special input
170 movss 32(%rsp,%r14,4), %xmm0
172 # LOE rbx rbp r14 r15 r12d r13d xmm0
174 movss %xmm0, 48(%rsp,%r14,4)
176 /* Process special inputs in loop */
177 jmp L(SPECIAL_VALUES_LOOP)
178 # LOE rbx rbp r15 r12d r13d
179 END(_ZGVbN4v_log2f_sse4)
181 .section .rodata, "a"
184 #ifdef __svml_slog2_data_internal_typedef
185 typedef unsigned int VUINT32;
187 __declspec(align(16)) VUINT32 MinNorm[4][1];
188 __declspec(align(16)) VUINT32 MaxNorm[4][1];
189 __declspec(align(16)) VUINT32 iBrkValue[4][1];
190 __declspec(align(16)) VUINT32 iOffExpoMask[4][1];
191 __declspec(align(16)) VUINT32 One[4][1];
192 __declspec(align(16)) VUINT32 sPoly[9][4][1];
193 } __svml_slog2_data_internal;
195 __svml_slog2_data_internal:
197 .long 0x00800000, 0x00800000, 0x00800000, 0x00800000
200 .long 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff
201 /*== iBrkValue = SP 2/3 ==*/
203 .long 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
204 /*== iOffExpoMask = SP significand mask ==*/
206 .long 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
207 /*== sOne = SP 1.0 ==*/
209 .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
212 .long 0x3e554012, 0x3e554012, 0x3e554012, 0x3e554012 /* coeff9 */
213 .long 0xbe638E14, 0xbe638E14, 0xbe638E14, 0xbe638E14 /* coeff8 */
214 .long 0x3e4D660B, 0x3e4D660B, 0x3e4D660B, 0x3e4D660B /* coeff7 */
215 .long 0xbe727824, 0xbe727824, 0xbe727824, 0xbe727824 /* coeff6 */
216 .long 0x3e93DD07, 0x3e93DD07, 0x3e93DD07, 0x3e93DD07 /* coeff5 */
217 .long 0xbeB8B969, 0xbeB8B969, 0xbeB8B969, 0xbeB8B969 /* coeff4 */
218 .long 0x3eF637C0, 0x3eF637C0, 0x3eF637C0, 0x3eF637C0 /* coeff3 */
219 .long 0xbf38AA2B, 0xbf38AA2B, 0xbf38AA2B, 0xbf38AA2B /* coeff2 */
220 .long 0x3fB8AA3B, 0x3fB8AA3B, 0x3fB8AA3B, 0x3fB8AA3B /* coeff1 */
222 .type __svml_slog2_data_internal,@object
223 .size __svml_slog2_data_internal,.-__svml_slog2_data_internal