]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/x86_64/fpu/multiarch/svml_s_log2f4_core_sse4.S
x86-64: Add vector log2/log2f implementation to libmvec
[thirdparty/glibc.git] / sysdeps / x86_64 / fpu / multiarch / svml_s_log2f4_core_sse4.S
1 /* Function log2f vectorized with SSE4.
2 Copyright (C) 2021 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
18
19 /*
20 * ALGORITHM DESCRIPTION:
21 *
22 * Get short reciprocal approximation Rcp ~ 1/mantissa(x)
23 * R = Rcp*x - 1.0
24 * log2(x) = k - log2(Rcp) + poly_approximation(R)
25 * log2(Rcp) is tabulated
26 *
27 *
28 */
29
30 /* Offsets for data table __svml_slog2_data_internal
31 */
32 #define MinNorm 0
33 #define MaxNorm 16
34 #define iBrkValue 32
35 #define iOffExpoMask 48
36 #define One 64
37 #define sPoly 80
38
39 #include <sysdep.h>
40
41 .text
42 .section .text.sse4,"ax",@progbits
43 ENTRY(_ZGVbN4v_log2f_sse4)
44 subq $72, %rsp
45 cfi_def_cfa_offset(80)
46 movaps %xmm0, %xmm1
47
48 /* reduction: compute r,n */
49 movdqu iBrkValue+__svml_slog2_data_internal(%rip), %xmm2
50 movaps %xmm0, %xmm4
51 movdqu iOffExpoMask+__svml_slog2_data_internal(%rip), %xmm10
52 psubd %xmm2, %xmm1
53 pand %xmm1, %xmm10
54 movaps %xmm0, %xmm3
55 paddd %xmm2, %xmm10
56 psrad $23, %xmm1
57 movups sPoly+__svml_slog2_data_internal(%rip), %xmm5
58 movups sPoly+32+__svml_slog2_data_internal(%rip), %xmm6
59 movups sPoly+64+__svml_slog2_data_internal(%rip), %xmm7
60 movups sPoly+96+__svml_slog2_data_internal(%rip), %xmm9
61 cmpltps MinNorm+__svml_slog2_data_internal(%rip), %xmm4
62 cmpnleps MaxNorm+__svml_slog2_data_internal(%rip), %xmm3
63 cvtdq2ps %xmm1, %xmm1
64 subps One+__svml_slog2_data_internal(%rip), %xmm10
65 mulps %xmm10, %xmm5
66 movaps %xmm10, %xmm8
67 mulps %xmm10, %xmm6
68 mulps %xmm10, %xmm8
69 addps sPoly+16+__svml_slog2_data_internal(%rip), %xmm5
70 mulps %xmm10, %xmm7
71 addps sPoly+48+__svml_slog2_data_internal(%rip), %xmm6
72 mulps %xmm10, %xmm9
73 mulps %xmm8, %xmm5
74 addps sPoly+80+__svml_slog2_data_internal(%rip), %xmm7
75 addps sPoly+112+__svml_slog2_data_internal(%rip), %xmm9
76 addps %xmm5, %xmm6
77 mulps %xmm8, %xmm6
78 orps %xmm3, %xmm4
79
80 /* combine and get argument value range mask */
81 movmskps %xmm4, %edx
82 addps %xmm6, %xmm7
83 mulps %xmm7, %xmm8
84 addps %xmm8, %xmm9
85 mulps %xmm10, %xmm9
86 addps sPoly+128+__svml_slog2_data_internal(%rip), %xmm9
87 mulps %xmm9, %xmm10
88 addps %xmm10, %xmm1
89 testl %edx, %edx
90
91 /* Go to special inputs processing branch */
92 jne L(SPECIAL_VALUES_BRANCH)
93 # LOE rbx rbp r12 r13 r14 r15 edx xmm0 xmm1
94
95 /* Restore registers
96 * and exit the function
97 */
98
99 L(EXIT):
100 movaps %xmm1, %xmm0
101 addq $72, %rsp
102 cfi_def_cfa_offset(8)
103 ret
104 cfi_def_cfa_offset(80)
105
106 /* Branch to process
107 * special inputs
108 */
109
110 L(SPECIAL_VALUES_BRANCH):
111 movups %xmm0, 32(%rsp)
112 movups %xmm1, 48(%rsp)
113 # LOE rbx rbp r12 r13 r14 r15 edx
114
115 xorl %eax, %eax
116 movq %r12, 16(%rsp)
117 cfi_offset(12, -64)
118 movl %eax, %r12d
119 movq %r13, 8(%rsp)
120 cfi_offset(13, -72)
121 movl %edx, %r13d
122 movq %r14, (%rsp)
123 cfi_offset(14, -80)
124 # LOE rbx rbp r15 r12d r13d
125
126 /* Range mask
127 * bits check
128 */
129
130 L(RANGEMASK_CHECK):
131 btl %r12d, %r13d
132
133 /* Call scalar math function */
134 jc L(SCALAR_MATH_CALL)
135 # LOE rbx rbp r15 r12d r13d
136
137 /* Special inputs
138 * processing loop
139 */
140
141 L(SPECIAL_VALUES_LOOP):
142 incl %r12d
143 cmpl $4, %r12d
144
145 /* Check bits in range mask */
146 jl L(RANGEMASK_CHECK)
147 # LOE rbx rbp r15 r12d r13d
148
149 movq 16(%rsp), %r12
150 cfi_restore(12)
151 movq 8(%rsp), %r13
152 cfi_restore(13)
153 movq (%rsp), %r14
154 cfi_restore(14)
155 movups 48(%rsp), %xmm1
156
157 /* Go to exit */
158 jmp L(EXIT)
159 cfi_offset(12, -64)
160 cfi_offset(13, -72)
161 cfi_offset(14, -80)
162 # LOE rbx rbp r12 r13 r14 r15 xmm1
163
164 /* Scalar math fucntion call
165 * to process special input
166 */
167
168 L(SCALAR_MATH_CALL):
169 movl %r12d, %r14d
170 movss 32(%rsp,%r14,4), %xmm0
171 call log2f@PLT
172 # LOE rbx rbp r14 r15 r12d r13d xmm0
173
174 movss %xmm0, 48(%rsp,%r14,4)
175
176 /* Process special inputs in loop */
177 jmp L(SPECIAL_VALUES_LOOP)
178 # LOE rbx rbp r15 r12d r13d
179 END(_ZGVbN4v_log2f_sse4)
180
181 .section .rodata, "a"
182 .align 16
183
184 #ifdef __svml_slog2_data_internal_typedef
185 typedef unsigned int VUINT32;
186 typedef struct {
187 __declspec(align(16)) VUINT32 MinNorm[4][1];
188 __declspec(align(16)) VUINT32 MaxNorm[4][1];
189 __declspec(align(16)) VUINT32 iBrkValue[4][1];
190 __declspec(align(16)) VUINT32 iOffExpoMask[4][1];
191 __declspec(align(16)) VUINT32 One[4][1];
192 __declspec(align(16)) VUINT32 sPoly[9][4][1];
193 } __svml_slog2_data_internal;
194 #endif
195 __svml_slog2_data_internal:
196 /*== MinNorm ==*/
197 .long 0x00800000, 0x00800000, 0x00800000, 0x00800000
198 /*== MaxNorm ==*/
199 .align 16
200 .long 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff
201 /*== iBrkValue = SP 2/3 ==*/
202 .align 16
203 .long 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
204 /*== iOffExpoMask = SP significand mask ==*/
205 .align 16
206 .long 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
207 /*== sOne = SP 1.0 ==*/
208 .align 16
209 .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
210 /*== spoly[9] ==*/
211 .align 16
212 .long 0x3e554012, 0x3e554012, 0x3e554012, 0x3e554012 /* coeff9 */
213 .long 0xbe638E14, 0xbe638E14, 0xbe638E14, 0xbe638E14 /* coeff8 */
214 .long 0x3e4D660B, 0x3e4D660B, 0x3e4D660B, 0x3e4D660B /* coeff7 */
215 .long 0xbe727824, 0xbe727824, 0xbe727824, 0xbe727824 /* coeff6 */
216 .long 0x3e93DD07, 0x3e93DD07, 0x3e93DD07, 0x3e93DD07 /* coeff5 */
217 .long 0xbeB8B969, 0xbeB8B969, 0xbeB8B969, 0xbeB8B969 /* coeff4 */
218 .long 0x3eF637C0, 0x3eF637C0, 0x3eF637C0, 0x3eF637C0 /* coeff3 */
219 .long 0xbf38AA2B, 0xbf38AA2B, 0xbf38AA2B, 0xbf38AA2B /* coeff2 */
220 .long 0x3fB8AA3B, 0x3fB8AA3B, 0x3fB8AA3B, 0x3fB8AA3B /* coeff1 */
221 .align 16
222 .type __svml_slog2_data_internal,@object
223 .size __svml_slog2_data_internal,.-__svml_slog2_data_internal