]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/x86_64/fpu/multiarch/svml_s_asinf4_core_sse4.S
x86-64: Add vector asin/asinf implementation to libmvec
[thirdparty/glibc.git] / sysdeps / x86_64 / fpu / multiarch / svml_s_asinf4_core_sse4.S
1 /* Function asinf vectorized with SSE4.
2 Copyright (C) 2021 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
18
19 /*
20 * ALGORITHM DESCRIPTION:
21 *
22 * SelMask = (|x| >= 0.5) ? 1 : 0;
23 * R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x|
24 * asin(x) = (SelMask ? (Pi/2 - 2*Poly(R)) : Poly(R))*(-1)^sign(x)
25 *
26 *
27 */
28
29 /* Offsets for data table __svml_sasin_data_internal
30 */
31 #define AbsMask 0
32 #define OneHalf 16
33 #define SmallNorm 32
34 #define One 48
35 #define Two 64
36 #define sqrt_coeff 80
37 #define poly_coeff 112
38 #define Pi2H 192
39
40 #include <sysdep.h>
41
42 .text
43 .section .text.sse4,"ax",@progbits
44 ENTRY(_ZGVbN4v_asinf_sse4)
45 subq $72, %rsp
46 cfi_def_cfa_offset(80)
47 movaps %xmm0, %xmm2
48 movups __svml_sasin_data_internal(%rip), %xmm1
49 movups OneHalf+__svml_sasin_data_internal(%rip), %xmm5
50
51 /* x = |arg| */
52 movaps %xmm1, %xmm0
53 andps %xmm2, %xmm0
54
55 /* Y = 0.5 - 0.5*x */
56 movaps %xmm5, %xmm3
57 mulps %xmm0, %xmm3
58 movaps %xmm5, %xmm8
59
60 /* x^2 */
61 movaps %xmm0, %xmm14
62 movaps %xmm0, %xmm15
63 mulps %xmm0, %xmm14
64 subps %xmm3, %xmm8
65 cmpnltps %xmm5, %xmm15
66
67 /* SQ ~ -2*sqrt(Y) */
68 rsqrtps %xmm8, %xmm6
69 minps %xmm8, %xmm14
70 movaps %xmm8, %xmm9
71 movaps %xmm14, %xmm10
72 cmpltps SmallNorm+__svml_sasin_data_internal(%rip), %xmm9
73 mulps %xmm14, %xmm10
74 addps %xmm8, %xmm8
75 andnps %xmm6, %xmm9
76 movaps %xmm15, %xmm3
77 movaps %xmm9, %xmm7
78 andnps %xmm0, %xmm3
79 mulps %xmm9, %xmm7
80 andnps %xmm2, %xmm1
81 mulps %xmm8, %xmm9
82 mulps %xmm7, %xmm8
83
84 /* polynomial */
85 movups poly_coeff+__svml_sasin_data_internal(%rip), %xmm11
86 mulps %xmm14, %xmm11
87 subps Two+__svml_sasin_data_internal(%rip), %xmm8
88 movups poly_coeff+32+__svml_sasin_data_internal(%rip), %xmm12
89 mulps %xmm14, %xmm12
90 addps poly_coeff+16+__svml_sasin_data_internal(%rip), %xmm11
91 mulps %xmm10, %xmm11
92 addps poly_coeff+48+__svml_sasin_data_internal(%rip), %xmm12
93 movups sqrt_coeff+__svml_sasin_data_internal(%rip), %xmm13
94 addps %xmm11, %xmm12
95 mulps %xmm8, %xmm13
96 mulps %xmm9, %xmm8
97 mulps %xmm14, %xmm12
98 addps sqrt_coeff+16+__svml_sasin_data_internal(%rip), %xmm13
99 addps poly_coeff+64+__svml_sasin_data_internal(%rip), %xmm12
100 mulps %xmm8, %xmm13
101 mulps %xmm12, %xmm14
102 subps %xmm9, %xmm13
103 andps %xmm15, %xmm13
104 orps %xmm13, %xmm3
105 mulps %xmm3, %xmm14
106 movups One+__svml_sasin_data_internal(%rip), %xmm4
107 addps %xmm14, %xmm3
108 cmpltps %xmm0, %xmm4
109 movups Pi2H+__svml_sasin_data_internal(%rip), %xmm0
110 andps %xmm15, %xmm0
111 movmskps %xmm4, %edx
112 addps %xmm3, %xmm0
113 pxor %xmm1, %xmm0
114 testl %edx, %edx
115
116 /* Go to special inputs processing branch */
117 jne L(SPECIAL_VALUES_BRANCH)
118 # LOE rbx rbp r12 r13 r14 r15 edx xmm0 xmm2
119
120 /* Restore registers
121 * and exit the function
122 */
123
124 L(EXIT):
125 addq $72, %rsp
126 cfi_def_cfa_offset(8)
127 ret
128 cfi_def_cfa_offset(80)
129
130 /* Branch to process
131 * special inputs
132 */
133
134 L(SPECIAL_VALUES_BRANCH):
135 movups %xmm2, 32(%rsp)
136 movups %xmm0, 48(%rsp)
137 # LOE rbx rbp r12 r13 r14 r15 edx
138
139 xorl %eax, %eax
140 movq %r12, 16(%rsp)
141 cfi_offset(12, -64)
142 movl %eax, %r12d
143 movq %r13, 8(%rsp)
144 cfi_offset(13, -72)
145 movl %edx, %r13d
146 movq %r14, (%rsp)
147 cfi_offset(14, -80)
148 # LOE rbx rbp r15 r12d r13d
149
150 /* Range mask
151 * bits check
152 */
153
154 L(RANGEMASK_CHECK):
155 btl %r12d, %r13d
156
157 /* Call scalar math function */
158 jc L(SCALAR_MATH_CALL)
159 # LOE rbx rbp r15 r12d r13d
160
161 /* Special inputs
162 * processing loop
163 */
164
165 L(SPECIAL_VALUES_LOOP):
166 incl %r12d
167 cmpl $4, %r12d
168
169 /* Check bits in range mask */
170 jl L(RANGEMASK_CHECK)
171 # LOE rbx rbp r15 r12d r13d
172
173 movq 16(%rsp), %r12
174 cfi_restore(12)
175 movq 8(%rsp), %r13
176 cfi_restore(13)
177 movq (%rsp), %r14
178 cfi_restore(14)
179 movups 48(%rsp), %xmm0
180
181 /* Go to exit */
182 jmp L(EXIT)
183 cfi_offset(12, -64)
184 cfi_offset(13, -72)
185 cfi_offset(14, -80)
186 # LOE rbx rbp r12 r13 r14 r15 xmm0
187
188 /* Scalar math fucntion call
189 * to process special input
190 */
191
192 L(SCALAR_MATH_CALL):
193 movl %r12d, %r14d
194 movss 32(%rsp,%r14,4), %xmm0
195 call asinf@PLT
196 # LOE rbx rbp r14 r15 r12d r13d xmm0
197
198 movss %xmm0, 48(%rsp,%r14,4)
199
200 /* Process special inputs in loop */
201 jmp L(SPECIAL_VALUES_LOOP)
202 # LOE rbx rbp r15 r12d r13d
203 END(_ZGVbN4v_asinf_sse4)
204
205 .section .rodata, "a"
206 .align 16
207
208 #ifdef __svml_sasin_data_internal_typedef
209 typedef unsigned int VUINT32;
210 typedef struct {
211 __declspec(align(16)) VUINT32 AbsMask[4][1];
212 __declspec(align(16)) VUINT32 OneHalf[4][1];
213 __declspec(align(16)) VUINT32 SmallNorm[4][1];
214 __declspec(align(16)) VUINT32 One[4][1];
215 __declspec(align(16)) VUINT32 Two[4][1];
216 __declspec(align(16)) VUINT32 sqrt_coeff[2][4][1];
217 __declspec(align(16)) VUINT32 poly_coeff[5][4][1];
218 __declspec(align(16)) VUINT32 Pi2H[4][1];
219 } __svml_sasin_data_internal;
220 #endif
221 __svml_sasin_data_internal:
222 /*== AbsMask ==*/
223 .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
224 /*== OneHalf ==*/
225 .align 16
226 .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000
227 /*== SmallNorm ==*/
228 .align 16
229 .long 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000
230 /*== One ==*/
231 .align 16
232 .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
233 /*== Two ==*/
234 .align 16
235 .long 0x40000000, 0x40000000, 0x40000000, 0x40000000
236 /*== sqrt_coeff[2] ==*/
237 .align 16
238 .long 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004 /* sqrt_coeff2 */
239 .long 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001 /* sqrt_coeff1 */
240 /*== poly_coeff[5] ==*/
241 .align 16
242 .long 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07 /* poly_coeff5 */
243 .long 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B /* poly_coeff4 */
244 .long 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4 /* poly_coeff3 */
245 .long 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12 /* poly_coeff2 */
246 .long 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF /* poly_coeff1 */
247 /*== Pi2H ==*/
248 .align 16
249 .long 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB
250 .align 16
251 .type __svml_sasin_data_internal,@object
252 .size __svml_sasin_data_internal,.-__svml_sasin_data_internal