1 /* Function asinf vectorized with SSE4.
2 Copyright (C) 2021 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
20 * ALGORITHM DESCRIPTION:
22 * SelMask = (|x| >= 0.5) ? 1 : 0;
23 * R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x|
24 * asin(x) = (SelMask ? (Pi/2 - 2*Poly(R)) : Poly(R))*(-1)^sign(x)
29 /* Offsets for data table __svml_sasin_data_internal
37 #define poly_coeff 112
43 .section .text.sse4,"ax",@progbits
44 ENTRY(_ZGVbN4v_asinf_sse4)
46 cfi_def_cfa_offset(80)
48 movups __svml_sasin_data_internal(%rip), %xmm1
49 movups OneHalf+__svml_sasin_data_internal(%rip), %xmm5
65 cmpnltps %xmm5, %xmm15
72 cmpltps SmallNorm+__svml_sasin_data_internal(%rip), %xmm9
85 movups poly_coeff+__svml_sasin_data_internal(%rip), %xmm11
87 subps Two+__svml_sasin_data_internal(%rip), %xmm8
88 movups poly_coeff+32+__svml_sasin_data_internal(%rip), %xmm12
90 addps poly_coeff+16+__svml_sasin_data_internal(%rip), %xmm11
92 addps poly_coeff+48+__svml_sasin_data_internal(%rip), %xmm12
93 movups sqrt_coeff+__svml_sasin_data_internal(%rip), %xmm13
98 addps sqrt_coeff+16+__svml_sasin_data_internal(%rip), %xmm13
99 addps poly_coeff+64+__svml_sasin_data_internal(%rip), %xmm12
106 movups One+__svml_sasin_data_internal(%rip), %xmm4
109 movups Pi2H+__svml_sasin_data_internal(%rip), %xmm0
116 /* Go to special inputs processing branch */
117 jne L(SPECIAL_VALUES_BRANCH)
118 # LOE rbx rbp r12 r13 r14 r15 edx xmm0 xmm2
121 * and exit the function
126 cfi_def_cfa_offset(8)
128 cfi_def_cfa_offset(80)
134 L(SPECIAL_VALUES_BRANCH):
135 movups %xmm2, 32(%rsp)
136 movups %xmm0, 48(%rsp)
137 # LOE rbx rbp r12 r13 r14 r15 edx
148 # LOE rbx rbp r15 r12d r13d
157 /* Call scalar math function */
158 jc L(SCALAR_MATH_CALL)
159 # LOE rbx rbp r15 r12d r13d
165 L(SPECIAL_VALUES_LOOP):
169 /* Check bits in range mask */
170 jl L(RANGEMASK_CHECK)
171 # LOE rbx rbp r15 r12d r13d
179 movups 48(%rsp), %xmm0
186 # LOE rbx rbp r12 r13 r14 r15 xmm0
188 /* Scalar math fucntion call
189 * to process special input
194 movss 32(%rsp,%r14,4), %xmm0
196 # LOE rbx rbp r14 r15 r12d r13d xmm0
198 movss %xmm0, 48(%rsp,%r14,4)
200 /* Process special inputs in loop */
201 jmp L(SPECIAL_VALUES_LOOP)
202 # LOE rbx rbp r15 r12d r13d
203 END(_ZGVbN4v_asinf_sse4)
205 .section .rodata, "a"
208 #ifdef __svml_sasin_data_internal_typedef
209 typedef unsigned int VUINT32;
211 __declspec(align(16)) VUINT32 AbsMask[4][1];
212 __declspec(align(16)) VUINT32 OneHalf[4][1];
213 __declspec(align(16)) VUINT32 SmallNorm[4][1];
214 __declspec(align(16)) VUINT32 One[4][1];
215 __declspec(align(16)) VUINT32 Two[4][1];
216 __declspec(align(16)) VUINT32 sqrt_coeff[2][4][1];
217 __declspec(align(16)) VUINT32 poly_coeff[5][4][1];
218 __declspec(align(16)) VUINT32 Pi2H[4][1];
219 } __svml_sasin_data_internal;
221 __svml_sasin_data_internal:
223 .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
226 .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000
229 .long 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000
232 .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
235 .long 0x40000000, 0x40000000, 0x40000000, 0x40000000
236 /*== sqrt_coeff[2] ==*/
238 .long 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004 /* sqrt_coeff2 */
239 .long 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001 /* sqrt_coeff1 */
240 /*== poly_coeff[5] ==*/
242 .long 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07 /* poly_coeff5 */
243 .long 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B /* poly_coeff4 */
244 .long 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4 /* poly_coeff3 */
245 .long 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12 /* poly_coeff2 */
246 .long 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF /* poly_coeff1 */
249 .long 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB
251 .type __svml_sasin_data_internal,@object
252 .size __svml_sasin_data_internal,.-__svml_sasin_data_internal