1 /* Function asinf vectorized with SSE4.
2 Copyright (C) 2021-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
20 * ALGORITHM DESCRIPTION:
22 * SelMask = (|x| >= 0.5) ? 1 : 0;
23 * R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x|
24 * asin(x) = (SelMask ? (Pi/2 - 2*Poly(R)) : Poly(R))*(-1)^sign(x)
29 /* Offsets for data table __svml_sasin_data_internal
37 #define poly_coeff 112
42 .section .text.sse4, "ax", @progbits
43 ENTRY(_ZGVbN4v_asinf_sse4)
45 cfi_def_cfa_offset(80)
47 movups __svml_sasin_data_internal(%rip), %xmm1
48 movups OneHalf+__svml_sasin_data_internal(%rip), %xmm5
64 cmpnltps %xmm5, %xmm15
71 cmpltps SmallNorm+__svml_sasin_data_internal(%rip), %xmm9
84 movups poly_coeff+__svml_sasin_data_internal(%rip), %xmm11
86 subps Two+__svml_sasin_data_internal(%rip), %xmm8
87 movups poly_coeff+32+__svml_sasin_data_internal(%rip), %xmm12
89 addps poly_coeff+16+__svml_sasin_data_internal(%rip), %xmm11
91 addps poly_coeff+48+__svml_sasin_data_internal(%rip), %xmm12
92 movups sqrt_coeff+__svml_sasin_data_internal(%rip), %xmm13
97 addps sqrt_coeff+16+__svml_sasin_data_internal(%rip), %xmm13
98 addps poly_coeff+64+__svml_sasin_data_internal(%rip), %xmm12
105 movups One+__svml_sasin_data_internal(%rip), %xmm4
108 movups Pi2H+__svml_sasin_data_internal(%rip), %xmm0
115 /* Go to special inputs processing branch */
116 jne L(SPECIAL_VALUES_BRANCH)
117 # LOE rbx rbp r12 r13 r14 r15 edx xmm0 xmm2
120 * and exit the function
125 cfi_def_cfa_offset(8)
127 cfi_def_cfa_offset(80)
133 L(SPECIAL_VALUES_BRANCH):
134 movups %xmm2, 32(%rsp)
135 movups %xmm0, 48(%rsp)
136 # LOE rbx rbp r12 r13 r14 r15 edx
147 # LOE rbx rbp r15 r12d r13d
156 /* Call scalar math function */
157 jc L(SCALAR_MATH_CALL)
158 # LOE rbx rbp r15 r12d r13d
164 L(SPECIAL_VALUES_LOOP):
168 /* Check bits in range mask */
169 jl L(RANGEMASK_CHECK)
170 # LOE rbx rbp r15 r12d r13d
178 movups 48(%rsp), %xmm0
185 # LOE rbx rbp r12 r13 r14 r15 xmm0
187 /* Scalar math function call
188 * to process special input
193 movss 32(%rsp, %r14, 4), %xmm0
195 # LOE rbx rbp r14 r15 r12d r13d xmm0
197 movss %xmm0, 48(%rsp, %r14, 4)
199 /* Process special inputs in loop */
200 jmp L(SPECIAL_VALUES_LOOP)
201 # LOE rbx rbp r15 r12d r13d
202 END(_ZGVbN4v_asinf_sse4)
204 .section .rodata, "a"
207 #ifdef __svml_sasin_data_internal_typedef
208 typedef unsigned int VUINT32;
210 __declspec(align(16)) VUINT32 AbsMask[4][1];
211 __declspec(align(16)) VUINT32 OneHalf[4][1];
212 __declspec(align(16)) VUINT32 SmallNorm[4][1];
213 __declspec(align(16)) VUINT32 One[4][1];
214 __declspec(align(16)) VUINT32 Two[4][1];
215 __declspec(align(16)) VUINT32 sqrt_coeff[2][4][1];
216 __declspec(align(16)) VUINT32 poly_coeff[5][4][1];
217 __declspec(align(16)) VUINT32 Pi2H[4][1];
218 } __svml_sasin_data_internal;
220 __svml_sasin_data_internal:
222 .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
225 .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000
228 .long 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000
231 .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
234 .long 0x40000000, 0x40000000, 0x40000000, 0x40000000
237 .long 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004 /* sqrt_coeff2 */
238 .long 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001 /* sqrt_coeff1 */
241 .long 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07 /* poly_coeff5 */
242 .long 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B /* poly_coeff4 */
243 .long 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4 /* poly_coeff3 */
244 .long 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12 /* poly_coeff2 */
245 .long 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF /* poly_coeff1 */
248 .long 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB
250 .type __svml_sasin_data_internal, @object
251 .size __svml_sasin_data_internal, .-__svml_sasin_data_internal