1 /* Function asin vectorized with AVX-512.
2 Copyright (C) 2021-2023 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
20 * ALGORITHM DESCRIPTION:
22 * SelMask = (|x| >= 0.5) ? 1 : 0;
23 * R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x|
24 * asin(x) = (SelMask ? (Pi/2 - 2*Poly(R)) : Poly(R))*(-1)^sign(x)
28 /* Offsets for data table __svml_dasin_data_internal
35 #define sqrt_coeff_1 320
36 #define sqrt_coeff_2 384
37 #define sqrt_coeff_3 448
38 #define sqrt_coeff_4 512
39 #define poly_coeff_1 576
40 #define poly_coeff_2 640
41 #define poly_coeff_3 704
42 #define poly_coeff_4 768
43 #define poly_coeff_5 832
44 #define poly_coeff_6 896
45 #define poly_coeff_7 960
46 #define poly_coeff_8 1024
47 #define poly_coeff_9 1088
48 #define poly_coeff_10 1152
49 #define poly_coeff_11 1216
50 #define poly_coeff_12 1280
55 .section .text.evex512, "ax", @progbits
56 ENTRY(_ZGVeN8v_asin_skx)
58 cfi_def_cfa_offset(16)
64 vmovups OneHalf+__svml_dasin_data_internal(%rip), %zmm8
67 vmovups SmallNorm+__svml_dasin_data_internal(%rip), %zmm10
68 vmovups Two+__svml_dasin_data_internal(%rip), %zmm14
69 vmovups sqrt_coeff_1+__svml_dasin_data_internal(%rip), %zmm15
70 vmovups sqrt_coeff_2+__svml_dasin_data_internal(%rip), %zmm2
71 vmovups sqrt_coeff_3+__svml_dasin_data_internal(%rip), %zmm1
72 vmovups One+__svml_dasin_data_internal(%rip), %zmm9
76 vandpd __svml_dasin_data_internal(%rip), %zmm6, %zmm4
80 vfnmadd231pd {rn-sae}, %zmm4, %zmm8, %zmm11
83 vmulpd {rn-sae}, %zmm4, %zmm4, %zmm7
84 vrsqrt14pd %zmm11, %zmm12
85 vcmppd $17, {sae}, %zmm10, %zmm11, %k1
86 vcmppd $21, {sae}, %zmm8, %zmm4, %k2
87 vcmppd $17, {sae}, %zmm4, %zmm9, %k0
88 vmovups poly_coeff_5+__svml_dasin_data_internal(%rip), %zmm10
91 vmovups poly_coeff_1+__svml_dasin_data_internal(%rip), %zmm8
92 vmovups poly_coeff_3+__svml_dasin_data_internal(%rip), %zmm9
93 vminpd {sae}, %zmm11, %zmm7, %zmm3
94 vxorpd %zmm12, %zmm12, %zmm12{%k1}
95 vaddpd {rn-sae}, %zmm11, %zmm11, %zmm0
96 vxorpd %zmm6, %zmm4, %zmm5
97 vmulpd {rn-sae}, %zmm12, %zmm12, %zmm13
98 vmulpd {rn-sae}, %zmm12, %zmm0, %zmm7
99 vmovups poly_coeff_7+__svml_dasin_data_internal(%rip), %zmm11
100 vmovups poly_coeff_4+__svml_dasin_data_internal(%rip), %zmm12
101 vfmsub213pd {rn-sae}, %zmm14, %zmm13, %zmm0
102 vmovups sqrt_coeff_4+__svml_dasin_data_internal(%rip), %zmm13
103 vfmadd231pd {rn-sae}, %zmm3, %zmm9, %zmm12
104 vmovups poly_coeff_11+__svml_dasin_data_internal(%rip), %zmm9
105 vfmadd231pd {rn-sae}, %zmm0, %zmm15, %zmm2
106 vmovups poly_coeff_9+__svml_dasin_data_internal(%rip), %zmm15
107 vmulpd {rn-sae}, %zmm0, %zmm7, %zmm14
108 vfmadd213pd {rn-sae}, %zmm1, %zmm0, %zmm2
109 vmovups poly_coeff_2+__svml_dasin_data_internal(%rip), %zmm1
111 vfmadd213pd {rn-sae}, %zmm13, %zmm0, %zmm2
112 vfmadd231pd {rn-sae}, %zmm3, %zmm8, %zmm1
113 vmovups poly_coeff_10+__svml_dasin_data_internal(%rip), %zmm8
114 vmulpd {rn-sae}, %zmm3, %zmm3, %zmm0
115 vfmsub213pd {rn-sae}, %zmm7, %zmm14, %zmm2
116 vmovups poly_coeff_6+__svml_dasin_data_internal(%rip), %zmm7
117 vfmadd231pd {rn-sae}, %zmm3, %zmm15, %zmm8
118 vfmadd213pd {rn-sae}, %zmm12, %zmm0, %zmm1
119 vblendmpd %zmm2, %zmm4, %zmm2{%k2}
120 vfmadd231pd {rn-sae}, %zmm3, %zmm10, %zmm7
121 vmovups poly_coeff_8+__svml_dasin_data_internal(%rip), %zmm10
122 vmovups Pi2H+__svml_dasin_data_internal(%rip), %zmm4
123 vfmadd231pd {rn-sae}, %zmm3, %zmm11, %zmm10
124 vmovups poly_coeff_12+__svml_dasin_data_internal(%rip), %zmm11
125 vfmadd213pd {rn-sae}, %zmm10, %zmm0, %zmm7
126 vfmadd231pd {rn-sae}, %zmm3, %zmm9, %zmm11
127 vmulpd {rn-sae}, %zmm0, %zmm0, %zmm10
128 vfmadd213pd {rn-sae}, %zmm7, %zmm10, %zmm1
129 vfmadd213pd {rn-sae}, %zmm8, %zmm0, %zmm1
130 vfmadd213pd {rn-sae}, %zmm11, %zmm0, %zmm1
131 vmulpd {rn-sae}, %zmm3, %zmm1, %zmm3
132 vfmadd213pd {rn-sae}, %zmm2, %zmm2, %zmm3
133 vaddpd {rn-sae}, %zmm4, %zmm3, %zmm3{%k2}
134 vxorpd %zmm5, %zmm3, %zmm0
137 /* Go to special inputs processing branch */
138 jne L(SPECIAL_VALUES_BRANCH)
139 # LOE rbx r12 r13 r14 r15 edx zmm0 zmm6
142 * and exit the function
158 L(SPECIAL_VALUES_BRANCH):
159 vmovups %zmm6, 64(%rsp)
160 vmovups %zmm0, 128(%rsp)
161 # LOE rbx r12 r13 r14 r15 edx zmm0
164 # LOE rbx r12 r13 r14 r15 eax edx
168 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
169 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
172 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
173 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
176 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
177 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
178 # LOE rbx r15 r12d r13d
187 /* Call scalar math function */
188 jc L(SCALAR_MATH_CALL)
189 # LOE rbx r15 r12d r13d
195 L(SPECIAL_VALUES_LOOP):
199 /* Check bits in range mask */
200 jl L(RANGEMASK_CHECK)
201 # LOE rbx r15 r12d r13d
209 vmovups 128(%rsp), %zmm0
213 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
214 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
215 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
216 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
217 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
218 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
219 # LOE rbx r12 r13 r14 r15 zmm0
221 /* Scalar math fucntion call
222 * to process special input
227 vmovsd 64(%rsp, %r14, 8), %xmm0
229 # LOE rbx r14 r15 r12d r13d xmm0
231 vmovsd %xmm0, 128(%rsp, %r14, 8)
233 /* Process special inputs in loop */
234 jmp L(SPECIAL_VALUES_LOOP)
235 # LOE rbx r15 r12d r13d
236 END(_ZGVeN8v_asin_skx)
238 .section .rodata, "a"
241 #ifdef __svml_dasin_data_internal_typedef
242 typedef unsigned int VUINT32;
244 __declspec(align(64)) VUINT32 AbsMask[8][2];
245 __declspec(align(64)) VUINT32 OneHalf[8][2];
246 __declspec(align(64)) VUINT32 SmallNorm[8][2];
247 __declspec(align(64)) VUINT32 One[8][2];
248 __declspec(align(64)) VUINT32 Two[8][2];
249 __declspec(align(64)) VUINT32 sqrt_coeff[4][8][2];
250 __declspec(align(64)) VUINT32 poly_coeff[12][8][2];
251 __declspec(align(64)) VUINT32 Pi2H[8][2];
252 } __svml_dasin_data_internal;
254 __svml_dasin_data_internal:
256 .quad 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff
259 .quad 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000
262 .quad 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000
265 .quad 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000
268 .quad 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000
271 .quad 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3 /* sqrt_coeff4 */
272 .quad 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D /* sqrt_coeff3 */
273 .quad 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97 /* sqrt_coeff2 */
274 .quad 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D /* sqrt_coeff1 */
277 .quad 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909 /* poly_coeff12 */
278 .quad 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED /* poly_coeff11 */
279 .quad 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE /* poly_coeff10 */
280 .quad 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5 /* poly_coeff9 */
281 .quad 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6 /* poly_coeff8 */
282 .quad 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57 /* poly_coeff7 */
283 .quad 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E /* poly_coeff6 */
284 .quad 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd /* poly_coeff5 */
285 .quad 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE /* poly_coeff4 */
286 .quad 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8 /* poly_coeff3 */
287 .quad 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE /* poly_coeff2 */
288 .quad 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C /* poly_coeff1 */
291 .quad 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18
293 .type __svml_dasin_data_internal, @object
294 .size __svml_dasin_data_internal, .-__svml_dasin_data_internal