1 /* Function acos vectorized with AVX2.
2 Copyright (C) 2021 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
20 * ALGORITHM DESCRIPTION:
22 * SelMask = (|x| >= 0.5) ? 1 : 0;
23 * R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x|
24 * acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R))
25 * acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|)
29 /* Offsets for data table __svml_dacos_data_internal
36 #define sqrt_coeff 160
37 #define poly_coeff 288
44 .section .text.avx2,"ax",@progbits
45 ENTRY(_ZGVdN4v_acos_avx2)
47 cfi_def_cfa_offset(16)
53 vmovupd __svml_dacos_data_internal(%rip), %ymm6
54 vmovupd OneHalf+__svml_dacos_data_internal(%rip), %ymm7
58 vorpd %ymm5, %ymm6, %ymm4
60 /* Y = 0.5 + 0.5*(-x) */
61 vfmadd231pd %ymm4, %ymm7, %ymm7
64 vmulpd %ymm4, %ymm4, %ymm8
67 vmovupd sqrt_coeff+__svml_dacos_data_internal(%rip), %ymm0
68 vcmplt_oqpd SmallNorm+__svml_dacos_data_internal(%rip), %ymm7, %ymm12
69 vminpd %ymm7, %ymm8, %ymm2
71 /* NaN processed in special branch (so wind test passed) */
72 vcmpnge_uqpd MOne+__svml_dacos_data_internal(%rip), %ymm4, %ymm9
73 vcvtpd2ps %ymm7, %xmm10
74 vmovupd poly_coeff+64+__svml_dacos_data_internal(%rip), %ymm8
75 vcmpnlt_uqpd %ymm7, %ymm2, %ymm1
76 vrsqrtps %xmm10, %xmm11
77 vfmadd213pd poly_coeff+96+__svml_dacos_data_internal(%rip), %ymm2, %ymm8
78 vcvtps2pd %xmm11, %ymm13
79 vmovupd poly_coeff+128+__svml_dacos_data_internal(%rip), %ymm11
80 vandnpd %ymm13, %ymm12, %ymm14
81 vmulpd %ymm14, %ymm14, %ymm15
82 vfmadd213pd poly_coeff+160+__svml_dacos_data_internal(%rip), %ymm2, %ymm11
83 vmulpd %ymm2, %ymm2, %ymm13
84 vmovupd poly_coeff+256+__svml_dacos_data_internal(%rip), %ymm12
85 vmulpd %ymm13, %ymm13, %ymm10
86 vfmadd213pd poly_coeff+288+__svml_dacos_data_internal(%rip), %ymm2, %ymm12
87 vandpd %ymm5, %ymm6, %ymm3
88 vaddpd %ymm7, %ymm7, %ymm6
89 vmulpd %ymm6, %ymm14, %ymm7
90 vfmsub213pd Two+__svml_dacos_data_internal(%rip), %ymm15, %ymm6
91 vmovupd poly_coeff+320+__svml_dacos_data_internal(%rip), %ymm14
92 vfmadd213pd sqrt_coeff+32+__svml_dacos_data_internal(%rip), %ymm6, %ymm0
93 vmulpd %ymm6, %ymm7, %ymm15
94 vfmadd213pd poly_coeff+352+__svml_dacos_data_internal(%rip), %ymm2, %ymm14
95 vfmadd213pd sqrt_coeff+64+__svml_dacos_data_internal(%rip), %ymm6, %ymm0
96 vfmadd213pd sqrt_coeff+96+__svml_dacos_data_internal(%rip), %ymm6, %ymm0
99 vmovupd poly_coeff+__svml_dacos_data_internal(%rip), %ymm6
100 vfnmadd213pd %ymm7, %ymm15, %ymm0
101 vfmadd213pd poly_coeff+32+__svml_dacos_data_internal(%rip), %ymm2, %ymm6
102 vblendvpd %ymm1, %ymm0, %ymm4, %ymm0
103 vfmadd213pd %ymm8, %ymm13, %ymm6
104 vmovmskpd %ymm9, %edx
105 vmovupd poly_coeff+192+__svml_dacos_data_internal(%rip), %ymm9
106 vfmadd213pd poly_coeff+224+__svml_dacos_data_internal(%rip), %ymm2, %ymm9
107 vfmadd213pd %ymm9, %ymm13, %ymm11
108 vfmadd213pd %ymm11, %ymm10, %ymm6
109 vfmadd213pd %ymm12, %ymm13, %ymm6
110 vfmadd213pd %ymm14, %ymm13, %ymm6
111 vmulpd %ymm6, %ymm2, %ymm9
114 vcmplt_oqpd %ymm2, %ymm5, %ymm6
115 vandpd PiH+__svml_dacos_data_internal(%rip), %ymm1, %ymm2
116 vandnpd Pi2H+__svml_dacos_data_internal(%rip), %ymm1, %ymm7
117 vxorpd %ymm3, %ymm0, %ymm1
118 vfmadd213pd %ymm1, %ymm1, %ymm9
119 vandpd %ymm6, %ymm2, %ymm2
120 vaddpd %ymm7, %ymm2, %ymm8
121 vaddpd %ymm9, %ymm8, %ymm0
124 /* Go to special inputs processing branch */
125 jne L(SPECIAL_VALUES_BRANCH)
126 # LOE rbx r12 r13 r14 r15 edx ymm0 ymm5
129 * and exit the function
145 L(SPECIAL_VALUES_BRANCH):
146 vmovupd %ymm5, 32(%rsp)
147 vmovupd %ymm0, 64(%rsp)
148 # LOE rbx r12 r13 r14 r15 edx ymm0
151 # LOE rbx r12 r13 r14 r15 eax edx
155 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */
156 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
159 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */
160 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
163 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */
164 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
165 # LOE rbx r15 r12d r13d
174 /* Call scalar math function */
175 jc L(SCALAR_MATH_CALL)
176 # LOE rbx r15 r12d r13d
182 L(SPECIAL_VALUES_LOOP):
186 /* Check bits in range mask */
187 jl L(RANGEMASK_CHECK)
188 # LOE rbx r15 r12d r13d
196 vmovupd 64(%rsp), %ymm0
200 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */
201 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
202 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */
203 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
204 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */
205 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
206 # LOE rbx r12 r13 r14 r15 ymm0
208 /* Scalar math fucntion call
209 * to process special input
214 movsd 32(%rsp,%r14,8), %xmm0
216 # LOE rbx r14 r15 r12d r13d xmm0
218 movsd %xmm0, 64(%rsp,%r14,8)
220 /* Process special inputs in loop */
221 jmp L(SPECIAL_VALUES_LOOP)
222 # LOE rbx r15 r12d r13d
223 END(_ZGVdN4v_acos_avx2)
225 .section .rodata, "a"
228 #ifdef __svml_dacos_data_internal_typedef
229 typedef unsigned int VUINT32;
231 __declspec(align(32)) VUINT32 SgnBit[4][2];
232 __declspec(align(32)) VUINT32 OneHalf[4][2];
233 __declspec(align(32)) VUINT32 SmallNorm[4][2];
234 __declspec(align(32)) VUINT32 MOne[4][2];
235 __declspec(align(32)) VUINT32 Two[4][2];
236 __declspec(align(32)) VUINT32 sqrt_coeff[4][4][2];
237 __declspec(align(32)) VUINT32 poly_coeff[12][4][2];
238 __declspec(align(32)) VUINT32 PiH[4][2];
239 __declspec(align(32)) VUINT32 Pi2H[4][2];
240 } __svml_dacos_data_internal;
242 __svml_dacos_data_internal:
244 .quad 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000
247 .quad 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000
250 .quad 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000
253 .quad 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000
256 .quad 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000
257 /*== sqrt_coeff[4] ==*/
259 .quad 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3 /* sqrt_coeff4 */
260 .quad 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D /* sqrt_coeff3 */
261 .quad 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97 /* sqrt_coeff2 */
262 .quad 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D /* sqrt_coeff1 */
263 /*== poly_coeff[12] ==*/
265 .quad 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909 /* poly_coeff12 */
266 .quad 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED /* poly_coeff11 */
267 .quad 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE /* poly_coeff10 */
268 .quad 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5 /* poly_coeff9 */
269 .quad 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6 /* poly_coeff8 */
270 .quad 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57 /* poly_coeff7 */
271 .quad 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E /* poly_coeff6 */
272 .quad 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd /* poly_coeff5 */
273 .quad 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE /* poly_coeff4 */
274 .quad 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8 /* poly_coeff3 */
275 .quad 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE /* poly_coeff2 */
276 .quad 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C /* poly_coeff1 */
279 .quad 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18
282 .quad 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18
284 .type __svml_dacos_data_internal,@object
285 .size __svml_dacos_data_internal,.-__svml_dacos_data_internal