1 /* Function acosf vectorized with AVX-512.
2 Copyright (C) 2021-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
20 * ALGORITHM DESCRIPTION:
22 * SelMask = (|x| >= 0.5) ? 1 : 0;
23 * R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x|
24 * acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R))
25 * acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|)
30 /* Offsets for data table __svml_sacos_data_internal
37 #define sqrt_coeff_1 320
38 #define sqrt_coeff_2 384
39 #define poly_coeff_1 448
40 #define poly_coeff_2 512
41 #define poly_coeff_3 576
42 #define poly_coeff_4 640
43 #define poly_coeff_5 704
49 .section .text.exex512, "ax", @progbits
50 ENTRY(_ZGVeN16v_acosf_skx)
52 cfi_def_cfa_offset(16)
58 vmovups __svml_sacos_data_internal(%rip), %zmm5
59 vmovups OneHalf+__svml_sacos_data_internal(%rip), %zmm6
62 vmovups SmallNorm+__svml_sacos_data_internal(%rip), %zmm9
63 vmovups MOne+__svml_sacos_data_internal(%rip), %zmm8
64 vmovups Two+__svml_sacos_data_internal(%rip), %zmm12
65 vmovups sqrt_coeff_1+__svml_sacos_data_internal(%rip), %zmm13
69 vorps %zmm4, %zmm5, %zmm3
70 vandps %zmm4, %zmm5, %zmm2
71 vmovups sqrt_coeff_2+__svml_sacos_data_internal(%rip), %zmm0
73 /* Y = 0.5 + 0.5*(-x) */
74 vfmadd231ps {rn-sae}, %zmm3, %zmm6, %zmm6
77 vmulps {rn-sae}, %zmm3, %zmm3, %zmm7
78 vrsqrt14ps %zmm6, %zmm10
79 vcmpps $17, {sae}, %zmm9, %zmm6, %k1
80 vcmpps $22, {sae}, %zmm3, %zmm8, %k0
81 vmovups poly_coeff_4+__svml_sacos_data_internal(%rip), %zmm9
82 vminps {sae}, %zmm6, %zmm7, %zmm1
83 vmovups poly_coeff_3+__svml_sacos_data_internal(%rip), %zmm7
84 vxorps %zmm10, %zmm10, %zmm10{%k1}
85 vaddps {rn-sae}, %zmm6, %zmm6, %zmm14
86 vmulps {rn-sae}, %zmm1, %zmm1, %zmm8
87 vmulps {rn-sae}, %zmm10, %zmm10, %zmm11
88 vmulps {rn-sae}, %zmm10, %zmm14, %zmm5
89 vcmpps $21, {sae}, %zmm6, %zmm1, %k4
92 vcmpps $17, {sae}, %zmm1, %zmm4, %k2
95 vmovups poly_coeff_1+__svml_sacos_data_internal(%rip), %zmm6
96 vfmsub213ps {rn-sae}, %zmm12, %zmm11, %zmm14
97 vmovups poly_coeff_2+__svml_sacos_data_internal(%rip), %zmm11
98 vfmadd231ps {rn-sae}, %zmm1, %zmm7, %zmm9
99 vmovups poly_coeff_5+__svml_sacos_data_internal(%rip), %zmm10
100 vmovups Pi2H+__svml_sacos_data_internal(%rip), %zmm12
101 vfmadd231ps {rn-sae}, %zmm14, %zmm13, %zmm0
102 vfmadd231ps {rn-sae}, %zmm1, %zmm6, %zmm11
103 vmulps {rn-sae}, %zmm14, %zmm5, %zmm15
104 vfmadd213ps {rn-sae}, %zmm9, %zmm8, %zmm11
105 vxorps %zmm12, %zmm12, %zmm12{%k4}
106 vfnmadd213ps {rn-sae}, %zmm5, %zmm15, %zmm0
107 vfmadd213ps {rn-sae}, %zmm10, %zmm1, %zmm11
109 vmulps {rn-sae}, %zmm1, %zmm11, %zmm13
110 vblendmps %zmm0, %zmm3, %zmm0{%k4}
111 vxorps %zmm2, %zmm0, %zmm1
113 vfmadd213ps {rn-sae}, %zmm1, %zmm1, %zmm13
114 vorps PiH+__svml_sacos_data_internal(%rip), %zmm12, %zmm12{%k3}
115 vaddps {rn-sae}, %zmm13, %zmm12, %zmm0
118 /* Go to special inputs processing branch */
119 jne L(SPECIAL_VALUES_BRANCH)
120 # LOE rbx r12 r13 r14 r15 edx zmm0 zmm4
123 * and exit the function
139 L(SPECIAL_VALUES_BRANCH):
140 vmovups %zmm4, 64(%rsp)
141 vmovups %zmm0, 128(%rsp)
142 # LOE rbx r12 r13 r14 r15 edx zmm0
145 # LOE rbx r12 r13 r14 r15 eax edx
149 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
150 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
153 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
154 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
157 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
158 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
159 # LOE rbx r15 r12d r13d
168 /* Call scalar math function */
169 jc L(SCALAR_MATH_CALL)
170 # LOE rbx r15 r12d r13d
176 L(SPECIAL_VALUES_LOOP):
180 /* Check bits in range mask */
181 jl L(RANGEMASK_CHECK)
182 # LOE rbx r15 r12d r13d
190 vmovups 128(%rsp), %zmm0
194 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
195 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
196 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
197 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
198 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
199 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
200 # LOE rbx r12 r13 r14 r15 zmm0
202 /* Scalar math fucntion call
203 * to process special input
208 movss 64(%rsp, %r14, 4), %xmm0
210 # LOE rbx r14 r15 r12d r13d xmm0
212 movss %xmm0, 128(%rsp, %r14, 4)
214 /* Process special inputs in loop */
215 jmp L(SPECIAL_VALUES_LOOP)
216 # LOE rbx r15 r12d r13d
217 END(_ZGVeN16v_acosf_skx)
219 .section .rodata, "a"
222 #ifdef __svml_sacos_data_internal_typedef
223 typedef unsigned int VUINT32;
225 __declspec(align(64)) VUINT32 SgnBit[16][1];
226 __declspec(align(64)) VUINT32 OneHalf[16][1];
227 __declspec(align(64)) VUINT32 SmallNorm[16][1];
228 __declspec(align(64)) VUINT32 MOne[16][1];
229 __declspec(align(64)) VUINT32 Two[16][1];
230 __declspec(align(64)) VUINT32 sqrt_coeff[2][16][1];
231 __declspec(align(64)) VUINT32 poly_coeff[5][16][1];
232 __declspec(align(64)) VUINT32 Pi2H[16][1];
233 __declspec(align(64)) VUINT32 PiH[16][1];
234 } __svml_sacos_data_internal;
236 __svml_sacos_data_internal:
238 .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000
241 .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000
244 .long 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000
247 .long 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000
250 .long 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000
253 .long 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004 /* sqrt_coeff2 */
254 .long 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001 /* sqrt_coeff1 */
257 .long 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07 /* poly_coeff5 */
258 .long 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B /* poly_coeff4 */
259 .long 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4 /* poly_coeff3 */
260 .long 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12 /* poly_coeff2 */
261 .long 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF /* poly_coeff1 */
264 .long 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB
267 .long 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB
269 .type __svml_sacos_data_internal, @object
270 .size __svml_sacos_data_internal, .-__svml_sacos_data_internal