]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/x86_64/fpu/multiarch/svml_d_atan4_core_avx2.S
x86-64: Add vector atan/atanf implementation to libmvec
[thirdparty/glibc.git] / sysdeps / x86_64 / fpu / multiarch / svml_d_atan4_core_avx2.S
1 /* Function atan vectorized with AVX2.
2 Copyright (C) 2021 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
18
19 /*
20 * ALGORITHM DESCRIPTION:
21 *
22 * For 0.0 <= x <= 7.0/16.0: atan(x) = atan(0.0) + atan(s), where s=(x-0.0)/(1.0+0.0*x)
23 * For 7.0/16.0 <= x <= 11.0/16.0: atan(x) = atan(0.5) + atan(s), where s=(x-0.5)/(1.0+0.5*x)
24 * For 11.0/16.0 <= x <= 19.0/16.0: atan(x) = atan(1.0) + atan(s), where s=(x-1.0)/(1.0+1.0*x)
25 * For 19.0/16.0 <= x <= 39.0/16.0: atan(x) = atan(1.5) + atan(s), where s=(x-1.5)/(1.0+1.5*x)
26 * For 39.0/16.0 <= x <= inf : atan(x) = atan(inf) + atan(s), where s=-1.0/x
27 * Where atan(s) ~= s+s^3*Poly11(s^2) on interval |s|<7.0/0.16.
28 *
29 */
30
31 /* Offsets for data table __svml_datan_data_internal_avx512
32 */
33 #define AbsMask 0
34 #define Shifter 32
35 #define MaxThreshold 64
36 #define MOne 96
37 #define One 128
38 #define LargeX 160
39 #define Zero 192
40 #define Tbl_H 224
41 #define Tbl_L 480
42 #define dIndexMed 736
43 #define Pi2 768
44 #define Pi2_low 800
45 #define coeff 832
46
47 #include <sysdep.h>
48
49 .text
50 .section .text.avx2,"ax",@progbits
51 ENTRY(_ZGVdN4v_atan_avx2)
52 lea Tbl_H+128+__svml_datan_data_internal_avx512(%rip), %rdi
53 vmovupd Shifter+__svml_datan_data_internal_avx512(%rip), %ymm4
54 vmovupd One+__svml_datan_data_internal_avx512(%rip), %ymm9
55
56 /* saturate X range */
57 vmovupd LargeX+__svml_datan_data_internal_avx512(%rip), %ymm6
58 vandpd __svml_datan_data_internal_avx512(%rip), %ymm0, %ymm7
59 vaddpd %ymm4, %ymm7, %ymm2
60 vcmpge_oqpd MaxThreshold+__svml_datan_data_internal_avx512(%rip), %ymm7, %ymm3
61 vminpd %ymm7, %ymm6, %ymm10
62 vsubpd %ymm4, %ymm2, %ymm5
63
64 /*
65 * table lookup sequence
66 * VPERMUTE not available
67 */
68 vpsllq $3, %ymm2, %ymm13
69 vsubpd %ymm5, %ymm7, %ymm8
70 vcmpge_oqpd dIndexMed+__svml_datan_data_internal_avx512(%rip), %ymm2, %ymm2
71 vfmadd231pd %ymm7, %ymm5, %ymm9
72 vpand .FLT_11(%rip), %ymm13, %ymm14
73 vblendvpd %ymm3, MOne+__svml_datan_data_internal_avx512(%rip), %ymm8, %ymm11
74 vblendvpd %ymm3, %ymm10, %ymm9, %ymm12
75 vxorpd %ymm0, %ymm7, %ymm1
76
77 /* R+Rl = DiffX/Y */
78 vdivpd %ymm12, %ymm11, %ymm0
79 vextractf128 $1, %ymm14, %xmm4
80 vmovd %xmm14, %eax
81 vmovd %xmm4, %ecx
82 movslq %eax, %rax
83 vpextrd $2, %xmm14, %edx
84 movslq %ecx, %rcx
85 vpextrd $2, %xmm4, %esi
86 movslq %edx, %rdx
87 movslq %esi, %rsi
88 vmovsd -128(%rax,%rdi), %xmm15
89 vmovsd (%rdi,%rax), %xmm7
90 vmovsd -128(%rcx,%rdi), %xmm5
91 vmovsd (%rdi,%rcx), %xmm9
92 vmovhpd -128(%rdx,%rdi), %xmm15, %xmm15
93 vmovhpd (%rdi,%rdx), %xmm7, %xmm8
94 vmovhpd -128(%rsi,%rdi), %xmm5, %xmm6
95 vmovhpd (%rdi,%rsi), %xmm9, %xmm10
96
97 /* polynomial evaluation */
98 vmulpd %ymm0, %ymm0, %ymm5
99 vmulpd %ymm5, %ymm5, %ymm4
100 vinsertf128 $1, %xmm6, %ymm15, %ymm11
101 vinsertf128 $1, %xmm10, %ymm8, %ymm12
102 vblendvpd %ymm2, %ymm12, %ymm11, %ymm13
103 vmovupd coeff+__svml_datan_data_internal_avx512(%rip), %ymm8
104 vmovupd coeff+64+__svml_datan_data_internal_avx512(%rip), %ymm2
105 vmulpd %ymm5, %ymm0, %ymm6
106 vfmadd213pd coeff+32+__svml_datan_data_internal_avx512(%rip), %ymm5, %ymm8
107 vfmadd213pd coeff+96+__svml_datan_data_internal_avx512(%rip), %ymm5, %ymm2
108
109 /* set table value to Pi/2 for large X */
110 vblendvpd %ymm3, Pi2+__svml_datan_data_internal_avx512(%rip), %ymm13, %ymm7
111 vmovupd coeff+128+__svml_datan_data_internal_avx512(%rip), %ymm3
112 vfmadd213pd %ymm2, %ymm4, %ymm8
113 vfmadd213pd coeff+160+__svml_datan_data_internal_avx512(%rip), %ymm3, %ymm5
114 vfmadd213pd %ymm5, %ymm4, %ymm8
115 vfmadd213pd %ymm0, %ymm6, %ymm8
116 vaddpd %ymm8, %ymm7, %ymm0
117 vxorpd %ymm1, %ymm0, %ymm0
118 ret
119
120 END(_ZGVdN4v_atan_avx2)
121
122 .section .rodata, "a"
123 .align 32
124
125 .FLT_11:
126 .long 0x00000078,0x00000000,0x00000078,0x00000000,0x00000078,0x00000000,0x00000078,0x00000000
127 .type .FLT_11,@object
128 .size .FLT_11,32
129 .align 32
130
131 #ifdef __svml_datan_data_internal_avx512_typedef
132 typedef unsigned int VUINT32;
133 typedef struct {
134 __declspec(align(32)) VUINT32 AbsMask[4][2];
135 __declspec(align(32)) VUINT32 Shifter[4][2];
136 __declspec(align(32)) VUINT32 MaxThreshold[4][2];
137 __declspec(align(32)) VUINT32 MOne[4][2];
138 __declspec(align(32)) VUINT32 One[4][2];
139 __declspec(align(32)) VUINT32 LargeX[4][2];
140 __declspec(align(32)) VUINT32 Zero[4][2];
141 __declspec(align(32)) VUINT32 Tbl_H[32][2];
142 __declspec(align(32)) VUINT32 Tbl_L[32][2];
143 __declspec(align(32)) VUINT32 dIndexMed[4][2];
144 __declspec(align(32)) VUINT32 Pi2[4][2];
145 __declspec(align(32)) VUINT32 Pi2_low[4][2];
146 __declspec(align(32)) VUINT32 coeff[6][4][2];
147 } __svml_datan_data_internal_avx512;
148 #endif
149 __svml_datan_data_internal_avx512:
150 /*== AbsMask ==*/
151 .quad 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff
152 /*== Shifter ==*/
153 .align 32
154 .quad 0x4318000000000000, 0x4318000000000000, 0x4318000000000000, 0x4318000000000000
155 /*== MaxThreshold ==*/
156 .align 32
157 .quad 0x401f800000000000, 0x401f800000000000, 0x401f800000000000, 0x401f800000000000
158 /*== MOne ==*/
159 .align 32
160 .quad 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000
161 /*== One ==*/
162 .align 32
163 .quad 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000
164 /*== LargeX ==*/
165 .align 32
166 .quad 0x47f0000000000000, 0x47f0000000000000, 0x47f0000000000000, 0x47f0000000000000
167 /*== Zero ==*/
168 .align 32
169 .quad 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000
170 /*== Tbl_H ==*/
171 .align 32
172 .quad 0x0000000000000000, 0x3fcf5b75f92c80dd
173 .quad 0x3fddac670561bb4f, 0x3fe4978fa3269ee1
174 .quad 0x3fe921fb54442d18, 0x3fecac7c57846f9e
175 .quad 0x3fef730bd281f69b, 0x3ff0d38f2c5ba09f
176 .quad 0x3ff1b6e192ebbe44, 0x3ff270ef55a53a25
177 .quad 0x3ff30b6d796a4da8, 0x3ff38d6a6ce13353
178 .quad 0x3ff3fc176b7a8560, 0x3ff45b54837351a0
179 .quad 0x3ff4ae10fc6589a5, 0x3ff4f68dea672617
180 .quad 0x3ff5368c951e9cfd, 0x3ff56f6f33a3e6a7
181 .quad 0x3ff5a25052114e60, 0x3ff5d013c41adabd
182 .quad 0x3ff5f97315254857, 0x3ff61f06c6a92b89
183 .quad 0x3ff6414d44094c7c, 0x3ff660b02c736a06
184 .quad 0x3ff67d8863bc99bd, 0x3ff698213a9d5053
185 .quad 0x3ff6b0bae830c070, 0x3ff6c78c7edeb195
186 .quad 0x3ff6dcc57bb565fd, 0x3ff6f08f07435fec
187 .quad 0x3ff7030cf9403197, 0x3ff7145eac2088a4
188 /*== Tbl_L ==*/
189 .align 32
190 .quad 0x0000000000000000, 0x3c68ab6e3cf7afbd
191 .quad 0x3c7a2b7f222f65e2, 0x3c72419a87f2a458
192 .quad 0x3c81a62633145c07, 0x3c80dae13ad18a6b
193 .quad 0x3c7007887af0cbbd, 0xbc9bd0dc231bfd70
194 .quad 0x3c9b1b466a88828e, 0xbc9a66b1af5f84fb
195 .quad 0x3c96254cb03bb199, 0xbc812c77e8a80f5c
196 .quad 0xbc4441a3bd3f1084, 0x3c79e4a72eedacc4
197 .quad 0xbc93b03e8a27f555, 0x3c9934f9f2b0020e
198 .quad 0xbc996f47948a99f1, 0xbc7df6edd6f1ec3b
199 .quad 0x3c78c2d0c89de218, 0x3c9f82bba194dd5d
200 .quad 0xbc831151a43b51ca, 0xbc8487d50bceb1a5
201 .quad 0xbc9c5f60a65c7397, 0xbc7acb6afb332a0f
202 .quad 0xbc99b7bd2e1e8c9c, 0xbc9b9839085189e3
203 .quad 0xbc97d1ab82ffb70b, 0x3c99239ad620ffe2
204 .quad 0xbc929c86447928e7, 0xbc8957a7170df016
205 .quad 0xbc7cbe1896221608, 0xbc9fda5797b32a0b
206 /*== dIndexMed ==*/
207 .align 32
208 .quad 0x4318000000000010, 0x4318000000000010, 0x4318000000000010, 0x4318000000000010
209 /*== Pi2 ==*/
210 .align 32
211 .quad 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18
212 /*== Pi2_low ==*/
213 .align 32
214 .quad 0x3c91a62633145c07, 0x3c91a62633145c07, 0x3c91a62633145c07, 0x3c91a62633145c07
215 /*== coeff6 ==*/
216 .align 32
217 .quad 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97
218 .quad 0xbfb74257c46790cc, 0xbfb74257c46790cc, 0xbfb74257c46790cc, 0xbfb74257c46790cc
219 .quad 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0
220 .quad 0xbfc249248eef04da, 0xbfc249248eef04da, 0xbfc249248eef04da, 0xbfc249248eef04da
221 .quad 0x3fc999999998741e, 0x3fc999999998741e, 0x3fc999999998741e, 0x3fc999999998741e
222 .quad 0xbfd555555555554d, 0xbfd555555555554d, 0xbfd555555555554d, 0xbfd555555555554d
223 .align 32
224 .type __svml_datan_data_internal_avx512,@object
225 .size __svml_datan_data_internal_avx512,.-__svml_datan_data_internal_avx512