]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/x86_64/fpu/multiarch/svml_d_atan22_core_sse4.S
x86-64: Add vector atan2/atan2f implementation to libmvec
[thirdparty/glibc.git] / sysdeps / x86_64 / fpu / multiarch / svml_d_atan22_core_sse4.S
1 /* Function atan2 vectorized with SSE4.
2 Copyright (C) 2021 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
18
19 /*
20 * ALGORITHM DESCRIPTION:
21 * For 0.0 <= x <= 7.0/16.0: atan(x) = atan(0.0) + atan(s), where s=(x-0.0)/(1.0+0.0*x)
22 * For 7.0/16.0 <= x <= 11.0/16.0: atan(x) = atan(0.5) + atan(s), where s=(x-0.5)/(1.0+0.5*x)
23 * For 11.0/16.0 <= x <= 19.0/16.0: atan(x) = atan(1.0) + atan(s), where s=(x-1.0)/(1.0+1.0*x)
24 * For 19.0/16.0 <= x <= 39.0/16.0: atan(x) = atan(1.5) + atan(s), where s=(x-1.5)/(1.0+1.5*x)
25 * For 39.0/16.0 <= x <= inf : atan(x) = atan(inf) + atan(s), where s=-1.0/x
26 * Where atan(s) ~= s+s^3*Poly11(s^2) on interval |s|<7.0/0.16.
27 *
28 *
29 */
30
31 /* Offsets for data table __svml_datan2_data_internal
32 */
33 #define dPI 0
34 #define dPIO2 16
35 #define dA19 32
36 #define dA18 48
37 #define dA17 64
38 #define dA16 80
39 #define dA15 96
40 #define dA14 112
41 #define dA13 128
42 #define dA12 144
43 #define dA11 160
44 #define dA10 176
45 #define dA09 192
46 #define dA08 208
47 #define dA07 224
48 #define dA06 240
49 #define dA05 256
50 #define dA04 272
51 #define dA03 288
52 #define dA02 304
53 #define dA01 320
54 #define dA00 336
55 #define dSIGN_MASK 352
56 #define iCHK_WORK_SUB 368
57 #define iCHK_WORK_CMP 384
58 #define dABS_MASK 400
59 #define dZERO 416
60
61 #include <sysdep.h>
62
63 .text
64 .section .text.sse4,"ax",@progbits
65 ENTRY(_ZGVbN2vv_atan2_sse4)
66 subq $88, %rsp
67 cfi_def_cfa_offset(96)
68 movaps %xmm0, %xmm8
69
70 /*
71 * #define NO_VECTOR_ZERO_ATAN2_ARGS
72 * Declarations
73 * Variables
74 * Constants
75 * The end of declarations
76 * Implementation
77 * Get r0~=1/B
78 * Cannot be replaced by VQRCP(D, dR0, dB);
79 * Argument Absolute values
80 */
81 movups dABS_MASK+__svml_datan2_data_internal(%rip), %xmm4
82 movaps %xmm1, %xmm9
83 movaps %xmm4, %xmm1
84 andps %xmm8, %xmm4
85 andps %xmm9, %xmm1
86 movaps %xmm4, %xmm2
87 cmpnltpd %xmm1, %xmm2
88
89 /* Argument signs */
90 movups dSIGN_MASK+__svml_datan2_data_internal(%rip), %xmm3
91 movaps %xmm2, %xmm0
92 movups dPIO2+__svml_datan2_data_internal(%rip), %xmm5
93 movaps %xmm3, %xmm7
94 movaps %xmm3, %xmm6
95
96 /*
97 * 1) If y<x then a= y, b=x, PIO2=0
98 * 2) If y>x then a=-x, b=y, PIO2=Pi/2
99 */
100 orps %xmm1, %xmm3
101 movaps %xmm2, %xmm10
102 andps %xmm2, %xmm5
103 andnps %xmm4, %xmm0
104 andps %xmm2, %xmm3
105 andnps %xmm1, %xmm10
106 andps %xmm4, %xmm2
107 orps %xmm3, %xmm0
108 orps %xmm2, %xmm10
109 divpd %xmm10, %xmm0
110 movq iCHK_WORK_SUB+__svml_datan2_data_internal(%rip), %xmm11
111
112 /* if x<0, dPI = Pi, else dPI =0 */
113 movaps %xmm9, %xmm3
114
115 /* Check if y and x are on main path. */
116 pshufd $221, %xmm1, %xmm12
117 andps %xmm9, %xmm7
118 psubd %xmm11, %xmm12
119 andps %xmm8, %xmm6
120 movq iCHK_WORK_CMP+__svml_datan2_data_internal(%rip), %xmm13
121 xorl %edx, %edx
122 movups %xmm4, 16(%rsp)
123 xorl %eax, %eax
124 pshufd $221, %xmm4, %xmm14
125 movdqa %xmm12, %xmm4
126 pcmpgtd %xmm13, %xmm4
127 pcmpeqd %xmm13, %xmm12
128 por %xmm12, %xmm4
129
130 /* Polynomial. */
131 movaps %xmm0, %xmm12
132 mulpd %xmm0, %xmm12
133 cmplepd dZERO+__svml_datan2_data_internal(%rip), %xmm3
134 psubd %xmm11, %xmm14
135 movdqa %xmm14, %xmm15
136 pcmpeqd %xmm13, %xmm14
137 pcmpgtd %xmm13, %xmm15
138 por %xmm14, %xmm15
139 movaps %xmm12, %xmm14
140 mulpd %xmm12, %xmm14
141 por %xmm15, %xmm4
142 movaps %xmm14, %xmm15
143 mulpd %xmm14, %xmm15
144 movmskps %xmm4, %ecx
145 movups %xmm10, (%rsp)
146 movups dA19+__svml_datan2_data_internal(%rip), %xmm10
147 mulpd %xmm15, %xmm10
148 movups dA18+__svml_datan2_data_internal(%rip), %xmm13
149 movups dA17+__svml_datan2_data_internal(%rip), %xmm11
150 addpd dA15+__svml_datan2_data_internal(%rip), %xmm10
151 mulpd %xmm15, %xmm13
152 mulpd %xmm15, %xmm11
153 mulpd %xmm15, %xmm10
154 addpd dA14+__svml_datan2_data_internal(%rip), %xmm13
155 addpd dA13+__svml_datan2_data_internal(%rip), %xmm11
156 addpd dA11+__svml_datan2_data_internal(%rip), %xmm10
157 mulpd %xmm15, %xmm13
158 mulpd %xmm15, %xmm11
159 mulpd %xmm15, %xmm10
160 addpd dA10+__svml_datan2_data_internal(%rip), %xmm13
161 addpd dA09+__svml_datan2_data_internal(%rip), %xmm11
162 addpd dA07+__svml_datan2_data_internal(%rip), %xmm10
163 mulpd %xmm15, %xmm13
164 mulpd %xmm15, %xmm11
165 mulpd %xmm15, %xmm10
166 addpd dA06+__svml_datan2_data_internal(%rip), %xmm13
167 addpd dA05+__svml_datan2_data_internal(%rip), %xmm11
168 addpd dA03+__svml_datan2_data_internal(%rip), %xmm10
169 mulpd %xmm15, %xmm13
170 mulpd %xmm15, %xmm11
171 mulpd %xmm12, %xmm10
172 addpd dA02+__svml_datan2_data_internal(%rip), %xmm13
173 addpd dA01+__svml_datan2_data_internal(%rip), %xmm11
174 addpd %xmm10, %xmm13
175 mulpd %xmm11, %xmm12
176 mulpd %xmm13, %xmm14
177 movups dA16+__svml_datan2_data_internal(%rip), %xmm2
178 mulpd %xmm15, %xmm2
179 addpd dA12+__svml_datan2_data_internal(%rip), %xmm2
180 mulpd %xmm15, %xmm2
181 addpd dA08+__svml_datan2_data_internal(%rip), %xmm2
182 mulpd %xmm15, %xmm2
183 addpd dA04+__svml_datan2_data_internal(%rip), %xmm2
184
185 /* A00=1.0, account for it later VQFMA(D, dP4, dP4, dR8, dA00); */
186 mulpd %xmm2, %xmm15
187 addpd %xmm12, %xmm15
188 addpd %xmm14, %xmm15
189
190 /*
191 * Reconstruction.
192 * dP=(R+R*dP) + dPIO2
193 */
194 mulpd %xmm0, %xmm15
195 addpd %xmm15, %xmm0
196 addpd %xmm5, %xmm0
197 andps __svml_datan2_data_internal(%rip), %xmm3
198 orps %xmm7, %xmm0
199 addpd %xmm3, %xmm0
200
201 /* Special branch for fast (vector) processing of zero arguments */
202 movups 16(%rsp), %xmm11
203 orps %xmm6, %xmm0
204 testb $3, %cl
205
206 /* Go to auxilary branch */
207 jne L(AUX_BRANCH)
208 # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm1 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm11
209
210 /* Return from auxilary branch
211 * for out of main path inputs
212 */
213
214 L(AUX_BRANCH_RETURN):
215 /*
216 * Special branch for fast (vector) processing of zero arguments
217 * The end of implementation
218 */
219 testl %edx, %edx
220
221 /* Go to special inputs processing branch */
222 jne L(SPECIAL_VALUES_BRANCH)
223 # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm8 xmm9
224
225 /* Restore registers
226 * and exit the function
227 */
228
229 L(EXIT):
230 addq $88, %rsp
231 cfi_def_cfa_offset(8)
232 ret
233 cfi_def_cfa_offset(96)
234
235 /* Branch to process
236 * special inputs
237 */
238
239 L(SPECIAL_VALUES_BRANCH):
240 movups %xmm8, 32(%rsp)
241 movups %xmm9, 48(%rsp)
242 movups %xmm0, 64(%rsp)
243 # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0
244
245 movq %r12, 16(%rsp)
246 cfi_offset(12, -80)
247 movl %eax, %r12d
248 movq %r13, 8(%rsp)
249 cfi_offset(13, -88)
250 movl %edx, %r13d
251 movq %r14, (%rsp)
252 cfi_offset(14, -96)
253 # LOE rbx rbp r15 r12d r13d
254
255 /* Range mask
256 * bits check
257 */
258
259 L(RANGEMASK_CHECK):
260 btl %r12d, %r13d
261
262 /* Call scalar math function */
263 jc L(SCALAR_MATH_CALL)
264 # LOE rbx rbp r15 r12d r13d
265
266 /* Special inputs
267 * processing loop
268 */
269
270 L(SPECIAL_VALUES_LOOP):
271 incl %r12d
272 cmpl $2, %r12d
273
274 /* Check bits in range mask */
275 jl L(RANGEMASK_CHECK)
276 # LOE rbx rbp r15 r12d r13d
277
278 movq 16(%rsp), %r12
279 cfi_restore(12)
280 movq 8(%rsp), %r13
281 cfi_restore(13)
282 movq (%rsp), %r14
283 cfi_restore(14)
284 movups 64(%rsp), %xmm0
285
286 /* Go to exit */
287 jmp L(EXIT)
288 cfi_offset(12, -80)
289 cfi_offset(13, -88)
290 cfi_offset(14, -96)
291 # LOE rbx rbp r12 r13 r14 r15 xmm0
292
293 /* Scalar math fucntion call
294 * to process special input
295 */
296
297 L(SCALAR_MATH_CALL):
298 movl %r12d, %r14d
299 movsd 32(%rsp,%r14,8), %xmm0
300 movsd 48(%rsp,%r14,8), %xmm1
301 call atan2@PLT
302 # LOE rbx rbp r14 r15 r12d r13d xmm0
303
304 movsd %xmm0, 64(%rsp,%r14,8)
305
306 /* Process special inputs in loop */
307 jmp L(SPECIAL_VALUES_LOOP)
308 cfi_restore(12)
309 cfi_restore(13)
310 cfi_restore(14)
311 # LOE rbx rbp r15 r12d r13d
312
313 /* Auxilary branch
314 * for out of main path inputs
315 */
316
317 L(AUX_BRANCH):
318 /* Check if at least on of Y or Y is zero: iAXAYZERO */
319 movups dZERO+__svml_datan2_data_internal(%rip), %xmm2
320
321 /* Check if both X & Y are not NaNs: iXYnotNAN */
322 movaps %xmm9, %xmm12
323 movaps %xmm8, %xmm10
324 cmpordpd %xmm9, %xmm12
325 cmpordpd %xmm8, %xmm10
326 cmpeqpd %xmm2, %xmm1
327 cmpeqpd %xmm2, %xmm11
328 andps %xmm10, %xmm12
329 orps %xmm11, %xmm1
330 pshufd $221, %xmm1, %xmm1
331 pshufd $221, %xmm12, %xmm11
332
333 /* Check if at least on of Y or Y is zero and not NaN: iAXAYZEROnotNAN */
334 pand %xmm11, %xmm1
335
336 /* Exclude from previous callout mask zero (and not NaN) arguments */
337 movdqa %xmm1, %xmm13
338 pandn %xmm4, %xmm13
339
340 /*
341 * Path for zero arguments (at least one of both)
342 * Check if both args are zeros (den. is zero)
343 */
344 movups (%rsp), %xmm4
345 cmpeqpd %xmm2, %xmm4
346
347 /* Go to callout */
348 movmskps %xmm13, %edx
349
350 /* Set sPIO2 to zero if den. is zero */
351 movaps %xmm4, %xmm15
352 andps %xmm2, %xmm4
353 andnps %xmm5, %xmm15
354 andl $3, %edx
355 orps %xmm4, %xmm15
356 pshufd $221, %xmm9, %xmm5
357 orps %xmm7, %xmm15
358
359 /* Res = sign(Y)*(X<0)?(PIO2+PI):PIO2 */
360 pshufd $221, %xmm2, %xmm7
361 pcmpgtd %xmm5, %xmm7
362 pshufd $80, %xmm7, %xmm14
363 andps %xmm3, %xmm14
364 addpd %xmm14, %xmm15
365
366 /* Merge results from main and spec path */
367 pshufd $80, %xmm1, %xmm3
368 orps %xmm6, %xmm15
369 movdqa %xmm3, %xmm6
370 andps %xmm3, %xmm15
371 andnps %xmm0, %xmm6
372 movaps %xmm6, %xmm0
373 orps %xmm15, %xmm0
374
375 /* Return to main vector processing path */
376 jmp L(AUX_BRANCH_RETURN)
377 # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm8 xmm9
378 END(_ZGVbN2vv_atan2_sse4)
379
380 .section .rodata, "a"
381 .align 16
382
383 #ifdef __svml_datan2_data_internal_typedef
384 typedef unsigned int VUINT32;
385 typedef struct {
386 __declspec(align(16)) VUINT32 dPI[2][2];
387 __declspec(align(16)) VUINT32 dPIO2[2][2];
388 __declspec(align(16)) VUINT32 dA19[2][2];
389 __declspec(align(16)) VUINT32 dA18[2][2];
390 __declspec(align(16)) VUINT32 dA17[2][2];
391 __declspec(align(16)) VUINT32 dA16[2][2];
392 __declspec(align(16)) VUINT32 dA15[2][2];
393 __declspec(align(16)) VUINT32 dA14[2][2];
394 __declspec(align(16)) VUINT32 dA13[2][2];
395 __declspec(align(16)) VUINT32 dA12[2][2];
396 __declspec(align(16)) VUINT32 dA11[2][2];
397 __declspec(align(16)) VUINT32 dA10[2][2];
398 __declspec(align(16)) VUINT32 dA09[2][2];
399 __declspec(align(16)) VUINT32 dA08[2][2];
400 __declspec(align(16)) VUINT32 dA07[2][2];
401 __declspec(align(16)) VUINT32 dA06[2][2];
402 __declspec(align(16)) VUINT32 dA05[2][2];
403 __declspec(align(16)) VUINT32 dA04[2][2];
404 __declspec(align(16)) VUINT32 dA03[2][2];
405 __declspec(align(16)) VUINT32 dA02[2][2];
406 __declspec(align(16)) VUINT32 dA01[2][2];
407 __declspec(align(16)) VUINT32 dA00[2][2];
408 __declspec(align(16)) VUINT32 dSIGN_MASK[2][2];
409 __declspec(align(16)) VUINT32 iCHK_WORK_SUB[4][1];
410 __declspec(align(16)) VUINT32 iCHK_WORK_CMP[4][1];
411 __declspec(align(16)) VUINT32 dABS_MASK[2][2];
412 __declspec(align(16)) VUINT32 dZERO[2][2];
413 } __svml_datan2_data_internal;
414 #endif
415 __svml_datan2_data_internal:
416 .quad 0x400921FB54442D18, 0x400921FB54442D18 //dPI
417 .align 16
418 .quad 0x3FF921FB54442D18, 0x3FF921FB54442D18 //dPIO2
419 .align 16
420 .quad 0xBEF4FDB537ABC7A3, 0xBEF4FDB537ABC7A3 // dA19
421 .align 16
422 .quad 0x3F2CED0A36665209, 0x3F2CED0A36665209 // dA18
423 .align 16
424 .quad 0xBF52E67C93954C23, 0xBF52E67C93954C23 // dA17
425 .align 16
426 .quad 0x3F6F5A1DAE82AFB3, 0x3F6F5A1DAE82AFB3 // dA16
427 .align 16
428 .quad 0xBF82B2EC618E4BAD, 0xBF82B2EC618E4BAD // dA15
429 .align 16
430 .quad 0x3F914F4C661116A5, 0x3F914F4C661116A5 // dA14
431 .align 16
432 .quad 0xBF9A5E83B081F69C, 0xBF9A5E83B081F69C // dA13
433 .align 16
434 .quad 0x3FA169980CB6AD4F, 0x3FA169980CB6AD4F // dA12
435 .align 16
436 .quad 0xBFA4EFA2E563C1BC, 0xBFA4EFA2E563C1BC // dA11
437 .align 16
438 .quad 0x3FA7EC0FBC50683B, 0x3FA7EC0FBC50683B // dA10
439 .align 16
440 .quad 0xBFAAD261EAA09954, 0xBFAAD261EAA09954 // dA09
441 .align 16
442 .quad 0x3FAE1749BD612DCF, 0x3FAE1749BD612DCF // dA08
443 .align 16
444 .quad 0xBFB11084009435E0, 0xBFB11084009435E0 // dA07
445 .align 16
446 .quad 0x3FB3B12A49295651, 0x3FB3B12A49295651 // dA06
447 .align 16
448 .quad 0xBFB745D009BADA94, 0xBFB745D009BADA94 // dA05
449 .align 16
450 .quad 0x3FBC71C707F7D5B5, 0x3FBC71C707F7D5B5 // dA04
451 .align 16
452 .quad 0xBFC2492491EE55C7, 0xBFC2492491EE55C7 // dA03
453 .align 16
454 .quad 0x3FC999999997EE34, 0x3FC999999997EE34 // dA02
455 .align 16
456 .quad 0xBFD55555555553C5, 0xBFD55555555553C5 // dA01
457 .align 16
458 .quad 0x3FF0000000000000, 0x3FF0000000000000 // dA00
459 .align 16
460 .quad 0x8000000000000000, 0x8000000000000000 //dSIGN_MASK
461 .align 16
462 .long 0x80300000, 0x80300000, 0x80300000, 0x80300000 //iCHK_WORK_SUB
463 .align 16
464 .long 0xfdd00000, 0xfdd00000, 0xfdd00000, 0xfdd00000 //iCHK_WORK_CMP
465 .align 16
466 .quad 0x7fffffffffffffff, 0x7fffffffffffffff //dABS_MASK
467 .align 16
468 .quad 0x0000000000000000, 0x0000000000000000 //dZERO
469 .align 16
470 .type __svml_datan2_data_internal,@object
471 .size __svml_datan2_data_internal,.-__svml_datan2_data_internal