1 /* Function atan2f vectorized with AVX-512.
2 Copyright (C) 2021-2023 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
20 * ALGORITHM DESCRIPTION:
21 * For 0.0 <= x <= 7.0/16.0: atan(x) = atan(0.0) + atan(s), where s=(x-0.0)/(1.0+0.0*x)
22 * For 7.0/16.0 <= x <= 11.0/16.0: atan(x) = atan(0.5) + atan(s), where s=(x-0.5)/(1.0+0.5*x)
23 * For 11.0/16.0 <= x <= 19.0/16.0: atan(x) = atan(1.0) + atan(s), where s=(x-1.0)/(1.0+1.0*x)
24 * For 19.0/16.0 <= x <= 39.0/16.0: atan(x) = atan(1.5) + atan(s), where s=(x-1.5)/(1.0+1.5*x)
25 * For 39.0/16.0 <= x <= inf : atan(x) = atan(inf) + atan(s), where s=-1.0/x
26 * Where atan(s) ~= s+s^3*Poly11(s^2) on interval |s|<7.0/0.16.
31 /* Offsets for data table __svml_satan2_data_internal
35 #define sSIGN_MASK 128
48 #define iCHK_WORK_SUB 960
49 #define iCHK_WORK_CMP 1024
53 .section .text.evex512, "ax", @progbits
54 ENTRY(_ZGVeN16vv_atan2f_skx)
56 cfi_def_cfa_offset(16)
65 * #define NO_VECTOR_ZERO_ATAN2_ARGS
69 * The end of declarations
73 vmovups sABS_MASK+__svml_satan2_data_internal(%rip), %zmm6
74 vmovups sONE+__svml_satan2_data_internal(%rip), %zmm3
76 /* Testing on working interval. */
77 vmovups iCHK_WORK_SUB+__svml_satan2_data_internal(%rip), %zmm9
78 vmovups iCHK_WORK_CMP+__svml_satan2_data_internal(%rip), %zmm14
81 * 1) If y<x then a= y, b=x, PIO2=0
82 * 2) If y>x then a=-x, b=y, PIO2=Pi/2
84 vmovups sPIO2+__svml_satan2_data_internal(%rip), %zmm4
85 vpternlogd $255, %zmm13, %zmm13, %zmm13
87 vandps %zmm6, %zmm8, %zmm2
88 vandps %zmm6, %zmm0, %zmm1
89 vorps sSIGN_MASK+__svml_satan2_data_internal(%rip), %zmm2, %zmm5
90 vpsubd %zmm9, %zmm2, %zmm10
91 vpsubd %zmm9, %zmm1, %zmm12
92 vxorps %zmm2, %zmm8, %zmm7
93 vxorps %zmm1, %zmm0, %zmm6
94 vcmpps $17, {sae}, %zmm2, %zmm1, %k1
95 vpcmpgtd %zmm10, %zmm14, %k2
96 vpcmpgtd %zmm12, %zmm14, %k3
97 vmovups sPC6+__svml_satan2_data_internal(%rip), %zmm14
98 vblendmps %zmm1, %zmm5, %zmm11{%k1}
99 vblendmps %zmm2, %zmm1, %zmm5{%k1}
100 vxorps %zmm4, %zmm4, %zmm4{%k1}
104 * Enabled when FMA is available and
105 * performance is better with NR iteration
107 vrcp14ps %zmm5, %zmm15
108 vfnmadd231ps {rn-sae}, %zmm5, %zmm15, %zmm3
109 vfmadd213ps {rn-sae}, %zmm15, %zmm3, %zmm15
110 vmulps {rn-sae}, %zmm15, %zmm11, %zmm3
111 vfnmadd231ps {rn-sae}, %zmm5, %zmm3, %zmm11
112 vfmadd213ps {rn-sae}, %zmm3, %zmm11, %zmm15
113 vmovups sPC8+__svml_satan2_data_internal(%rip), %zmm11
114 vpternlogd $255, %zmm3, %zmm3, %zmm3
117 vmulps {rn-sae}, %zmm15, %zmm15, %zmm9
118 vpandnd %zmm10, %zmm10, %zmm13{%k2}
119 vmulps {rn-sae}, %zmm9, %zmm9, %zmm10
120 vfmadd231ps {rn-sae}, %zmm10, %zmm11, %zmm14
121 vmovups sPC5+__svml_satan2_data_internal(%rip), %zmm11
122 vpandnd %zmm12, %zmm12, %zmm3{%k3}
123 vpord %zmm3, %zmm13, %zmm3
124 vmovups sPC4+__svml_satan2_data_internal(%rip), %zmm13
125 vmovups sPC7+__svml_satan2_data_internal(%rip), %zmm12
126 vptestmd %zmm3, %zmm3, %k0
127 vfmadd213ps {rn-sae}, %zmm13, %zmm10, %zmm14
128 vfmadd231ps {rn-sae}, %zmm10, %zmm12, %zmm11
129 vmovups sPC3+__svml_satan2_data_internal(%rip), %zmm12
130 vmovups sPC2+__svml_satan2_data_internal(%rip), %zmm13
132 /* Special branch for fast (vector) processing of zero arguments */
134 vfmadd213ps {rn-sae}, %zmm12, %zmm10, %zmm11
135 vmovups sPC1+__svml_satan2_data_internal(%rip), %zmm12
136 vfmadd213ps {rn-sae}, %zmm13, %zmm10, %zmm14
137 vmovups sPC0+__svml_satan2_data_internal(%rip), %zmm13
138 vfmadd213ps {rn-sae}, %zmm12, %zmm10, %zmm11
139 vfmadd213ps {rn-sae}, %zmm13, %zmm10, %zmm14
140 vfmadd213ps {rn-sae}, %zmm14, %zmm9, %zmm11
142 /* Reconstruction. */
143 vfmadd213ps {rn-sae}, %zmm4, %zmm15, %zmm11
145 /* if x<0, sPI = Pi, else sPI =0 */
146 vmovups __svml_satan2_data_internal(%rip), %zmm15
147 vorps %zmm7, %zmm11, %zmm9
148 vcmpps $18, {sae}, %zmm15, %zmm8, %k4
149 vmovups sPI+__svml_satan2_data_internal(%rip), %zmm11
150 vaddps {rn-sae}, %zmm11, %zmm9, %zmm9{%k4}
151 vorps %zmm6, %zmm9, %zmm10
153 /* Go to auxilary branch */
155 # LOE rbx r12 r13 r14 r15 edx zmm0 zmm1 zmm2 zmm3 zmm4 zmm5 zmm6 zmm7 zmm8 zmm10 zmm11
157 /* Return from auxilary branch
158 * for out of main path inputs
161 L(AUX_BRANCH_RETURN):
163 * Special branch for fast (vector) processing of zero arguments
164 * The end of implementation
168 /* Go to special inputs processing branch */
169 jne L(SPECIAL_VALUES_BRANCH)
170 # LOE rbx r12 r13 r14 r15 edx zmm0 zmm8 zmm10
173 * and exit the function
177 vmovaps %zmm10, %zmm0
190 L(SPECIAL_VALUES_BRANCH):
191 vmovups %zmm0, 64(%rsp)
192 vmovups %zmm8, 128(%rsp)
193 vmovups %zmm10, 192(%rsp)
194 # LOE rbx r12 r13 r14 r15 edx zmm10
197 # LOE rbx r12 r13 r14 r15 eax edx
201 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -240; DW_OP_plus) */
202 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x10, 0xff, 0xff, 0xff, 0x22
205 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -248; DW_OP_plus) */
206 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x08, 0xff, 0xff, 0xff, 0x22
209 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -256; DW_OP_plus) */
210 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x00, 0xff, 0xff, 0xff, 0x22
211 # LOE rbx r15 r12d r13d
220 /* Call scalar math function */
221 jc L(SCALAR_MATH_CALL)
222 # LOE rbx r15 r12d r13d
228 L(SPECIAL_VALUES_LOOP):
232 /* Check bits in range mask */
233 jl L(RANGEMASK_CHECK)
234 # LOE rbx r15 r12d r13d
242 vmovups 192(%rsp), %zmm10
246 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -240; DW_OP_plus) */
247 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x10, 0xff, 0xff, 0xff, 0x22
248 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -248; DW_OP_plus) */
249 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x08, 0xff, 0xff, 0xff, 0x22
250 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -256; DW_OP_plus) */
251 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x00, 0xff, 0xff, 0xff, 0x22
252 # LOE rbx r12 r13 r14 r15 zmm10
254 /* Scalar math fucntion call
255 * to process special input
260 vmovss 64(%rsp, %r14, 4), %xmm0
261 vmovss 128(%rsp, %r14, 4), %xmm1
263 # LOE rbx r14 r15 r12d r13d xmm0
265 vmovss %xmm0, 192(%rsp, %r14, 4)
267 /* Process special inputs in loop */
268 jmp L(SPECIAL_VALUES_LOOP)
272 # LOE rbx r15 r12d r13d
275 * for out of main path inputs
279 /* Check if at least on of Y or Y is zero: iAXAYZERO */
280 vmovups __svml_satan2_data_internal(%rip), %zmm9
282 /* Check if both X & Y are not NaNs: iXYnotNAN */
283 vcmpps $3, {sae}, %zmm8, %zmm8, %k1
284 vcmpps $3, {sae}, %zmm0, %zmm0, %k2
285 vpcmpd $4, %zmm9, %zmm2, %k3
286 vpcmpd $4, %zmm9, %zmm1, %k4
289 * Path for zero arguments (at least one of both)
290 * Check if both args are zeros (den. is zero)
292 vcmpps $4, {sae}, %zmm9, %zmm5, %k5
294 /* Res = sign(Y)*(X<0)?(PIO2+PI):PIO2 */
295 vpcmpgtd %zmm8, %zmm9, %k6
296 vpternlogd $255, %zmm14, %zmm14, %zmm14
297 vpternlogd $255, %zmm12, %zmm12, %zmm12
298 vpternlogd $255, %zmm13, %zmm13, %zmm13
299 vpandnd %zmm2, %zmm2, %zmm14{%k3}
300 vpternlogd $255, %zmm2, %zmm2, %zmm2
301 vpandnd %zmm1, %zmm1, %zmm2{%k4}
302 vpord %zmm2, %zmm14, %zmm15
303 vpternlogd $255, %zmm2, %zmm2, %zmm2
304 vpandnd %zmm5, %zmm5, %zmm2{%k5}
306 /* Set sPIO2 to zero if den. is zero */
307 vpandnd %zmm4, %zmm2, %zmm4
308 vpandd %zmm2, %zmm9, %zmm5
309 vpord %zmm5, %zmm4, %zmm2
310 vorps %zmm7, %zmm2, %zmm7
311 vaddps {rn-sae}, %zmm11, %zmm7, %zmm7{%k6}
312 vorps %zmm6, %zmm7, %zmm6
313 vpandnd %zmm8, %zmm8, %zmm12{%k1}
314 vpandnd %zmm0, %zmm0, %zmm13{%k2}
315 vandps %zmm13, %zmm12, %zmm12
317 /* Check if at least on of Y or Y is zero and not NaN: iAXAYZEROnotNAN */
318 vpandd %zmm12, %zmm15, %zmm1
320 /* Exclude from previous callout mask zero (and not NaN) arguments */
321 vpandnd %zmm3, %zmm1, %zmm3
324 vptestmd %zmm3, %zmm3, %k0
327 /* Merge results from main and spec path */
328 vpandnd %zmm10, %zmm1, %zmm10
329 vpandd %zmm1, %zmm6, %zmm11
330 vpord %zmm11, %zmm10, %zmm10
332 /* Return to main vector processing path */
333 jmp L(AUX_BRANCH_RETURN)
334 # LOE rbx r12 r13 r14 r15 edx zmm0 zmm8 zmm10
335 END(_ZGVeN16vv_atan2f_skx)
337 .section .rodata, "a"
340 #ifdef __svml_satan2_data_internal_typedef
341 typedef unsigned int VUINT32;
343 __declspec(align(64)) VUINT32 sZERO[16][1];
344 __declspec(align(64)) VUINT32 sONE[16][1];
345 __declspec(align(64)) VUINT32 sSIGN_MASK[16][1];
346 __declspec(align(64)) VUINT32 sABS_MASK[16][1];
347 __declspec(align(64)) VUINT32 sPIO2[16][1];
348 __declspec(align(64)) VUINT32 sPI[16][1];
349 __declspec(align(64)) VUINT32 sPC8[16][1];
350 __declspec(align(64)) VUINT32 sPC7[16][1];
351 __declspec(align(64)) VUINT32 sPC6[16][1];
352 __declspec(align(64)) VUINT32 sPC5[16][1];
353 __declspec(align(64)) VUINT32 sPC4[16][1];
354 __declspec(align(64)) VUINT32 sPC3[16][1];
355 __declspec(align(64)) VUINT32 sPC2[16][1];
356 __declspec(align(64)) VUINT32 sPC1[16][1];
357 __declspec(align(64)) VUINT32 sPC0[16][1];
358 __declspec(align(64)) VUINT32 iCHK_WORK_SUB[16][1];
359 __declspec(align(64)) VUINT32 iCHK_WORK_CMP[16][1];
360 } __svml_satan2_data_internal;
362 __svml_satan2_data_internal:
363 .long 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // sZERO
365 .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 // sONE
367 .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000 // sSIGN_MASK
369 .long 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF // sABS_MASK
371 .long 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB // sPIO2
373 .long 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB // sPI
375 .long 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0 // sA08
377 .long 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631 // sA07
379 .long 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384 // sA06
381 .long 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629 // sA05
383 .long 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474 // sA04
385 .long 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8 // sA03
387 .long 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F // sA02
389 .long 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49 // sA01
391 .long 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000 // sA00
393 .long 0x81000000, 0x81000000, 0x81000000, 0x81000000, 0x81000000, 0x81000000, 0x81000000, 0x81000000, 0x81000000, 0x81000000, 0x81000000, 0x81000000, 0x81000000, 0x81000000, 0x81000000, 0x81000000 // iCHK_WORK_SUB
395 .long 0xFC000000, 0xFC000000, 0xFC000000, 0xFC000000, 0xFC000000, 0xFC000000, 0xFC000000, 0xFC000000, 0xFC000000, 0xFC000000, 0xFC000000, 0xFC000000, 0xFC000000, 0xFC000000, 0xFC000000, 0xFC000000 // iCHK_WORK_CMP
397 .type __svml_satan2_data_internal, @object
398 .size __svml_satan2_data_internal, .-__svml_satan2_data_internal