1 /* Function sincosf vectorized with AVX-512. KNL and SKX versions.
2 Copyright (C) 2014-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
20 #include "svml_s_trig_data.h"
21 #include "svml_s_wrapper_impl.h"
24 ALGORITHM DESCRIPTION:
26 1) Range reduction to [-Pi/4; +Pi/4] interval
27 a) Grab sign from source argument and save it.
28 b) Remove sign using AND operation
29 c) Getting octant Y by 2/Pi multiplication
30 d) Add "Right Shifter" value
31 e) Treat obtained value as integer S for destination sign setting.
32 SS = ((S-S&1)&2)<<30; For sin part
33 SC = ((S+S&1)&2)<<30; For cos part
34 f) Change destination sign if source sign is negative
36 g) Subtract "Right Shifter" (0x4B000000) value
37 h) Subtract Y*(PI/2) from X argument, where PI/2 divided to 4 parts:
38 X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
39 2) Polynomial (minimax for sin within [-Pi/4; +Pi/4] interval)
40 a) Calculate X^2 = X * X
41 b) Calculate 2 polynomials for sin and cos:
42 RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3))));
43 RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4))));
44 c) Swap RS & RC if first bit of obtained value after
45 Right Shifting is set to 1. Using And, Andnot & Or operations.
46 3) Destination sign setting
47 a) Set shifted destination sign using XOR operation:
49 R2 = XOR( RC, SC ). */
51 .section .text.evex512, "ax", @progbits
52 ENTRY (_ZGVeN16vl4l4_sincosf_knl)
54 cfi_adjust_cfa_offset (8)
55 cfi_rel_offset (%rbp, 0)
57 cfi_def_cfa_register (%rbp)
60 movq __svml_s_trig_data@GOTPCREL(%rip), %rax
63 vmovups __sAbsMask(%rax), %zmm0
64 vmovups __sInvPI(%rax), %zmm3
66 /* Absolute argument computation */
67 vpandd %zmm0, %zmm2, %zmm1
68 vmovups __sPI1_FMA(%rax), %zmm5
69 vmovups __sSignMask(%rax), %zmm9
70 vpandnd %zmm2, %zmm0, %zmm0
72 /* h) Subtract Y*(PI/2) from X argument, where PI/2 divided to 3 parts:
73 X = X - Y*PI1 - Y*PI2 - Y*PI3 */
77 /* c) Getting octant Y by 2/Pi multiplication
78 d) Add "Right Shifter" value */
79 vfmadd213ps __sRShifter(%rax), %zmm1, %zmm3
80 vmovups __sPI3_FMA(%rax), %zmm7
82 /* g) Subtract "Right Shifter" (0x4B000000) value */
83 vsubps __sRShifter(%rax), %zmm3, %zmm12
85 /* e) Treat obtained value as integer S for destination sign setting */
86 vpslld $31, %zmm3, %zmm13
87 vmovups __sA7_FMA(%rax), %zmm14
88 vfnmadd231ps %zmm12, %zmm5, %zmm6
90 /* 2) Polynomial (minimax for sin within [-Pi/4; +Pi/4] interval)
91 a) Calculate X^2 = X * X
92 b) Calculate 2 polynomials for sin and cos:
93 RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3))));
94 RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4)))) */
95 vmovaps %zmm14, %zmm15
96 vmovups __sA9_FMA(%rax), %zmm3
97 vcmpps $22, __sRangeReductionVal(%rax), %zmm1, %k1
98 vpbroadcastd %edx, %zmm1{%k1}{z}
99 vfnmadd231ps __sPI2_FMA(%rax), %zmm12, %zmm6
100 vptestmd %zmm1, %zmm1, %k0
101 vpandd %zmm6, %zmm9, %zmm11
103 vpxord __sOneHalf(%rax), %zmm11, %zmm4
105 /* Result sign calculations */
106 vpternlogd $150, %zmm13, %zmm9, %zmm11
108 /* Add correction term 0.5 for cos() part */
109 vaddps %zmm4, %zmm12, %zmm10
110 vfnmadd213ps %zmm6, %zmm7, %zmm12
111 vfnmadd231ps %zmm10, %zmm5, %zmm8
112 vpxord %zmm13, %zmm12, %zmm13
113 vmulps %zmm13, %zmm13, %zmm12
114 vfnmadd231ps __sPI2_FMA(%rax), %zmm10, %zmm8
115 vfmadd231ps __sA9_FMA(%rax), %zmm12, %zmm15
116 vfnmadd213ps %zmm8, %zmm7, %zmm10
117 vfmadd213ps __sA5_FMA(%rax), %zmm12, %zmm15
118 vpxord %zmm11, %zmm10, %zmm5
119 vmulps %zmm5, %zmm5, %zmm4
120 vfmadd213ps __sA3(%rax), %zmm12, %zmm15
121 vfmadd213ps %zmm14, %zmm4, %zmm3
122 vmulps %zmm12, %zmm15, %zmm14
123 vfmadd213ps __sA5_FMA(%rax), %zmm4, %zmm3
124 vfmadd213ps %zmm13, %zmm13, %zmm14
125 vfmadd213ps __sA3(%rax), %zmm4, %zmm3
126 vpxord %zmm0, %zmm14, %zmm0
127 vmulps %zmm4, %zmm3, %zmm3
128 vfmadd213ps %zmm5, %zmm5, %zmm3
134 vmovups %zmm0, (%rdi)
135 vmovups %zmm3, (%rsi)
137 cfi_def_cfa_register (%rsp)
139 cfi_adjust_cfa_offset (-8)
145 vmovups %zmm2, 1152(%rsp)
146 vmovups %zmm0, 1216(%rsp)
147 vmovups %zmm3, 1280(%rsp)
151 kmovw %k4, 1048(%rsp)
153 kmovw %k5, 1040(%rsp)
154 kmovw %k6, 1032(%rsp)
155 kmovw %k7, 1024(%rsp)
156 vmovups %zmm16, 960(%rsp)
157 vmovups %zmm17, 896(%rsp)
158 vmovups %zmm18, 832(%rsp)
159 vmovups %zmm19, 768(%rsp)
160 vmovups %zmm20, 704(%rsp)
161 vmovups %zmm21, 640(%rsp)
162 vmovups %zmm22, 576(%rsp)
163 vmovups %zmm23, 512(%rsp)
164 vmovups %zmm24, 448(%rsp)
165 vmovups %zmm25, 384(%rsp)
166 vmovups %zmm26, 320(%rsp)
167 vmovups %zmm27, 256(%rsp)
168 vmovups %zmm28, 192(%rsp)
169 vmovups %zmm29, 128(%rsp)
170 vmovups %zmm30, 64(%rsp)
171 vmovups %zmm31, (%rsp)
172 movq %rsi, 1056(%rsp)
173 movq %r12, 1096(%rsp)
174 cfi_offset_rel_rsp (12, 1096)
176 movq %r13, 1088(%rsp)
177 cfi_offset_rel_rsp (13, 1088)
179 movq %r14, 1080(%rsp)
180 cfi_offset_rel_rsp (14, 1080)
182 movq %r15, 1072(%rsp)
183 cfi_offset_rel_rsp (15, 1072)
184 movq %rbx, 1064(%rsp)
204 kmovw 1048(%rsp), %k4
205 movq 1056(%rsp), %rsi
206 kmovw 1040(%rsp), %k5
207 movq 1096(%rsp), %r12
209 kmovw 1032(%rsp), %k6
210 movq 1088(%rsp), %r13
212 kmovw 1024(%rsp), %k7
213 vmovups 960(%rsp), %zmm16
214 vmovups 896(%rsp), %zmm17
215 vmovups 832(%rsp), %zmm18
216 vmovups 768(%rsp), %zmm19
217 vmovups 704(%rsp), %zmm20
218 vmovups 640(%rsp), %zmm21
219 vmovups 576(%rsp), %zmm22
220 vmovups 512(%rsp), %zmm23
221 vmovups 448(%rsp), %zmm24
222 vmovups 384(%rsp), %zmm25
223 vmovups 320(%rsp), %zmm26
224 vmovups 256(%rsp), %zmm27
225 vmovups 192(%rsp), %zmm28
226 vmovups 128(%rsp), %zmm29
227 vmovups 64(%rsp), %zmm30
228 vmovups (%rsp), %zmm31
229 movq 1080(%rsp), %r14
231 movq 1072(%rsp), %r15
233 movq 1064(%rsp), %rbx
234 vmovups 1216(%rsp), %zmm0
235 vmovups 1280(%rsp), %zmm3
241 vmovss 1156(%rsp,%r15,8), %xmm0
243 call JUMPTARGET(sinf)
245 vmovss %xmm0, 1220(%rsp,%r15,8)
246 vmovss 1156(%rsp,%r15,8), %xmm0
248 call JUMPTARGET(cosf)
250 vmovss %xmm0, 1284(%rsp,%r15,8)
255 vmovss 1152(%rsp,%r15,8), %xmm0
257 call JUMPTARGET(sinf)
259 vmovss %xmm0, 1216(%rsp,%r15,8)
260 vmovss 1152(%rsp,%r15,8), %xmm0
262 call JUMPTARGET(cosf)
264 vmovss %xmm0, 1280(%rsp,%r15,8)
266 END (_ZGVeN16vl4l4_sincosf_knl)
267 libmvec_hidden_def(_ZGVeN16vl4l4_sincosf_knl)
269 ENTRY (_ZGVeN16vl4l4_sincosf_skx)
271 cfi_adjust_cfa_offset (8)
272 cfi_rel_offset (%rbp, 0)
274 cfi_def_cfa_register (%rbp)
277 movq __svml_s_trig_data@GOTPCREL(%rip), %rax
279 vmovups __sAbsMask(%rax), %zmm3
280 vmovups __sInvPI(%rax), %zmm5
281 vmovups __sRShifter(%rax), %zmm6
282 vmovups __sPI1_FMA(%rax), %zmm9
283 vmovups __sPI2_FMA(%rax), %zmm10
284 vmovups __sSignMask(%rax), %zmm14
285 vmovups __sOneHalf(%rax), %zmm7
286 vmovups __sPI3_FMA(%rax), %zmm12
288 /* Absolute argument computation */
289 vandps %zmm3, %zmm4, %zmm2
291 /* c) Getting octant Y by 2/Pi multiplication
292 d) Add "Right Shifter" value */
293 vfmadd213ps %zmm6, %zmm2, %zmm5
294 vcmpps $18, __sRangeReductionVal(%rax), %zmm2, %k1
296 /* e) Treat obtained value as integer S for destination sign setting */
297 vpslld $31, %zmm5, %zmm0
299 /* g) Subtract "Right Shifter" (0x4B000000) value */
300 vsubps %zmm6, %zmm5, %zmm5
301 vmovups __sA3(%rax), %zmm6
303 /* h) Subtract Y*(PI/2) from X argument, where PI/2 divided to 3 parts:
304 X = X - Y*PI1 - Y*PI2 - Y*PI3 */
305 vmovaps %zmm2, %zmm11
306 vfnmadd231ps %zmm5, %zmm9, %zmm11
307 vfnmadd231ps %zmm5, %zmm10, %zmm11
308 vandps %zmm11, %zmm14, %zmm1
309 vxorps %zmm1, %zmm7, %zmm8
311 /* Result sign calculations */
312 vpternlogd $150, %zmm0, %zmm14, %zmm1
313 vpternlogd $0xff, %zmm14, %zmm14, %zmm14
315 /* Add correction term 0.5 for cos() part */
316 vaddps %zmm8, %zmm5, %zmm15
317 vfnmadd213ps %zmm11, %zmm12, %zmm5
318 vandnps %zmm4, %zmm3, %zmm11
319 vmovups __sA7_FMA(%rax), %zmm3
320 vmovaps %zmm2, %zmm13
321 vfnmadd231ps %zmm15, %zmm9, %zmm13
322 vxorps %zmm0, %zmm5, %zmm9
323 vmovups __sA5_FMA(%rax), %zmm0
324 vfnmadd231ps %zmm15, %zmm10, %zmm13
325 vmulps %zmm9, %zmm9, %zmm8
326 vfnmadd213ps %zmm13, %zmm12, %zmm15
327 vmovups __sA9_FMA(%rax), %zmm12
328 vxorps %zmm1, %zmm15, %zmm1
329 vmulps %zmm1, %zmm1, %zmm13
331 /* 2) Polynomial (minimax for sin within [-Pi/4; +Pi/4] interval)
332 a) Calculate X^2 = X * X
333 b) Calculate 2 polynomials for sin and cos:
334 RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3))));
335 RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4)))) */
336 vmovaps %zmm12, %zmm7
337 vfmadd213ps %zmm3, %zmm8, %zmm7
338 vfmadd213ps %zmm3, %zmm13, %zmm12
339 vfmadd213ps %zmm0, %zmm8, %zmm7
340 vfmadd213ps %zmm0, %zmm13, %zmm12
341 vfmadd213ps %zmm6, %zmm8, %zmm7
342 vfmadd213ps %zmm6, %zmm13, %zmm12
343 vmulps %zmm8, %zmm7, %zmm10
344 vmulps %zmm13, %zmm12, %zmm3
345 vfmadd213ps %zmm9, %zmm9, %zmm10
346 vfmadd213ps %zmm1, %zmm1, %zmm3
347 vxorps %zmm11, %zmm10, %zmm0
348 vpandnd %zmm2, %zmm2, %zmm14{%k1}
349 vptestmd %zmm14, %zmm14, %k0
356 vmovups %zmm0, (%rdi)
357 vmovups %zmm3, (%rsi)
359 cfi_def_cfa_register (%rsp)
361 cfi_adjust_cfa_offset (-8)
367 vmovups %zmm4, 1152(%rsp)
368 vmovups %zmm0, 1216(%rsp)
369 vmovups %zmm3, 1280(%rsp)
374 kmovw %k4, 1048(%rsp)
375 kmovw %k5, 1040(%rsp)
376 kmovw %k6, 1032(%rsp)
377 kmovw %k7, 1024(%rsp)
378 vmovups %zmm16, 960(%rsp)
379 vmovups %zmm17, 896(%rsp)
380 vmovups %zmm18, 832(%rsp)
381 vmovups %zmm19, 768(%rsp)
382 vmovups %zmm20, 704(%rsp)
383 vmovups %zmm21, 640(%rsp)
384 vmovups %zmm22, 576(%rsp)
385 vmovups %zmm23, 512(%rsp)
386 vmovups %zmm24, 448(%rsp)
387 vmovups %zmm25, 384(%rsp)
388 vmovups %zmm26, 320(%rsp)
389 vmovups %zmm27, 256(%rsp)
390 vmovups %zmm28, 192(%rsp)
391 vmovups %zmm29, 128(%rsp)
392 vmovups %zmm30, 64(%rsp)
393 vmovups %zmm31, (%rsp)
394 movq %rsi, 1056(%rsp)
395 movq %r12, 1096(%rsp)
396 cfi_offset_rel_rsp (12, 1096)
398 movq %r13, 1088(%rsp)
399 cfi_offset_rel_rsp (13, 1088)
401 movq %r14, 1080(%rsp)
402 cfi_offset_rel_rsp (14, 1080)
404 movq %r15, 1072(%rsp)
405 cfi_offset_rel_rsp (15, 1072)
406 movq %rbx, 1064(%rsp)
425 kmovw 1048(%rsp), %k4
427 kmovw 1040(%rsp), %k5
428 kmovw 1032(%rsp), %k6
429 kmovw 1024(%rsp), %k7
430 vmovups 960(%rsp), %zmm16
431 vmovups 896(%rsp), %zmm17
432 vmovups 832(%rsp), %zmm18
433 vmovups 768(%rsp), %zmm19
434 vmovups 704(%rsp), %zmm20
435 vmovups 640(%rsp), %zmm21
436 vmovups 576(%rsp), %zmm22
437 vmovups 512(%rsp), %zmm23
438 vmovups 448(%rsp), %zmm24
439 vmovups 384(%rsp), %zmm25
440 vmovups 320(%rsp), %zmm26
441 vmovups 256(%rsp), %zmm27
442 vmovups 192(%rsp), %zmm28
443 vmovups 128(%rsp), %zmm29
444 vmovups 64(%rsp), %zmm30
445 vmovups (%rsp), %zmm31
446 vmovups 1216(%rsp), %zmm0
447 vmovups 1280(%rsp), %zmm3
448 movq 1056(%rsp), %rsi
449 movq 1096(%rsp), %r12
451 movq 1088(%rsp), %r13
453 movq 1080(%rsp), %r14
455 movq 1072(%rsp), %r15
457 movq 1064(%rsp), %rbx
463 vmovss 1156(%rsp,%r15,8), %xmm0
465 vmovss 1156(%rsp,%r15,8), %xmm0
467 call JUMPTARGET(sinf)
469 vmovss %xmm0, 1220(%rsp,%r15,8)
470 vmovss 1156(%rsp,%r15,8), %xmm0
472 call JUMPTARGET(cosf)
474 vmovss %xmm0, 1284(%rsp,%r15,8)
479 vmovss 1152(%rsp,%r15,8), %xmm0
481 vmovss 1152(%rsp,%r15,8), %xmm0
483 call JUMPTARGET(sinf)
485 vmovss %xmm0, 1216(%rsp,%r15,8)
486 vmovss 1152(%rsp,%r15,8), %xmm0
488 call JUMPTARGET(cosf)
490 vmovss %xmm0, 1280(%rsp,%r15,8)
492 END (_ZGVeN16vl4l4_sincosf_skx)
493 libmvec_hidden_def(_ZGVeN16vl4l4_sincosf_skx)
495 /* Wrapper between vvv and vl4l4 vector variants. */
496 .macro WRAPPER_AVX512_vvv_vl4l4 callee
499 cfi_adjust_cfa_offset (8)
500 cfi_rel_offset (%rbp, 0)
502 cfi_def_cfa_register (%rbp)
505 vmovups %zmm1, 128(%rsp)
507 vmovups %zmm2, 192(%rdi)
508 vmovups %zmm3, 256(%rdi)
509 vmovups %zmm4, 320(%rdi)
511 call HIDDEN_JUMPTARGET(\callee)
590 movl 104(%rsp), %r10d
603 movl 124(%rsp), %r11d
609 cfi_def_cfa_register (%rsp)
611 cfi_adjust_cfa_offset (-8)
620 .cfi_escape 0x10,0x6,0x2,0x76,0
623 .cfi_escape 0xf,0x3,0x76,0x78,0x6
624 leal -112(%rbp), %esi
625 leal -176(%rbp), %edi
627 vmovdqa64 %zmm1, -240(%ebp)
628 vmovdqa64 %zmm2, -304(%ebp)
629 call HIDDEN_JUMPTARGET(\callee)
630 movl -240(%ebp), %eax
631 vmovss -176(%ebp), %xmm0
633 movl -236(%ebp), %eax
634 vmovss -172(%ebp), %xmm0
636 movl -232(%ebp), %eax
637 vmovss -168(%ebp), %xmm0
639 movl -228(%ebp), %eax
640 vmovss -164(%ebp), %xmm0
642 movl -224(%ebp), %eax
643 vmovss -160(%ebp), %xmm0
645 movl -220(%ebp), %eax
646 vmovss -156(%ebp), %xmm0
648 movl -216(%ebp), %eax
649 vmovss -152(%ebp), %xmm0
651 movl -212(%ebp), %eax
652 vmovss -148(%ebp), %xmm0
654 movl -208(%ebp), %eax
655 vmovss -144(%ebp), %xmm0
657 movl -204(%ebp), %eax
658 vmovss -140(%ebp), %xmm0
660 movl -200(%ebp), %eax
661 vmovss -136(%ebp), %xmm0
663 movl -196(%ebp), %eax
664 vmovss -132(%ebp), %xmm0
666 movl -192(%ebp), %eax
667 vmovss -128(%ebp), %xmm0
669 movl -188(%ebp), %eax
670 vmovss -124(%ebp), %xmm0
672 movl -184(%ebp), %eax
673 vmovss -120(%ebp), %xmm0
675 movl -180(%ebp), %eax
676 vmovss -116(%ebp), %xmm0
678 movl -304(%ebp), %eax
679 vmovss -112(%ebp), %xmm0
681 movl -300(%ebp), %eax
682 vmovss -108(%ebp), %xmm0
684 movl -296(%ebp), %eax
685 vmovss -104(%ebp), %xmm0
687 movl -292(%ebp), %eax
688 vmovss -100(%ebp), %xmm0
690 movl -288(%ebp), %eax
691 vmovss -96(%ebp), %xmm0
693 movl -284(%ebp), %eax
694 vmovss -92(%ebp), %xmm0
696 movl -280(%ebp), %eax
697 vmovss -88(%ebp), %xmm0
699 movl -276(%ebp), %eax
700 vmovss -84(%ebp), %xmm0
702 movl -272(%ebp), %eax
703 vmovss -80(%ebp), %xmm0
705 movl -268(%ebp), %eax
706 vmovss -76(%ebp), %xmm0
708 movl -264(%ebp), %eax
709 vmovss -72(%ebp), %xmm0
711 movl -260(%ebp), %eax
712 vmovss -68(%ebp), %xmm0
714 movl -256(%ebp), %eax
715 vmovss -64(%ebp), %xmm0
717 movl -252(%ebp), %eax
718 vmovss -60(%ebp), %xmm0
720 movl -248(%ebp), %eax
721 vmovss -56(%ebp), %xmm0
723 movl -244(%ebp), %eax
724 vmovss -52(%ebp), %xmm0
736 ENTRY (_ZGVeN16vvv_sincosf_knl)
737 WRAPPER_AVX512_vvv_vl4l4 _ZGVeN16vl4l4_sincosf_knl
738 END (_ZGVeN16vvv_sincosf_knl)
740 ENTRY (_ZGVeN16vvv_sincosf_skx)
741 WRAPPER_AVX512_vvv_vl4l4 _ZGVeN16vl4l4_sincosf_skx
742 END (_ZGVeN16vvv_sincosf_skx)