]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
Prefer https to http for gnu.org and fsf.org URLs
[thirdparty/glibc.git] / sysdeps / x86_64 / fpu / multiarch / svml_d_sincos8_core_avx512.S
CommitLineData
c9a8c526 1/* Function sincos vectorized with AVX-512. KNL and SKX versions.
04277e02 2 Copyright (C) 2014-2019 Free Software Foundation, Inc.
c9a8c526
AS
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
5a82c748 17 <https://www.gnu.org/licenses/>. */
c9a8c526
AS
18
19#include <sysdep.h>
5872b835 20#include "svml_d_trig_data.h"
c9a8c526
AS
21#include "svml_d_wrapper_impl.h"
22
23/*
24 ALGORITHM DESCRIPTION:
25
26 ( low accuracy ( < 4ulp ) or enhanced performance
27 ( half of correct mantissa ) implementation )
28
29 Argument representation:
30 arg = N*Pi + R
31
32 Result calculation:
33 sin(arg) = sin(N*Pi + R) = (-1)^N * sin(R)
34 arg + Pi/2 = (N'*Pi + R')
35 cos(arg) = sin(arg+Pi/2) = sin(N'*Pi + R') = (-1)^N' * sin(R')
36 sin(R), sin(R') are approximated by corresponding polynomial. */
37
38 .text
ee2196bb 39ENTRY (_ZGVeN8vl8l8_sincos_knl)
f43cb35c 40#ifndef HAVE_AVX512DQ_ASM_SUPPORT
ee2196bb 41WRAPPER_IMPL_AVX512_fFF _ZGVdN4vl8l8_sincos
c9a8c526
AS
42#else
43 pushq %rbp
44 cfi_adjust_cfa_offset (8)
45 cfi_rel_offset (%rbp, 0)
46 movq %rsp, %rbp
47 cfi_def_cfa_register (%rbp)
48 andq $-64, %rsp
49 subq $1344, %rsp
5872b835 50 movq __svml_d_trig_data@GOTPCREL(%rip), %rax
c9a8c526
AS
51 vmovaps %zmm0, %zmm4
52 movq $-1, %rdx
53 vmovups __dSignMask(%rax), %zmm12
54 vmovups __dInvPI(%rax), %zmm5
55
56/* ARGUMENT RANGE REDUCTION:
57 Absolute argument: X' = |X| */
58 vpandnq %zmm4, %zmm12, %zmm3
59 vmovups __dPI1_FMA(%rax), %zmm7
60 vmovups __dPI3_FMA(%rax), %zmm9
61
62/* SinR = X' - SinN*Pi1 */
63 vmovaps %zmm3, %zmm8
64
65/* CosR = SinX - CosN*Pi1 */
66 vmovaps %zmm3, %zmm10
67
68/* SinY = X'*InvPi + RS : right shifter add */
69 vfmadd213pd __dRShifter(%rax), %zmm3, %zmm5
70 vmovups __dC6(%rax), %zmm13
71
72/* SinN = Y - RS : right shifter sub */
73 vsubpd __dRShifter(%rax), %zmm5, %zmm1
74 vmovaps %zmm13, %zmm14
75
76/* SinSignRes = Y<<63 : shift LSB to MSB place for result sign */
77 vpsllq $63, %zmm5, %zmm2
78 vcmppd $22, __dRangeVal(%rax), %zmm3, %k1
79
80/* Update CosRSign and CosSignRes signs */
81 vmovaps %zmm12, %zmm5
82 vfnmadd231pd %zmm1, %zmm7, %zmm8
83
84/* SinR = SinR - SinN*Pi1 */
85 vfnmadd231pd __dPI2_FMA(%rax), %zmm1, %zmm8
86
87/* Sine result sign: SinRSign = SignMask & SinR */
88 vpandq %zmm8, %zmm12, %zmm11
89
90/* Set SinRSign to 0.5 */
91 vporq __dOneHalf(%rax), %zmm11, %zmm6
92 vpternlogq $150, %zmm2, %zmm11, %zmm5
93
94/* Update sign SinSignRes */
95 vpternlogq $120, %zmm4, %zmm12, %zmm2
96
97/* Polynomial approximation */
98 vmovups __dC7(%rax), %zmm11
99
100/* CosN = SinN +(-)0.5 */
101 vaddpd %zmm6, %zmm1, %zmm0
102
103/* SinR = SinR - SinN*Pi3 */
104 vfnmadd213pd %zmm8, %zmm9, %zmm1
105 vfnmadd231pd %zmm0, %zmm7, %zmm10
106
107/* SinR2 = SinR^2 */
108 vmulpd %zmm1, %zmm1, %zmm15
109
110/* Grab SignX
111 CosR = CosR - CosN*Pi2 */
112 vfnmadd231pd __dPI2_FMA(%rax), %zmm0, %zmm10
113 vfmadd231pd __dC7(%rax), %zmm15, %zmm14
114
115/* CosR = CosR - CosN*Pi3 */
116 vfnmadd213pd %zmm10, %zmm9, %zmm0
117 vfmadd213pd __dC5(%rax), %zmm15, %zmm14
118
119/* CosR2 = CosR^2 */
120 vmulpd %zmm0, %zmm0, %zmm12
121 vfmadd213pd __dC4(%rax), %zmm15, %zmm14
122 vfmadd213pd %zmm13, %zmm12, %zmm11
123
124/* SinPoly = C3 + SinR2*(C4 + SinR2*(C5 + SinR2*(C6 + SinR2*C7))) */
125 vfmadd213pd __dC3(%rax), %zmm15, %zmm14
126 vfmadd213pd __dC5(%rax), %zmm12, %zmm11
127
128/* SinPoly = C2 + SinR2*SinPoly */
129 vfmadd213pd __dC2(%rax), %zmm15, %zmm14
130 vfmadd213pd __dC4(%rax), %zmm12, %zmm11
131
132/* SinPoly = C1 + SinR2*SinPoly */
133 vfmadd213pd __dC1(%rax), %zmm15, %zmm14
134
135/* CosPoly = C3 + CosR2*(C4 + CosR2*(C5 + CosR2*(C6 + CosR2*C7))) */
136 vfmadd213pd __dC3(%rax), %zmm12, %zmm11
137
138/* SinPoly = SinR2*SinPoly */
139 vmulpd %zmm15, %zmm14, %zmm13
140
141/* CosPoly = C2 + CosR2*CosPoly */
142 vfmadd213pd __dC2(%rax), %zmm12, %zmm11
143
144/* SinPoly = SinR*SinPoly */
145 vfmadd213pd %zmm1, %zmm1, %zmm13
146 vpbroadcastq %rdx, %zmm1{%k1}{z}
147
148/* CosPoly = C1 + CosR2*CosPoly */
149 vfmadd213pd __dC1(%rax), %zmm12, %zmm11
150 vptestmq %zmm1, %zmm1, %k0
151 kmovw %k0, %ecx
152
153/* CosPoly = CosR2*CosPoly */
154 vmulpd %zmm12, %zmm11, %zmm14
155 movzbl %cl, %ecx
156
157/* CosPoly = CosR*CosPoly */
158 vfmadd213pd %zmm0, %zmm0, %zmm14
159
160/* Final reconstruction.
161 Update Sin result's sign */
162 vpxorq %zmm2, %zmm13, %zmm0
163
164/* Update Cos result's sign */
165 vpxorq %zmm5, %zmm14, %zmm2
166 testl %ecx, %ecx
167 jne .LBL_1_3
168
169.LBL_1_2:
170 cfi_remember_state
171 vmovups %zmm0, (%rdi)
172 vmovups %zmm2, (%rsi)
173 movq %rbp, %rsp
174 cfi_def_cfa_register (%rsp)
175 popq %rbp
176 cfi_adjust_cfa_offset (-8)
177 cfi_restore (%rbp)
178 ret
179
180.LBL_1_3:
181 cfi_restore_state
182 vmovups %zmm4, 1152(%rsp)
183 vmovups %zmm0, 1216(%rsp)
184 vmovups %zmm2, 1280(%rsp)
185 je .LBL_1_2
186
187 xorb %dl, %dl
188 kmovw %k4, 1048(%rsp)
189 xorl %eax, %eax
190 kmovw %k5, 1040(%rsp)
191 kmovw %k6, 1032(%rsp)
192 kmovw %k7, 1024(%rsp)
193 vmovups %zmm16, 960(%rsp)
194 vmovups %zmm17, 896(%rsp)
195 vmovups %zmm18, 832(%rsp)
196 vmovups %zmm19, 768(%rsp)
197 vmovups %zmm20, 704(%rsp)
198 vmovups %zmm21, 640(%rsp)
199 vmovups %zmm22, 576(%rsp)
200 vmovups %zmm23, 512(%rsp)
201 vmovups %zmm24, 448(%rsp)
202 vmovups %zmm25, 384(%rsp)
203 vmovups %zmm26, 320(%rsp)
204 vmovups %zmm27, 256(%rsp)
205 vmovups %zmm28, 192(%rsp)
206 vmovups %zmm29, 128(%rsp)
207 vmovups %zmm30, 64(%rsp)
208 vmovups %zmm31, (%rsp)
209 movq %rsi, 1056(%rsp)
210 movq %r12, 1096(%rsp)
211 cfi_offset_rel_rsp (12, 1096)
212 movb %dl, %r12b
213 movq %r13, 1088(%rsp)
214 cfi_offset_rel_rsp (13, 1088)
215 movl %eax, %r13d
216 movq %r14, 1080(%rsp)
217 cfi_offset_rel_rsp (14, 1080)
218 movl %ecx, %r14d
219 movq %r15, 1072(%rsp)
220 cfi_offset_rel_rsp (15, 1072)
221 movq %rbx, 1064(%rsp)
222 movq %rdi, %rbx
223 cfi_remember_state
224
225.LBL_1_6:
226 btl %r13d, %r14d
227 jc .LBL_1_13
228
229.LBL_1_7:
230 lea 1(%r13), %esi
231 btl %esi, %r14d
232 jc .LBL_1_10
233
234.LBL_1_8:
235 addb $1, %r12b
236 addl $2, %r13d
237 cmpb $16, %r12b
238 jb .LBL_1_6
239
240 movq %rbx, %rdi
241 kmovw 1048(%rsp), %k4
242 movq 1056(%rsp), %rsi
243 kmovw 1040(%rsp), %k5
244 movq 1096(%rsp), %r12
245 cfi_restore (%r12)
246 kmovw 1032(%rsp), %k6
247 movq 1088(%rsp), %r13
248 cfi_restore (%r13)
249 kmovw 1024(%rsp), %k7
250 vmovups 960(%rsp), %zmm16
251 vmovups 896(%rsp), %zmm17
252 vmovups 832(%rsp), %zmm18
253 vmovups 768(%rsp), %zmm19
254 vmovups 704(%rsp), %zmm20
255 vmovups 640(%rsp), %zmm21
256 vmovups 576(%rsp), %zmm22
257 vmovups 512(%rsp), %zmm23
258 vmovups 448(%rsp), %zmm24
259 vmovups 384(%rsp), %zmm25
260 vmovups 320(%rsp), %zmm26
261 vmovups 256(%rsp), %zmm27
262 vmovups 192(%rsp), %zmm28
263 vmovups 128(%rsp), %zmm29
264 vmovups 64(%rsp), %zmm30
265 vmovups (%rsp), %zmm31
266 movq 1080(%rsp), %r14
267 cfi_restore (%r14)
268 movq 1072(%rsp), %r15
269 cfi_restore (%r15)
270 movq 1064(%rsp), %rbx
271 vmovups 1216(%rsp), %zmm0
272 vmovups 1280(%rsp), %zmm2
273 jmp .LBL_1_2
274
275.LBL_1_10:
276 cfi_restore_state
277 movzbl %r12b, %r15d
278 shlq $4, %r15
279 vmovsd 1160(%rsp,%r15), %xmm0
280
86ed8882 281 call JUMPTARGET(sin)
c9a8c526
AS
282
283 vmovsd %xmm0, 1224(%rsp,%r15)
284 vmovsd 1160(%rsp,%r15), %xmm0
285
86ed8882 286 call JUMPTARGET(cos)
c9a8c526
AS
287
288 vmovsd %xmm0, 1288(%rsp,%r15)
289 jmp .LBL_1_8
290
291.LBL_1_13:
292 movzbl %r12b, %r15d
293 shlq $4, %r15
294 vmovsd 1152(%rsp,%r15), %xmm0
295
86ed8882 296 call JUMPTARGET(sin)
c9a8c526
AS
297
298 vmovsd %xmm0, 1216(%rsp,%r15)
299 vmovsd 1152(%rsp,%r15), %xmm0
300
86ed8882 301 call JUMPTARGET(cos)
c9a8c526
AS
302
303 vmovsd %xmm0, 1280(%rsp,%r15)
304 jmp .LBL_1_7
305
306#endif
ee2196bb
AS
307END (_ZGVeN8vl8l8_sincos_knl)
308libmvec_hidden_def(_ZGVeN8vl8l8_sincos_knl)
c9a8c526 309
ee2196bb 310ENTRY (_ZGVeN8vl8l8_sincos_skx)
f43cb35c 311#ifndef HAVE_AVX512DQ_ASM_SUPPORT
ee2196bb 312WRAPPER_IMPL_AVX512_fFF _ZGVdN4vl8l8_sincos
c9a8c526
AS
313#else
314 pushq %rbp
315 cfi_adjust_cfa_offset (8)
316 cfi_rel_offset (%rbp, 0)
317 movq %rsp, %rbp
318 cfi_def_cfa_register (%rbp)
319 andq $-64, %rsp
320 subq $1344, %rsp
5872b835 321 movq __svml_d_trig_data@GOTPCREL(%rip), %rax
c9a8c526
AS
322 vmovaps %zmm0, %zmm8
323 vmovups __dSignMask(%rax), %zmm4
324 vmovups __dInvPI(%rax), %zmm9
325 vmovups __dRShifter(%rax), %zmm10
326 vmovups __dPI1_FMA(%rax), %zmm13
327 vmovups __dPI2_FMA(%rax), %zmm14
328 vmovups __dOneHalf(%rax), %zmm11
329 vmovups __dPI3_FMA(%rax), %zmm2
330
331/* ARGUMENT RANGE REDUCTION:
332 Absolute argument: X' = |X| */
333 vandnpd %zmm8, %zmm4, %zmm7
334
335/* SinY = X'*InvPi + RS : right shifter add */
336 vfmadd213pd %zmm10, %zmm7, %zmm9
337 vcmppd $18, __dRangeVal(%rax), %zmm7, %k1
338
339/* SinSignRes = Y<<63 : shift LSB to MSB place for result sign */
340 vpsllq $63, %zmm9, %zmm6
341
342/* SinN = Y - RS : right shifter sub */
343 vsubpd %zmm10, %zmm9, %zmm5
344 vmovups __dC5(%rax), %zmm9
345 vmovups __dC4(%rax), %zmm10
346
347/* SinR = X' - SinN*Pi1 */
348 vmovaps %zmm7, %zmm15
349 vfnmadd231pd %zmm5, %zmm13, %zmm15
350
351/* SinR = SinR - SinN*Pi1 */
352 vfnmadd231pd %zmm5, %zmm14, %zmm15
353
354/* Sine result sign: SinRSign = SignMask & SinR */
355 vandpd %zmm15, %zmm4, %zmm1
356
357/* Set SinRSign to 0.5 */
358 vorpd %zmm1, %zmm11, %zmm12
359 vmovups __dC3(%rax), %zmm11
360
361/* CosN = SinN +(-)0.5 */
362 vaddpd %zmm12, %zmm5, %zmm3
363
364/* SinR = SinR - SinN*Pi3 */
365 vfnmadd213pd %zmm15, %zmm2, %zmm5
366 vmovups __dC2(%rax), %zmm12
367
368/* SinR2 = SinR^2 */
369 vmulpd %zmm5, %zmm5, %zmm15
370
371/* CosR = SinX - CosN*Pi1 */
372 vmovaps %zmm7, %zmm0
373 vfnmadd231pd %zmm3, %zmm13, %zmm0
374 vmovups __dC1(%rax), %zmm13
375
376/* Grab SignX
377 CosR = CosR - CosN*Pi2 */
378 vfnmadd231pd %zmm3, %zmm14, %zmm0
379
380/* CosR = CosR - CosN*Pi3 */
381 vfnmadd213pd %zmm0, %zmm2, %zmm3
382
383/* Polynomial approximation */
384 vmovups __dC7(%rax), %zmm0
385
386/* Update CosRSign and CosSignRes signs */
387 vmovaps %zmm4, %zmm2
388 vpternlogq $150, %zmm6, %zmm1, %zmm2
389
390/* Update sign SinSignRes */
391 vpternlogq $120, %zmm8, %zmm4, %zmm6
392
393/* CosR2 = CosR^2 */
394 vmulpd %zmm3, %zmm3, %zmm1
395 vmovups __dC6(%rax), %zmm4
396 vmovaps %zmm0, %zmm14
397 vfmadd213pd %zmm4, %zmm1, %zmm0
398 vfmadd213pd %zmm4, %zmm15, %zmm14
399 vfmadd213pd %zmm9, %zmm1, %zmm0
400 vfmadd213pd %zmm9, %zmm15, %zmm14
401 vfmadd213pd %zmm10, %zmm1, %zmm0
402 vfmadd213pd %zmm10, %zmm15, %zmm14
403
404/* CosPoly = C3 + CosR2*(C4 + CosR2*(C5 + CosR2*(C6 + CosR2*C7))) */
405 vfmadd213pd %zmm11, %zmm1, %zmm0
406
407/* SinPoly = C3 + SinR2*(C4 + SinR2*(C5 + SinR2*(C6 + SinR2*C7))) */
408 vfmadd213pd %zmm11, %zmm15, %zmm14
409
410/* CosPoly = C2 + CosR2*CosPoly */
411 vfmadd213pd %zmm12, %zmm1, %zmm0
412
413/* SinPoly = C2 + SinR2*SinPoly */
414 vfmadd213pd %zmm12, %zmm15, %zmm14
415
416/* CosPoly = C1 + CosR2*CosPoly */
417 vfmadd213pd %zmm13, %zmm1, %zmm0
418
419/* SinPoly = C1 + SinR2*SinPoly */
420 vfmadd213pd %zmm13, %zmm15, %zmm14
421
422/* CosPoly = CosR2*CosPoly */
423 vmulpd %zmm1, %zmm0, %zmm1
424
425/* SinPoly = SinR2*SinPoly */
426 vmulpd %zmm15, %zmm14, %zmm4
427
428/* CosPoly = CosR*CosPoly */
429 vfmadd213pd %zmm3, %zmm3, %zmm1
430
431/* SinPoly = SinR*SinPoly */
432 vfmadd213pd %zmm5, %zmm5, %zmm4
433 vpbroadcastq .L_2il0floatpacket.15(%rip), %zmm3
434
435/* Update Cos result's sign */
436 vxorpd %zmm2, %zmm1, %zmm1
437
438/* Final reconstruction.
439 Update Sin result's sign */
440 vxorpd %zmm6, %zmm4, %zmm0
441 vpandnq %zmm7, %zmm7, %zmm3{%k1}
442 vcmppd $3, %zmm3, %zmm3, %k0
443 kmovw %k0, %ecx
444 testl %ecx, %ecx
445 jne .LBL_2_3
446
447.LBL_2_2:
448 cfi_remember_state
449 vmovups %zmm0, (%rdi)
450 vmovups %zmm1, (%rsi)
451 movq %rbp, %rsp
452 cfi_def_cfa_register (%rsp)
453 popq %rbp
454 cfi_adjust_cfa_offset (-8)
455 cfi_restore (%rbp)
456 ret
457
458.LBL_2_3:
459 cfi_restore_state
460 vmovups %zmm8, 1152(%rsp)
461 vmovups %zmm0, 1216(%rsp)
462 vmovups %zmm1, 1280(%rsp)
463 je .LBL_2_2
464
465 xorb %dl, %dl
466 xorl %eax, %eax
467 kmovw %k4, 1048(%rsp)
468 kmovw %k5, 1040(%rsp)
469 kmovw %k6, 1032(%rsp)
470 kmovw %k7, 1024(%rsp)
471 vmovups %zmm16, 960(%rsp)
472 vmovups %zmm17, 896(%rsp)
473 vmovups %zmm18, 832(%rsp)
474 vmovups %zmm19, 768(%rsp)
475 vmovups %zmm20, 704(%rsp)
476 vmovups %zmm21, 640(%rsp)
477 vmovups %zmm22, 576(%rsp)
478 vmovups %zmm23, 512(%rsp)
479 vmovups %zmm24, 448(%rsp)
480 vmovups %zmm25, 384(%rsp)
481 vmovups %zmm26, 320(%rsp)
482 vmovups %zmm27, 256(%rsp)
483 vmovups %zmm28, 192(%rsp)
484 vmovups %zmm29, 128(%rsp)
485 vmovups %zmm30, 64(%rsp)
486 vmovups %zmm31, (%rsp)
487 movq %rsi, 1056(%rsp)
488 movq %r12, 1096(%rsp)
489 cfi_offset_rel_rsp (12, 1096)
490 movb %dl, %r12b
491 movq %r13, 1088(%rsp)
492 cfi_offset_rel_rsp (13, 1088)
493 movl %eax, %r13d
494 movq %r14, 1080(%rsp)
495 cfi_offset_rel_rsp (14, 1080)
496 movl %ecx, %r14d
497 movq %r15, 1072(%rsp)
498 cfi_offset_rel_rsp (15, 1072)
499 movq %rbx, 1064(%rsp)
500 movq %rdi, %rbx
501 cfi_remember_state
502
503.LBL_2_6:
504 btl %r13d, %r14d
505 jc .LBL_2_13
506
507.LBL_2_7:
508 lea 1(%r13), %esi
509 btl %esi, %r14d
510 jc .LBL_2_10
511
512.LBL_2_8:
513 incb %r12b
514 addl $2, %r13d
515 cmpb $16, %r12b
516 jb .LBL_2_6
517
518 kmovw 1048(%rsp), %k4
519 movq %rbx, %rdi
520 kmovw 1040(%rsp), %k5
521 kmovw 1032(%rsp), %k6
522 kmovw 1024(%rsp), %k7
523 vmovups 960(%rsp), %zmm16
524 vmovups 896(%rsp), %zmm17
525 vmovups 832(%rsp), %zmm18
526 vmovups 768(%rsp), %zmm19
527 vmovups 704(%rsp), %zmm20
528 vmovups 640(%rsp), %zmm21
529 vmovups 576(%rsp), %zmm22
530 vmovups 512(%rsp), %zmm23
531 vmovups 448(%rsp), %zmm24
532 vmovups 384(%rsp), %zmm25
533 vmovups 320(%rsp), %zmm26
534 vmovups 256(%rsp), %zmm27
535 vmovups 192(%rsp), %zmm28
536 vmovups 128(%rsp), %zmm29
537 vmovups 64(%rsp), %zmm30
538 vmovups (%rsp), %zmm31
539 vmovups 1216(%rsp), %zmm0
540 vmovups 1280(%rsp), %zmm1
541 movq 1056(%rsp), %rsi
542 movq 1096(%rsp), %r12
543 cfi_restore (%r12)
544 movq 1088(%rsp), %r13
545 cfi_restore (%r13)
546 movq 1080(%rsp), %r14
547 cfi_restore (%r14)
548 movq 1072(%rsp), %r15
549 cfi_restore (%r15)
550 movq 1064(%rsp), %rbx
551 jmp .LBL_2_2
552
553.LBL_2_10:
554 cfi_restore_state
555 movzbl %r12b, %r15d
556 shlq $4, %r15
557 vmovsd 1160(%rsp,%r15), %xmm0
558 vzeroupper
559 vmovsd 1160(%rsp,%r15), %xmm0
560
86ed8882 561 call JUMPTARGET(sin)
c9a8c526
AS
562
563 vmovsd %xmm0, 1224(%rsp,%r15)
564 vmovsd 1160(%rsp,%r15), %xmm0
565
86ed8882 566 call JUMPTARGET(cos)
c9a8c526
AS
567
568 vmovsd %xmm0, 1288(%rsp,%r15)
569 jmp .LBL_2_8
570
571.LBL_2_13:
572 movzbl %r12b, %r15d
573 shlq $4, %r15
574 vmovsd 1152(%rsp,%r15), %xmm0
575 vzeroupper
576 vmovsd 1152(%rsp,%r15), %xmm0
577
86ed8882 578 call JUMPTARGET(sin)
c9a8c526
AS
579
580 vmovsd %xmm0, 1216(%rsp,%r15)
581 vmovsd 1152(%rsp,%r15), %xmm0
582
86ed8882 583 call JUMPTARGET(cos)
c9a8c526
AS
584
585 vmovsd %xmm0, 1280(%rsp,%r15)
586 jmp .LBL_2_7
587
588#endif
ee2196bb
AS
589END (_ZGVeN8vl8l8_sincos_skx)
590libmvec_hidden_def(_ZGVeN8vl8l8_sincos_skx)
591
592/* Wrapper between vvv and vl8l8 vector variants. */
593.macro WRAPPER_AVX512_vvv_vl8l8 callee
594#ifndef __ILP32__
595 pushq %rbp
596 cfi_adjust_cfa_offset (8)
597 cfi_rel_offset (%rbp, 0)
598 movq %rsp, %rbp
599 cfi_def_cfa_register (%rbp)
600 andq $-64, %rsp
601 subq $256, %rsp
b9eaca8f 602 vmovups %zmm1, 128(%rsp)
ee2196bb 603 lea (%rsp), %rdi
b9eaca8f 604 vmovups %zmm2, 192(%rdi)
ee2196bb
AS
605 lea 64(%rsp), %rsi
606 call HIDDEN_JUMPTARGET(\callee)
607 movq 128(%rsp), %rdx
608 movq 136(%rsp), %rsi
609 movq 144(%rsp), %r8
610 movq 152(%rsp), %r10
611 movq (%rsp), %rax
612 movq 8(%rsp), %rcx
613 movq 16(%rsp), %rdi
614 movq 24(%rsp), %r9
615 movq %rax, (%rdx)
616 movq %rcx, (%rsi)
617 movq 160(%rsp), %rax
618 movq 168(%rsp), %rcx
619 movq %rdi, (%r8)
620 movq %r9, (%r10)
621 movq 176(%rsp), %rdi
622 movq 184(%rsp), %r9
623 movq 32(%rsp), %r11
624 movq 40(%rsp), %rdx
625 movq 48(%rsp), %rsi
626 movq 56(%rsp), %r8
627 movq %r11, (%rax)
628 movq %rdx, (%rcx)
629 movq 192(%rsp), %r11
630 movq 200(%rsp), %rdx
631 movq %rsi, (%rdi)
632 movq %r8, (%r9)
633 movq 208(%rsp), %rsi
634 movq 216(%rsp), %r8
635 movq 64(%rsp), %r10
636 movq 72(%rsp), %rax
637 movq 80(%rsp), %rcx
638 movq 88(%rsp), %rdi
639 movq %r10, (%r11)
640 movq %rax, (%rdx)
641 movq 224(%rsp), %r10
642 movq 232(%rsp), %rax
643 movq %rcx, (%rsi)
644 movq %rdi, (%r8)
645 movq 240(%rsp), %rcx
646 movq 248(%rsp), %rdi
647 movq 96(%rsp), %r9
648 movq 104(%rsp), %r11
649 movq 112(%rsp), %rdx
650 movq 120(%rsp), %rsi
651 movq %r9, (%r10)
652 movq %r11, (%rax)
653 movq %rdx, (%rcx)
654 movq %rsi, (%rdi)
655 movq %rbp, %rsp
656 cfi_def_cfa_register (%rsp)
657 popq %rbp
658 cfi_adjust_cfa_offset (-8)
659 cfi_restore (%rbp)
660 ret
661#else
662 leal 8(%rsp), %r10d
663 .cfi_def_cfa 10, 0
664 andl $-64, %esp
665 pushq -8(%r10d)
666 pushq %rbp
667 .cfi_escape 0x10,0x6,0x2,0x76,0
668 movl %esp, %ebp
669 pushq %r10
670 .cfi_escape 0xf,0x3,0x76,0x78,0x6
671 leal -112(%rbp), %esi
672 leal -176(%rbp), %edi
673 subl $232, %esp
674 vmovdqa %ymm1, -208(%ebp)
675 vmovdqa %ymm2, -240(%ebp)
676 call HIDDEN_JUMPTARGET(\callee)
677 vmovdqa -208(%ebp), %xmm0
678 vmovq %xmm0, %rax
679 vmovsd -176(%ebp), %xmm0
680 vmovsd %xmm0, (%eax)
681 shrq $32, %rax
682 vmovsd -168(%ebp), %xmm0
683 vmovsd %xmm0, (%eax)
684 movq -200(%ebp), %rax
685 vmovsd -160(%ebp), %xmm0
686 vmovsd %xmm0, (%eax)
687 shrq $32, %rax
688 vmovsd -152(%ebp), %xmm0
689 vmovsd %xmm0, (%eax)
690 movq -192(%ebp), %rax
691 vmovsd -144(%ebp), %xmm0
692 vmovsd %xmm0, (%eax)
693 shrq $32, %rax
694 vmovsd -136(%ebp), %xmm0
695 vmovsd %xmm0, (%eax)
696 movq -184(%ebp), %rax
697 vmovsd -128(%ebp), %xmm0
698 vmovsd %xmm0, (%eax)
699 shrq $32, %rax
700 vmovsd -120(%ebp), %xmm0
701 vmovsd %xmm0, (%eax)
702 vmovdqa -240(%ebp), %xmm0
703 vmovq %xmm0, %rax
704 vmovsd -112(%ebp), %xmm0
705 vmovsd %xmm0, (%eax)
706 shrq $32, %rax
707 vmovsd -104(%ebp), %xmm0
708 vmovsd %xmm0, (%eax)
709 movq -232(%ebp), %rax
710 vmovsd -96(%ebp), %xmm0
711 vmovsd %xmm0, (%eax)
712 shrq $32, %rax
713 vmovsd -88(%ebp), %xmm0
714 vmovsd %xmm0, (%eax)
715 movq -224(%ebp), %rax
716 vmovsd -80(%ebp), %xmm0
717 vmovsd %xmm0, (%eax)
718 shrq $32, %rax
719 vmovsd -72(%ebp), %xmm0
720 vmovsd %xmm0, (%eax)
721 movq -216(%ebp), %rax
722 vmovsd -64(%ebp), %xmm0
723 vmovsd %xmm0, (%eax)
724 shrq $32, %rax
725 vmovsd -56(%ebp), %xmm0
726 vmovsd %xmm0, (%eax)
727 addl $232, %esp
728 popq %r10
729 .cfi_def_cfa 10, 0
730 popq %rbp
731 leal -8(%r10), %esp
732 .cfi_def_cfa 7, 8
733 ret
734#endif
735.endm
736
737ENTRY (_ZGVeN8vvv_sincos_knl)
738WRAPPER_AVX512_vvv_vl8l8 _ZGVeN8vl8l8_sincos_knl
739END (_ZGVeN8vvv_sincos_knl)
740
741ENTRY (_ZGVeN8vvv_sincos_skx)
742WRAPPER_AVX512_vvv_vl8l8 _ZGVeN8vl8l8_sincos_skx
c9a8c526
AS
743END (_ZGVeN8vvv_sincos_skx)
744
745 .section .rodata, "a"
746.L_2il0floatpacket.15:
747 .long 0xffffffff,0xffffffff
748 .type .L_2il0floatpacket.15,@object