1 /* Function sinf vectorized with AVX-512. KNL and SKX versions.
2 Copyright (C) 2014-2021 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
20 #include "svml_s_trig_data.h"
21 #include "svml_s_wrapper_impl.h"
24 ENTRY(_ZGVeN16v_sinf_knl)
25 #ifndef HAVE_AVX512DQ_ASM_SUPPORT
26 WRAPPER_IMPL_AVX512 _ZGVdN8v_sinf
29 ALGORITHM DESCRIPTION:
31 1) Range reduction to [-Pi/2; +Pi/2] interval
32 a) Grab sign from source argument and save it.
33 b) Remove sign using AND operation
34 c) Getting octant Y by 1/Pi multiplication
35 d) Add "Right Shifter" value
36 e) Treat obtained value as integer for destination sign setting.
37 Shift first bit of this value to the last (sign) position
38 f) Change destination sign if source sign is negative
40 g) Subtract "Right Shifter" value
41 h) Subtract Y*PI from X argument, where PI divided to 4 parts:
42 X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
43 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
44 a) Calculate X^2 = X * X
45 b) Calculate polynomial:
46 R = X + X * X^2 * (A3 + x^2 * (A5 + ......
47 3) Destination sign setting
48 a) Set shifted destination sign using XOR operation:
52 cfi_adjust_cfa_offset (8)
53 cfi_rel_offset (%rbp, 0)
55 cfi_def_cfa_register (%rbp)
58 movq __svml_s_trig_data@GOTPCREL(%rip), %rax
60 /* Check for large and special values */
62 vmovups __sAbsMask(%rax), %zmm4
63 vmovups __sInvPI(%rax), %zmm1
65 /* b) Remove sign using AND operation */
66 vpandd %zmm4, %zmm0, %zmm12
67 vmovups __sPI1_FMA(%rax), %zmm2
68 vmovups __sA9(%rax), %zmm7
71 f) Change destination sign if source sign is negative
74 vpandnd %zmm0, %zmm4, %zmm11
77 h) Subtract Y*PI from X argument, where PI divided to 4 parts:
78 X = X - Y*PI1 - Y*PI2 - Y*PI3;
83 c) Getting octant Y by 1/Pi multiplication
84 d) Add "Right Shifter" value
86 vfmadd213ps __sRShifter(%rax), %zmm12, %zmm1
87 vcmpps $22, __sRangeReductionVal(%rax), %zmm12, %k1
88 vpbroadcastd %edx, %zmm13{%k1}{z}
90 /* g) Subtract "Right Shifter" value */
91 vsubps __sRShifter(%rax), %zmm1, %zmm5
94 e) Treat obtained value as integer for destination sign setting.
95 Shift first bit of this value to the last (sign) position
97 vpslld $31, %zmm1, %zmm6
98 vptestmd %zmm13, %zmm13, %k0
99 vfnmadd231ps %zmm5, %zmm2, %zmm3
101 vfnmadd231ps __sPI2_FMA(%rax), %zmm5, %zmm3
102 vfnmadd132ps __sPI3_FMA(%rax), %zmm3, %zmm5
105 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
106 a) Calculate X^2 = X * X
107 b) Calculate polynomial:
108 R = X + X * X^2 * (A3 + x^2 * (A5 + ......
110 vmulps %zmm5, %zmm5, %zmm8
111 vpxord %zmm6, %zmm5, %zmm9
112 vfmadd213ps __sA7(%rax), %zmm8, %zmm7
113 vfmadd213ps __sA5(%rax), %zmm8, %zmm7
114 vfmadd213ps __sA3(%rax), %zmm8, %zmm7
115 vmulps %zmm8, %zmm7, %zmm10
116 vfmadd213ps %zmm9, %zmm9, %zmm10
119 3) Destination sign setting
120 a) Set shifted destination sign using XOR operation:
123 vpxord %zmm11, %zmm10, %zmm1
131 cfi_def_cfa_register (%rsp)
133 cfi_adjust_cfa_offset (-8)
139 vmovups %zmm0, 1152(%rsp)
140 vmovups %zmm1, 1216(%rsp)
144 kmovw %k4, 1048(%rsp)
146 kmovw %k5, 1040(%rsp)
147 kmovw %k6, 1032(%rsp)
148 kmovw %k7, 1024(%rsp)
149 vmovups %zmm16, 960(%rsp)
150 vmovups %zmm17, 896(%rsp)
151 vmovups %zmm18, 832(%rsp)
152 vmovups %zmm19, 768(%rsp)
153 vmovups %zmm20, 704(%rsp)
154 vmovups %zmm21, 640(%rsp)
155 vmovups %zmm22, 576(%rsp)
156 vmovups %zmm23, 512(%rsp)
157 vmovups %zmm24, 448(%rsp)
158 vmovups %zmm25, 384(%rsp)
159 vmovups %zmm26, 320(%rsp)
160 vmovups %zmm27, 256(%rsp)
161 vmovups %zmm28, 192(%rsp)
162 vmovups %zmm29, 128(%rsp)
163 vmovups %zmm30, 64(%rsp)
164 vmovups %zmm31, (%rsp)
165 movq %rsi, 1064(%rsp)
166 movq %rdi, 1056(%rsp)
167 movq %r12, 1096(%rsp)
168 cfi_offset_rel_rsp (12, 1096)
170 movq %r13, 1088(%rsp)
171 cfi_offset_rel_rsp (13, 1088)
173 movq %r14, 1080(%rsp)
174 cfi_offset_rel_rsp (14, 1080)
176 movq %r15, 1072(%rsp)
177 cfi_offset_rel_rsp (15, 1072)
195 kmovw 1048(%rsp), %k4
196 movq 1064(%rsp), %rsi
197 kmovw 1040(%rsp), %k5
198 movq 1056(%rsp), %rdi
199 kmovw 1032(%rsp), %k6
200 movq 1096(%rsp), %r12
202 movq 1088(%rsp), %r13
204 kmovw 1024(%rsp), %k7
205 vmovups 960(%rsp), %zmm16
206 vmovups 896(%rsp), %zmm17
207 vmovups 832(%rsp), %zmm18
208 vmovups 768(%rsp), %zmm19
209 vmovups 704(%rsp), %zmm20
210 vmovups 640(%rsp), %zmm21
211 vmovups 576(%rsp), %zmm22
212 vmovups 512(%rsp), %zmm23
213 vmovups 448(%rsp), %zmm24
214 vmovups 384(%rsp), %zmm25
215 vmovups 320(%rsp), %zmm26
216 vmovups 256(%rsp), %zmm27
217 vmovups 192(%rsp), %zmm28
218 vmovups 128(%rsp), %zmm29
219 vmovups 64(%rsp), %zmm30
220 vmovups (%rsp), %zmm31
221 movq 1080(%rsp), %r14
223 movq 1072(%rsp), %r15
225 vmovups 1216(%rsp), %zmm1
231 vmovss 1156(%rsp,%r15,8), %xmm0
232 call JUMPTARGET(sinf)
233 vmovss %xmm0, 1220(%rsp,%r15,8)
238 vmovss 1152(%rsp,%r15,8), %xmm0
239 call JUMPTARGET(sinf)
240 vmovss %xmm0, 1216(%rsp,%r15,8)
243 END(_ZGVeN16v_sinf_knl)
245 ENTRY (_ZGVeN16v_sinf_skx)
246 #ifndef HAVE_AVX512DQ_ASM_SUPPORT
247 WRAPPER_IMPL_AVX512 _ZGVdN8v_sinf
250 ALGORITHM DESCRIPTION:
252 1) Range reduction to [-Pi/2; +Pi/2] interval
253 a) Grab sign from source argument and save it.
254 b) Remove sign using AND operation
255 c) Getting octant Y by 1/Pi multiplication
256 d) Add "Right Shifter" value
257 e) Treat obtained value as integer for destination sign setting.
258 Shift first bit of this value to the last (sign) position
259 f) Change destination sign if source sign is negative
261 g) Subtract "Right Shifter" value
262 h) Subtract Y*PI from X argument, where PI divided to 4 parts:
263 X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
264 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
265 a) Calculate X^2 = X * X
266 b) Calculate polynomial:
267 R = X + X * X^2 * (A3 + x^2 * (A5 + ......
268 3) Destination sign setting
269 a) Set shifted destination sign using XOR operation:
274 cfi_adjust_cfa_offset (8)
275 cfi_rel_offset (%rbp, 0)
277 cfi_def_cfa_register (%rbp)
280 movq __svml_s_trig_data@GOTPCREL(%rip), %rax
282 /* Check for large and special values */
283 vmovups .L_2il0floatpacket.11(%rip), %zmm14
284 vmovups __sAbsMask(%rax), %zmm5
285 vmovups __sInvPI(%rax), %zmm1
286 vmovups __sRShifter(%rax), %zmm2
287 vmovups __sPI1_FMA(%rax), %zmm3
288 vmovups __sA9(%rax), %zmm8
290 /* b) Remove sign using AND operation */
291 vandps %zmm5, %zmm0, %zmm13
294 f) Change destination sign if source sign is negative
297 vandnps %zmm0, %zmm5, %zmm12
300 c) Getting octant Y by 1/Pi multiplication
301 d) Add "Right Shifter" value
303 vfmadd213ps %zmm2, %zmm13, %zmm1
304 vcmpps $18, __sRangeReductionVal(%rax), %zmm13, %k1
307 e) Treat obtained value as integer for destination sign setting.
308 Shift first bit of this value to the last (sign) position
310 vpslld $31, %zmm1, %zmm7
312 /* g) Subtract "Right Shifter" value */
313 vsubps %zmm2, %zmm1, %zmm6
316 h) Subtract Y*PI from X argument, where PI divided to 4 parts:
317 X = X - Y*PI1 - Y*PI2 - Y*PI3;
319 vmovaps %zmm13, %zmm4
320 vfnmadd231ps %zmm6, %zmm3, %zmm4
321 vfnmadd231ps __sPI2_FMA(%rax), %zmm6, %zmm4
322 vfnmadd132ps __sPI3_FMA(%rax), %zmm4, %zmm6
325 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
326 a) Calculate X^2 = X * X
327 b) Calculate polynomial:
328 R = X + X * X^2 * (A3 + x^2 * (A5 + ......
330 vmulps %zmm6, %zmm6, %zmm9
331 vxorps %zmm7, %zmm6, %zmm10
332 vfmadd213ps __sA7(%rax), %zmm9, %zmm8
333 vfmadd213ps __sA5(%rax), %zmm9, %zmm8
334 vfmadd213ps __sA3(%rax), %zmm9, %zmm8
335 vmulps %zmm9, %zmm8, %zmm11
336 vfmadd213ps %zmm10, %zmm10, %zmm11
339 3) Destination sign setting
340 a) Set shifted destination sign using XOR operation:
343 vxorps %zmm12, %zmm11, %zmm1
344 vpandnd %zmm13, %zmm13, %zmm14{%k1}
345 vptestmd %zmm14, %zmm14, %k0
354 cfi_def_cfa_register (%rsp)
356 cfi_adjust_cfa_offset (-8)
362 vmovups %zmm0, 1152(%rsp)
363 vmovups %zmm1, 1216(%rsp)
368 kmovw %k4, 1048(%rsp)
369 kmovw %k5, 1040(%rsp)
370 kmovw %k6, 1032(%rsp)
371 kmovw %k7, 1024(%rsp)
372 vmovups %zmm16, 960(%rsp)
373 vmovups %zmm17, 896(%rsp)
374 vmovups %zmm18, 832(%rsp)
375 vmovups %zmm19, 768(%rsp)
376 vmovups %zmm20, 704(%rsp)
377 vmovups %zmm21, 640(%rsp)
378 vmovups %zmm22, 576(%rsp)
379 vmovups %zmm23, 512(%rsp)
380 vmovups %zmm24, 448(%rsp)
381 vmovups %zmm25, 384(%rsp)
382 vmovups %zmm26, 320(%rsp)
383 vmovups %zmm27, 256(%rsp)
384 vmovups %zmm28, 192(%rsp)
385 vmovups %zmm29, 128(%rsp)
386 vmovups %zmm30, 64(%rsp)
387 vmovups %zmm31, (%rsp)
388 movq %rsi, 1064(%rsp)
389 movq %rdi, 1056(%rsp)
390 movq %r12, 1096(%rsp)
391 cfi_offset_rel_rsp (12, 1096)
393 movq %r13, 1088(%rsp)
394 cfi_offset_rel_rsp (13, 1088)
396 movq %r14, 1080(%rsp)
397 cfi_offset_rel_rsp (14, 1080)
399 movq %r15, 1072(%rsp)
400 cfi_offset_rel_rsp (15, 1072)
418 kmovw 1048(%rsp), %k4
419 kmovw 1040(%rsp), %k5
420 kmovw 1032(%rsp), %k6
421 kmovw 1024(%rsp), %k7
422 vmovups 960(%rsp), %zmm16
423 vmovups 896(%rsp), %zmm17
424 vmovups 832(%rsp), %zmm18
425 vmovups 768(%rsp), %zmm19
426 vmovups 704(%rsp), %zmm20
427 vmovups 640(%rsp), %zmm21
428 vmovups 576(%rsp), %zmm22
429 vmovups 512(%rsp), %zmm23
430 vmovups 448(%rsp), %zmm24
431 vmovups 384(%rsp), %zmm25
432 vmovups 320(%rsp), %zmm26
433 vmovups 256(%rsp), %zmm27
434 vmovups 192(%rsp), %zmm28
435 vmovups 128(%rsp), %zmm29
436 vmovups 64(%rsp), %zmm30
437 vmovups (%rsp), %zmm31
438 vmovups 1216(%rsp), %zmm1
439 movq 1064(%rsp), %rsi
440 movq 1056(%rsp), %rdi
441 movq 1096(%rsp), %r12
443 movq 1088(%rsp), %r13
445 movq 1080(%rsp), %r14
447 movq 1072(%rsp), %r15
454 vmovss 1156(%rsp,%r15,8), %xmm0
456 vmovss 1156(%rsp,%r15,8), %xmm0
458 call JUMPTARGET(sinf)
460 vmovss %xmm0, 1220(%rsp,%r15,8)
465 vmovss 1152(%rsp,%r15,8), %xmm0
467 vmovss 1152(%rsp,%r15,8), %xmm0
469 call JUMPTARGET(sinf)
471 vmovss %xmm0, 1216(%rsp,%r15,8)
474 END (_ZGVeN16v_sinf_skx)
476 .section .rodata, "a"
477 .L_2il0floatpacket.11:
478 .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
479 .type .L_2il0floatpacket.11,@object