1 /* Function sinf vectorized with AVX2.
2 Copyright (C) 2014-2019 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
20 #include "svml_s_trig_data.h"
23 ENTRY(_ZGVdN8v_sinf_avx2)
25 ALGORITHM DESCRIPTION:
27 1) Range reduction to [-Pi/2; +Pi/2] interval
28 a) Grab sign from source argument and save it.
29 b) Remove sign using AND operation
30 c) Getting octant Y by 1/Pi multiplication
31 d) Add "Right Shifter" value
32 e) Treat obtained value as integer for destination sign setting.
33 Shift first bit of this value to the last (sign) position
34 f) Change destination sign if source sign is negative
36 g) Subtract "Right Shifter" value
37 h) Subtract Y*PI from X argument, where PI divided to 4 parts:
38 X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
39 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
40 a) Calculate X^2 = X * X
41 b) Calculate polynomial:
42 R = X + X * X^2 * (A3 + x^2 * (A5 + ......
43 3) Destination sign setting
44 a) Set shifted destination sign using XOR operation:
48 cfi_adjust_cfa_offset (8)
49 cfi_rel_offset (%rbp, 0)
51 cfi_def_cfa_register (%rbp)
54 movq __svml_s_trig_data@GOTPCREL(%rip), %rax
56 vmovups __sAbsMask(%rax), %ymm3
57 vmovups __sInvPI(%rax), %ymm7
58 vmovups __sRShifter(%rax), %ymm0
59 vmovups __sPI1_FMA(%rax), %ymm1
61 /* b) Remove sign using AND operation */
62 vandps %ymm3, %ymm5, %ymm4
65 c) Getting octant Y by 1/Pi multiplication
66 d) Add "Right Shifter" value
68 vfmadd213ps %ymm0, %ymm4, %ymm7
70 /* g) Subtract "Right Shifter" value */
71 vsubps %ymm0, %ymm7, %ymm2
74 e) Treat obtained value as integer for destination sign setting.
75 Shift first bit of this value to the last (sign) position
77 vpslld $31, %ymm7, %ymm6
80 h) Subtract Y*PI from X argument, where PI divided to 4 parts:
81 X = X - Y*PI1 - Y*PI2 - Y*PI3;
84 vfnmadd231ps %ymm2, %ymm1, %ymm0
86 /* Check for large and special values */
87 vcmpnle_uqps __sRangeReductionVal(%rax), %ymm4, %ymm4
88 vfnmadd231ps __sPI2_FMA(%rax), %ymm2, %ymm0
89 vfnmadd132ps __sPI3_FMA(%rax), %ymm0, %ymm2
92 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
93 a) Calculate X^2 = X * X
94 b) Calculate polynomial:
95 R = X + X * X^2 * (A3 + x^2 * (A5 + ......
97 vmulps %ymm2, %ymm2, %ymm1
100 f) Change destination sign if source sign is negative
103 vandnps %ymm5, %ymm3, %ymm0
104 vxorps %ymm6, %ymm2, %ymm3
105 vmovups __sA9(%rax), %ymm2
106 vfmadd213ps __sA7(%rax), %ymm1, %ymm2
107 vfmadd213ps __sA5(%rax), %ymm1, %ymm2
108 vfmadd213ps __sA3(%rax), %ymm1, %ymm2
109 vmulps %ymm1, %ymm2, %ymm6
110 vfmadd213ps %ymm3, %ymm3, %ymm6
111 vmovmskps %ymm4, %ecx
114 3) Destination sign setting
115 a) Set shifted destination sign using XOR operation:
118 vxorps %ymm0, %ymm6, %ymm0
125 cfi_def_cfa_register (%rsp)
127 cfi_adjust_cfa_offset (-8)
133 vmovups %ymm5, 320(%rsp)
134 vmovups %ymm0, 384(%rsp)
139 vmovups %ymm8, 224(%rsp)
140 vmovups %ymm9, 192(%rsp)
141 vmovups %ymm10, 160(%rsp)
142 vmovups %ymm11, 128(%rsp)
143 vmovups %ymm12, 96(%rsp)
144 vmovups %ymm13, 64(%rsp)
145 vmovups %ymm14, 32(%rsp)
146 vmovups %ymm15, (%rsp)
150 cfi_offset_rel_rsp (12, 296)
153 cfi_offset_rel_rsp (13, 288)
156 cfi_offset_rel_rsp (14, 280)
159 cfi_offset_rel_rsp (15, 272)
177 vmovups 224(%rsp), %ymm8
178 vmovups 192(%rsp), %ymm9
179 vmovups 160(%rsp), %ymm10
180 vmovups 128(%rsp), %ymm11
181 vmovups 96(%rsp), %ymm12
182 vmovups 64(%rsp), %ymm13
183 vmovups 32(%rsp), %ymm14
184 vmovups (%rsp), %ymm15
185 vmovups 384(%rsp), %ymm0
201 vmovss 324(%rsp,%r15,8), %xmm0
204 call JUMPTARGET(sinf)
206 vmovss %xmm0, 388(%rsp,%r15,8)
211 vmovss 320(%rsp,%r15,8), %xmm0
214 call JUMPTARGET(sinf)
216 vmovss %xmm0, 384(%rsp,%r15,8)
219 END(_ZGVdN8v_sinf_avx2)