]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/x86_64/fpu/multiarch/svml_s_sinf8_core_avx2.S
ed4582fc0a20bf871603a108293dbd2d7cacb22d
[thirdparty/glibc.git] / sysdeps / x86_64 / fpu / multiarch / svml_s_sinf8_core_avx2.S
1 /* Function sinf vectorized with AVX2.
2 Copyright (C) 2014-2019 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
18
19 #include <sysdep.h>
20 #include "svml_s_trig_data.h"
21
22 .text
23 ENTRY(_ZGVdN8v_sinf_avx2)
24 /*
25 ALGORITHM DESCRIPTION:
26
27 1) Range reduction to [-Pi/2; +Pi/2] interval
28 a) Grab sign from source argument and save it.
29 b) Remove sign using AND operation
30 c) Getting octant Y by 1/Pi multiplication
31 d) Add "Right Shifter" value
32 e) Treat obtained value as integer for destination sign setting.
33 Shift first bit of this value to the last (sign) position
34 f) Change destination sign if source sign is negative
35 using XOR operation.
36 g) Subtract "Right Shifter" value
37 h) Subtract Y*PI from X argument, where PI divided to 4 parts:
38 X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
39 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
40 a) Calculate X^2 = X * X
41 b) Calculate polynomial:
42 R = X + X * X^2 * (A3 + x^2 * (A5 + ......
43 3) Destination sign setting
44 a) Set shifted destination sign using XOR operation:
45 R = XOR( R, S );
46 */
47 pushq %rbp
48 cfi_adjust_cfa_offset (8)
49 cfi_rel_offset (%rbp, 0)
50 movq %rsp, %rbp
51 cfi_def_cfa_register (%rbp)
52 andq $-64, %rsp
53 subq $448, %rsp
54 movq __svml_s_trig_data@GOTPCREL(%rip), %rax
55 vmovdqa %ymm0, %ymm5
56 vmovups __sAbsMask(%rax), %ymm3
57 vmovups __sInvPI(%rax), %ymm7
58 vmovups __sRShifter(%rax), %ymm0
59 vmovups __sPI1_FMA(%rax), %ymm1
60
61 /* b) Remove sign using AND operation */
62 vandps %ymm3, %ymm5, %ymm4
63
64 /*
65 c) Getting octant Y by 1/Pi multiplication
66 d) Add "Right Shifter" value
67 */
68 vfmadd213ps %ymm0, %ymm4, %ymm7
69
70 /* g) Subtract "Right Shifter" value */
71 vsubps %ymm0, %ymm7, %ymm2
72
73 /*
74 e) Treat obtained value as integer for destination sign setting.
75 Shift first bit of this value to the last (sign) position
76 */
77 vpslld $31, %ymm7, %ymm6
78
79 /*
80 h) Subtract Y*PI from X argument, where PI divided to 4 parts:
81 X = X - Y*PI1 - Y*PI2 - Y*PI3;
82 */
83 vmovdqa %ymm4, %ymm0
84 vfnmadd231ps %ymm2, %ymm1, %ymm0
85
86 /* Check for large and special values */
87 vcmpnle_uqps __sRangeReductionVal(%rax), %ymm4, %ymm4
88 vfnmadd231ps __sPI2_FMA(%rax), %ymm2, %ymm0
89 vfnmadd132ps __sPI3_FMA(%rax), %ymm0, %ymm2
90
91 /*
92 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
93 a) Calculate X^2 = X * X
94 b) Calculate polynomial:
95 R = X + X * X^2 * (A3 + x^2 * (A5 + ......
96 */
97 vmulps %ymm2, %ymm2, %ymm1
98
99 /*
100 f) Change destination sign if source sign is negative
101 using XOR operation.
102 */
103 vandnps %ymm5, %ymm3, %ymm0
104 vxorps %ymm6, %ymm2, %ymm3
105 vmovups __sA9(%rax), %ymm2
106 vfmadd213ps __sA7(%rax), %ymm1, %ymm2
107 vfmadd213ps __sA5(%rax), %ymm1, %ymm2
108 vfmadd213ps __sA3(%rax), %ymm1, %ymm2
109 vmulps %ymm1, %ymm2, %ymm6
110 vfmadd213ps %ymm3, %ymm3, %ymm6
111 vmovmskps %ymm4, %ecx
112
113 /*
114 3) Destination sign setting
115 a) Set shifted destination sign using XOR operation:
116 R = XOR( R, S );
117 */
118 vxorps %ymm0, %ymm6, %ymm0
119 testl %ecx, %ecx
120 jne .LBL_1_3
121
122 .LBL_1_2:
123 cfi_remember_state
124 movq %rbp, %rsp
125 cfi_def_cfa_register (%rsp)
126 popq %rbp
127 cfi_adjust_cfa_offset (-8)
128 cfi_restore (%rbp)
129 ret
130
131 .LBL_1_3:
132 cfi_restore_state
133 vmovups %ymm5, 320(%rsp)
134 vmovups %ymm0, 384(%rsp)
135 je .LBL_1_2
136
137 xorb %dl, %dl
138 xorl %eax, %eax
139 vmovups %ymm8, 224(%rsp)
140 vmovups %ymm9, 192(%rsp)
141 vmovups %ymm10, 160(%rsp)
142 vmovups %ymm11, 128(%rsp)
143 vmovups %ymm12, 96(%rsp)
144 vmovups %ymm13, 64(%rsp)
145 vmovups %ymm14, 32(%rsp)
146 vmovups %ymm15, (%rsp)
147 movq %rsi, 264(%rsp)
148 movq %rdi, 256(%rsp)
149 movq %r12, 296(%rsp)
150 cfi_offset_rel_rsp (12, 296)
151 movb %dl, %r12b
152 movq %r13, 288(%rsp)
153 cfi_offset_rel_rsp (13, 288)
154 movl %ecx, %r13d
155 movq %r14, 280(%rsp)
156 cfi_offset_rel_rsp (14, 280)
157 movl %eax, %r14d
158 movq %r15, 272(%rsp)
159 cfi_offset_rel_rsp (15, 272)
160 cfi_remember_state
161
162 .LBL_1_6:
163 btl %r14d, %r13d
164 jc .LBL_1_12
165
166 .LBL_1_7:
167 lea 1(%r14), %esi
168 btl %esi, %r13d
169 jc .LBL_1_10
170
171 .LBL_1_8:
172 incb %r12b
173 addl $2, %r14d
174 cmpb $16, %r12b
175 jb .LBL_1_6
176
177 vmovups 224(%rsp), %ymm8
178 vmovups 192(%rsp), %ymm9
179 vmovups 160(%rsp), %ymm10
180 vmovups 128(%rsp), %ymm11
181 vmovups 96(%rsp), %ymm12
182 vmovups 64(%rsp), %ymm13
183 vmovups 32(%rsp), %ymm14
184 vmovups (%rsp), %ymm15
185 vmovups 384(%rsp), %ymm0
186 movq 264(%rsp), %rsi
187 movq 256(%rsp), %rdi
188 movq 296(%rsp), %r12
189 cfi_restore (%r12)
190 movq 288(%rsp), %r13
191 cfi_restore (%r13)
192 movq 280(%rsp), %r14
193 cfi_restore (%r14)
194 movq 272(%rsp), %r15
195 cfi_restore (%r15)
196 jmp .LBL_1_2
197
198 .LBL_1_10:
199 cfi_restore_state
200 movzbl %r12b, %r15d
201 vmovss 324(%rsp,%r15,8), %xmm0
202 vzeroupper
203
204 call JUMPTARGET(sinf)
205
206 vmovss %xmm0, 388(%rsp,%r15,8)
207 jmp .LBL_1_8
208
209 .LBL_1_12:
210 movzbl %r12b, %r15d
211 vmovss 320(%rsp,%r15,8), %xmm0
212 vzeroupper
213
214 call JUMPTARGET(sinf)
215
216 vmovss %xmm0, 384(%rsp,%r15,8)
217 jmp .LBL_1_7
218
219 END(_ZGVdN8v_sinf_avx2)