]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core_sse4.S
Prefer https to http for gnu.org and fsf.org URLs
[thirdparty/glibc.git] / sysdeps / x86_64 / fpu / multiarch / svml_d_sincos2_core_sse4.S
CommitLineData
c9a8c526 1/* Function sincos vectorized with SSE4.
04277e02 2 Copyright (C) 2014-2019 Free Software Foundation, Inc.
c9a8c526
AS
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
5a82c748 17 <https://www.gnu.org/licenses/>. */
c9a8c526
AS
18
19#include <sysdep.h>
5872b835 20#include "svml_d_trig_data.h"
c9a8c526
AS
21
22 .text
ee2196bb 23ENTRY (_ZGVbN2vl8l8_sincos_sse4)
c9a8c526
AS
24/*
25 ALGORITHM DESCRIPTION:
26
27 ( low accuracy ( < 4ulp ) or enhanced performance
28 ( half of correct mantissa ) implementation )
29
30 Argument representation:
31 arg = N*Pi + R
32
33 Result calculation:
34 sin(arg) = sin(N*Pi + R) = (-1)^N * sin(R)
35 arg + Pi/2 = (N'*Pi + R')
36 cos(arg) = sin(arg+Pi/2) = sin(N'*Pi + R') = (-1)^N' * sin(R')
37 sin(R), sin(R') are approximated by corresponding polynomial. */
38
39 pushq %rbp
40 cfi_adjust_cfa_offset (8)
41 cfi_rel_offset (%rbp, 0)
42 movq %rsp, %rbp
43 cfi_def_cfa_register (%rbp)
44 andq $-64, %rsp
45 subq $320, %rsp
5872b835 46 movq __svml_d_trig_data@GOTPCREL(%rip), %rax
c9a8c526
AS
47 movups %xmm11, 160(%rsp)
48 movups %xmm12, 144(%rsp)
49 movups __dSignMask(%rax), %xmm11
50
51/* ARGUMENT RANGE REDUCTION:
52 Absolute argument: X' = |X| */
53 movaps %xmm11, %xmm4
54
55/* Grab sign bit from argument */
56 movaps %xmm11, %xmm7
57 movups __dInvPI(%rax), %xmm5
58 andnps %xmm0, %xmm4
59
60/* SinY = X'*InvPi + RS : right shifter add */
61 mulpd %xmm4, %xmm5
62 addpd __dRShifter(%rax), %xmm5
63
64/* SinSignRes = Y<<63 : shift LSB to MSB place for result sign */
65 movaps %xmm5, %xmm12
66 andps %xmm0, %xmm7
67
68/* SinN = Y - RS : right shifter sub */
69 subpd __dRShifter(%rax), %xmm5
70 movups %xmm10, 176(%rsp)
71 psllq $63, %xmm12
72 movups __dPI1(%rax), %xmm10
73
74/* SinR = X' - SinN*Pi1 */
75 movaps %xmm10, %xmm1
76 mulpd %xmm5, %xmm1
77 movups __dPI2(%rax), %xmm6
78
79/* SinR = SinR - SinN*Pi1 */
80 movaps %xmm6, %xmm2
81 mulpd %xmm5, %xmm2
82 movups %xmm13, 112(%rsp)
83 movaps %xmm4, %xmm13
84 subpd %xmm1, %xmm13
85 subpd %xmm2, %xmm13
86
87/* Sine result sign: SinRSign = SignMask & SinR */
88 movaps %xmm11, %xmm2
89
90/* CosR = SinX - CosN*Pi1 */
91 movaps %xmm4, %xmm1
92 movups __dOneHalf(%rax), %xmm3
93 andps %xmm13, %xmm2
94
95/* Set SinRSign to 0.5 */
96 orps %xmm2, %xmm3
97
98/* Update CosRSign and CosSignRes signs */
99 xorps %xmm11, %xmm2
100
101/* CosN = SinN +(-)0.5 */
102 addpd %xmm5, %xmm3
103 cmpnlepd __dRangeVal(%rax), %xmm4
104 mulpd %xmm3, %xmm10
105
106/* CosR = CosR - CosN*Pi2 */
107 mulpd %xmm3, %xmm6
108 subpd %xmm10, %xmm1
109 movmskpd %xmm4, %ecx
110 movups __dPI3(%rax), %xmm10
111 xorps %xmm12, %xmm2
112 subpd %xmm6, %xmm1
113
114/* SinR = SinR - SinN*Pi3 */
115 movaps %xmm10, %xmm6
116
117/* Final reconstruction.
118 Combine Sin result's sign */
119 xorps %xmm7, %xmm12
120 mulpd %xmm5, %xmm6
121
122/* CosR = CosR - CosN*Pi3 */
123 mulpd %xmm3, %xmm10
124 subpd %xmm6, %xmm13
125 subpd %xmm10, %xmm1
126 movups __dPI4(%rax), %xmm6
127
128/* SinR = SinR - SinN*Pi4 */
129 mulpd %xmm6, %xmm5
130
131/* CosR = CosR - CosN*Pi4 */
132 mulpd %xmm6, %xmm3
133 subpd %xmm5, %xmm13
134 subpd %xmm3, %xmm1
135
136/* SinR2 = SinR^2 */
137 movaps %xmm13, %xmm6
138
139/* CosR2 = CosR^2 */
140 movaps %xmm1, %xmm10
141 mulpd %xmm13, %xmm6
142 mulpd %xmm1, %xmm10
143
144/* Polynomial approximation */
145 movups __dC7(%rax), %xmm5
146 movaps %xmm5, %xmm3
147 mulpd %xmm6, %xmm3
148 mulpd %xmm10, %xmm5
149 addpd __dC6(%rax), %xmm3
150 addpd __dC6(%rax), %xmm5
151 mulpd %xmm6, %xmm3
152 mulpd %xmm10, %xmm5
153 addpd __dC5(%rax), %xmm3
154 addpd __dC5(%rax), %xmm5
155 mulpd %xmm6, %xmm3
156 mulpd %xmm10, %xmm5
157 addpd __dC4(%rax), %xmm3
158 addpd __dC4(%rax), %xmm5
159
160/* SinPoly = C3 + SinR2*(C4 + SinR2*(C5 + SinR2*(C6 + SinR2*C7))) */
161 mulpd %xmm6, %xmm3
162
163/* CosPoly = C3 + CosR2*(C4 + CosR2*(C5 + CosR2*(C6 + CosR2*C7))) */
164 mulpd %xmm10, %xmm5
165 addpd __dC3(%rax), %xmm3
166 addpd __dC3(%rax), %xmm5
167
168/* SinPoly = C2 + SinR2*SinPoly */
169 mulpd %xmm6, %xmm3
170
171/* CosPoly = C2 + CosR2*CosPoly */
172 mulpd %xmm10, %xmm5
173 addpd __dC2(%rax), %xmm3
174 addpd __dC2(%rax), %xmm5
175
176/* SinPoly = C1 + SinR2*SinPoly */
177 mulpd %xmm6, %xmm3
178
179/* CosPoly = C1 + CosR2*CosPoly */
180 mulpd %xmm10, %xmm5
181 addpd __dC1(%rax), %xmm3
182 addpd __dC1(%rax), %xmm5
183
184/* SinPoly = SinR2*SinPoly */
185 mulpd %xmm3, %xmm6
186
187/* CosPoly = CosR2*CosPoly */
188 mulpd %xmm5, %xmm10
189
190/* SinPoly = SinR*SinPoly */
191 mulpd %xmm13, %xmm6
192
193/* CosPoly = CosR*CosPoly */
194 mulpd %xmm1, %xmm10
195 addpd %xmm6, %xmm13
196 addpd %xmm10, %xmm1
197
198/* Update Sin result's sign */
199 xorps %xmm12, %xmm13
200
201/* Update Cos result's sign */
202 xorps %xmm2, %xmm1
203 testl %ecx, %ecx
204 jne .LBL_1_3
205
206.LBL_1_2:
207 cfi_remember_state
208 movups 176(%rsp), %xmm10
209 movaps %xmm13, (%rdi)
210 movups 160(%rsp), %xmm11
211 movups 144(%rsp), %xmm12
212 movups 112(%rsp), %xmm13
213 movups %xmm1, (%rsi)
214 movq %rbp, %rsp
215 cfi_def_cfa_register (%rsp)
216 popq %rbp
217 cfi_adjust_cfa_offset (-8)
218 cfi_restore (%rbp)
219 ret
220
221.LBL_1_3:
222 cfi_restore_state
223 movups %xmm0, 128(%rsp)
224 movups %xmm13, 192(%rsp)
225 movups %xmm1, 256(%rsp)
226 je .LBL_1_2
227
228 xorb %dl, %dl
229 xorl %eax, %eax
230 movups %xmm8, 48(%rsp)
231 movups %xmm9, 32(%rsp)
232 movups %xmm14, 16(%rsp)
233 movups %xmm15, (%rsp)
234 movq %rsi, 64(%rsp)
235 movq %r12, 104(%rsp)
236 cfi_offset_rel_rsp (12, 104)
237 movb %dl, %r12b
238 movq %r13, 96(%rsp)
239 cfi_offset_rel_rsp (13, 96)
240 movl %eax, %r13d
241 movq %r14, 88(%rsp)
242 cfi_offset_rel_rsp (14, 88)
243 movl %ecx, %r14d
244 movq %r15, 80(%rsp)
245 cfi_offset_rel_rsp (15, 80)
246 movq %rbx, 72(%rsp)
247 movq %rdi, %rbx
248 cfi_remember_state
249
250.LBL_1_6:
251 btl %r13d, %r14d
252 jc .LBL_1_13
253
254.LBL_1_7:
255 lea 1(%r13), %esi
256 btl %esi, %r14d
257 jc .LBL_1_10
258
259.LBL_1_8:
260 incb %r12b
261 addl $2, %r13d
262 cmpb $16, %r12b
263 jb .LBL_1_6
264
265 movups 48(%rsp), %xmm8
266 movq %rbx, %rdi
267 movups 32(%rsp), %xmm9
268 movups 16(%rsp), %xmm14
269 movups (%rsp), %xmm15
270 movq 64(%rsp), %rsi
271 movq 104(%rsp), %r12
272 cfi_restore (%r12)
273 movq 96(%rsp), %r13
274 cfi_restore (%r13)
275 movq 88(%rsp), %r14
276 cfi_restore (%r14)
277 movq 80(%rsp), %r15
278 cfi_restore (%r15)
279 movq 72(%rsp), %rbx
280 movups 192(%rsp), %xmm13
281 movups 256(%rsp), %xmm1
282 jmp .LBL_1_2
283
284.LBL_1_10:
285 cfi_restore_state
286 movzbl %r12b, %r15d
287 shlq $4, %r15
288 movsd 136(%rsp,%r15), %xmm0
289
86ed8882 290 call JUMPTARGET(sin)
c9a8c526
AS
291
292 movsd %xmm0, 200(%rsp,%r15)
293 movsd 136(%rsp,%r15), %xmm0
294
86ed8882 295 call JUMPTARGET(cos)
c9a8c526
AS
296
297 movsd %xmm0, 264(%rsp,%r15)
298 jmp .LBL_1_8
299
300.LBL_1_13:
301 movzbl %r12b, %r15d
302 shlq $4, %r15
303 movsd 128(%rsp,%r15), %xmm0
304
86ed8882 305 call JUMPTARGET(sin)
c9a8c526
AS
306
307 movsd %xmm0, 192(%rsp,%r15)
308 movsd 128(%rsp,%r15), %xmm0
309
86ed8882 310 call JUMPTARGET(cos)
c9a8c526
AS
311
312 movsd %xmm0, 256(%rsp,%r15)
313 jmp .LBL_1_7
ee2196bb
AS
314END (_ZGVbN2vl8l8_sincos_sse4)
315libmvec_hidden_def(_ZGVbN2vl8l8_sincos_sse4)
316
317/* vvv version implemented with wrapper to vl8l8 variant. */
318ENTRY (_ZGVbN2vvv_sincos_sse4)
319#ifndef __ILP32__
320 subq $72, %rsp
321 .cfi_def_cfa_offset 80
322 movdqu %xmm1, 32(%rsp)
323 lea (%rsp), %rdi
324 movdqu %xmm2, 48(%rdi)
325 lea 16(%rsp), %rsi
326 call HIDDEN_JUMPTARGET(_ZGVbN2vl8l8_sincos_sse4)
327 movq 32(%rsp), %rdx
328 movq 48(%rsp), %rsi
329 movq 40(%rsp), %r8
330 movq 56(%rsp), %r10
331 movq (%rsp), %rax
332 movq 16(%rsp), %rcx
333 movq 8(%rsp), %rdi
334 movq 24(%rsp), %r9
335 movq %rax, (%rdx)
336 movq %rcx, (%rsi)
337 movq %rdi, (%r8)
338 movq %r9, (%r10)
339 addq $72, %rsp
340 .cfi_def_cfa_offset 8
341 ret
342#else
343 subl $72, %esp
344 .cfi_def_cfa_offset 80
345 leal 48(%rsp), %esi
346 movaps %xmm1, 16(%esp)
347 leal 32(%rsp), %edi
348 movaps %xmm2, (%esp)
349 call HIDDEN_JUMPTARGET(_ZGVbN2vl8l8_sincos_sse4)
350 movdqa 16(%esp), %xmm1
351 movsd 32(%esp), %xmm0
352 movq %xmm1, %rax
353 movdqa (%esp), %xmm2
354 movsd %xmm0, (%eax)
355 movsd 40(%esp), %xmm0
356 pextrd $1, %xmm1, %eax
357 movsd %xmm0, (%eax)
358 movsd 48(%esp), %xmm0
359 movq %xmm2, %rax
360 movsd %xmm0, (%eax)
361 movsd 56(%esp), %xmm0
362 pextrd $1, %xmm2, %eax
363 movsd %xmm0, (%eax)
364 addl $72, %esp
365 .cfi_def_cfa_offset 8
366 ret
367#endif
c9a8c526 368END (_ZGVbN2vvv_sincos_sse4)