]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core_sse4.S
Update copyright dates with scripts/update-copyrights.
[thirdparty/glibc.git] / sysdeps / x86_64 / fpu / multiarch / svml_s_sincosf4_core_sse4.S
CommitLineData
a6336cc4 1/* Function sincosf vectorized with SSE4.
f7a9f785 2 Copyright (C) 2014-2016 Free Software Foundation, Inc.
a6336cc4
AS
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
18
19#include <sysdep.h>
36870482 20#include "svml_s_trig_data.h"
a6336cc4
AS
21
22 .text
23ENTRY (_ZGVbN4vvv_sincosf_sse4)
24/*
25 ALGORITHM DESCRIPTION:
26
27 1) Range reduction to [-Pi/4; +Pi/4] interval
28 a) Grab sign from source argument and save it.
29 b) Remove sign using AND operation
30 c) Getting octant Y by 2/Pi multiplication
31 d) Add "Right Shifter" value
32 e) Treat obtained value as integer S for destination sign setting.
33 SS = ((S-S&1)&2)<<30; For sin part
34 SC = ((S+S&1)&2)<<30; For cos part
35 f) Change destination sign if source sign is negative
36 using XOR operation.
37 g) Subtract "Right Shifter" (0x4B000000) value
38 h) Subtract Y*(PI/2) from X argument, where PI/2 divided to 4 parts:
39 X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
40 2) Polynomial (minimax for sin within [-Pi/4; +Pi/4] interval)
41 a) Calculate X^2 = X * X
42 b) Calculate 2 polynomials for sin and cos:
43 RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3))));
44 RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4))));
45 c) Swap RS & RC if if first bit of obtained value after
46 Right Shifting is set to 1. Using And, Andnot & Or operations.
47 3) Destination sign setting
48 a) Set shifted destination sign using XOR operation:
49 R1 = XOR( RS, SS );
50 R2 = XOR( RC, SC ). */
51
52 pushq %rbp
53 cfi_adjust_cfa_offset (8)
54 cfi_rel_offset (%rbp, 0)
55 movq %rsp, %rbp
56 cfi_def_cfa_register (%rbp)
57 andq $-64, %rsp
58 subq $320, %rsp
36870482 59 movq __svml_s_trig_data@GOTPCREL(%rip), %rax
a6336cc4
AS
60 movups %xmm12, 176(%rsp)
61 movups %xmm9, 160(%rsp)
62 movups __sAbsMask(%rax), %xmm12
63
64/* Absolute argument computation */
65 movaps %xmm12, %xmm5
66 andnps %xmm0, %xmm12
67 movups __sInvPI(%rax), %xmm7
68 andps %xmm0, %xmm5
69
70/* c) Getting octant Y by 2/Pi multiplication
71 d) Add "Right Shifter" value. */
72 mulps %xmm5, %xmm7
73 movups %xmm10, 144(%rsp)
74 movups __sPI1(%rax), %xmm10
75
76/* h) Subtract Y*(PI/2) from X argument, where PI/2 divided to 3 parts:
77 X = X - Y*PI1 - Y*PI2 - Y*PI3. */
78 movaps %xmm10, %xmm1
79 addps __sRShifter(%rax), %xmm7
80
81/* e) Treat obtained value as integer S for destination sign setting */
82 movaps %xmm7, %xmm9
83
84/* g) Subtract "Right Shifter" (0x4B000000) value */
85 subps __sRShifter(%rax), %xmm7
86 mulps %xmm7, %xmm1
87 pslld $31, %xmm9
88 movups __sPI2(%rax), %xmm6
89 movups %xmm13, 112(%rsp)
90 movaps %xmm5, %xmm13
91 movaps %xmm6, %xmm2
92 subps %xmm1, %xmm13
93 mulps %xmm7, %xmm2
94 movups __sSignMask(%rax), %xmm3
95 movaps %xmm5, %xmm1
96 movups __sOneHalf(%rax), %xmm4
97 subps %xmm2, %xmm13
98 cmpnleps __sRangeReductionVal(%rax), %xmm5
99 movaps %xmm3, %xmm2
100 andps %xmm13, %xmm2
101 xorps %xmm2, %xmm4
102
103/* Result sign calculations */
104 xorps %xmm2, %xmm3
105 xorps %xmm9, %xmm3
106
107/* Add correction term 0.5 for cos() part */
108 addps %xmm7, %xmm4
109 movmskps %xmm5, %ecx
110 mulps %xmm4, %xmm10
111 mulps %xmm4, %xmm6
112 subps %xmm10, %xmm1
113 movups __sPI3(%rax), %xmm10
114 subps %xmm6, %xmm1
115 movaps %xmm10, %xmm6
116 mulps %xmm7, %xmm6
117 mulps %xmm4, %xmm10
118 subps %xmm6, %xmm13
119 subps %xmm10, %xmm1
120 movups __sPI4(%rax), %xmm6
121 mulps %xmm6, %xmm7
122 mulps %xmm6, %xmm4
123 subps %xmm7, %xmm13
124 subps %xmm4, %xmm1
125 xorps %xmm9, %xmm13
126 xorps %xmm3, %xmm1
127 movaps %xmm13, %xmm4
128 movaps %xmm1, %xmm2
129 mulps %xmm13, %xmm4
130 mulps %xmm1, %xmm2
131 movups __sA9(%rax), %xmm7
132
133/* 2) Polynomial (minimax for sin within [-Pi/4; +Pi/4] interval)
134 a) Calculate X^2 = X * X
135 b) Calculate 2 polynomials for sin and cos:
136 RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3))));
137 RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4)))) */
138 movaps %xmm7, %xmm3
139 mulps %xmm4, %xmm3
140 mulps %xmm2, %xmm7
141 addps __sA7(%rax), %xmm3
142 addps __sA7(%rax), %xmm7
143 mulps %xmm4, %xmm3
144 mulps %xmm2, %xmm7
145 addps __sA5(%rax), %xmm3
146 addps __sA5(%rax), %xmm7
147 mulps %xmm4, %xmm3
148 mulps %xmm2, %xmm7
149 addps __sA3(%rax), %xmm3
150 addps __sA3(%rax), %xmm7
151 mulps %xmm3, %xmm4
152 mulps %xmm7, %xmm2
153 mulps %xmm13, %xmm4
154 mulps %xmm1, %xmm2
155 addps %xmm4, %xmm13
156 addps %xmm2, %xmm1
157 xorps %xmm12, %xmm13
158 testl %ecx, %ecx
159 jne .LBL_1_3
160
161.LBL_1_2:
162 cfi_remember_state
163 movups 160(%rsp), %xmm9
164 movaps %xmm13, (%rdi)
165 movups 144(%rsp), %xmm10
166 movups 176(%rsp), %xmm12
167 movups 112(%rsp), %xmm13
168 movups %xmm1, (%rsi)
169 movq %rbp, %rsp
170 cfi_def_cfa_register (%rsp)
171 popq %rbp
172 cfi_adjust_cfa_offset (-8)
173 cfi_restore (%rbp)
174 ret
175
176.LBL_1_3:
177 cfi_restore_state
178 movups %xmm0, 128(%rsp)
179 movups %xmm13, 192(%rsp)
180 movups %xmm1, 256(%rsp)
181 je .LBL_1_2
182
183 xorb %dl, %dl
184 xorl %eax, %eax
185 movups %xmm8, 48(%rsp)
186 movups %xmm11, 32(%rsp)
187 movups %xmm14, 16(%rsp)
188 movups %xmm15, (%rsp)
189 movq %rsi, 64(%rsp)
190 movq %r12, 104(%rsp)
191 cfi_offset_rel_rsp (12, 104)
192 movb %dl, %r12b
193 movq %r13, 96(%rsp)
194 cfi_offset_rel_rsp (13, 96)
195 movl %eax, %r13d
196 movq %r14, 88(%rsp)
197 cfi_offset_rel_rsp (14, 88)
198 movl %ecx, %r14d
199 movq %r15, 80(%rsp)
200 cfi_offset_rel_rsp (15, 80)
201 movq %rbx, 72(%rsp)
202 movq %rdi, %rbx
203 cfi_remember_state
204
205.LBL_1_6:
206 btl %r13d, %r14d
207 jc .LBL_1_13
208
209.LBL_1_7:
210 lea 1(%r13), %esi
211 btl %esi, %r14d
212 jc .LBL_1_10
213
214.LBL_1_8:
215 incb %r12b
216 addl $2, %r13d
217 cmpb $16, %r12b
218 jb .LBL_1_6
219
220 movups 48(%rsp), %xmm8
221 movq %rbx, %rdi
222 movups 32(%rsp), %xmm11
223 movups 16(%rsp), %xmm14
224 movups (%rsp), %xmm15
225 movq 64(%rsp), %rsi
226 movq 104(%rsp), %r12
227 cfi_restore (%r12)
228 movq 96(%rsp), %r13
229 cfi_restore (%r13)
230 movq 88(%rsp), %r14
231 cfi_restore (%r14)
232 movq 80(%rsp), %r15
233 cfi_restore (%r15)
234 movq 72(%rsp), %rbx
235 movups 192(%rsp), %xmm13
236 movups 256(%rsp), %xmm1
237 jmp .LBL_1_2
238
239.LBL_1_10:
240 cfi_restore_state
241 movzbl %r12b, %r15d
242 movss 132(%rsp,%r15,8), %xmm0
243
244 call sinf@PLT
245
246 movss %xmm0, 196(%rsp,%r15,8)
247 movss 132(%rsp,%r15,8), %xmm0
248
249 call cosf@PLT
250
251 movss %xmm0, 260(%rsp,%r15,8)
252 jmp .LBL_1_8
253
254.LBL_1_13:
255 movzbl %r12b, %r15d
256 movss 128(%rsp,%r15,8), %xmm0
257
258 call sinf@PLT
259
260 movss %xmm0, 192(%rsp,%r15,8)
261 movss 128(%rsp,%r15,8), %xmm0
262
263 call cosf@PLT
264
265 movss %xmm0, 256(%rsp,%r15,8)
266 jmp .LBL_1_7
267
268END (_ZGVbN4vvv_sincosf_sse4)