]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/x86_64/fpu/multiarch/svml_s_sinf4_core_sse4.S
Prefer https to http for gnu.org and fsf.org URLs
[thirdparty/glibc.git] / sysdeps / x86_64 / fpu / multiarch / svml_s_sinf4_core_sse4.S
1 /* Function sinf vectorized with SSE4.
2 Copyright (C) 2014-2019 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19
20 #include <sysdep.h>
21 #include "svml_s_trig_data.h"
22
23 .text
24 ENTRY(_ZGVbN4v_sinf_sse4)
25 /*
26 ALGORITHM DESCRIPTION:
27
28 1) Range reduction to [-Pi/2; +Pi/2] interval
29 a) Grab sign from source argument and save it.
30 b) Remove sign using AND operation
31 c) Getting octant Y by 1/Pi multiplication
32 d) Add "Right Shifter" value
33 e) Treat obtained value as integer for destination sign setting.
34 Shift first bit of this value to the last (sign) position
35 f) Change destination sign if source sign is negative
36 using XOR operation.
37 g) Subtract "Right Shifter" value
38 h) Subtract Y*PI from X argument, where PI divided to 4 parts:
39 X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
40 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
41 a) Calculate X^2 = X * X
42 b) Calculate polynomial:
43 R = X + X * X^2 * (A3 + x^2 * (A5 + ......
44 3) Destination sign setting
45 a) Set shifted destination sign using XOR operation:
46 R = XOR( R, S );
47 */
48 pushq %rbp
49 cfi_adjust_cfa_offset (8)
50 cfi_rel_offset (%rbp, 0)
51 movq %rsp, %rbp
52 cfi_def_cfa_register (%rbp)
53 andq $-64, %rsp
54 subq $320, %rsp
55 movaps %xmm0, %xmm5
56 movq __svml_s_trig_data@GOTPCREL(%rip), %rax
57 movups __sAbsMask(%rax), %xmm2
58
59 /* b) Remove sign using AND operation */
60 movaps %xmm2, %xmm4
61
62 /*
63 f) Change destination sign if source sign is negative
64 using XOR operation.
65 */
66 andnps %xmm5, %xmm2
67 movups __sInvPI(%rax), %xmm1
68 andps %xmm5, %xmm4
69
70 /* c) Getting octant Y by 1/Pi multiplication
71 d) Add "Right Shifter" value */
72 mulps %xmm4, %xmm1
73
74 /* h) Subtract Y*PI from X argument, where PI divided to 4 parts:
75 X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4 */
76 movaps %xmm4, %xmm0
77
78 /* Check for large and special values */
79 cmpnleps __sRangeReductionVal(%rax), %xmm4
80 movups __sRShifter(%rax), %xmm6
81 movups __sPI1(%rax), %xmm7
82 addps %xmm6, %xmm1
83 movmskps %xmm4, %ecx
84
85 /* e) Treat obtained value as integer for destination sign setting.
86 Shift first bit of this value to the last (sign) position */
87 movaps %xmm1, %xmm3
88
89 /* g) Subtract "Right Shifter" value */
90 subps %xmm6, %xmm1
91 mulps %xmm1, %xmm7
92 pslld $31, %xmm3
93 movups __sPI2(%rax), %xmm6
94 subps %xmm7, %xmm0
95 mulps %xmm1, %xmm6
96 movups __sPI3(%rax), %xmm7
97 subps %xmm6, %xmm0
98 mulps %xmm1, %xmm7
99 movups __sPI4(%rax), %xmm6
100 subps %xmm7, %xmm0
101 mulps %xmm6, %xmm1
102 subps %xmm1, %xmm0
103
104 /* 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
105 a) Calculate X^2 = X * X
106 b) Calculate polynomial:
107 R = X + X * X^2 * (A3 + x^2 * (A5 + ...... */
108 movaps %xmm0, %xmm1
109 mulps %xmm0, %xmm1
110 xorps %xmm3, %xmm0
111 movups __sA9(%rax), %xmm3
112 mulps %xmm1, %xmm3
113 addps __sA7(%rax), %xmm3
114 mulps %xmm1, %xmm3
115 addps __sA5(%rax), %xmm3
116 mulps %xmm1, %xmm3
117 addps __sA3(%rax), %xmm3
118 mulps %xmm3, %xmm1
119 mulps %xmm0, %xmm1
120 addps %xmm1, %xmm0
121
122 /* 3) Destination sign setting
123 a) Set shifted destination sign using XOR operation:
124 R = XOR( R, S ); */
125 xorps %xmm2, %xmm0
126 testl %ecx, %ecx
127 jne .LBL_1_3
128
129 .LBL_1_2:
130 cfi_remember_state
131 movq %rbp, %rsp
132 cfi_def_cfa_register (%rsp)
133 popq %rbp
134 cfi_adjust_cfa_offset (-8)
135 cfi_restore (%rbp)
136 ret
137
138 .LBL_1_3:
139 cfi_restore_state
140 movups %xmm5, 192(%rsp)
141 movups %xmm0, 256(%rsp)
142 je .LBL_1_2
143
144 xorb %dl, %dl
145 xorl %eax, %eax
146 movups %xmm8, 112(%rsp)
147 movups %xmm9, 96(%rsp)
148 movups %xmm10, 80(%rsp)
149 movups %xmm11, 64(%rsp)
150 movups %xmm12, 48(%rsp)
151 movups %xmm13, 32(%rsp)
152 movups %xmm14, 16(%rsp)
153 movups %xmm15, (%rsp)
154 movq %rsi, 136(%rsp)
155 movq %rdi, 128(%rsp)
156 movq %r12, 168(%rsp)
157 cfi_offset_rel_rsp (12, 168)
158 movb %dl, %r12b
159 movq %r13, 160(%rsp)
160 cfi_offset_rel_rsp (13, 160)
161 movl %ecx, %r13d
162 movq %r14, 152(%rsp)
163 cfi_offset_rel_rsp (14, 152)
164 movl %eax, %r14d
165 movq %r15, 144(%rsp)
166 cfi_offset_rel_rsp (15, 144)
167 cfi_remember_state
168
169 .LBL_1_6:
170 btl %r14d, %r13d
171 jc .LBL_1_12
172
173 .LBL_1_7:
174 lea 1(%r14), %esi
175 btl %esi, %r13d
176 jc .LBL_1_10
177
178 .LBL_1_8:
179 incb %r12b
180 addl $2, %r14d
181 cmpb $16, %r12b
182 jb .LBL_1_6
183
184 movups 112(%rsp), %xmm8
185 movups 96(%rsp), %xmm9
186 movups 80(%rsp), %xmm10
187 movups 64(%rsp), %xmm11
188 movups 48(%rsp), %xmm12
189 movups 32(%rsp), %xmm13
190 movups 16(%rsp), %xmm14
191 movups (%rsp), %xmm15
192 movq 136(%rsp), %rsi
193 movq 128(%rsp), %rdi
194 movq 168(%rsp), %r12
195 cfi_restore (%r12)
196 movq 160(%rsp), %r13
197 cfi_restore (%r13)
198 movq 152(%rsp), %r14
199 cfi_restore (%r14)
200 movq 144(%rsp), %r15
201 cfi_restore (%r15)
202 movups 256(%rsp), %xmm0
203 jmp .LBL_1_2
204
205 .LBL_1_10:
206 cfi_restore_state
207 movzbl %r12b, %r15d
208 movss 196(%rsp,%r15,8), %xmm0
209
210 call JUMPTARGET(sinf)
211
212 movss %xmm0, 260(%rsp,%r15,8)
213 jmp .LBL_1_8
214
215 .LBL_1_12:
216 movzbl %r12b, %r15d
217 movss 192(%rsp,%r15,8), %xmm0
218
219 call JUMPTARGET(sinf)
220
221 movss %xmm0, 256(%rsp,%r15,8)
222 jmp .LBL_1_7
223
224 END(_ZGVbN4v_sinf_sse4)