]>
Commit | Line | Data |
---|---|---|
04f496d6 | 1 | /* Wrapper implementations of vector math functions. |
04277e02 | 2 | Copyright (C) 2014-2019 Free Software Foundation, Inc. |
04f496d6 AS |
3 | This file is part of the GNU C Library. |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
16 | License along with the GNU C Library; if not, see | |
17 | <http://www.gnu.org/licenses/>. */ | |
18 | ||
19 | /* SSE2 ISA version as wrapper to scalar. */ | |
20 | .macro WRAPPER_IMPL_SSE2 callee | |
21 | subq $40, %rsp | |
22 | cfi_adjust_cfa_offset(40) | |
23 | movaps %xmm0, (%rsp) | |
86ed8882 | 24 | call JUMPTARGET(\callee) |
04f496d6 AS |
25 | movss %xmm0, 16(%rsp) |
26 | movss 4(%rsp), %xmm0 | |
86ed8882 | 27 | call JUMPTARGET(\callee) |
04f496d6 AS |
28 | movss %xmm0, 20(%rsp) |
29 | movss 8(%rsp), %xmm0 | |
86ed8882 | 30 | call JUMPTARGET(\callee) |
04f496d6 AS |
31 | movss %xmm0, 24(%rsp) |
32 | movss 12(%rsp), %xmm0 | |
86ed8882 | 33 | call JUMPTARGET(\callee) |
04f496d6 AS |
34 | movss 16(%rsp), %xmm3 |
35 | movss 20(%rsp), %xmm2 | |
36 | movss 24(%rsp), %xmm1 | |
37 | movss %xmm0, 28(%rsp) | |
38 | unpcklps %xmm1, %xmm3 | |
39 | unpcklps %xmm0, %xmm2 | |
40 | unpcklps %xmm2, %xmm3 | |
41 | movaps %xmm3, %xmm0 | |
42 | addq $40, %rsp | |
43 | cfi_adjust_cfa_offset(-40) | |
44 | ret | |
45 | .endm | |
46 | ||
8aa92022 AS |
47 | /* 2 argument SSE2 ISA version as wrapper to scalar. */ |
48 | .macro WRAPPER_IMPL_SSE2_ff callee | |
49 | subq $56, %rsp | |
50 | cfi_adjust_cfa_offset(56) | |
51 | movaps %xmm0, (%rsp) | |
52 | movaps %xmm1, 16(%rsp) | |
86ed8882 | 53 | call JUMPTARGET(\callee) |
8aa92022 AS |
54 | movss %xmm0, 32(%rsp) |
55 | movss 4(%rsp), %xmm0 | |
56 | movss 20(%rsp), %xmm1 | |
86ed8882 | 57 | call JUMPTARGET(\callee) |
8aa92022 AS |
58 | movss %xmm0, 36(%rsp) |
59 | movss 8(%rsp), %xmm0 | |
60 | movss 24(%rsp), %xmm1 | |
86ed8882 | 61 | call JUMPTARGET(\callee) |
8aa92022 AS |
62 | movss %xmm0, 40(%rsp) |
63 | movss 12(%rsp), %xmm0 | |
64 | movss 28(%rsp), %xmm1 | |
86ed8882 | 65 | call JUMPTARGET(\callee) |
8aa92022 AS |
66 | movss 32(%rsp), %xmm3 |
67 | movss 36(%rsp), %xmm2 | |
68 | movss 40(%rsp), %xmm1 | |
69 | movss %xmm0, 44(%rsp) | |
70 | unpcklps %xmm1, %xmm3 | |
71 | unpcklps %xmm0, %xmm2 | |
72 | unpcklps %xmm2, %xmm3 | |
73 | movaps %xmm3, %xmm0 | |
74 | addq $56, %rsp | |
75 | cfi_adjust_cfa_offset(-56) | |
76 | ret | |
77 | .endm | |
78 | ||
a6336cc4 AS |
79 | /* 3 argument SSE2 ISA version as wrapper to scalar. */ |
80 | .macro WRAPPER_IMPL_SSE2_fFF callee | |
81 | pushq %rbp | |
82 | cfi_adjust_cfa_offset (8) | |
83 | cfi_rel_offset (%rbp, 0) | |
84 | pushq %rbx | |
85 | cfi_adjust_cfa_offset (8) | |
86 | cfi_rel_offset (%rbx, 0) | |
87 | movq %rdi, %rbp | |
88 | movq %rsi, %rbx | |
89 | subq $40, %rsp | |
90 | cfi_adjust_cfa_offset(40) | |
91 | leaq 24(%rsp), %rsi | |
92 | leaq 28(%rsp), %rdi | |
93 | movaps %xmm0, (%rsp) | |
86ed8882 | 94 | call JUMPTARGET(\callee) |
a6336cc4 AS |
95 | leaq 24(%rsp), %rsi |
96 | leaq 28(%rsp), %rdi | |
97 | movss 28(%rsp), %xmm0 | |
98 | movss %xmm0, 0(%rbp) | |
99 | movaps (%rsp), %xmm1 | |
100 | movss 24(%rsp), %xmm0 | |
101 | movss %xmm0, (%rbx) | |
102 | movaps %xmm1, %xmm0 | |
103 | shufps $85, %xmm1, %xmm0 | |
86ed8882 | 104 | call JUMPTARGET(\callee) |
a6336cc4 AS |
105 | movss 28(%rsp), %xmm0 |
106 | leaq 24(%rsp), %rsi | |
107 | movss %xmm0, 4(%rbp) | |
108 | leaq 28(%rsp), %rdi | |
109 | movaps (%rsp), %xmm1 | |
110 | movss 24(%rsp), %xmm0 | |
111 | movss %xmm0, 4(%rbx) | |
112 | movaps %xmm1, %xmm0 | |
113 | unpckhps %xmm1, %xmm0 | |
86ed8882 | 114 | call JUMPTARGET(\callee) |
a6336cc4 AS |
115 | movaps (%rsp), %xmm1 |
116 | leaq 24(%rsp), %rsi | |
117 | leaq 28(%rsp), %rdi | |
118 | movss 28(%rsp), %xmm0 | |
119 | shufps $255, %xmm1, %xmm1 | |
120 | movss %xmm0, 8(%rbp) | |
121 | movss 24(%rsp), %xmm0 | |
122 | movss %xmm0, 8(%rbx) | |
123 | movaps %xmm1, %xmm0 | |
86ed8882 | 124 | call JUMPTARGET(\callee) |
a6336cc4 AS |
125 | movss 28(%rsp), %xmm0 |
126 | movss %xmm0, 12(%rbp) | |
127 | movss 24(%rsp), %xmm0 | |
128 | movss %xmm0, 12(%rbx) | |
129 | addq $40, %rsp | |
130 | cfi_adjust_cfa_offset(-40) | |
131 | popq %rbx | |
132 | cfi_adjust_cfa_offset (-8) | |
133 | cfi_restore (%rbx) | |
134 | popq %rbp | |
135 | cfi_adjust_cfa_offset (-8) | |
136 | cfi_restore (%rbp) | |
137 | ret | |
138 | .endm | |
139 | ||
04f496d6 AS |
140 | /* AVX/AVX2 ISA version as wrapper to SSE ISA version. */ |
141 | .macro WRAPPER_IMPL_AVX callee | |
142 | pushq %rbp | |
143 | cfi_adjust_cfa_offset (8) | |
144 | cfi_rel_offset (%rbp, 0) | |
145 | movq %rsp, %rbp | |
146 | cfi_def_cfa_register (%rbp) | |
147 | andq $-32, %rsp | |
148 | subq $32, %rsp | |
149 | vextractf128 $1, %ymm0, (%rsp) | |
150 | vzeroupper | |
151 | call HIDDEN_JUMPTARGET(\callee) | |
152 | vmovaps %xmm0, 16(%rsp) | |
153 | vmovaps (%rsp), %xmm0 | |
154 | call HIDDEN_JUMPTARGET(\callee) | |
155 | vmovaps %xmm0, %xmm1 | |
156 | vmovaps 16(%rsp), %xmm0 | |
157 | vinsertf128 $1, %xmm1, %ymm0, %ymm0 | |
158 | movq %rbp, %rsp | |
159 | cfi_def_cfa_register (%rsp) | |
160 | popq %rbp | |
161 | cfi_adjust_cfa_offset (-8) | |
162 | cfi_restore (%rbp) | |
163 | ret | |
164 | .endm | |
165 | ||
8aa92022 AS |
166 | /* 2 argument AVX/AVX2 ISA version as wrapper to SSE ISA version. */ |
167 | .macro WRAPPER_IMPL_AVX_ff callee | |
168 | pushq %rbp | |
169 | cfi_adjust_cfa_offset (8) | |
170 | cfi_rel_offset (%rbp, 0) | |
171 | movq %rsp, %rbp | |
172 | cfi_def_cfa_register (%rbp) | |
173 | andq $-32, %rsp | |
174 | subq $64, %rsp | |
175 | vextractf128 $1, %ymm0, 16(%rsp) | |
176 | vextractf128 $1, %ymm1, (%rsp) | |
177 | vzeroupper | |
178 | call HIDDEN_JUMPTARGET(\callee) | |
179 | vmovaps %xmm0, 32(%rsp) | |
180 | vmovaps 16(%rsp), %xmm0 | |
181 | vmovaps (%rsp), %xmm1 | |
182 | call HIDDEN_JUMPTARGET(\callee) | |
183 | vmovaps %xmm0, %xmm1 | |
184 | vmovaps 32(%rsp), %xmm0 | |
185 | vinsertf128 $1, %xmm1, %ymm0, %ymm0 | |
186 | movq %rbp, %rsp | |
187 | cfi_def_cfa_register (%rsp) | |
188 | popq %rbp | |
189 | cfi_adjust_cfa_offset (-8) | |
190 | cfi_restore (%rbp) | |
191 | ret | |
192 | .endm | |
193 | ||
a6336cc4 AS |
194 | /* 3 argument AVX/AVX2 ISA version as wrapper to SSE ISA version. */ |
195 | .macro WRAPPER_IMPL_AVX_fFF callee | |
196 | pushq %rbp | |
197 | cfi_adjust_cfa_offset (8) | |
198 | cfi_rel_offset (%rbp, 0) | |
199 | movq %rsp, %rbp | |
200 | cfi_def_cfa_register (%rbp) | |
201 | andq $-32, %rsp | |
202 | pushq %r13 | |
203 | cfi_adjust_cfa_offset (8) | |
204 | cfi_rel_offset (%r13, 0) | |
205 | pushq %r14 | |
206 | cfi_adjust_cfa_offset (8) | |
207 | cfi_rel_offset (%r14, 0) | |
208 | subq $48, %rsp | |
209 | movq %rsi, %r14 | |
210 | vmovaps %ymm0, (%rsp) | |
211 | movq %rdi, %r13 | |
212 | vmovaps 16(%rsp), %xmm1 | |
213 | vmovaps %xmm1, 32(%rsp) | |
214 | vzeroupper | |
215 | vmovaps (%rsp), %xmm0 | |
216 | call HIDDEN_JUMPTARGET(\callee) | |
217 | vmovaps 32(%rsp), %xmm0 | |
218 | lea (%rsp), %rdi | |
219 | lea 16(%rsp), %rsi | |
220 | call HIDDEN_JUMPTARGET(\callee) | |
221 | vmovaps (%rsp), %xmm0 | |
222 | vmovaps 16(%rsp), %xmm1 | |
223 | vmovaps %xmm0, 16(%r13) | |
224 | vmovaps %xmm1, 16(%r14) | |
225 | addq $48, %rsp | |
226 | popq %r14 | |
227 | cfi_adjust_cfa_offset (-8) | |
228 | cfi_restore (%r14) | |
229 | popq %r13 | |
230 | cfi_adjust_cfa_offset (-8) | |
231 | cfi_restore (%r13) | |
232 | movq %rbp, %rsp | |
233 | cfi_def_cfa_register (%rsp) | |
234 | popq %rbp | |
235 | cfi_adjust_cfa_offset (-8) | |
236 | cfi_restore (%rbp) | |
237 | ret | |
238 | .endm | |
239 | ||
04f496d6 AS |
240 | /* AVX512 ISA version as wrapper to AVX2 ISA version. */ |
241 | .macro WRAPPER_IMPL_AVX512 callee | |
99017161 | 242 | pushq %rbp |
04f496d6 AS |
243 | cfi_adjust_cfa_offset (8) |
244 | cfi_rel_offset (%rbp, 0) | |
99017161 | 245 | movq %rsp, %rbp |
04f496d6 | 246 | cfi_def_cfa_register (%rbp) |
99017161 AS |
247 | andq $-64, %rsp |
248 | subq $128, %rsp | |
b9eaca8f | 249 | vmovups %zmm0, (%rsp) |
99017161 AS |
250 | vmovupd (%rsp), %ymm0 |
251 | call HIDDEN_JUMPTARGET(\callee) | |
252 | vmovupd %ymm0, 64(%rsp) | |
253 | vmovupd 32(%rsp), %ymm0 | |
254 | call HIDDEN_JUMPTARGET(\callee) | |
255 | vmovupd %ymm0, 96(%rsp) | |
b9eaca8f | 256 | vmovups 64(%rsp), %zmm0 |
99017161 | 257 | movq %rbp, %rsp |
04f496d6 | 258 | cfi_def_cfa_register (%rsp) |
99017161 | 259 | popq %rbp |
04f496d6 AS |
260 | cfi_adjust_cfa_offset (-8) |
261 | cfi_restore (%rbp) | |
262 | ret | |
263 | .endm | |
8aa92022 AS |
264 | |
265 | /* 2 argument AVX512 ISA version as wrapper to AVX2 ISA version. */ | |
266 | .macro WRAPPER_IMPL_AVX512_ff callee | |
267 | pushq %rbp | |
268 | cfi_adjust_cfa_offset (8) | |
269 | cfi_rel_offset (%rbp, 0) | |
270 | movq %rsp, %rbp | |
271 | cfi_def_cfa_register (%rbp) | |
272 | andq $-64, %rsp | |
99017161 | 273 | subq $192, %rsp |
b9eaca8f L |
274 | vmovups %zmm0, (%rsp) |
275 | vmovups %zmm1, 64(%rsp) | |
99017161 AS |
276 | vmovups (%rsp), %ymm0 |
277 | vmovups 64(%rsp), %ymm1 | |
a6336cc4 | 278 | call HIDDEN_JUMPTARGET(\callee) |
99017161 AS |
279 | vmovups %ymm0, 128(%rsp) |
280 | vmovups 32(%rsp), %ymm0 | |
281 | vmovups 96(%rsp), %ymm1 | |
a6336cc4 | 282 | call HIDDEN_JUMPTARGET(\callee) |
99017161 | 283 | vmovups %ymm0, 160(%rsp) |
b9eaca8f | 284 | vmovups 128(%rsp), %zmm0 |
a6336cc4 AS |
285 | movq %rbp, %rsp |
286 | cfi_def_cfa_register (%rsp) | |
287 | popq %rbp | |
288 | cfi_adjust_cfa_offset (-8) | |
289 | cfi_restore (%rbp) | |
290 | ret | |
291 | .endm | |
292 | ||
293 | /* 3 argument AVX512 ISA version as wrapper to AVX2 ISA version. */ | |
294 | .macro WRAPPER_IMPL_AVX512_fFF callee | |
295 | pushq %rbp | |
296 | cfi_adjust_cfa_offset (8) | |
297 | cfi_rel_offset (%rbp, 0) | |
298 | movq %rsp, %rbp | |
299 | cfi_def_cfa_register (%rbp) | |
300 | andq $-64, %rsp | |
301 | pushq %r12 | |
302 | pushq %r13 | |
303 | subq $176, %rsp | |
304 | movq %rsi, %r13 | |
b9eaca8f | 305 | vmovaps %zmm0, (%rsp) |
a6336cc4 AS |
306 | movq %rdi, %r12 |
307 | vmovaps (%rsp), %ymm0 | |
8aa92022 | 308 | call HIDDEN_JUMPTARGET(\callee) |
a6336cc4 AS |
309 | vmovaps 32(%rsp), %ymm0 |
310 | lea 64(%rsp), %rdi | |
311 | lea 96(%rsp), %rsi | |
8aa92022 | 312 | call HIDDEN_JUMPTARGET(\callee) |
a6336cc4 AS |
313 | vmovaps 64(%rsp), %ymm0 |
314 | vmovaps 96(%rsp), %ymm1 | |
315 | vmovaps %ymm0, 32(%r12) | |
316 | vmovaps %ymm1, 32(%r13) | |
317 | addq $176, %rsp | |
318 | popq %r13 | |
319 | popq %r12 | |
8aa92022 AS |
320 | movq %rbp, %rsp |
321 | cfi_def_cfa_register (%rsp) | |
a6336cc4 | 322 | popq %rbp |
8aa92022 AS |
323 | cfi_adjust_cfa_offset (-8) |
324 | cfi_restore (%rbp) | |
325 | ret | |
326 | .endm |