]>
Commit | Line | Data |
---|---|---|
04f496d6 AS |
1 | /* Wrapper implementations of vector math functions. |
2 | Copyright (C) 2014-2015 Free Software Foundation, Inc. | |
3 | This file is part of the GNU C Library. | |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
16 | License along with the GNU C Library; if not, see | |
17 | <http://www.gnu.org/licenses/>. */ | |
18 | ||
19 | /* SSE2 ISA version as wrapper to scalar. */ | |
20 | .macro WRAPPER_IMPL_SSE2 callee | |
21 | subq $40, %rsp | |
22 | cfi_adjust_cfa_offset(40) | |
23 | movaps %xmm0, (%rsp) | |
24 | call \callee@PLT | |
25 | movss %xmm0, 16(%rsp) | |
26 | movss 4(%rsp), %xmm0 | |
27 | call \callee@PLT | |
28 | movss %xmm0, 20(%rsp) | |
29 | movss 8(%rsp), %xmm0 | |
30 | call \callee@PLT | |
31 | movss %xmm0, 24(%rsp) | |
32 | movss 12(%rsp), %xmm0 | |
33 | call \callee@PLT | |
34 | movss 16(%rsp), %xmm3 | |
35 | movss 20(%rsp), %xmm2 | |
36 | movss 24(%rsp), %xmm1 | |
37 | movss %xmm0, 28(%rsp) | |
38 | unpcklps %xmm1, %xmm3 | |
39 | unpcklps %xmm0, %xmm2 | |
40 | unpcklps %xmm2, %xmm3 | |
41 | movaps %xmm3, %xmm0 | |
42 | addq $40, %rsp | |
43 | cfi_adjust_cfa_offset(-40) | |
44 | ret | |
45 | .endm | |
46 | ||
8aa92022 AS |
47 | /* 2 argument SSE2 ISA version as wrapper to scalar. */ |
48 | .macro WRAPPER_IMPL_SSE2_ff callee | |
49 | subq $56, %rsp | |
50 | cfi_adjust_cfa_offset(56) | |
51 | movaps %xmm0, (%rsp) | |
52 | movaps %xmm1, 16(%rsp) | |
53 | call \callee@PLT | |
54 | movss %xmm0, 32(%rsp) | |
55 | movss 4(%rsp), %xmm0 | |
56 | movss 20(%rsp), %xmm1 | |
57 | call \callee@PLT | |
58 | movss %xmm0, 36(%rsp) | |
59 | movss 8(%rsp), %xmm0 | |
60 | movss 24(%rsp), %xmm1 | |
61 | call \callee@PLT | |
62 | movss %xmm0, 40(%rsp) | |
63 | movss 12(%rsp), %xmm0 | |
64 | movss 28(%rsp), %xmm1 | |
65 | call \callee@PLT | |
66 | movss 32(%rsp), %xmm3 | |
67 | movss 36(%rsp), %xmm2 | |
68 | movss 40(%rsp), %xmm1 | |
69 | movss %xmm0, 44(%rsp) | |
70 | unpcklps %xmm1, %xmm3 | |
71 | unpcklps %xmm0, %xmm2 | |
72 | unpcklps %xmm2, %xmm3 | |
73 | movaps %xmm3, %xmm0 | |
74 | addq $56, %rsp | |
75 | cfi_adjust_cfa_offset(-56) | |
76 | ret | |
77 | .endm | |
78 | ||
04f496d6 AS |
79 | /* AVX/AVX2 ISA version as wrapper to SSE ISA version. */ |
80 | .macro WRAPPER_IMPL_AVX callee | |
81 | pushq %rbp | |
82 | cfi_adjust_cfa_offset (8) | |
83 | cfi_rel_offset (%rbp, 0) | |
84 | movq %rsp, %rbp | |
85 | cfi_def_cfa_register (%rbp) | |
86 | andq $-32, %rsp | |
87 | subq $32, %rsp | |
88 | vextractf128 $1, %ymm0, (%rsp) | |
89 | vzeroupper | |
90 | call HIDDEN_JUMPTARGET(\callee) | |
91 | vmovaps %xmm0, 16(%rsp) | |
92 | vmovaps (%rsp), %xmm0 | |
93 | call HIDDEN_JUMPTARGET(\callee) | |
94 | vmovaps %xmm0, %xmm1 | |
95 | vmovaps 16(%rsp), %xmm0 | |
96 | vinsertf128 $1, %xmm1, %ymm0, %ymm0 | |
97 | movq %rbp, %rsp | |
98 | cfi_def_cfa_register (%rsp) | |
99 | popq %rbp | |
100 | cfi_adjust_cfa_offset (-8) | |
101 | cfi_restore (%rbp) | |
102 | ret | |
103 | .endm | |
104 | ||
8aa92022 AS |
105 | /* 2 argument AVX/AVX2 ISA version as wrapper to SSE ISA version. */ |
106 | .macro WRAPPER_IMPL_AVX_ff callee | |
107 | pushq %rbp | |
108 | cfi_adjust_cfa_offset (8) | |
109 | cfi_rel_offset (%rbp, 0) | |
110 | movq %rsp, %rbp | |
111 | cfi_def_cfa_register (%rbp) | |
112 | andq $-32, %rsp | |
113 | subq $64, %rsp | |
114 | vextractf128 $1, %ymm0, 16(%rsp) | |
115 | vextractf128 $1, %ymm1, (%rsp) | |
116 | vzeroupper | |
117 | call HIDDEN_JUMPTARGET(\callee) | |
118 | vmovaps %xmm0, 32(%rsp) | |
119 | vmovaps 16(%rsp), %xmm0 | |
120 | vmovaps (%rsp), %xmm1 | |
121 | call HIDDEN_JUMPTARGET(\callee) | |
122 | vmovaps %xmm0, %xmm1 | |
123 | vmovaps 32(%rsp), %xmm0 | |
124 | vinsertf128 $1, %xmm1, %ymm0, %ymm0 | |
125 | movq %rbp, %rsp | |
126 | cfi_def_cfa_register (%rsp) | |
127 | popq %rbp | |
128 | cfi_adjust_cfa_offset (-8) | |
129 | cfi_restore (%rbp) | |
130 | ret | |
131 | .endm | |
132 | ||
04f496d6 AS |
133 | /* AVX512 ISA version as wrapper to AVX2 ISA version. */ |
134 | .macro WRAPPER_IMPL_AVX512 callee | |
135 | pushq %rbp | |
136 | cfi_adjust_cfa_offset (8) | |
137 | cfi_rel_offset (%rbp, 0) | |
138 | movq %rsp, %rbp | |
139 | cfi_def_cfa_register (%rbp) | |
140 | andq $-64, %rsp | |
141 | subq $64, %rsp | |
142 | /* Below is encoding for vmovaps %zmm0, (%rsp). */ | |
143 | .byte 0x62 | |
144 | .byte 0xf1 | |
145 | .byte 0x7c | |
146 | .byte 0x48 | |
147 | .byte 0x29 | |
148 | .byte 0x04 | |
149 | .byte 0x24 | |
150 | /* Below is encoding for vmovaps (%rsp), %ymm0. */ | |
151 | .byte 0xc5 | |
152 | .byte 0xfc | |
153 | .byte 0x28 | |
154 | .byte 0x04 | |
155 | .byte 0x24 | |
156 | call HIDDEN_JUMPTARGET(\callee) | |
157 | /* Below is encoding for vmovaps 32(%rsp), %ymm0. */ | |
158 | .byte 0xc5 | |
159 | .byte 0xfc | |
160 | .byte 0x28 | |
161 | .byte 0x44 | |
162 | .byte 0x24 | |
163 | .byte 0x20 | |
164 | call HIDDEN_JUMPTARGET(\callee) | |
165 | movq %rbp, %rsp | |
166 | cfi_def_cfa_register (%rsp) | |
167 | popq %rbp | |
168 | cfi_adjust_cfa_offset (-8) | |
169 | cfi_restore (%rbp) | |
170 | ret | |
171 | .endm | |
8aa92022 AS |
172 | |
173 | /* 2 argument AVX512 ISA version as wrapper to AVX2 ISA version. */ | |
174 | .macro WRAPPER_IMPL_AVX512_ff callee | |
175 | pushq %rbp | |
176 | cfi_adjust_cfa_offset (8) | |
177 | cfi_rel_offset (%rbp, 0) | |
178 | movq %rsp, %rbp | |
179 | cfi_def_cfa_register (%rbp) | |
180 | andq $-64, %rsp | |
181 | subq $128, %rsp | |
182 | /* Below is encoding for vmovaps %zmm0, (%rsp). */ | |
183 | .byte 0x62 | |
184 | .byte 0xf1 | |
185 | .byte 0x7c | |
186 | .byte 0x48 | |
187 | .byte 0x29 | |
188 | .byte 0x04 | |
189 | .byte 0x24 | |
190 | /* Below is encoding for vmovaps %zmm1, 64(%rsp). */ | |
191 | .byte 0x62 | |
192 | .byte 0xf1 | |
193 | .byte 0x7c | |
194 | .byte 0x48 | |
195 | .byte 0x29 | |
196 | .byte 0x4c | |
197 | .byte 0x24 | |
198 | /* Below is encoding for vmovaps (%rsp), %ymm0. */ | |
199 | .byte 0xc5 | |
200 | .byte 0xfc | |
201 | .byte 0x28 | |
202 | .byte 0x04 | |
203 | .byte 0x24 | |
204 | /* Below is encoding for vmovaps 64(%rsp), %ymm1. */ | |
205 | .byte 0xc5 | |
206 | .byte 0xfc | |
207 | .byte 0x28 | |
208 | .byte 0x4c | |
209 | .byte 0x24 | |
210 | .byte 0x40 | |
211 | call HIDDEN_JUMPTARGET(\callee) | |
212 | /* Below is encoding for vmovaps 32(%rsp), %ymm0. */ | |
213 | .byte 0xc5 | |
214 | .byte 0xfc | |
215 | .byte 0x28 | |
216 | .byte 0x44 | |
217 | .byte 0x24 | |
218 | .byte 0x20 | |
219 | /* Below is encoding for vmovaps 96(%rsp), %ymm1. */ | |
220 | .byte 0xc5 | |
221 | .byte 0xfc | |
222 | .byte 0x28 | |
223 | .byte 0x4c | |
224 | .byte 0x24 | |
225 | .byte 0x60 | |
226 | call HIDDEN_JUMPTARGET(\callee) | |
227 | movq %rbp, %rsp | |
228 | cfi_def_cfa_register (%rsp) | |
229 | popq %rbp | |
230 | cfi_adjust_cfa_offset (-8) | |
231 | cfi_restore (%rbp) | |
232 | ret | |
233 | .endm |