]>
Commit | Line | Data |
---|---|---|
d9d67745 AS |
1 | /* Macro library used to help during conversion of scalar math functions to |
2 | vectorized SIMD equivalents on AMD GCN. | |
3 | ||
a945c346 | 4 | Copyright (C) 2023-2024 Free Software Foundation, Inc. |
d9d67745 AS |
5 | Contributed by Siemens. |
6 | ||
7 | This file is free software; you can redistribute it and/or modify it | |
8 | under the terms of the GNU General Public License as published by the | |
9 | Free Software Foundation; either version 3, or (at your option) any | |
10 | later version. | |
11 | ||
12 | This file is distributed in the hope that it will be useful, but | |
13 | WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 | General Public License for more details. | |
16 | ||
17 | Under Section 7 of GPL version 3, you are granted additional | |
18 | permissions described in the GCC Runtime Library Exception, version | |
19 | 3.1, as published by the Free Software Foundation. | |
20 | ||
21 | You should have received a copy of the GNU General Public License and | |
22 | a copy of the GCC Runtime Library Exception along with this program; | |
23 | see the files COPYING3 and COPYING.RUNTIME respectively. If not, see | |
24 | <http://www.gnu.org/licenses/>. */ | |
25 | ||
26 | typedef union { | |
27 | v2sf t_v2sf; | |
28 | v4sf t_v4sf; | |
29 | v8sf t_v8sf; | |
30 | v16sf t_v16sf; | |
31 | v32sf t_v32sf; | |
32 | v64sf t_v64sf; | |
33 | ||
34 | v2df t_v2df; | |
35 | v4df t_v4df; | |
36 | v8df t_v8df; | |
37 | v16df t_v16df; | |
38 | v32df t_v32df; | |
39 | v64df t_v64df; | |
40 | ||
41 | v64qi t_v64qi; | |
42 | v64hi t_v64hi; | |
43 | ||
44 | v2si t_v2si; | |
45 | v4si t_v4si; | |
46 | v8si t_v8si; | |
47 | v16si t_v16si; | |
48 | v32si t_v32si; | |
49 | v64si t_v64si; | |
50 | ||
51 | v64usi t_v64usi; | |
52 | ||
53 | v2di t_v2di; | |
54 | v4di t_v4di; | |
55 | v8di t_v8di; | |
56 | v16di t_v16di; | |
57 | v32di t_v32di; | |
58 | v64di t_v64di; | |
59 | } vector_union; | |
60 | ||
61 | /* Cast between vectors with a different number of elements, or type. */ | |
62 | ||
63 | #define VGPR_CAST(to_t, from) \ | |
64 | ({ \ | |
65 | to_t __res; \ | |
66 | __asm__ ("" : "=v"(__res) : "0"(from)); \ | |
67 | __res; \ | |
68 | }) | |
69 | ||
70 | #define PACK_SI_PAIR(low, high) \ | |
71 | ({ \ | |
72 | v64udi __res; \ | |
73 | asm ("v_mov_b32\t%L0, %1\n\t" \ | |
74 | "v_mov_b32\t%H0, %2" \ | |
75 | : "=&v"(__res) : "v0"(low), "v"(high), "e"(-1L)); \ | |
76 | __res; \ | |
77 | }) | |
78 | ||
79 | #define UNPACK_SI_LOW(to_t, pair) VGPR_CAST(to_t, pair) | |
80 | #define UNPACK_SI_HIGH(to_t, pair) \ | |
81 | ({ \ | |
82 | to_t __res; \ | |
83 | asm ("v_mov_b32\t%0, %H1" : "=v"(__res) : "v"(pair), "e"(-1L)); \ | |
84 | __res; \ | |
85 | }) | |
86 | ||
87 | #define PACK_DI_PAIR(low, high) \ | |
88 | ({ \ | |
89 | v64uti __res; \ | |
90 | asm ("v_mov_b32\t%L0, %L1\n\t" \ | |
91 | "v_mov_b32\t%H0, %H1\n\t" \ | |
92 | "v_mov_b32\t%J0, %L2\n\t" \ | |
93 | "v_mov_b32\t%K0, %H2" \ | |
94 | : "=&v"(__res) : "v0"(low), "v"(high), "e"(-1L)); \ | |
95 | __res; \ | |
96 | }) | |
97 | ||
98 | #define UNPACK_DI_LOW(to_t, pair) VGPR_CAST(to_t, pair) | |
99 | #define UNPACK_DI_HIGH(to_t, pair) \ | |
100 | ({ \ | |
101 | to_t __res; \ | |
102 | asm ("v_mov_b32\t%L0, %J1\n\t" \ | |
103 | "v_mov_b32\t%H0, %K1" : "=v"(__res) : "v"(pair), "e"(-1L)); \ | |
104 | __res; \ | |
105 | }) | |
106 | ||
107 | #define NO_COND __mask | |
108 | ||
109 | /* Note - __mask is _not_ accounted for in VECTOR_MERGE! */ | |
110 | #define VECTOR_MERGE(vec1, vec2, cond) \ | |
111 | ({ \ | |
112 | _Static_assert (__builtin_types_compatible_p (typeof (vec1), typeof (vec2))); \ | |
113 | union { \ | |
114 | typeof (vec1) val; \ | |
115 | v64qi t_v64qi; \ | |
116 | v64hi t_v64hi; \ | |
117 | v64si t_v64si; \ | |
118 | v64di t_v64di; \ | |
119 | } __vec1, __vec2, __res; \ | |
120 | __vec1.val = (vec1); \ | |
121 | __vec2.val = (vec2); \ | |
122 | __builtin_choose_expr ( \ | |
123 | sizeof (vec1) == sizeof (v64si), \ | |
124 | ({ \ | |
125 | v64si __bitmask = __builtin_convertvector ((cond), v64si); \ | |
126 | __res.t_v64si = (__vec1.t_v64si & __bitmask) \ | |
127 | | (__vec2.t_v64si & ~__bitmask); \ | |
128 | }), \ | |
129 | __builtin_choose_expr ( \ | |
130 | sizeof (vec1) == sizeof (v64hi), \ | |
131 | ({ \ | |
132 | v64hi __bitmask = __builtin_convertvector ((cond), v64hi); \ | |
133 | __res.t_v64hi = (__vec1.t_v64hi & __bitmask) \ | |
134 | | (__vec2.t_v64hi & ~__bitmask); \ | |
135 | }), \ | |
136 | __builtin_choose_expr ( \ | |
137 | sizeof (vec1) == sizeof (v64qi), \ | |
138 | ({ \ | |
139 | v64qi __bitmask = __builtin_convertvector ((cond), v64qi); \ | |
140 | __res.t_v64qi = (__vec1.t_v64qi & __bitmask) \ | |
141 | | (__vec2.t_v64qi & ~__bitmask); \ | |
142 | }), \ | |
143 | ({ \ | |
144 | v64di __bitmask = __builtin_convertvector ((cond), v64di); \ | |
145 | __res.t_v64di = (__vec1.t_v64di & __bitmask) \ | |
146 | | (__vec2.t_v64di & ~__bitmask); \ | |
147 | })))); \ | |
148 | __res.val; \ | |
149 | }) | |
150 | ||
151 | #define VECTOR_COND_MOVE(var, val, cond) \ | |
152 | do { \ | |
153 | _Static_assert (__builtin_types_compatible_p (typeof (var), typeof (val))); \ | |
154 | __auto_type __cond = __builtin_convertvector ((cond), typeof (__mask)); \ | |
155 | var = VECTOR_MERGE ((val), var, __cond & __mask); \ | |
156 | } while (0) | |
157 | ||
158 | #define VECTOR_IF(cond, cond_var) \ | |
159 | { \ | |
160 | __auto_type cond_var = (cond); \ | |
161 | __auto_type __inv_cond __attribute__((unused)) = ~cond_var; \ | |
162 | if (!ALL_ZEROES_P (cond_var)) \ | |
163 | { | |
164 | ||
165 | #define VECTOR_ELSEIF(cond, cond_var) \ | |
166 | } \ | |
167 | cond_var = __inv_cond & (cond); \ | |
168 | __inv_cond &= ~(cond); \ | |
169 | if (!ALL_ZEROES_P (cond_var)) \ | |
170 | { | |
171 | ||
172 | #define VECTOR_ELSE(cond_var) \ | |
173 | } \ | |
174 | cond_var = __inv_cond; \ | |
175 | if (!ALL_ZEROES_P (cond_var)) \ | |
176 | { | |
177 | ||
178 | #define VECTOR_IF2(cond, cond_var, prev_cond_var) \ | |
179 | { \ | |
180 | __auto_type cond_var = (cond) & __builtin_convertvector (prev_cond_var, typeof (cond)); \ | |
181 | __auto_type __inv_cond __attribute__((unused)) = ~cond_var; \ | |
182 | if (!ALL_ZEROES_P (cond_var)) \ | |
183 | { | |
184 | ||
185 | #define VECTOR_ELSEIF2(cond, cond_var, prev_cond_var) \ | |
186 | } \ | |
187 | cond_var = (cond) & __inv_cond & __builtin_convertvector (prev_cond_var, typeof (cond)); \ | |
188 | __inv_cond &= ~(cond); \ | |
189 | if (!ALL_ZEROES_P (cond_var)) \ | |
190 | { | |
191 | ||
192 | #define VECTOR_ELSE2(cond_var, prev_cond_var) \ | |
193 | } \ | |
194 | cond_var = __inv_cond & __builtin_convertvector (prev_cond_var, typeof (__inv_cond)); \ | |
195 | if (!ALL_ZEROES_P (cond_var)) \ | |
196 | { | |
197 | ||
198 | ||
199 | #define VECTOR_ENDIF \ | |
200 | } \ | |
201 | } | |
202 | ||
203 | #define VECTOR_INIT_AUX(x, type) \ | |
204 | ({ \ | |
205 | typeof (x) __e = (x); \ | |
206 | type __tmp = { \ | |
207 | __e, __e, __e, __e, __e, __e, __e, __e, \ | |
208 | __e, __e, __e, __e, __e, __e, __e, __e, \ | |
209 | __e, __e, __e, __e, __e, __e, __e, __e, \ | |
210 | __e, __e, __e, __e, __e, __e, __e, __e, \ | |
211 | __e, __e, __e, __e, __e, __e, __e, __e, \ | |
212 | __e, __e, __e, __e, __e, __e, __e, __e, \ | |
213 | __e, __e, __e, __e, __e, __e, __e, __e, \ | |
214 | __e, __e, __e, __e, __e, __e, __e, __e }; \ | |
215 | __tmp; \ | |
216 | }) | |
217 | ||
218 | #define VECTOR_INIT(x) \ | |
219 | (_Generic ((x), int: VECTOR_INIT_AUX ((x), v64si), \ | |
220 | unsigned: VECTOR_INIT_AUX ((x), v64usi), \ | |
221 | char: VECTOR_INIT_AUX ((x), v64qi), \ | |
222 | unsigned char: VECTOR_INIT_AUX ((x), v64uqi), \ | |
223 | short: VECTOR_INIT_AUX ((x), v64hi), \ | |
224 | unsigned short: VECTOR_INIT_AUX ((x), v64uhi), \ | |
225 | long: VECTOR_INIT_AUX ((x), v64di), \ | |
226 | unsigned long: VECTOR_INIT_AUX ((x), v64udi), \ | |
227 | float: VECTOR_INIT_AUX ((x), v64sf), \ | |
228 | double: VECTOR_INIT_AUX ((x), v64df))) | |
229 | ||
230 | ||
231 | #if defined (__GCN3__) || defined (__GCN5__) \ | |
c7ec7bd1 AS |
232 | || defined (__CDNA1__) || defined (__CDNA2__) \ |
233 | || defined (__RDNA2__) | |
d9d67745 AS |
234 | #define CDNA3_PLUS 0 |
235 | #else | |
236 | #define CDNA3_PLUS 1 | |
237 | #endif | |
238 | ||
239 | #define VECTOR_INIT_MASK(COUNT) \ | |
240 | ({ \ | |
241 | MASKMODE __mask; \ | |
242 | int count = (COUNT); \ | |
243 | if (count == 64) \ | |
244 | { \ | |
245 | if (sizeof (MASKMODE) < 512 || CDNA3_PLUS) \ | |
246 | asm ("v_mov%B0\t%0, -1" : "=v"(__mask) : "e"(-1L)); \ | |
247 | else \ | |
248 | asm ("v_mov_b32\t%L0, -1\n\t" \ | |
249 | "v_mov_b32\t%H0, -1" : "=v"(__mask) : "e"(-1L)); \ | |
250 | } \ | |
251 | else \ | |
252 | { \ | |
253 | long bitmask = (count == 64 ? -1 : (1<<count)-1); \ | |
254 | if (sizeof (MASKMODE) < 512 || CDNA3_PLUS) \ | |
255 | { \ | |
256 | asm ("v_mov%B0\t%0, 0" : "=v"(__mask) : "e"(-1L)); \ | |
257 | asm ("v_mov%B0\t%0, -1" : "+v"(__mask) : "e"(bitmask)); \ | |
258 | } \ | |
259 | else \ | |
260 | { \ | |
261 | asm ("v_mov_b32\t%L0, 0\n\t" \ | |
262 | "v_mov_b32\t%H0, 0" : "=v"(__mask) : "e"(-1L)); \ | |
263 | asm ("v_mov_b32\t%L0, -1\n\t" \ | |
264 | "v_mov_b32\t%H0, -1" : "+v"(__mask) : "e"(bitmask)); \ | |
265 | } \ | |
266 | } \ | |
267 | __mask; \ | |
268 | }) | |
269 | ||
270 | #define ALL_ZEROES_P(x) (COND_TO_BITMASK(x) == 0) | |
271 | ||
272 | #define COND_TO_BITMASK(x) \ | |
273 | ({ \ | |
274 | long __tmp = 0; \ | |
275 | __auto_type __x = __builtin_convertvector((x), typeof (__mask)) & __mask; \ | |
276 | __builtin_choose_expr (sizeof (__mask) != 512, \ | |
277 | ({ asm ("v_cmp_ne_u32_e64 %0, %1, 0" \ | |
278 | : "=Sg" (__tmp) \ | |
279 | : "v" (__x)); }), \ | |
280 | ({ asm ("v_cmp_ne_u64_e64 %0, %1, 0" \ | |
281 | : "=Sg" (__tmp) \ | |
282 | : "v" (__x)); })); \ | |
283 | __tmp; \ | |
284 | }) | |
285 | ||
286 | #define VECTOR_WHILE(cond, cond_var, prev_cond_var) \ | |
287 | { \ | |
288 | __auto_type cond_var = prev_cond_var; \ | |
289 | for (;;) { \ | |
290 | cond_var &= (cond); \ | |
291 | if (ALL_ZEROES_P (cond_var)) \ | |
292 | break; | |
293 | ||
294 | #define VECTOR_ENDWHILE \ | |
295 | } \ | |
296 | } | |
297 | ||
298 | #define DEF_VARIANT(FUN, SUFFIX, OTYPE, TYPE, COUNT) \ | |
299 | v##COUNT##OTYPE \ | |
300 | FUN##v##COUNT##SUFFIX (v##COUNT##TYPE __arg1, v##COUNT##TYPE __arg2) \ | |
301 | { \ | |
302 | __auto_type __upsized_arg1 = VGPR_CAST (v64##TYPE, __arg1); \ | |
303 | __auto_type __upsized_arg2 = VGPR_CAST (v64##TYPE, __arg2); \ | |
304 | __auto_type __mask = VECTOR_INIT_MASK (COUNT); \ | |
305 | __auto_type __result = FUN##v64##SUFFIX##_aux (__upsized_arg1, __upsized_arg2, __mask); \ | |
306 | return VGPR_CAST (v##COUNT##OTYPE, __result); \ | |
307 | } | |
308 | ||
309 | #define DEF_VARIANTS(FUN, SUFFIX, TYPE) \ | |
310 | DEF_VARIANT (FUN, SUFFIX, TYPE, TYPE, 2) \ | |
311 | DEF_VARIANT (FUN, SUFFIX, TYPE, TYPE, 4) \ | |
312 | DEF_VARIANT (FUN, SUFFIX, TYPE, TYPE, 8) \ | |
313 | DEF_VARIANT (FUN, SUFFIX, TYPE, TYPE, 16) \ | |
314 | DEF_VARIANT (FUN, SUFFIX, TYPE, TYPE, 32) \ | |
315 | DEF_VARIANT (FUN, SUFFIX, TYPE, TYPE, 64) | |
316 | ||
317 | #define DEF_VARIANTS_B(FUN, SUFFIX, OTYPE, TYPE) \ | |
318 | DEF_VARIANT (FUN, SUFFIX, OTYPE, TYPE, 2) \ | |
319 | DEF_VARIANT (FUN, SUFFIX, OTYPE, TYPE, 4) \ | |
320 | DEF_VARIANT (FUN, SUFFIX, OTYPE, TYPE, 8) \ | |
321 | DEF_VARIANT (FUN, SUFFIX, OTYPE, TYPE, 16) \ | |
322 | DEF_VARIANT (FUN, SUFFIX, OTYPE, TYPE, 32) \ | |
323 | DEF_VARIANT (FUN, SUFFIX, OTYPE, TYPE, 64) |