]> git.ipfire.org Git - thirdparty/qemu.git/blame - tcg/tcg-op-gvec.c
qcow2: Avoid COW during metadata preallocation
[thirdparty/qemu.git] / tcg / tcg-op-gvec.c
CommitLineData
db432672
RH
1/*
2 * Generic vector operation expansion
3 *
4 * Copyright (c) 2018 Linaro
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18 */
19
20#include "qemu/osdep.h"
21#include "qemu-common.h"
22#include "tcg.h"
23#include "tcg-op.h"
24#include "tcg-op-gvec.h"
25#include "tcg-gvec-desc.h"
26
27#define MAX_UNROLL 4
28
29/* Verify vector size and alignment rules. OFS should be the OR of all
30 of the operand offsets so that we can check them all at once. */
31static void check_size_align(uint32_t oprsz, uint32_t maxsz, uint32_t ofs)
32{
33 uint32_t opr_align = oprsz >= 16 ? 15 : 7;
34 uint32_t max_align = maxsz >= 16 || oprsz >= 16 ? 15 : 7;
35 tcg_debug_assert(oprsz > 0);
36 tcg_debug_assert(oprsz <= maxsz);
37 tcg_debug_assert((oprsz & opr_align) == 0);
38 tcg_debug_assert((maxsz & max_align) == 0);
39 tcg_debug_assert((ofs & max_align) == 0);
40}
41
42/* Verify vector overlap rules for two operands. */
43static void check_overlap_2(uint32_t d, uint32_t a, uint32_t s)
44{
45 tcg_debug_assert(d == a || d + s <= a || a + s <= d);
46}
47
48/* Verify vector overlap rules for three operands. */
49static void check_overlap_3(uint32_t d, uint32_t a, uint32_t b, uint32_t s)
50{
51 check_overlap_2(d, a, s);
52 check_overlap_2(d, b, s);
53 check_overlap_2(a, b, s);
54}
55
56/* Verify vector overlap rules for four operands. */
57static void check_overlap_4(uint32_t d, uint32_t a, uint32_t b,
58 uint32_t c, uint32_t s)
59{
60 check_overlap_2(d, a, s);
61 check_overlap_2(d, b, s);
62 check_overlap_2(d, c, s);
63 check_overlap_2(a, b, s);
64 check_overlap_2(a, c, s);
65 check_overlap_2(b, c, s);
66}
67
68/* Create a descriptor from components. */
69uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data)
70{
71 uint32_t desc = 0;
72
73 assert(oprsz % 8 == 0 && oprsz <= (8 << SIMD_OPRSZ_BITS));
74 assert(maxsz % 8 == 0 && maxsz <= (8 << SIMD_MAXSZ_BITS));
75 assert(data == sextract32(data, 0, SIMD_DATA_BITS));
76
77 oprsz = (oprsz / 8) - 1;
78 maxsz = (maxsz / 8) - 1;
79 desc = deposit32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS, oprsz);
80 desc = deposit32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS, maxsz);
81 desc = deposit32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS, data);
82
83 return desc;
84}
85
86/* Generate a call to a gvec-style helper with two vector operands. */
87void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs,
88 uint32_t oprsz, uint32_t maxsz, int32_t data,
89 gen_helper_gvec_2 *fn)
90{
91 TCGv_ptr a0, a1;
92 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
93
94 a0 = tcg_temp_new_ptr();
95 a1 = tcg_temp_new_ptr();
96
97 tcg_gen_addi_ptr(a0, cpu_env, dofs);
98 tcg_gen_addi_ptr(a1, cpu_env, aofs);
99
100 fn(a0, a1, desc);
101
102 tcg_temp_free_ptr(a0);
103 tcg_temp_free_ptr(a1);
104 tcg_temp_free_i32(desc);
105}
106
22fc3527
RH
107/* Generate a call to a gvec-style helper with two vector operands
108 and one scalar operand. */
109void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c,
110 uint32_t oprsz, uint32_t maxsz, int32_t data,
111 gen_helper_gvec_2i *fn)
112{
113 TCGv_ptr a0, a1;
114 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
115
116 a0 = tcg_temp_new_ptr();
117 a1 = tcg_temp_new_ptr();
118
119 tcg_gen_addi_ptr(a0, cpu_env, dofs);
120 tcg_gen_addi_ptr(a1, cpu_env, aofs);
121
122 fn(a0, a1, c, desc);
123
124 tcg_temp_free_ptr(a0);
125 tcg_temp_free_ptr(a1);
126 tcg_temp_free_i32(desc);
127}
128
db432672
RH
129/* Generate a call to a gvec-style helper with three vector operands. */
130void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
131 uint32_t oprsz, uint32_t maxsz, int32_t data,
132 gen_helper_gvec_3 *fn)
133{
134 TCGv_ptr a0, a1, a2;
135 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
136
137 a0 = tcg_temp_new_ptr();
138 a1 = tcg_temp_new_ptr();
139 a2 = tcg_temp_new_ptr();
140
141 tcg_gen_addi_ptr(a0, cpu_env, dofs);
142 tcg_gen_addi_ptr(a1, cpu_env, aofs);
143 tcg_gen_addi_ptr(a2, cpu_env, bofs);
144
145 fn(a0, a1, a2, desc);
146
147 tcg_temp_free_ptr(a0);
148 tcg_temp_free_ptr(a1);
149 tcg_temp_free_ptr(a2);
150 tcg_temp_free_i32(desc);
151}
152
153/* Generate a call to a gvec-style helper with four vector operands. */
154void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
155 uint32_t cofs, uint32_t oprsz, uint32_t maxsz,
156 int32_t data, gen_helper_gvec_4 *fn)
157{
158 TCGv_ptr a0, a1, a2, a3;
159 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
160
161 a0 = tcg_temp_new_ptr();
162 a1 = tcg_temp_new_ptr();
163 a2 = tcg_temp_new_ptr();
164 a3 = tcg_temp_new_ptr();
165
166 tcg_gen_addi_ptr(a0, cpu_env, dofs);
167 tcg_gen_addi_ptr(a1, cpu_env, aofs);
168 tcg_gen_addi_ptr(a2, cpu_env, bofs);
169 tcg_gen_addi_ptr(a3, cpu_env, cofs);
170
171 fn(a0, a1, a2, a3, desc);
172
173 tcg_temp_free_ptr(a0);
174 tcg_temp_free_ptr(a1);
175 tcg_temp_free_ptr(a2);
176 tcg_temp_free_ptr(a3);
177 tcg_temp_free_i32(desc);
178}
179
180/* Generate a call to a gvec-style helper with five vector operands. */
181void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
182 uint32_t cofs, uint32_t xofs, uint32_t oprsz,
183 uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn)
184{
185 TCGv_ptr a0, a1, a2, a3, a4;
186 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
187
188 a0 = tcg_temp_new_ptr();
189 a1 = tcg_temp_new_ptr();
190 a2 = tcg_temp_new_ptr();
191 a3 = tcg_temp_new_ptr();
192 a4 = tcg_temp_new_ptr();
193
194 tcg_gen_addi_ptr(a0, cpu_env, dofs);
195 tcg_gen_addi_ptr(a1, cpu_env, aofs);
196 tcg_gen_addi_ptr(a2, cpu_env, bofs);
197 tcg_gen_addi_ptr(a3, cpu_env, cofs);
198 tcg_gen_addi_ptr(a4, cpu_env, xofs);
199
200 fn(a0, a1, a2, a3, a4, desc);
201
202 tcg_temp_free_ptr(a0);
203 tcg_temp_free_ptr(a1);
204 tcg_temp_free_ptr(a2);
205 tcg_temp_free_ptr(a3);
206 tcg_temp_free_ptr(a4);
207 tcg_temp_free_i32(desc);
208}
209
210/* Generate a call to a gvec-style helper with three vector operands
211 and an extra pointer operand. */
212void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs,
213 TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
214 int32_t data, gen_helper_gvec_2_ptr *fn)
215{
216 TCGv_ptr a0, a1;
217 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
218
219 a0 = tcg_temp_new_ptr();
220 a1 = tcg_temp_new_ptr();
221
222 tcg_gen_addi_ptr(a0, cpu_env, dofs);
223 tcg_gen_addi_ptr(a1, cpu_env, aofs);
224
225 fn(a0, a1, ptr, desc);
226
227 tcg_temp_free_ptr(a0);
228 tcg_temp_free_ptr(a1);
229 tcg_temp_free_i32(desc);
230}
231
232/* Generate a call to a gvec-style helper with three vector operands
233 and an extra pointer operand. */
234void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
235 TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
236 int32_t data, gen_helper_gvec_3_ptr *fn)
237{
238 TCGv_ptr a0, a1, a2;
239 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
240
241 a0 = tcg_temp_new_ptr();
242 a1 = tcg_temp_new_ptr();
243 a2 = tcg_temp_new_ptr();
244
245 tcg_gen_addi_ptr(a0, cpu_env, dofs);
246 tcg_gen_addi_ptr(a1, cpu_env, aofs);
247 tcg_gen_addi_ptr(a2, cpu_env, bofs);
248
249 fn(a0, a1, a2, ptr, desc);
250
251 tcg_temp_free_ptr(a0);
252 tcg_temp_free_ptr(a1);
253 tcg_temp_free_ptr(a2);
254 tcg_temp_free_i32(desc);
255}
256
257/* Generate a call to a gvec-style helper with four vector operands
258 and an extra pointer operand. */
259void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
260 uint32_t cofs, TCGv_ptr ptr, uint32_t oprsz,
261 uint32_t maxsz, int32_t data,
262 gen_helper_gvec_4_ptr *fn)
263{
264 TCGv_ptr a0, a1, a2, a3;
265 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
266
267 a0 = tcg_temp_new_ptr();
268 a1 = tcg_temp_new_ptr();
269 a2 = tcg_temp_new_ptr();
270 a3 = tcg_temp_new_ptr();
271
272 tcg_gen_addi_ptr(a0, cpu_env, dofs);
273 tcg_gen_addi_ptr(a1, cpu_env, aofs);
274 tcg_gen_addi_ptr(a2, cpu_env, bofs);
275 tcg_gen_addi_ptr(a3, cpu_env, cofs);
276
277 fn(a0, a1, a2, a3, ptr, desc);
278
279 tcg_temp_free_ptr(a0);
280 tcg_temp_free_ptr(a1);
281 tcg_temp_free_ptr(a2);
282 tcg_temp_free_ptr(a3);
283 tcg_temp_free_i32(desc);
284}
285
286/* Return true if we want to implement something of OPRSZ bytes
287 in units of LNSZ. This limits the expansion of inline code. */
288static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz)
289{
499748d7
RH
290 if (oprsz % lnsz == 0) {
291 uint32_t lnct = oprsz / lnsz;
292 return lnct >= 1 && lnct <= MAX_UNROLL;
293 }
294 return false;
db432672
RH
295}
296
297static void expand_clr(uint32_t dofs, uint32_t maxsz);
298
299/* Duplicate C as per VECE. */
300uint64_t (dup_const)(unsigned vece, uint64_t c)
301{
302 switch (vece) {
303 case MO_8:
304 return 0x0101010101010101ull * (uint8_t)c;
305 case MO_16:
306 return 0x0001000100010001ull * (uint16_t)c;
307 case MO_32:
308 return 0x0000000100000001ull * (uint32_t)c;
309 case MO_64:
310 return c;
311 default:
312 g_assert_not_reached();
313 }
314}
315
316/* Duplicate IN into OUT as per VECE. */
317static void gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in)
318{
319 switch (vece) {
320 case MO_8:
321 tcg_gen_ext8u_i32(out, in);
322 tcg_gen_muli_i32(out, out, 0x01010101);
323 break;
324 case MO_16:
325 tcg_gen_deposit_i32(out, in, in, 16, 16);
326 break;
327 case MO_32:
328 tcg_gen_mov_i32(out, in);
329 break;
330 default:
331 g_assert_not_reached();
332 }
333}
334
335static void gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in)
336{
337 switch (vece) {
338 case MO_8:
339 tcg_gen_ext8u_i64(out, in);
340 tcg_gen_muli_i64(out, out, 0x0101010101010101ull);
341 break;
342 case MO_16:
343 tcg_gen_ext16u_i64(out, in);
344 tcg_gen_muli_i64(out, out, 0x0001000100010001ull);
345 break;
346 case MO_32:
347 tcg_gen_deposit_i64(out, in, in, 32, 32);
348 break;
349 case MO_64:
350 tcg_gen_mov_i64(out, in);
351 break;
352 default:
353 g_assert_not_reached();
354 }
355}
356
adb196cb
RH
357/* Select a supported vector type for implementing an operation on SIZE
358 * bytes. If OP is 0, assume that the real operation to be performed is
359 * required by all backends. Otherwise, make sure than OP can be performed
360 * on elements of size VECE in the selected type. Do not select V64 if
361 * PREFER_I64 is true. Return 0 if no vector type is selected.
362 */
363static TCGType choose_vector_type(TCGOpcode op, unsigned vece, uint32_t size,
364 bool prefer_i64)
365{
366 if (TCG_TARGET_HAS_v256 && check_size_impl(size, 32)) {
367 if (op == 0) {
368 return TCG_TYPE_V256;
369 }
370 /* Recall that ARM SVE allows vector sizes that are not a
371 * power of 2, but always a multiple of 16. The intent is
372 * that e.g. size == 80 would be expanded with 2x32 + 1x16.
373 * It is hard to imagine a case in which v256 is supported
374 * but v128 is not, but check anyway.
375 */
376 if (tcg_can_emit_vec_op(op, TCG_TYPE_V256, vece)
377 && (size % 32 == 0
378 || tcg_can_emit_vec_op(op, TCG_TYPE_V128, vece))) {
379 return TCG_TYPE_V256;
380 }
381 }
382 if (TCG_TARGET_HAS_v128 && check_size_impl(size, 16)
383 && (op == 0 || tcg_can_emit_vec_op(op, TCG_TYPE_V128, vece))) {
384 return TCG_TYPE_V128;
385 }
386 if (TCG_TARGET_HAS_v64 && !prefer_i64 && check_size_impl(size, 8)
387 && (op == 0 || tcg_can_emit_vec_op(op, TCG_TYPE_V64, vece))) {
388 return TCG_TYPE_V64;
389 }
390 return 0;
391}
392
db432672
RH
393/* Set OPRSZ bytes at DOFS to replications of IN_32, IN_64 or IN_C.
394 * Only one of IN_32 or IN_64 may be set;
395 * IN_C is used if IN_32 and IN_64 are unset.
396 */
397static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz,
398 uint32_t maxsz, TCGv_i32 in_32, TCGv_i64 in_64,
399 uint64_t in_c)
400{
401 TCGType type;
402 TCGv_i64 t_64;
403 TCGv_i32 t_32, t_desc;
404 TCGv_ptr t_ptr;
405 uint32_t i;
406
407 assert(vece <= (in_32 ? MO_32 : MO_64));
408 assert(in_32 == NULL || in_64 == NULL);
409
410 /* If we're storing 0, expand oprsz to maxsz. */
411 if (in_32 == NULL && in_64 == NULL) {
412 in_c = dup_const(vece, in_c);
413 if (in_c == 0) {
414 oprsz = maxsz;
415 }
416 }
417
adb196cb
RH
418 /* Implement inline with a vector type, if possible.
419 * Prefer integer when 64-bit host and no variable dup.
420 */
421 type = choose_vector_type(0, vece, oprsz,
422 (TCG_TARGET_REG_BITS == 64 && in_32 == NULL
423 && (in_64 == NULL || vece == MO_64)));
db432672
RH
424 if (type != 0) {
425 TCGv_vec t_vec = tcg_temp_new_vec(type);
426
427 if (in_32) {
428 tcg_gen_dup_i32_vec(vece, t_vec, in_32);
429 } else if (in_64) {
430 tcg_gen_dup_i64_vec(vece, t_vec, in_64);
431 } else {
432 switch (vece) {
433 case MO_8:
434 tcg_gen_dup8i_vec(t_vec, in_c);
435 break;
436 case MO_16:
437 tcg_gen_dup16i_vec(t_vec, in_c);
438 break;
439 case MO_32:
440 tcg_gen_dup32i_vec(t_vec, in_c);
441 break;
442 default:
443 tcg_gen_dup64i_vec(t_vec, in_c);
444 break;
445 }
446 }
447
448 i = 0;
adb196cb
RH
449 switch (type) {
450 case TCG_TYPE_V256:
451 /* Recall that ARM SVE allows vector sizes that are not a
452 * power of 2, but always a multiple of 16. The intent is
453 * that e.g. size == 80 would be expanded with 2x32 + 1x16.
454 */
db432672
RH
455 for (; i + 32 <= oprsz; i += 32) {
456 tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V256);
457 }
adb196cb
RH
458 /* fallthru */
459 case TCG_TYPE_V128:
db432672
RH
460 for (; i + 16 <= oprsz; i += 16) {
461 tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V128);
462 }
adb196cb
RH
463 break;
464 case TCG_TYPE_V64:
db432672
RH
465 for (; i < oprsz; i += 8) {
466 tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64);
467 }
adb196cb
RH
468 break;
469 default:
470 g_assert_not_reached();
db432672 471 }
adb196cb 472
db432672
RH
473 tcg_temp_free_vec(t_vec);
474 goto done;
475 }
476
477 /* Otherwise, inline with an integer type, unless "large". */
478 if (check_size_impl(oprsz, TCG_TARGET_REG_BITS / 8)) {
479 t_64 = NULL;
480 t_32 = NULL;
481
482 if (in_32) {
483 /* We are given a 32-bit variable input. For a 64-bit host,
484 use a 64-bit operation unless the 32-bit operation would
485 be simple enough. */
486 if (TCG_TARGET_REG_BITS == 64
487 && (vece != MO_32 || !check_size_impl(oprsz, 4))) {
488 t_64 = tcg_temp_new_i64();
489 tcg_gen_extu_i32_i64(t_64, in_32);
490 gen_dup_i64(vece, t_64, t_64);
491 } else {
492 t_32 = tcg_temp_new_i32();
493 gen_dup_i32(vece, t_32, in_32);
494 }
495 } else if (in_64) {
496 /* We are given a 64-bit variable input. */
497 t_64 = tcg_temp_new_i64();
498 gen_dup_i64(vece, t_64, in_64);
499 } else {
500 /* We are given a constant input. */
501 /* For 64-bit hosts, use 64-bit constants for "simple" constants
502 or when we'd need too many 32-bit stores, or when a 64-bit
503 constant is really required. */
504 if (vece == MO_64
505 || (TCG_TARGET_REG_BITS == 64
506 && (in_c == 0 || in_c == -1
507 || !check_size_impl(oprsz, 4)))) {
508 t_64 = tcg_const_i64(in_c);
509 } else {
510 t_32 = tcg_const_i32(in_c);
511 }
512 }
513
514 /* Implement inline if we picked an implementation size above. */
515 if (t_32) {
516 for (i = 0; i < oprsz; i += 4) {
517 tcg_gen_st_i32(t_32, cpu_env, dofs + i);
518 }
519 tcg_temp_free_i32(t_32);
520 goto done;
521 }
522 if (t_64) {
523 for (i = 0; i < oprsz; i += 8) {
524 tcg_gen_st_i64(t_64, cpu_env, dofs + i);
525 }
526 tcg_temp_free_i64(t_64);
527 goto done;
adb196cb 528 }
db432672
RH
529 }
530
531 /* Otherwise implement out of line. */
532 t_ptr = tcg_temp_new_ptr();
533 tcg_gen_addi_ptr(t_ptr, cpu_env, dofs);
534 t_desc = tcg_const_i32(simd_desc(oprsz, maxsz, 0));
535
536 if (vece == MO_64) {
537 if (in_64) {
538 gen_helper_gvec_dup64(t_ptr, t_desc, in_64);
539 } else {
540 t_64 = tcg_const_i64(in_c);
541 gen_helper_gvec_dup64(t_ptr, t_desc, t_64);
542 tcg_temp_free_i64(t_64);
543 }
544 } else {
545 typedef void dup_fn(TCGv_ptr, TCGv_i32, TCGv_i32);
546 static dup_fn * const fns[3] = {
547 gen_helper_gvec_dup8,
548 gen_helper_gvec_dup16,
549 gen_helper_gvec_dup32
550 };
551
552 if (in_32) {
553 fns[vece](t_ptr, t_desc, in_32);
554 } else {
555 t_32 = tcg_temp_new_i32();
556 if (in_64) {
557 tcg_gen_extrl_i64_i32(t_32, in_64);
558 } else if (vece == MO_8) {
559 tcg_gen_movi_i32(t_32, in_c & 0xff);
560 } else if (vece == MO_16) {
561 tcg_gen_movi_i32(t_32, in_c & 0xffff);
562 } else {
563 tcg_gen_movi_i32(t_32, in_c);
564 }
565 fns[vece](t_ptr, t_desc, t_32);
566 tcg_temp_free_i32(t_32);
567 }
568 }
569
570 tcg_temp_free_ptr(t_ptr);
571 tcg_temp_free_i32(t_desc);
572 return;
573
574 done:
575 if (oprsz < maxsz) {
576 expand_clr(dofs + oprsz, maxsz - oprsz);
577 }
578}
579
580/* Likewise, but with zero. */
581static void expand_clr(uint32_t dofs, uint32_t maxsz)
582{
583 do_dup(MO_8, dofs, maxsz, maxsz, NULL, NULL, 0);
584}
585
586/* Expand OPSZ bytes worth of two-operand operations using i32 elements. */
587static void expand_2_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
588 void (*fni)(TCGv_i32, TCGv_i32))
589{
590 TCGv_i32 t0 = tcg_temp_new_i32();
591 uint32_t i;
592
593 for (i = 0; i < oprsz; i += 4) {
594 tcg_gen_ld_i32(t0, cpu_env, aofs + i);
595 fni(t0, t0);
596 tcg_gen_st_i32(t0, cpu_env, dofs + i);
597 }
598 tcg_temp_free_i32(t0);
599}
600
d0ec9796
RH
601static void expand_2i_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
602 int32_t c, bool load_dest,
603 void (*fni)(TCGv_i32, TCGv_i32, int32_t))
604{
605 TCGv_i32 t0 = tcg_temp_new_i32();
606 TCGv_i32 t1 = tcg_temp_new_i32();
607 uint32_t i;
608
609 for (i = 0; i < oprsz; i += 4) {
610 tcg_gen_ld_i32(t0, cpu_env, aofs + i);
611 if (load_dest) {
612 tcg_gen_ld_i32(t1, cpu_env, dofs + i);
613 }
614 fni(t1, t0, c);
615 tcg_gen_st_i32(t1, cpu_env, dofs + i);
616 }
617 tcg_temp_free_i32(t0);
618 tcg_temp_free_i32(t1);
619}
620
22fc3527
RH
621static void expand_2s_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
622 TCGv_i32 c, bool scalar_first,
623 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
624{
625 TCGv_i32 t0 = tcg_temp_new_i32();
626 TCGv_i32 t1 = tcg_temp_new_i32();
627 uint32_t i;
628
629 for (i = 0; i < oprsz; i += 4) {
630 tcg_gen_ld_i32(t0, cpu_env, aofs + i);
631 if (scalar_first) {
632 fni(t1, c, t0);
633 } else {
634 fni(t1, t0, c);
635 }
636 tcg_gen_st_i32(t1, cpu_env, dofs + i);
637 }
638 tcg_temp_free_i32(t0);
639 tcg_temp_free_i32(t1);
640}
641
db432672
RH
642/* Expand OPSZ bytes worth of three-operand operations using i32 elements. */
643static void expand_3_i32(uint32_t dofs, uint32_t aofs,
644 uint32_t bofs, uint32_t oprsz, bool load_dest,
645 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
646{
647 TCGv_i32 t0 = tcg_temp_new_i32();
648 TCGv_i32 t1 = tcg_temp_new_i32();
649 TCGv_i32 t2 = tcg_temp_new_i32();
650 uint32_t i;
651
652 for (i = 0; i < oprsz; i += 4) {
653 tcg_gen_ld_i32(t0, cpu_env, aofs + i);
654 tcg_gen_ld_i32(t1, cpu_env, bofs + i);
655 if (load_dest) {
656 tcg_gen_ld_i32(t2, cpu_env, dofs + i);
657 }
658 fni(t2, t0, t1);
659 tcg_gen_st_i32(t2, cpu_env, dofs + i);
660 }
661 tcg_temp_free_i32(t2);
662 tcg_temp_free_i32(t1);
663 tcg_temp_free_i32(t0);
664}
665
666/* Expand OPSZ bytes worth of three-operand operations using i32 elements. */
667static void expand_4_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
668 uint32_t cofs, uint32_t oprsz,
669 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32))
670{
671 TCGv_i32 t0 = tcg_temp_new_i32();
672 TCGv_i32 t1 = tcg_temp_new_i32();
673 TCGv_i32 t2 = tcg_temp_new_i32();
674 TCGv_i32 t3 = tcg_temp_new_i32();
675 uint32_t i;
676
677 for (i = 0; i < oprsz; i += 4) {
678 tcg_gen_ld_i32(t1, cpu_env, aofs + i);
679 tcg_gen_ld_i32(t2, cpu_env, bofs + i);
680 tcg_gen_ld_i32(t3, cpu_env, cofs + i);
681 fni(t0, t1, t2, t3);
682 tcg_gen_st_i32(t0, cpu_env, dofs + i);
683 }
684 tcg_temp_free_i32(t3);
685 tcg_temp_free_i32(t2);
686 tcg_temp_free_i32(t1);
687 tcg_temp_free_i32(t0);
688}
689
690/* Expand OPSZ bytes worth of two-operand operations using i64 elements. */
691static void expand_2_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
692 void (*fni)(TCGv_i64, TCGv_i64))
693{
694 TCGv_i64 t0 = tcg_temp_new_i64();
695 uint32_t i;
696
697 for (i = 0; i < oprsz; i += 8) {
698 tcg_gen_ld_i64(t0, cpu_env, aofs + i);
699 fni(t0, t0);
700 tcg_gen_st_i64(t0, cpu_env, dofs + i);
701 }
702 tcg_temp_free_i64(t0);
703}
704
d0ec9796
RH
705static void expand_2i_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
706 int64_t c, bool load_dest,
707 void (*fni)(TCGv_i64, TCGv_i64, int64_t))
708{
709 TCGv_i64 t0 = tcg_temp_new_i64();
710 TCGv_i64 t1 = tcg_temp_new_i64();
711 uint32_t i;
712
713 for (i = 0; i < oprsz; i += 8) {
714 tcg_gen_ld_i64(t0, cpu_env, aofs + i);
715 if (load_dest) {
716 tcg_gen_ld_i64(t1, cpu_env, dofs + i);
717 }
718 fni(t1, t0, c);
719 tcg_gen_st_i64(t1, cpu_env, dofs + i);
720 }
721 tcg_temp_free_i64(t0);
722 tcg_temp_free_i64(t1);
723}
724
22fc3527
RH
725static void expand_2s_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
726 TCGv_i64 c, bool scalar_first,
727 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
728{
729 TCGv_i64 t0 = tcg_temp_new_i64();
730 TCGv_i64 t1 = tcg_temp_new_i64();
731 uint32_t i;
732
733 for (i = 0; i < oprsz; i += 8) {
734 tcg_gen_ld_i64(t0, cpu_env, aofs + i);
735 if (scalar_first) {
736 fni(t1, c, t0);
737 } else {
738 fni(t1, t0, c);
739 }
740 tcg_gen_st_i64(t1, cpu_env, dofs + i);
741 }
742 tcg_temp_free_i64(t0);
743 tcg_temp_free_i64(t1);
744}
745
db432672
RH
746/* Expand OPSZ bytes worth of three-operand operations using i64 elements. */
747static void expand_3_i64(uint32_t dofs, uint32_t aofs,
748 uint32_t bofs, uint32_t oprsz, bool load_dest,
749 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
750{
751 TCGv_i64 t0 = tcg_temp_new_i64();
752 TCGv_i64 t1 = tcg_temp_new_i64();
753 TCGv_i64 t2 = tcg_temp_new_i64();
754 uint32_t i;
755
756 for (i = 0; i < oprsz; i += 8) {
757 tcg_gen_ld_i64(t0, cpu_env, aofs + i);
758 tcg_gen_ld_i64(t1, cpu_env, bofs + i);
759 if (load_dest) {
760 tcg_gen_ld_i64(t2, cpu_env, dofs + i);
761 }
762 fni(t2, t0, t1);
763 tcg_gen_st_i64(t2, cpu_env, dofs + i);
764 }
765 tcg_temp_free_i64(t2);
766 tcg_temp_free_i64(t1);
767 tcg_temp_free_i64(t0);
768}
769
770/* Expand OPSZ bytes worth of three-operand operations using i64 elements. */
771static void expand_4_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
772 uint32_t cofs, uint32_t oprsz,
773 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64))
774{
775 TCGv_i64 t0 = tcg_temp_new_i64();
776 TCGv_i64 t1 = tcg_temp_new_i64();
777 TCGv_i64 t2 = tcg_temp_new_i64();
778 TCGv_i64 t3 = tcg_temp_new_i64();
779 uint32_t i;
780
781 for (i = 0; i < oprsz; i += 8) {
782 tcg_gen_ld_i64(t1, cpu_env, aofs + i);
783 tcg_gen_ld_i64(t2, cpu_env, bofs + i);
784 tcg_gen_ld_i64(t3, cpu_env, cofs + i);
785 fni(t0, t1, t2, t3);
786 tcg_gen_st_i64(t0, cpu_env, dofs + i);
787 }
788 tcg_temp_free_i64(t3);
789 tcg_temp_free_i64(t2);
790 tcg_temp_free_i64(t1);
791 tcg_temp_free_i64(t0);
792}
793
794/* Expand OPSZ bytes worth of two-operand operations using host vectors. */
795static void expand_2_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
796 uint32_t oprsz, uint32_t tysz, TCGType type,
797 void (*fni)(unsigned, TCGv_vec, TCGv_vec))
798{
799 TCGv_vec t0 = tcg_temp_new_vec(type);
800 uint32_t i;
801
802 for (i = 0; i < oprsz; i += tysz) {
803 tcg_gen_ld_vec(t0, cpu_env, aofs + i);
804 fni(vece, t0, t0);
805 tcg_gen_st_vec(t0, cpu_env, dofs + i);
806 }
807 tcg_temp_free_vec(t0);
808}
809
d0ec9796
RH
810/* Expand OPSZ bytes worth of two-vector operands and an immediate operand
811 using host vectors. */
812static void expand_2i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
813 uint32_t oprsz, uint32_t tysz, TCGType type,
814 int64_t c, bool load_dest,
815 void (*fni)(unsigned, TCGv_vec, TCGv_vec, int64_t))
816{
817 TCGv_vec t0 = tcg_temp_new_vec(type);
818 TCGv_vec t1 = tcg_temp_new_vec(type);
819 uint32_t i;
820
821 for (i = 0; i < oprsz; i += tysz) {
822 tcg_gen_ld_vec(t0, cpu_env, aofs + i);
823 if (load_dest) {
824 tcg_gen_ld_vec(t1, cpu_env, dofs + i);
825 }
826 fni(vece, t1, t0, c);
827 tcg_gen_st_vec(t1, cpu_env, dofs + i);
828 }
829 tcg_temp_free_vec(t0);
830 tcg_temp_free_vec(t1);
831}
832
22fc3527
RH
833static void expand_2s_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
834 uint32_t oprsz, uint32_t tysz, TCGType type,
835 TCGv_vec c, bool scalar_first,
836 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
837{
838 TCGv_vec t0 = tcg_temp_new_vec(type);
839 TCGv_vec t1 = tcg_temp_new_vec(type);
840 uint32_t i;
841
842 for (i = 0; i < oprsz; i += tysz) {
843 tcg_gen_ld_vec(t0, cpu_env, aofs + i);
844 if (scalar_first) {
845 fni(vece, t1, c, t0);
846 } else {
847 fni(vece, t1, t0, c);
848 }
849 tcg_gen_st_vec(t1, cpu_env, dofs + i);
850 }
851 tcg_temp_free_vec(t0);
852 tcg_temp_free_vec(t1);
853}
854
db432672
RH
855/* Expand OPSZ bytes worth of three-operand operations using host vectors. */
856static void expand_3_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
857 uint32_t bofs, uint32_t oprsz,
858 uint32_t tysz, TCGType type, bool load_dest,
859 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
860{
861 TCGv_vec t0 = tcg_temp_new_vec(type);
862 TCGv_vec t1 = tcg_temp_new_vec(type);
863 TCGv_vec t2 = tcg_temp_new_vec(type);
864 uint32_t i;
865
866 for (i = 0; i < oprsz; i += tysz) {
867 tcg_gen_ld_vec(t0, cpu_env, aofs + i);
868 tcg_gen_ld_vec(t1, cpu_env, bofs + i);
869 if (load_dest) {
870 tcg_gen_ld_vec(t2, cpu_env, dofs + i);
871 }
872 fni(vece, t2, t0, t1);
873 tcg_gen_st_vec(t2, cpu_env, dofs + i);
874 }
875 tcg_temp_free_vec(t2);
876 tcg_temp_free_vec(t1);
877 tcg_temp_free_vec(t0);
878}
879
880/* Expand OPSZ bytes worth of four-operand operations using host vectors. */
881static void expand_4_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
882 uint32_t bofs, uint32_t cofs, uint32_t oprsz,
883 uint32_t tysz, TCGType type,
884 void (*fni)(unsigned, TCGv_vec, TCGv_vec,
885 TCGv_vec, TCGv_vec))
886{
887 TCGv_vec t0 = tcg_temp_new_vec(type);
888 TCGv_vec t1 = tcg_temp_new_vec(type);
889 TCGv_vec t2 = tcg_temp_new_vec(type);
890 TCGv_vec t3 = tcg_temp_new_vec(type);
891 uint32_t i;
892
893 for (i = 0; i < oprsz; i += tysz) {
894 tcg_gen_ld_vec(t1, cpu_env, aofs + i);
895 tcg_gen_ld_vec(t2, cpu_env, bofs + i);
896 tcg_gen_ld_vec(t3, cpu_env, cofs + i);
897 fni(vece, t0, t1, t2, t3);
898 tcg_gen_st_vec(t0, cpu_env, dofs + i);
899 }
900 tcg_temp_free_vec(t3);
901 tcg_temp_free_vec(t2);
902 tcg_temp_free_vec(t1);
903 tcg_temp_free_vec(t0);
904}
905
906/* Expand a vector two-operand operation. */
907void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
908 uint32_t oprsz, uint32_t maxsz, const GVecGen2 *g)
909{
adb196cb
RH
910 TCGType type;
911 uint32_t some;
912
db432672
RH
913 check_size_align(oprsz, maxsz, dofs | aofs);
914 check_overlap_2(dofs, aofs, maxsz);
915
adb196cb
RH
916 type = 0;
917 if (g->fniv) {
918 type = choose_vector_type(g->opc, g->vece, oprsz, g->prefer_i64);
919 }
920 switch (type) {
921 case TCG_TYPE_V256:
922 /* Recall that ARM SVE allows vector sizes that are not a
923 * power of 2, but always a multiple of 16. The intent is
924 * that e.g. size == 80 would be expanded with 2x32 + 1x16.
925 */
926 some = QEMU_ALIGN_DOWN(oprsz, 32);
db432672
RH
927 expand_2_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, g->fniv);
928 if (some == oprsz) {
adb196cb 929 break;
db432672
RH
930 }
931 dofs += some;
932 aofs += some;
933 oprsz -= some;
934 maxsz -= some;
adb196cb
RH
935 /* fallthru */
936 case TCG_TYPE_V128:
db432672 937 expand_2_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, g->fniv);
adb196cb
RH
938 break;
939 case TCG_TYPE_V64:
db432672 940 expand_2_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, g->fniv);
adb196cb
RH
941 break;
942
943 case 0:
944 if (g->fni8 && check_size_impl(oprsz, 8)) {
945 expand_2_i64(dofs, aofs, oprsz, g->fni8);
946 } else if (g->fni4 && check_size_impl(oprsz, 4)) {
947 expand_2_i32(dofs, aofs, oprsz, g->fni4);
948 } else {
949 assert(g->fno != NULL);
950 tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, g->data, g->fno);
951 return;
952 }
953 break;
954
955 default:
956 g_assert_not_reached();
db432672
RH
957 }
958
db432672
RH
959 if (oprsz < maxsz) {
960 expand_clr(dofs + oprsz, maxsz - oprsz);
961 }
962}
963
22fc3527 964/* Expand a vector operation with two vectors and an immediate. */
d0ec9796
RH
965void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
966 uint32_t maxsz, int64_t c, const GVecGen2i *g)
967{
adb196cb
RH
968 TCGType type;
969 uint32_t some;
970
d0ec9796
RH
971 check_size_align(oprsz, maxsz, dofs | aofs);
972 check_overlap_2(dofs, aofs, maxsz);
973
adb196cb
RH
974 type = 0;
975 if (g->fniv) {
976 type = choose_vector_type(g->opc, g->vece, oprsz, g->prefer_i64);
977 }
978 switch (type) {
979 case TCG_TYPE_V256:
980 /* Recall that ARM SVE allows vector sizes that are not a
981 * power of 2, but always a multiple of 16. The intent is
982 * that e.g. size == 80 would be expanded with 2x32 + 1x16.
983 */
984 some = QEMU_ALIGN_DOWN(oprsz, 32);
d0ec9796
RH
985 expand_2i_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
986 c, g->load_dest, g->fniv);
987 if (some == oprsz) {
adb196cb 988 break;
d0ec9796
RH
989 }
990 dofs += some;
991 aofs += some;
992 oprsz -= some;
993 maxsz -= some;
adb196cb
RH
994 /* fallthru */
995 case TCG_TYPE_V128:
d0ec9796
RH
996 expand_2i_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
997 c, g->load_dest, g->fniv);
adb196cb
RH
998 break;
999 case TCG_TYPE_V64:
d0ec9796
RH
1000 expand_2i_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1001 c, g->load_dest, g->fniv);
adb196cb
RH
1002 break;
1003
1004 case 0:
1005 if (g->fni8 && check_size_impl(oprsz, 8)) {
1006 expand_2i_i64(dofs, aofs, oprsz, c, g->load_dest, g->fni8);
1007 } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1008 expand_2i_i32(dofs, aofs, oprsz, c, g->load_dest, g->fni4);
22fc3527 1009 } else {
adb196cb
RH
1010 if (g->fno) {
1011 tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, c, g->fno);
1012 } else {
1013 TCGv_i64 tcg_c = tcg_const_i64(c);
1014 tcg_gen_gvec_2i_ool(dofs, aofs, tcg_c, oprsz,
1015 maxsz, c, g->fnoi);
1016 tcg_temp_free_i64(tcg_c);
1017 }
1018 return;
22fc3527 1019 }
adb196cb
RH
1020 break;
1021
1022 default:
1023 g_assert_not_reached();
d0ec9796
RH
1024 }
1025
d0ec9796
RH
1026 if (oprsz < maxsz) {
1027 expand_clr(dofs + oprsz, maxsz - oprsz);
1028 }
1029}
1030
22fc3527
RH
1031/* Expand a vector operation with two vectors and a scalar. */
1032void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
1033 uint32_t maxsz, TCGv_i64 c, const GVecGen2s *g)
1034{
1035 TCGType type;
1036
1037 check_size_align(oprsz, maxsz, dofs | aofs);
1038 check_overlap_2(dofs, aofs, maxsz);
1039
1040 type = 0;
1041 if (g->fniv) {
adb196cb 1042 type = choose_vector_type(g->opc, g->vece, oprsz, g->prefer_i64);
22fc3527
RH
1043 }
1044 if (type != 0) {
1045 TCGv_vec t_vec = tcg_temp_new_vec(type);
adb196cb 1046 uint32_t some;
22fc3527
RH
1047
1048 tcg_gen_dup_i64_vec(g->vece, t_vec, c);
1049
22fc3527
RH
1050 switch (type) {
1051 case TCG_TYPE_V256:
adb196cb
RH
1052 /* Recall that ARM SVE allows vector sizes that are not a
1053 * power of 2, but always a multiple of 16. The intent is
1054 * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1055 */
1056 some = QEMU_ALIGN_DOWN(oprsz, 32);
1057 expand_2s_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1058 t_vec, g->scalar_first, g->fniv);
1059 if (some == oprsz) {
1060 break;
22fc3527 1061 }
adb196cb
RH
1062 dofs += some;
1063 aofs += some;
1064 oprsz -= some;
1065 maxsz -= some;
22fc3527
RH
1066 /* fallthru */
1067
1068 case TCG_TYPE_V128:
1069 expand_2s_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1070 t_vec, g->scalar_first, g->fniv);
1071 break;
1072
1073 case TCG_TYPE_V64:
1074 expand_2s_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1075 t_vec, g->scalar_first, g->fniv);
1076 break;
1077
1078 default:
1079 g_assert_not_reached();
1080 }
1081 tcg_temp_free_vec(t_vec);
1082 } else if (g->fni8 && check_size_impl(oprsz, 8)) {
1083 TCGv_i64 t64 = tcg_temp_new_i64();
1084
1085 gen_dup_i64(g->vece, t64, c);
1086 expand_2s_i64(dofs, aofs, oprsz, t64, g->scalar_first, g->fni8);
1087 tcg_temp_free_i64(t64);
1088 } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1089 TCGv_i32 t32 = tcg_temp_new_i32();
1090
1091 tcg_gen_extrl_i64_i32(t32, c);
1092 gen_dup_i32(g->vece, t32, t32);
1093 expand_2s_i32(dofs, aofs, oprsz, t32, g->scalar_first, g->fni4);
1094 tcg_temp_free_i32(t32);
1095 } else {
1096 tcg_gen_gvec_2i_ool(dofs, aofs, c, oprsz, maxsz, 0, g->fno);
1097 return;
1098 }
1099
1100 if (oprsz < maxsz) {
1101 expand_clr(dofs + oprsz, maxsz - oprsz);
1102 }
1103}
1104
db432672
RH
1105/* Expand a vector three-operand operation. */
1106void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1107 uint32_t oprsz, uint32_t maxsz, const GVecGen3 *g)
1108{
adb196cb
RH
1109 TCGType type;
1110 uint32_t some;
1111
db432672
RH
1112 check_size_align(oprsz, maxsz, dofs | aofs | bofs);
1113 check_overlap_3(dofs, aofs, bofs, maxsz);
1114
adb196cb
RH
1115 type = 0;
1116 if (g->fniv) {
1117 type = choose_vector_type(g->opc, g->vece, oprsz, g->prefer_i64);
1118 }
1119 switch (type) {
1120 case TCG_TYPE_V256:
1121 /* Recall that ARM SVE allows vector sizes that are not a
1122 * power of 2, but always a multiple of 16. The intent is
1123 * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1124 */
1125 some = QEMU_ALIGN_DOWN(oprsz, 32);
db432672
RH
1126 expand_3_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256,
1127 g->load_dest, g->fniv);
1128 if (some == oprsz) {
adb196cb 1129 break;
db432672
RH
1130 }
1131 dofs += some;
1132 aofs += some;
1133 bofs += some;
1134 oprsz -= some;
1135 maxsz -= some;
adb196cb
RH
1136 /* fallthru */
1137 case TCG_TYPE_V128:
db432672
RH
1138 expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128,
1139 g->load_dest, g->fniv);
adb196cb
RH
1140 break;
1141 case TCG_TYPE_V64:
db432672
RH
1142 expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64,
1143 g->load_dest, g->fniv);
adb196cb
RH
1144 break;
1145
1146 case 0:
1147 if (g->fni8 && check_size_impl(oprsz, 8)) {
1148 expand_3_i64(dofs, aofs, bofs, oprsz, g->load_dest, g->fni8);
1149 } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1150 expand_3_i32(dofs, aofs, bofs, oprsz, g->load_dest, g->fni4);
1151 } else {
1152 assert(g->fno != NULL);
1153 tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz,
1154 maxsz, g->data, g->fno);
1155 return;
1156 }
1157 break;
1158
1159 default:
1160 g_assert_not_reached();
db432672
RH
1161 }
1162
db432672
RH
1163 if (oprsz < maxsz) {
1164 expand_clr(dofs + oprsz, maxsz - oprsz);
1165 }
1166}
1167
1168/* Expand a vector four-operand operation. */
1169void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
1170 uint32_t oprsz, uint32_t maxsz, const GVecGen4 *g)
1171{
adb196cb
RH
1172 TCGType type;
1173 uint32_t some;
1174
db432672
RH
1175 check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs);
1176 check_overlap_4(dofs, aofs, bofs, cofs, maxsz);
1177
adb196cb
RH
1178 type = 0;
1179 if (g->fniv) {
1180 type = choose_vector_type(g->opc, g->vece, oprsz, g->prefer_i64);
1181 }
1182 switch (type) {
1183 case TCG_TYPE_V256:
1184 /* Recall that ARM SVE allows vector sizes that are not a
1185 * power of 2, but always a multiple of 16. The intent is
1186 * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1187 */
1188 some = QEMU_ALIGN_DOWN(oprsz, 32);
db432672
RH
1189 expand_4_vec(g->vece, dofs, aofs, bofs, cofs, some,
1190 32, TCG_TYPE_V256, g->fniv);
1191 if (some == oprsz) {
adb196cb 1192 break;
db432672
RH
1193 }
1194 dofs += some;
1195 aofs += some;
1196 bofs += some;
1197 cofs += some;
1198 oprsz -= some;
1199 maxsz -= some;
adb196cb
RH
1200 /* fallthru */
1201 case TCG_TYPE_V128:
db432672
RH
1202 expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1203 16, TCG_TYPE_V128, g->fniv);
adb196cb
RH
1204 break;
1205 case TCG_TYPE_V64:
db432672
RH
1206 expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1207 8, TCG_TYPE_V64, g->fniv);
adb196cb
RH
1208 break;
1209
1210 case 0:
1211 if (g->fni8 && check_size_impl(oprsz, 8)) {
1212 expand_4_i64(dofs, aofs, bofs, cofs, oprsz, g->fni8);
1213 } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1214 expand_4_i32(dofs, aofs, bofs, cofs, oprsz, g->fni4);
1215 } else {
1216 assert(g->fno != NULL);
1217 tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs,
1218 oprsz, maxsz, g->data, g->fno);
1219 return;
1220 }
1221 break;
1222
1223 default:
1224 g_assert_not_reached();
db432672
RH
1225 }
1226
db432672
RH
1227 if (oprsz < maxsz) {
1228 expand_clr(dofs + oprsz, maxsz - oprsz);
1229 }
1230}
1231
1232/*
1233 * Expand specific vector operations.
1234 */
1235
1236static void vec_mov2(unsigned vece, TCGv_vec a, TCGv_vec b)
1237{
1238 tcg_gen_mov_vec(a, b);
1239}
1240
1241void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs,
1242 uint32_t oprsz, uint32_t maxsz)
1243{
1244 static const GVecGen2 g = {
1245 .fni8 = tcg_gen_mov_i64,
1246 .fniv = vec_mov2,
1247 .fno = gen_helper_gvec_mov,
1248 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1249 };
1250 if (dofs != aofs) {
1251 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
1252 } else {
1253 check_size_align(oprsz, maxsz, dofs);
1254 if (oprsz < maxsz) {
1255 expand_clr(dofs + oprsz, maxsz - oprsz);
1256 }
1257 }
1258}
1259
1260void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t oprsz,
1261 uint32_t maxsz, TCGv_i32 in)
1262{
1263 check_size_align(oprsz, maxsz, dofs);
1264 tcg_debug_assert(vece <= MO_32);
1265 do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0);
1266}
1267
1268void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t oprsz,
1269 uint32_t maxsz, TCGv_i64 in)
1270{
1271 check_size_align(oprsz, maxsz, dofs);
1272 tcg_debug_assert(vece <= MO_64);
1273 do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0);
1274}
1275
1276void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
1277 uint32_t oprsz, uint32_t maxsz)
1278{
1279 if (vece <= MO_32) {
1280 TCGv_i32 in = tcg_temp_new_i32();
1281 switch (vece) {
1282 case MO_8:
1283 tcg_gen_ld8u_i32(in, cpu_env, aofs);
1284 break;
1285 case MO_16:
1286 tcg_gen_ld16u_i32(in, cpu_env, aofs);
1287 break;
1288 case MO_32:
1289 tcg_gen_ld_i32(in, cpu_env, aofs);
1290 break;
1291 }
1292 tcg_gen_gvec_dup_i32(vece, dofs, oprsz, maxsz, in);
1293 tcg_temp_free_i32(in);
1294 } else if (vece == MO_64) {
1295 TCGv_i64 in = tcg_temp_new_i64();
1296 tcg_gen_ld_i64(in, cpu_env, aofs);
1297 tcg_gen_gvec_dup_i64(MO_64, dofs, oprsz, maxsz, in);
1298 tcg_temp_free_i64(in);
1299 } else {
1300 /* 128-bit duplicate. */
1301 /* ??? Dup to 256-bit vector. */
1302 int i;
1303
1304 tcg_debug_assert(vece == 4);
1305 tcg_debug_assert(oprsz >= 16);
1306 if (TCG_TARGET_HAS_v128) {
1307 TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128);
1308
1309 tcg_gen_ld_vec(in, cpu_env, aofs);
1310 for (i = 0; i < oprsz; i += 16) {
1311 tcg_gen_st_vec(in, cpu_env, dofs + i);
1312 }
1313 tcg_temp_free_vec(in);
1314 } else {
1315 TCGv_i64 in0 = tcg_temp_new_i64();
1316 TCGv_i64 in1 = tcg_temp_new_i64();
1317
1318 tcg_gen_ld_i64(in0, cpu_env, aofs);
1319 tcg_gen_ld_i64(in1, cpu_env, aofs + 8);
1320 for (i = 0; i < oprsz; i += 16) {
1321 tcg_gen_st_i64(in0, cpu_env, dofs + i);
1322 tcg_gen_st_i64(in1, cpu_env, dofs + i + 8);
1323 }
1324 tcg_temp_free_i64(in0);
1325 tcg_temp_free_i64(in1);
1326 }
1327 }
1328}
1329
1330void tcg_gen_gvec_dup64i(uint32_t dofs, uint32_t oprsz,
1331 uint32_t maxsz, uint64_t x)
1332{
1333 check_size_align(oprsz, maxsz, dofs);
1334 do_dup(MO_64, dofs, oprsz, maxsz, NULL, NULL, x);
1335}
1336
1337void tcg_gen_gvec_dup32i(uint32_t dofs, uint32_t oprsz,
1338 uint32_t maxsz, uint32_t x)
1339{
1340 check_size_align(oprsz, maxsz, dofs);
1341 do_dup(MO_32, dofs, oprsz, maxsz, NULL, NULL, x);
1342}
1343
1344void tcg_gen_gvec_dup16i(uint32_t dofs, uint32_t oprsz,
1345 uint32_t maxsz, uint16_t x)
1346{
1347 check_size_align(oprsz, maxsz, dofs);
1348 do_dup(MO_16, dofs, oprsz, maxsz, NULL, NULL, x);
1349}
1350
1351void tcg_gen_gvec_dup8i(uint32_t dofs, uint32_t oprsz,
1352 uint32_t maxsz, uint8_t x)
1353{
1354 check_size_align(oprsz, maxsz, dofs);
1355 do_dup(MO_8, dofs, oprsz, maxsz, NULL, NULL, x);
1356}
1357
1358void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs,
1359 uint32_t oprsz, uint32_t maxsz)
1360{
1361 static const GVecGen2 g = {
1362 .fni8 = tcg_gen_not_i64,
1363 .fniv = tcg_gen_not_vec,
1364 .fno = gen_helper_gvec_not,
1365 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1366 };
1367 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
1368}
1369
1370/* Perform a vector addition using normal addition and a mask. The mask
1371 should be the sign bit of each lane. This 6-operation form is more
1372 efficient than separate additions when there are 4 or more lanes in
1373 the 64-bit operation. */
1374static void gen_addv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
1375{
1376 TCGv_i64 t1 = tcg_temp_new_i64();
1377 TCGv_i64 t2 = tcg_temp_new_i64();
1378 TCGv_i64 t3 = tcg_temp_new_i64();
1379
1380 tcg_gen_andc_i64(t1, a, m);
1381 tcg_gen_andc_i64(t2, b, m);
1382 tcg_gen_xor_i64(t3, a, b);
1383 tcg_gen_add_i64(d, t1, t2);
1384 tcg_gen_and_i64(t3, t3, m);
1385 tcg_gen_xor_i64(d, d, t3);
1386
1387 tcg_temp_free_i64(t1);
1388 tcg_temp_free_i64(t2);
1389 tcg_temp_free_i64(t3);
1390}
1391
1392void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1393{
1394 TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
1395 gen_addv_mask(d, a, b, m);
1396 tcg_temp_free_i64(m);
1397}
1398
1399void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1400{
1401 TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
1402 gen_addv_mask(d, a, b, m);
1403 tcg_temp_free_i64(m);
1404}
1405
1406void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1407{
1408 TCGv_i64 t1 = tcg_temp_new_i64();
1409 TCGv_i64 t2 = tcg_temp_new_i64();
1410
1411 tcg_gen_andi_i64(t1, a, ~0xffffffffull);
1412 tcg_gen_add_i64(t2, a, b);
1413 tcg_gen_add_i64(t1, t1, b);
1414 tcg_gen_deposit_i64(d, t1, t2, 0, 32);
1415
1416 tcg_temp_free_i64(t1);
1417 tcg_temp_free_i64(t2);
1418}
1419
1420void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs,
1421 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1422{
1423 static const GVecGen3 g[4] = {
1424 { .fni8 = tcg_gen_vec_add8_i64,
1425 .fniv = tcg_gen_add_vec,
1426 .fno = gen_helper_gvec_add8,
1427 .opc = INDEX_op_add_vec,
1428 .vece = MO_8 },
1429 { .fni8 = tcg_gen_vec_add16_i64,
1430 .fniv = tcg_gen_add_vec,
1431 .fno = gen_helper_gvec_add16,
1432 .opc = INDEX_op_add_vec,
1433 .vece = MO_16 },
1434 { .fni4 = tcg_gen_add_i32,
1435 .fniv = tcg_gen_add_vec,
1436 .fno = gen_helper_gvec_add32,
1437 .opc = INDEX_op_add_vec,
1438 .vece = MO_32 },
1439 { .fni8 = tcg_gen_add_i64,
1440 .fniv = tcg_gen_add_vec,
1441 .fno = gen_helper_gvec_add64,
1442 .opc = INDEX_op_add_vec,
1443 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1444 .vece = MO_64 },
1445 };
1446
1447 tcg_debug_assert(vece <= MO_64);
1448 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1449}
1450
22fc3527
RH
1451void tcg_gen_gvec_adds(unsigned vece, uint32_t dofs, uint32_t aofs,
1452 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1453{
1454 static const GVecGen2s g[4] = {
1455 { .fni8 = tcg_gen_vec_add8_i64,
1456 .fniv = tcg_gen_add_vec,
1457 .fno = gen_helper_gvec_adds8,
1458 .opc = INDEX_op_add_vec,
1459 .vece = MO_8 },
1460 { .fni8 = tcg_gen_vec_add16_i64,
1461 .fniv = tcg_gen_add_vec,
1462 .fno = gen_helper_gvec_adds16,
1463 .opc = INDEX_op_add_vec,
1464 .vece = MO_16 },
1465 { .fni4 = tcg_gen_add_i32,
1466 .fniv = tcg_gen_add_vec,
1467 .fno = gen_helper_gvec_adds32,
1468 .opc = INDEX_op_add_vec,
1469 .vece = MO_32 },
1470 { .fni8 = tcg_gen_add_i64,
1471 .fniv = tcg_gen_add_vec,
1472 .fno = gen_helper_gvec_adds64,
1473 .opc = INDEX_op_add_vec,
1474 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1475 .vece = MO_64 },
1476 };
1477
1478 tcg_debug_assert(vece <= MO_64);
1479 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1480}
1481
1482void tcg_gen_gvec_addi(unsigned vece, uint32_t dofs, uint32_t aofs,
1483 int64_t c, uint32_t oprsz, uint32_t maxsz)
1484{
1485 TCGv_i64 tmp = tcg_const_i64(c);
1486 tcg_gen_gvec_adds(vece, dofs, aofs, tmp, oprsz, maxsz);
1487 tcg_temp_free_i64(tmp);
1488}
1489
1490void tcg_gen_gvec_subs(unsigned vece, uint32_t dofs, uint32_t aofs,
1491 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1492{
1493 static const GVecGen2s g[4] = {
1494 { .fni8 = tcg_gen_vec_sub8_i64,
1495 .fniv = tcg_gen_sub_vec,
1496 .fno = gen_helper_gvec_subs8,
1497 .opc = INDEX_op_sub_vec,
1498 .vece = MO_8 },
1499 { .fni8 = tcg_gen_vec_sub16_i64,
1500 .fniv = tcg_gen_sub_vec,
1501 .fno = gen_helper_gvec_subs16,
1502 .opc = INDEX_op_sub_vec,
1503 .vece = MO_16 },
1504 { .fni4 = tcg_gen_sub_i32,
1505 .fniv = tcg_gen_sub_vec,
1506 .fno = gen_helper_gvec_subs32,
1507 .opc = INDEX_op_sub_vec,
1508 .vece = MO_32 },
1509 { .fni8 = tcg_gen_sub_i64,
1510 .fniv = tcg_gen_sub_vec,
1511 .fno = gen_helper_gvec_subs64,
1512 .opc = INDEX_op_sub_vec,
1513 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1514 .vece = MO_64 },
1515 };
1516
1517 tcg_debug_assert(vece <= MO_64);
1518 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1519}
1520
db432672
RH
1521/* Perform a vector subtraction using normal subtraction and a mask.
1522 Compare gen_addv_mask above. */
1523static void gen_subv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
1524{
1525 TCGv_i64 t1 = tcg_temp_new_i64();
1526 TCGv_i64 t2 = tcg_temp_new_i64();
1527 TCGv_i64 t3 = tcg_temp_new_i64();
1528
1529 tcg_gen_or_i64(t1, a, m);
1530 tcg_gen_andc_i64(t2, b, m);
1531 tcg_gen_eqv_i64(t3, a, b);
1532 tcg_gen_sub_i64(d, t1, t2);
1533 tcg_gen_and_i64(t3, t3, m);
1534 tcg_gen_xor_i64(d, d, t3);
1535
1536 tcg_temp_free_i64(t1);
1537 tcg_temp_free_i64(t2);
1538 tcg_temp_free_i64(t3);
1539}
1540
1541void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1542{
1543 TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
1544 gen_subv_mask(d, a, b, m);
1545 tcg_temp_free_i64(m);
1546}
1547
1548void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1549{
1550 TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
1551 gen_subv_mask(d, a, b, m);
1552 tcg_temp_free_i64(m);
1553}
1554
1555void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1556{
1557 TCGv_i64 t1 = tcg_temp_new_i64();
1558 TCGv_i64 t2 = tcg_temp_new_i64();
1559
1560 tcg_gen_andi_i64(t1, b, ~0xffffffffull);
1561 tcg_gen_sub_i64(t2, a, b);
1562 tcg_gen_sub_i64(t1, a, t1);
1563 tcg_gen_deposit_i64(d, t1, t2, 0, 32);
1564
1565 tcg_temp_free_i64(t1);
1566 tcg_temp_free_i64(t2);
1567}
1568
1569void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs,
1570 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1571{
1572 static const GVecGen3 g[4] = {
1573 { .fni8 = tcg_gen_vec_sub8_i64,
1574 .fniv = tcg_gen_sub_vec,
1575 .fno = gen_helper_gvec_sub8,
1576 .opc = INDEX_op_sub_vec,
1577 .vece = MO_8 },
1578 { .fni8 = tcg_gen_vec_sub16_i64,
1579 .fniv = tcg_gen_sub_vec,
1580 .fno = gen_helper_gvec_sub16,
1581 .opc = INDEX_op_sub_vec,
1582 .vece = MO_16 },
1583 { .fni4 = tcg_gen_sub_i32,
1584 .fniv = tcg_gen_sub_vec,
1585 .fno = gen_helper_gvec_sub32,
1586 .opc = INDEX_op_sub_vec,
1587 .vece = MO_32 },
1588 { .fni8 = tcg_gen_sub_i64,
1589 .fniv = tcg_gen_sub_vec,
1590 .fno = gen_helper_gvec_sub64,
1591 .opc = INDEX_op_sub_vec,
1592 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1593 .vece = MO_64 },
1594 };
1595
1596 tcg_debug_assert(vece <= MO_64);
1597 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1598}
1599
3774030a
RH
1600void tcg_gen_gvec_mul(unsigned vece, uint32_t dofs, uint32_t aofs,
1601 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1602{
1603 static const GVecGen3 g[4] = {
1604 { .fniv = tcg_gen_mul_vec,
1605 .fno = gen_helper_gvec_mul8,
1606 .opc = INDEX_op_mul_vec,
1607 .vece = MO_8 },
1608 { .fniv = tcg_gen_mul_vec,
1609 .fno = gen_helper_gvec_mul16,
1610 .opc = INDEX_op_mul_vec,
1611 .vece = MO_16 },
1612 { .fni4 = tcg_gen_mul_i32,
1613 .fniv = tcg_gen_mul_vec,
1614 .fno = gen_helper_gvec_mul32,
1615 .opc = INDEX_op_mul_vec,
1616 .vece = MO_32 },
1617 { .fni8 = tcg_gen_mul_i64,
1618 .fniv = tcg_gen_mul_vec,
1619 .fno = gen_helper_gvec_mul64,
1620 .opc = INDEX_op_mul_vec,
1621 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1622 .vece = MO_64 },
1623 };
1624
1625 tcg_debug_assert(vece <= MO_64);
1626 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1627}
1628
22fc3527
RH
1629void tcg_gen_gvec_muls(unsigned vece, uint32_t dofs, uint32_t aofs,
1630 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1631{
1632 static const GVecGen2s g[4] = {
1633 { .fniv = tcg_gen_mul_vec,
1634 .fno = gen_helper_gvec_muls8,
1635 .opc = INDEX_op_mul_vec,
1636 .vece = MO_8 },
1637 { .fniv = tcg_gen_mul_vec,
1638 .fno = gen_helper_gvec_muls16,
1639 .opc = INDEX_op_mul_vec,
1640 .vece = MO_16 },
1641 { .fni4 = tcg_gen_mul_i32,
1642 .fniv = tcg_gen_mul_vec,
1643 .fno = gen_helper_gvec_muls32,
1644 .opc = INDEX_op_mul_vec,
1645 .vece = MO_32 },
1646 { .fni8 = tcg_gen_mul_i64,
1647 .fniv = tcg_gen_mul_vec,
1648 .fno = gen_helper_gvec_muls64,
1649 .opc = INDEX_op_mul_vec,
1650 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1651 .vece = MO_64 },
1652 };
1653
1654 tcg_debug_assert(vece <= MO_64);
1655 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1656}
1657
1658void tcg_gen_gvec_muli(unsigned vece, uint32_t dofs, uint32_t aofs,
1659 int64_t c, uint32_t oprsz, uint32_t maxsz)
1660{
1661 TCGv_i64 tmp = tcg_const_i64(c);
1662 tcg_gen_gvec_muls(vece, dofs, aofs, tmp, oprsz, maxsz);
1663 tcg_temp_free_i64(tmp);
1664}
1665
f49b12c6
RH
1666void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs,
1667 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1668{
1669 static const GVecGen3 g[4] = {
1670 { .fno = gen_helper_gvec_ssadd8, .vece = MO_8 },
1671 { .fno = gen_helper_gvec_ssadd16, .vece = MO_16 },
1672 { .fno = gen_helper_gvec_ssadd32, .vece = MO_32 },
1673 { .fno = gen_helper_gvec_ssadd64, .vece = MO_64 }
1674 };
1675 tcg_debug_assert(vece <= MO_64);
1676 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1677}
1678
1679void tcg_gen_gvec_sssub(unsigned vece, uint32_t dofs, uint32_t aofs,
1680 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1681{
1682 static const GVecGen3 g[4] = {
1683 { .fno = gen_helper_gvec_sssub8, .vece = MO_8 },
1684 { .fno = gen_helper_gvec_sssub16, .vece = MO_16 },
1685 { .fno = gen_helper_gvec_sssub32, .vece = MO_32 },
1686 { .fno = gen_helper_gvec_sssub64, .vece = MO_64 }
1687 };
1688 tcg_debug_assert(vece <= MO_64);
1689 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1690}
1691
1692static void tcg_gen_vec_usadd32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1693{
1694 TCGv_i32 max = tcg_const_i32(-1);
1695 tcg_gen_add_i32(d, a, b);
1696 tcg_gen_movcond_i32(TCG_COND_LTU, d, d, a, max, d);
1697 tcg_temp_free_i32(max);
1698}
1699
1700static void tcg_gen_vec_usadd32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1701{
1702 TCGv_i64 max = tcg_const_i64(-1);
1703 tcg_gen_add_i64(d, a, b);
1704 tcg_gen_movcond_i64(TCG_COND_LTU, d, d, a, max, d);
1705 tcg_temp_free_i64(max);
1706}
1707
1708void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs,
1709 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1710{
1711 static const GVecGen3 g[4] = {
1712 { .fno = gen_helper_gvec_usadd8, .vece = MO_8 },
1713 { .fno = gen_helper_gvec_usadd16, .vece = MO_16 },
1714 { .fni4 = tcg_gen_vec_usadd32_i32,
1715 .fno = gen_helper_gvec_usadd32,
1716 .vece = MO_32 },
1717 { .fni8 = tcg_gen_vec_usadd32_i64,
1718 .fno = gen_helper_gvec_usadd64,
1719 .vece = MO_64 }
1720 };
1721 tcg_debug_assert(vece <= MO_64);
1722 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1723}
1724
1725static void tcg_gen_vec_ussub32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1726{
1727 TCGv_i32 min = tcg_const_i32(0);
1728 tcg_gen_sub_i32(d, a, b);
1729 tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, min, d);
1730 tcg_temp_free_i32(min);
1731}
1732
1733static void tcg_gen_vec_ussub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1734{
1735 TCGv_i64 min = tcg_const_i64(0);
1736 tcg_gen_sub_i64(d, a, b);
1737 tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, min, d);
1738 tcg_temp_free_i64(min);
1739}
1740
1741void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs,
1742 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1743{
1744 static const GVecGen3 g[4] = {
1745 { .fno = gen_helper_gvec_ussub8, .vece = MO_8 },
1746 { .fno = gen_helper_gvec_ussub16, .vece = MO_16 },
1747 { .fni4 = tcg_gen_vec_ussub32_i32,
1748 .fno = gen_helper_gvec_ussub32,
1749 .vece = MO_32 },
1750 { .fni8 = tcg_gen_vec_ussub32_i64,
1751 .fno = gen_helper_gvec_ussub64,
1752 .vece = MO_64 }
1753 };
1754 tcg_debug_assert(vece <= MO_64);
1755 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1756}
1757
db432672
RH
1758/* Perform a vector negation using normal negation and a mask.
1759 Compare gen_subv_mask above. */
1760static void gen_negv_mask(TCGv_i64 d, TCGv_i64 b, TCGv_i64 m)
1761{
1762 TCGv_i64 t2 = tcg_temp_new_i64();
1763 TCGv_i64 t3 = tcg_temp_new_i64();
1764
1765 tcg_gen_andc_i64(t3, m, b);
1766 tcg_gen_andc_i64(t2, b, m);
1767 tcg_gen_sub_i64(d, m, t2);
1768 tcg_gen_xor_i64(d, d, t3);
1769
1770 tcg_temp_free_i64(t2);
1771 tcg_temp_free_i64(t3);
1772}
1773
1774void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 b)
1775{
1776 TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
1777 gen_negv_mask(d, b, m);
1778 tcg_temp_free_i64(m);
1779}
1780
1781void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 b)
1782{
1783 TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
1784 gen_negv_mask(d, b, m);
1785 tcg_temp_free_i64(m);
1786}
1787
1788void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 b)
1789{
1790 TCGv_i64 t1 = tcg_temp_new_i64();
1791 TCGv_i64 t2 = tcg_temp_new_i64();
1792
1793 tcg_gen_andi_i64(t1, b, ~0xffffffffull);
1794 tcg_gen_neg_i64(t2, b);
1795 tcg_gen_neg_i64(t1, t1);
1796 tcg_gen_deposit_i64(d, t1, t2, 0, 32);
1797
1798 tcg_temp_free_i64(t1);
1799 tcg_temp_free_i64(t2);
1800}
1801
1802void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs,
1803 uint32_t oprsz, uint32_t maxsz)
1804{
1805 static const GVecGen2 g[4] = {
1806 { .fni8 = tcg_gen_vec_neg8_i64,
1807 .fniv = tcg_gen_neg_vec,
1808 .fno = gen_helper_gvec_neg8,
1809 .opc = INDEX_op_neg_vec,
1810 .vece = MO_8 },
1811 { .fni8 = tcg_gen_vec_neg16_i64,
1812 .fniv = tcg_gen_neg_vec,
1813 .fno = gen_helper_gvec_neg16,
1814 .opc = INDEX_op_neg_vec,
1815 .vece = MO_16 },
1816 { .fni4 = tcg_gen_neg_i32,
1817 .fniv = tcg_gen_neg_vec,
1818 .fno = gen_helper_gvec_neg32,
1819 .opc = INDEX_op_neg_vec,
1820 .vece = MO_32 },
1821 { .fni8 = tcg_gen_neg_i64,
1822 .fniv = tcg_gen_neg_vec,
1823 .fno = gen_helper_gvec_neg64,
1824 .opc = INDEX_op_neg_vec,
1825 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1826 .vece = MO_64 },
1827 };
1828
1829 tcg_debug_assert(vece <= MO_64);
1830 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]);
1831}
1832
1833void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs,
1834 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1835{
1836 static const GVecGen3 g = {
1837 .fni8 = tcg_gen_and_i64,
1838 .fniv = tcg_gen_and_vec,
1839 .fno = gen_helper_gvec_and,
1840 .opc = INDEX_op_and_vec,
1841 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1842 };
1843 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
1844}
1845
1846void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs,
1847 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1848{
1849 static const GVecGen3 g = {
1850 .fni8 = tcg_gen_or_i64,
1851 .fniv = tcg_gen_or_vec,
1852 .fno = gen_helper_gvec_or,
1853 .opc = INDEX_op_or_vec,
1854 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1855 };
1856 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
1857}
1858
1859void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs,
1860 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1861{
1862 static const GVecGen3 g = {
1863 .fni8 = tcg_gen_xor_i64,
1864 .fniv = tcg_gen_xor_vec,
1865 .fno = gen_helper_gvec_xor,
1866 .opc = INDEX_op_xor_vec,
1867 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1868 };
1869 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
1870}
1871
1872void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs,
1873 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1874{
1875 static const GVecGen3 g = {
1876 .fni8 = tcg_gen_andc_i64,
1877 .fniv = tcg_gen_andc_vec,
1878 .fno = gen_helper_gvec_andc,
1879 .opc = INDEX_op_andc_vec,
1880 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1881 };
1882 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
1883}
1884
1885void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs,
1886 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1887{
1888 static const GVecGen3 g = {
1889 .fni8 = tcg_gen_orc_i64,
1890 .fniv = tcg_gen_orc_vec,
1891 .fno = gen_helper_gvec_orc,
1892 .opc = INDEX_op_orc_vec,
1893 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1894 };
1895 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
1896}
d0ec9796 1897
22fc3527
RH
1898static const GVecGen2s gop_ands = {
1899 .fni8 = tcg_gen_and_i64,
1900 .fniv = tcg_gen_and_vec,
1901 .fno = gen_helper_gvec_ands,
1902 .opc = INDEX_op_and_vec,
1903 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1904 .vece = MO_64
1905};
1906
1907void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs,
1908 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1909{
1910 TCGv_i64 tmp = tcg_temp_new_i64();
1911 gen_dup_i64(vece, tmp, c);
1912 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
1913 tcg_temp_free_i64(tmp);
1914}
1915
1916void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs,
1917 int64_t c, uint32_t oprsz, uint32_t maxsz)
1918{
1919 TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
1920 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
1921 tcg_temp_free_i64(tmp);
1922}
1923
1924static const GVecGen2s gop_xors = {
1925 .fni8 = tcg_gen_xor_i64,
1926 .fniv = tcg_gen_xor_vec,
1927 .fno = gen_helper_gvec_xors,
1928 .opc = INDEX_op_xor_vec,
1929 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1930 .vece = MO_64
1931};
1932
1933void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs,
1934 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1935{
1936 TCGv_i64 tmp = tcg_temp_new_i64();
1937 gen_dup_i64(vece, tmp, c);
1938 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
1939 tcg_temp_free_i64(tmp);
1940}
1941
1942void tcg_gen_gvec_xori(unsigned vece, uint32_t dofs, uint32_t aofs,
1943 int64_t c, uint32_t oprsz, uint32_t maxsz)
1944{
1945 TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
1946 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
1947 tcg_temp_free_i64(tmp);
1948}
1949
1950static const GVecGen2s gop_ors = {
1951 .fni8 = tcg_gen_or_i64,
1952 .fniv = tcg_gen_or_vec,
1953 .fno = gen_helper_gvec_ors,
1954 .opc = INDEX_op_or_vec,
1955 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1956 .vece = MO_64
1957};
1958
1959void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs,
1960 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1961{
1962 TCGv_i64 tmp = tcg_temp_new_i64();
1963 gen_dup_i64(vece, tmp, c);
1964 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
1965 tcg_temp_free_i64(tmp);
1966}
1967
1968void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs,
1969 int64_t c, uint32_t oprsz, uint32_t maxsz)
1970{
1971 TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
1972 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
1973 tcg_temp_free_i64(tmp);
1974}
1975
d0ec9796
RH
1976void tcg_gen_vec_shl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
1977{
1978 uint64_t mask = dup_const(MO_8, 0xff << c);
1979 tcg_gen_shli_i64(d, a, c);
1980 tcg_gen_andi_i64(d, d, mask);
1981}
1982
1983void tcg_gen_vec_shl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
1984{
1985 uint64_t mask = dup_const(MO_16, 0xffff << c);
1986 tcg_gen_shli_i64(d, a, c);
1987 tcg_gen_andi_i64(d, d, mask);
1988}
1989
1990void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs,
1991 int64_t shift, uint32_t oprsz, uint32_t maxsz)
1992{
1993 static const GVecGen2i g[4] = {
1994 { .fni8 = tcg_gen_vec_shl8i_i64,
1995 .fniv = tcg_gen_shli_vec,
1996 .fno = gen_helper_gvec_shl8i,
1997 .opc = INDEX_op_shli_vec,
1998 .vece = MO_8 },
1999 { .fni8 = tcg_gen_vec_shl16i_i64,
2000 .fniv = tcg_gen_shli_vec,
2001 .fno = gen_helper_gvec_shl16i,
2002 .opc = INDEX_op_shli_vec,
2003 .vece = MO_16 },
2004 { .fni4 = tcg_gen_shli_i32,
2005 .fniv = tcg_gen_shli_vec,
2006 .fno = gen_helper_gvec_shl32i,
2007 .opc = INDEX_op_shli_vec,
2008 .vece = MO_32 },
2009 { .fni8 = tcg_gen_shli_i64,
2010 .fniv = tcg_gen_shli_vec,
2011 .fno = gen_helper_gvec_shl64i,
2012 .opc = INDEX_op_shli_vec,
2013 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2014 .vece = MO_64 },
2015 };
2016
2017 tcg_debug_assert(vece <= MO_64);
2018 tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2019 if (shift == 0) {
2020 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2021 } else {
2022 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2023 }
2024}
2025
2026void tcg_gen_vec_shr8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2027{
2028 uint64_t mask = dup_const(MO_8, 0xff >> c);
2029 tcg_gen_shri_i64(d, a, c);
2030 tcg_gen_andi_i64(d, d, mask);
2031}
2032
2033void tcg_gen_vec_shr16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2034{
2035 uint64_t mask = dup_const(MO_16, 0xffff >> c);
2036 tcg_gen_shri_i64(d, a, c);
2037 tcg_gen_andi_i64(d, d, mask);
2038}
2039
2040void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs,
2041 int64_t shift, uint32_t oprsz, uint32_t maxsz)
2042{
2043 static const GVecGen2i g[4] = {
2044 { .fni8 = tcg_gen_vec_shr8i_i64,
2045 .fniv = tcg_gen_shri_vec,
2046 .fno = gen_helper_gvec_shr8i,
2047 .opc = INDEX_op_shri_vec,
2048 .vece = MO_8 },
2049 { .fni8 = tcg_gen_vec_shr16i_i64,
2050 .fniv = tcg_gen_shri_vec,
2051 .fno = gen_helper_gvec_shr16i,
2052 .opc = INDEX_op_shri_vec,
2053 .vece = MO_16 },
2054 { .fni4 = tcg_gen_shri_i32,
2055 .fniv = tcg_gen_shri_vec,
2056 .fno = gen_helper_gvec_shr32i,
2057 .opc = INDEX_op_shri_vec,
2058 .vece = MO_32 },
2059 { .fni8 = tcg_gen_shri_i64,
2060 .fniv = tcg_gen_shri_vec,
2061 .fno = gen_helper_gvec_shr64i,
2062 .opc = INDEX_op_shri_vec,
2063 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2064 .vece = MO_64 },
2065 };
2066
2067 tcg_debug_assert(vece <= MO_64);
2068 tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2069 if (shift == 0) {
2070 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2071 } else {
2072 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2073 }
2074}
2075
2076void tcg_gen_vec_sar8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2077{
2078 uint64_t s_mask = dup_const(MO_8, 0x80 >> c);
2079 uint64_t c_mask = dup_const(MO_8, 0xff >> c);
2080 TCGv_i64 s = tcg_temp_new_i64();
2081
2082 tcg_gen_shri_i64(d, a, c);
2083 tcg_gen_andi_i64(s, d, s_mask); /* isolate (shifted) sign bit */
2084 tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
2085 tcg_gen_andi_i64(d, d, c_mask); /* clear out bits above sign */
2086 tcg_gen_or_i64(d, d, s); /* include sign extension */
2087 tcg_temp_free_i64(s);
2088}
2089
2090void tcg_gen_vec_sar16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2091{
2092 uint64_t s_mask = dup_const(MO_16, 0x8000 >> c);
2093 uint64_t c_mask = dup_const(MO_16, 0xffff >> c);
2094 TCGv_i64 s = tcg_temp_new_i64();
2095
2096 tcg_gen_shri_i64(d, a, c);
2097 tcg_gen_andi_i64(s, d, s_mask); /* isolate (shifted) sign bit */
2098 tcg_gen_andi_i64(d, d, c_mask); /* clear out bits above sign */
2099 tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
2100 tcg_gen_or_i64(d, d, s); /* include sign extension */
2101 tcg_temp_free_i64(s);
2102}
2103
2104void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs,
2105 int64_t shift, uint32_t oprsz, uint32_t maxsz)
2106{
2107 static const GVecGen2i g[4] = {
2108 { .fni8 = tcg_gen_vec_sar8i_i64,
2109 .fniv = tcg_gen_sari_vec,
2110 .fno = gen_helper_gvec_sar8i,
2111 .opc = INDEX_op_sari_vec,
2112 .vece = MO_8 },
2113 { .fni8 = tcg_gen_vec_sar16i_i64,
2114 .fniv = tcg_gen_sari_vec,
2115 .fno = gen_helper_gvec_sar16i,
2116 .opc = INDEX_op_sari_vec,
2117 .vece = MO_16 },
2118 { .fni4 = tcg_gen_sari_i32,
2119 .fniv = tcg_gen_sari_vec,
2120 .fno = gen_helper_gvec_sar32i,
2121 .opc = INDEX_op_sari_vec,
2122 .vece = MO_32 },
2123 { .fni8 = tcg_gen_sari_i64,
2124 .fniv = tcg_gen_sari_vec,
2125 .fno = gen_helper_gvec_sar64i,
2126 .opc = INDEX_op_sari_vec,
2127 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2128 .vece = MO_64 },
2129 };
2130
2131 tcg_debug_assert(vece <= MO_64);
2132 tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2133 if (shift == 0) {
2134 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2135 } else {
2136 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2137 }
2138}
212be173
RH
2139
2140/* Expand OPSZ bytes worth of three-operand operations using i32 elements. */
2141static void expand_cmp_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
2142 uint32_t oprsz, TCGCond cond)
2143{
2144 TCGv_i32 t0 = tcg_temp_new_i32();
2145 TCGv_i32 t1 = tcg_temp_new_i32();
2146 uint32_t i;
2147
2148 for (i = 0; i < oprsz; i += 4) {
2149 tcg_gen_ld_i32(t0, cpu_env, aofs + i);
2150 tcg_gen_ld_i32(t1, cpu_env, bofs + i);
2151 tcg_gen_setcond_i32(cond, t0, t0, t1);
2152 tcg_gen_neg_i32(t0, t0);
2153 tcg_gen_st_i32(t0, cpu_env, dofs + i);
2154 }
2155 tcg_temp_free_i32(t1);
2156 tcg_temp_free_i32(t0);
2157}
2158
2159static void expand_cmp_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
2160 uint32_t oprsz, TCGCond cond)
2161{
2162 TCGv_i64 t0 = tcg_temp_new_i64();
2163 TCGv_i64 t1 = tcg_temp_new_i64();
2164 uint32_t i;
2165
2166 for (i = 0; i < oprsz; i += 8) {
2167 tcg_gen_ld_i64(t0, cpu_env, aofs + i);
2168 tcg_gen_ld_i64(t1, cpu_env, bofs + i);
2169 tcg_gen_setcond_i64(cond, t0, t0, t1);
2170 tcg_gen_neg_i64(t0, t0);
2171 tcg_gen_st_i64(t0, cpu_env, dofs + i);
2172 }
2173 tcg_temp_free_i64(t1);
2174 tcg_temp_free_i64(t0);
2175}
2176
2177static void expand_cmp_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
2178 uint32_t bofs, uint32_t oprsz, uint32_t tysz,
2179 TCGType type, TCGCond cond)
2180{
2181 TCGv_vec t0 = tcg_temp_new_vec(type);
2182 TCGv_vec t1 = tcg_temp_new_vec(type);
2183 uint32_t i;
2184
2185 for (i = 0; i < oprsz; i += tysz) {
2186 tcg_gen_ld_vec(t0, cpu_env, aofs + i);
2187 tcg_gen_ld_vec(t1, cpu_env, bofs + i);
2188 tcg_gen_cmp_vec(cond, vece, t0, t0, t1);
2189 tcg_gen_st_vec(t0, cpu_env, dofs + i);
2190 }
2191 tcg_temp_free_vec(t1);
2192 tcg_temp_free_vec(t0);
2193}
2194
2195void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs,
2196 uint32_t aofs, uint32_t bofs,
2197 uint32_t oprsz, uint32_t maxsz)
2198{
2199 static gen_helper_gvec_3 * const eq_fn[4] = {
2200 gen_helper_gvec_eq8, gen_helper_gvec_eq16,
2201 gen_helper_gvec_eq32, gen_helper_gvec_eq64
2202 };
2203 static gen_helper_gvec_3 * const ne_fn[4] = {
2204 gen_helper_gvec_ne8, gen_helper_gvec_ne16,
2205 gen_helper_gvec_ne32, gen_helper_gvec_ne64
2206 };
2207 static gen_helper_gvec_3 * const lt_fn[4] = {
2208 gen_helper_gvec_lt8, gen_helper_gvec_lt16,
2209 gen_helper_gvec_lt32, gen_helper_gvec_lt64
2210 };
2211 static gen_helper_gvec_3 * const le_fn[4] = {
2212 gen_helper_gvec_le8, gen_helper_gvec_le16,
2213 gen_helper_gvec_le32, gen_helper_gvec_le64
2214 };
2215 static gen_helper_gvec_3 * const ltu_fn[4] = {
2216 gen_helper_gvec_ltu8, gen_helper_gvec_ltu16,
2217 gen_helper_gvec_ltu32, gen_helper_gvec_ltu64
2218 };
2219 static gen_helper_gvec_3 * const leu_fn[4] = {
2220 gen_helper_gvec_leu8, gen_helper_gvec_leu16,
2221 gen_helper_gvec_leu32, gen_helper_gvec_leu64
2222 };
2223 static gen_helper_gvec_3 * const * const fns[16] = {
2224 [TCG_COND_EQ] = eq_fn,
2225 [TCG_COND_NE] = ne_fn,
2226 [TCG_COND_LT] = lt_fn,
2227 [TCG_COND_LE] = le_fn,
2228 [TCG_COND_LTU] = ltu_fn,
2229 [TCG_COND_LEU] = leu_fn,
2230 };
adb196cb
RH
2231 TCGType type;
2232 uint32_t some;
212be173
RH
2233
2234 check_size_align(oprsz, maxsz, dofs | aofs | bofs);
2235 check_overlap_3(dofs, aofs, bofs, maxsz);
2236
2237 if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) {
2238 do_dup(MO_8, dofs, oprsz, maxsz,
2239 NULL, NULL, -(cond == TCG_COND_ALWAYS));
2240 return;
2241 }
2242
adb196cb
RH
2243 /* Implement inline with a vector type, if possible.
2244 * Prefer integer when 64-bit host and 64-bit comparison.
2245 */
2246 type = choose_vector_type(INDEX_op_cmp_vec, vece, oprsz,
2247 TCG_TARGET_REG_BITS == 64 && vece == MO_64);
2248 switch (type) {
2249 case TCG_TYPE_V256:
2250 /* Recall that ARM SVE allows vector sizes that are not a
2251 * power of 2, but always a multiple of 16. The intent is
2252 * that e.g. size == 80 would be expanded with 2x32 + 1x16.
2253 */
2254 some = QEMU_ALIGN_DOWN(oprsz, 32);
212be173
RH
2255 expand_cmp_vec(vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, cond);
2256 if (some == oprsz) {
adb196cb 2257 break;
212be173
RH
2258 }
2259 dofs += some;
2260 aofs += some;
2261 bofs += some;
2262 oprsz -= some;
2263 maxsz -= some;
adb196cb
RH
2264 /* fallthru */
2265 case TCG_TYPE_V128:
212be173 2266 expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, cond);
adb196cb
RH
2267 break;
2268 case TCG_TYPE_V64:
212be173 2269 expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, cond);
adb196cb
RH
2270 break;
2271
2272 case 0:
2273 if (vece == MO_64 && check_size_impl(oprsz, 8)) {
2274 expand_cmp_i64(dofs, aofs, bofs, oprsz, cond);
2275 } else if (vece == MO_32 && check_size_impl(oprsz, 4)) {
2276 expand_cmp_i32(dofs, aofs, bofs, oprsz, cond);
2277 } else {
2278 gen_helper_gvec_3 * const *fn = fns[cond];
2279
2280 if (fn == NULL) {
2281 uint32_t tmp;
2282 tmp = aofs, aofs = bofs, bofs = tmp;
2283 cond = tcg_swap_cond(cond);
2284 fn = fns[cond];
2285 assert(fn != NULL);
2286 }
2287 tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, 0, fn[vece]);
2288 return;
212be173 2289 }
adb196cb
RH
2290 break;
2291
2292 default:
2293 g_assert_not_reached();
212be173
RH
2294 }
2295
212be173
RH
2296 if (oprsz < maxsz) {
2297 expand_clr(dofs + oprsz, maxsz - oprsz);
2298 }
2299}