]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/config/riscv/riscv-v.cc
Update copyright years.
[thirdparty/gcc.git] / gcc / config / riscv / riscv-v.cc
1 /* Subroutines used for code generation for RISC-V 'V' Extension for
2 GNU compiler.
3 Copyright (C) 2022-2024 Free Software Foundation, Inc.
4 Contributed by Juzhe Zhong (juzhe.zhong@rivai.ai), RiVAI Technologies Ltd.
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3, or (at your option)
11 any later version.
12
13 GCC is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #define IN_TARGET_CODE 1
23
24 /* We have a maximum of 11 operands for RVV instruction patterns according to
25 the vector.md. */
26 #define RVV_INSN_OPERANDS_MAX 11
27
28 #include "config.h"
29 #include "system.h"
30 #include "coretypes.h"
31 #include "tm.h"
32 #include "backend.h"
33 #include "rtl.h"
34 #include "insn-config.h"
35 #include "insn-attr.h"
36 #include "recog.h"
37 #include "alias.h"
38 #include "tree.h"
39 #include "stringpool.h"
40 #include "attribs.h"
41 #include "explow.h"
42 #include "memmodel.h"
43 #include "emit-rtl.h"
44 #include "tm_p.h"
45 #include "target.h"
46 #include "targhooks.h"
47 #include "expr.h"
48 #include "optabs.h"
49 #include "tm-constrs.h"
50 #include "rtx-vector-builder.h"
51 #include "targhooks.h"
52 #include "predict.h"
53
54 using namespace riscv_vector;
55
56 namespace riscv_vector {
57
58 /* Return true if NUNTIS <=31 so that we can use immediate AVL in vsetivli. */
59 bool
60 imm_avl_p (machine_mode mode)
61 {
62 poly_uint64 nunits = GET_MODE_NUNITS (mode);
63
64 return nunits.is_constant ()
65 /* The vsetivli can only hold register 0~31. */
66 ? (IN_RANGE (nunits.to_constant (), 0, 31))
67 /* Only allowed in VLS-VLMAX mode. */
68 : false;
69 }
70
71 /* Return true if LEN is equal to NUNITS that out of the range [0, 31]. */
72 static bool
73 is_vlmax_len_p (machine_mode mode, rtx len)
74 {
75 poly_int64 value;
76 return poly_int_rtx_p (len, &value)
77 && known_eq (value, GET_MODE_NUNITS (mode));
78 }
79
80 /* Helper functions for insn_flags && insn_types */
81
82 /* Return true if caller need pass mask operand for insn pattern with
83 INSN_FLAGS. */
84
85 static bool
86 need_mask_operand_p (unsigned insn_flags)
87 {
88 return (insn_flags & HAS_MASK_P)
89 && !(insn_flags & (USE_ONE_TRUE_MASK_P | USE_ALL_TRUES_MASK_P));
90 }
91
92 template <int MAX_OPERANDS> class insn_expander
93 {
94 public:
95 insn_expander () = delete;
96
97 insn_expander (unsigned insn_flags, bool vlmax_p)
98 : m_insn_flags (insn_flags), m_opno (0), m_vlmax_p (vlmax_p),
99 m_vl_op (NULL_RTX)
100 {
101 check_insn_flags ();
102 }
103
104 void check_insn_flags () const
105 {
106 if (m_insn_flags & USE_ONE_TRUE_MASK_P)
107 /* USE_ONE_TRUE_MASK_P is dependent on HAS_MASK_P. */
108 gcc_assert ((m_insn_flags & HAS_MASK_P));
109
110 if (m_insn_flags & USE_ALL_TRUES_MASK_P)
111 /* USE_ALL_TRUES_MASK_P is dependent on HAS_MASK_P. */
112 gcc_assert ((m_insn_flags & HAS_MASK_P));
113
114 /* USE_ONE_TRUE_MASK_P and USE_ALL_TRUES_MASK_P are mutually exclusive. */
115 gcc_assert (!((m_insn_flags & USE_ONE_TRUE_MASK_P)
116 && (m_insn_flags & USE_ALL_TRUES_MASK_P)));
117
118 if (m_insn_flags & USE_VUNDEF_MERGE_P)
119 /* USE_VUNDEF_MERGE_P is dependent on HAS_MERGE_P. */
120 gcc_assert ((m_insn_flags & HAS_MERGE_P));
121
122 /* TU_POLICY_P and TDEFAULT_POLICY_P are mutually exclusive. */
123 gcc_assert (
124 !((m_insn_flags & TU_POLICY_P) && (m_insn_flags & TDEFAULT_POLICY_P)));
125
126 /* MU_POLICY_P and MDEFAULT_POLICY_P are mutually exclusive. */
127 gcc_assert (
128 !((m_insn_flags & MU_POLICY_P) && (m_insn_flags & MDEFAULT_POLICY_P)));
129
130 /* NULLARY_OP_P, UNARY_OP_P, BINARY_OP_P, TERNARY_OP_P are mutually
131 exclusive. */
132 gcc_assert (
133 !((m_insn_flags & NULLARY_OP_P)
134 && ((m_insn_flags & UNARY_OP_P) || (m_insn_flags & BINARY_OP_P)
135 || (m_insn_flags & TERNARY_OP_P))));
136 gcc_assert (
137 !((m_insn_flags & UNARY_OP_P)
138 && ((m_insn_flags & NULLARY_OP_P) || (m_insn_flags & BINARY_OP_P)
139 || (m_insn_flags & TERNARY_OP_P))));
140 gcc_assert (
141 !((m_insn_flags & BINARY_OP_P)
142 && ((m_insn_flags & NULLARY_OP_P) || (m_insn_flags & UNARY_OP_P)
143 || (m_insn_flags & TERNARY_OP_P))));
144 gcc_assert (
145 !((m_insn_flags & TERNARY_OP_P)
146 && ((m_insn_flags & NULLARY_OP_P) || (m_insn_flags & UNARY_OP_P)
147 || (m_insn_flags & BINARY_OP_P))));
148 }
149
150 void set_vl (rtx vl) { m_vl_op = vl; }
151
152 void add_output_operand (rtx x, machine_mode mode)
153 {
154 create_output_operand (&m_ops[m_opno++], x, mode);
155 gcc_assert (m_opno <= MAX_OPERANDS);
156 }
157 void add_input_operand (rtx x, machine_mode mode)
158 {
159 create_input_operand (&m_ops[m_opno++], x, mode);
160 gcc_assert (m_opno <= MAX_OPERANDS);
161 }
162 void add_all_one_mask_operand (machine_mode mask_mode)
163 {
164 add_input_operand (CONSTM1_RTX (mask_mode), mask_mode);
165 }
166 void add_first_one_true_mask_operand (machine_mode mask_mode)
167 {
168 add_input_operand (gen_scalar_move_mask (mask_mode), mask_mode);
169 }
170 void add_vundef_operand (machine_mode dest_mode)
171 {
172 add_input_operand (RVV_VUNDEF (dest_mode), dest_mode);
173 }
174 void add_policy_operand ()
175 {
176 if (m_insn_flags & TU_POLICY_P)
177 {
178 rtx tail_policy_rtx = gen_int_mode (TAIL_UNDISTURBED, Pmode);
179 add_input_operand (tail_policy_rtx, Pmode);
180 }
181 else if (m_insn_flags & TDEFAULT_POLICY_P)
182 {
183 rtx tail_policy_rtx = gen_int_mode (get_prefer_tail_policy (), Pmode);
184 add_input_operand (tail_policy_rtx, Pmode);
185 }
186
187 if (m_insn_flags & MU_POLICY_P)
188 {
189 rtx mask_policy_rtx = gen_int_mode (MASK_UNDISTURBED, Pmode);
190 add_input_operand (mask_policy_rtx, Pmode);
191 }
192 else if (m_insn_flags & MDEFAULT_POLICY_P)
193 {
194 rtx mask_policy_rtx = gen_int_mode (get_prefer_mask_policy (), Pmode);
195 add_input_operand (mask_policy_rtx, Pmode);
196 }
197 }
198 void add_avl_type_operand (avl_type type)
199 {
200 add_input_operand (gen_int_mode (type, Pmode), Pmode);
201 }
202
203 void
204 add_rounding_mode_operand (enum floating_point_rounding_mode rounding_mode)
205 {
206 rtx frm_rtx = gen_int_mode (rounding_mode, Pmode);
207 add_input_operand (frm_rtx, Pmode);
208 }
209
210 /* Return the vtype mode based on insn_flags.
211 vtype mode mean the mode vsetvl insn set. */
212 machine_mode
213 get_vtype_mode (rtx *ops)
214 {
215 machine_mode vtype_mode;
216 if (m_insn_flags & VTYPE_MODE_FROM_OP1_P)
217 vtype_mode = GET_MODE (ops[1]);
218 else
219 vtype_mode = GET_MODE (ops[0]);
220 return vtype_mode;
221 }
222
223 void emit_insn (enum insn_code icode, rtx *ops)
224 {
225 int opno = 0;
226 int num_ops;
227 /* It's true if any operand is memory operand. */
228 bool any_mem_p = false;
229
230 machine_mode vtype_mode = get_vtype_mode (ops);
231 machine_mode mask_mode = get_mask_mode (vtype_mode);
232
233 /* Add dest operand. */
234 if (m_insn_flags & HAS_DEST_P)
235 {
236 rtx op = ops[opno++];
237 any_mem_p |= MEM_P (op);
238 add_output_operand (op, GET_MODE (op));
239 }
240
241 /* Add mask operand. */
242 if (m_insn_flags & USE_ONE_TRUE_MASK_P)
243 add_first_one_true_mask_operand (mask_mode);
244 else if (m_insn_flags & USE_ALL_TRUES_MASK_P)
245 add_all_one_mask_operand (mask_mode);
246 else if (m_insn_flags & HAS_MASK_P)
247 {
248 machine_mode mode = insn_data[(int) icode].operand[m_opno].mode;
249 gcc_assert (mode != VOIDmode);
250 add_input_operand (ops[opno++], mode);
251 }
252
253 /* Add merge operand. */
254 if (m_insn_flags & USE_VUNDEF_MERGE_P)
255 /* Same as dest operand. */
256 add_vundef_operand (GET_MODE (ops[0]));
257 else if (m_insn_flags & HAS_MERGE_P)
258 {
259 machine_mode mode = insn_data[(int) icode].operand[m_opno].mode;
260 gcc_assert (mode != VOIDmode);
261 add_input_operand (ops[opno++], mode);
262 }
263
264 if (m_insn_flags & NULLARY_OP_P)
265 num_ops = 0;
266 else if (m_insn_flags & UNARY_OP_P)
267 num_ops = 1;
268 else if (m_insn_flags & BINARY_OP_P)
269 num_ops = 2;
270 else if (m_insn_flags & TERNARY_OP_P)
271 num_ops = 3;
272 else
273 gcc_unreachable ();
274
275 /* Add the remain operands. */
276 for (; num_ops; num_ops--, opno++)
277 {
278 any_mem_p |= MEM_P (ops[opno]);
279 machine_mode mode = insn_data[(int) icode].operand[m_opno].mode;
280 /* 'create_input_operand doesn't allow VOIDmode.
281 According to vector.md, we may have some patterns that do not have
282 explicit machine mode specifying the operand. Such operands are
283 always Pmode. */
284 if (mode == VOIDmode)
285 mode = Pmode;
286 else
287 /* Early assertion ensures same mode since maybe_legitimize_operand
288 will check this. */
289 gcc_assert (GET_MODE (ops[opno]) == VOIDmode
290 || GET_MODE (ops[opno]) == mode);
291
292 add_input_operand (ops[opno], mode);
293 }
294
295 /* Add vl operand. */
296 rtx len = m_vl_op;
297 bool vls_p = false;
298 if (m_vlmax_p)
299 {
300 if (riscv_v_ext_vls_mode_p (vtype_mode))
301 {
302 /* VLS modes always set VSETVL by
303 "vsetvl zero, rs1/imm". */
304 poly_uint64 nunits = GET_MODE_NUNITS (vtype_mode);
305 len = gen_int_mode (nunits, Pmode);
306 vls_p = true;
307 }
308 else if (can_create_pseudo_p ())
309 {
310 len = gen_reg_rtx (Pmode);
311 emit_vlmax_vsetvl (vtype_mode, len);
312 }
313 }
314
315 gcc_assert (len != NULL_RTX);
316 add_input_operand (len, Pmode);
317
318 /* Add tail and mask policy operands. */
319 add_policy_operand ();
320
321 /* Add avl_type operand. */
322 add_avl_type_operand (
323 vls_p ? avl_type::VLS
324 : (m_vlmax_p ? avl_type::VLMAX : avl_type::NONVLMAX));
325
326 /* Add rounding mode operand. */
327 if (m_insn_flags & FRM_DYN_P)
328 add_rounding_mode_operand (FRM_DYN);
329 else if (m_insn_flags & FRM_RUP_P)
330 add_rounding_mode_operand (FRM_RUP);
331 else if (m_insn_flags & FRM_RDN_P)
332 add_rounding_mode_operand (FRM_RDN);
333 else if (m_insn_flags & FRM_RMM_P)
334 add_rounding_mode_operand (FRM_RMM);
335 else if (m_insn_flags & FRM_RNE_P)
336 add_rounding_mode_operand (FRM_RNE);
337
338 gcc_assert (insn_data[(int) icode].n_operands == m_opno);
339 expand (icode, any_mem_p);
340 }
341
342 void expand (enum insn_code icode, bool temporary_volatile_p = false)
343 {
344 if (temporary_volatile_p)
345 {
346 temporary_volatile_ok v (true);
347 expand_insn (icode, m_opno, m_ops);
348 }
349 else
350 expand_insn (icode, m_opno, m_ops);
351 }
352
353 private:
354 unsigned m_insn_flags;
355 int m_opno;
356 bool m_vlmax_p;
357 rtx m_vl_op;
358 expand_operand m_ops[MAX_OPERANDS];
359 };
360
361 /* Emit an RVV insn with a vector length that equals the number of units of the
362 vector mode. For VLA modes this corresponds to VLMAX.
363
364 Unless the vector length can be encoded in the vsetivl[i] instruction this
365 function must only be used as long as we can create pseudo registers. This is
366 because it will set a pseudo register to VLMAX using vsetvl and use this as
367 definition for the vector length. */
368 void
369 emit_vlmax_insn (unsigned icode, unsigned insn_flags, rtx *ops)
370 {
371 insn_expander<RVV_INSN_OPERANDS_MAX> e (insn_flags, true);
372 gcc_assert (can_create_pseudo_p () || imm_avl_p (e.get_vtype_mode (ops)));
373
374 e.emit_insn ((enum insn_code) icode, ops);
375 }
376
377 /* Like emit_vlmax_insn but must only be used when we cannot create pseudo
378 registers anymore. This function, however, takes a predefined vector length
379 from the value in VL. */
380 void
381 emit_vlmax_insn_lra (unsigned icode, unsigned insn_flags, rtx *ops, rtx vl)
382 {
383 gcc_assert (!can_create_pseudo_p ());
384 machine_mode mode = GET_MODE (ops[0]);
385
386 if (imm_avl_p (mode))
387 {
388 /* Even though VL is a real hardreg already allocated since
389 it is post-RA now, we still gain benefits that we emit
390 vsetivli zero, imm instead of vsetvli VL, zero which is
391 we can be more flexible in post-RA instruction scheduling. */
392 insn_expander<RVV_INSN_OPERANDS_MAX> e (insn_flags, false);
393 e.set_vl (gen_int_mode (GET_MODE_NUNITS (mode), Pmode));
394 e.emit_insn ((enum insn_code) icode, ops);
395 }
396 else
397 {
398 insn_expander<RVV_INSN_OPERANDS_MAX> e (insn_flags, true);
399 e.set_vl (vl);
400 e.emit_insn ((enum insn_code) icode, ops);
401 }
402 }
403
404 /* Emit an RVV insn with a predefined vector length. Contrary to
405 emit_vlmax_insn the instruction's vector length is not deduced from its mode
406 but taken from the value in VL. */
407 void
408 emit_nonvlmax_insn (unsigned icode, unsigned insn_flags, rtx *ops, rtx vl)
409 {
410 insn_expander<RVV_INSN_OPERANDS_MAX> e (insn_flags, false);
411 e.set_vl (vl);
412 e.emit_insn ((enum insn_code) icode, ops);
413 }
414
415 class rvv_builder : public rtx_vector_builder
416 {
417 public:
418 rvv_builder () : rtx_vector_builder () {}
419 rvv_builder (machine_mode mode, unsigned int npatterns,
420 unsigned int nelts_per_pattern)
421 : rtx_vector_builder (mode, npatterns, nelts_per_pattern)
422 {
423 m_inner_mode = GET_MODE_INNER (mode);
424 m_inner_bits_size = GET_MODE_BITSIZE (m_inner_mode);
425 m_inner_bytes_size = GET_MODE_SIZE (m_inner_mode);
426 m_mask_mode = get_mask_mode (mode);
427
428 gcc_assert (
429 int_mode_for_size (inner_bits_size (), 0).exists (&m_inner_int_mode));
430 m_int_mode
431 = get_vector_mode (m_inner_int_mode, GET_MODE_NUNITS (mode)).require ();
432 }
433
434 bool can_duplicate_repeating_sequence_p ();
435 rtx get_merged_repeating_sequence ();
436
437 bool repeating_sequence_use_merge_profitable_p ();
438 bool combine_sequence_use_slideup_profitable_p ();
439 bool combine_sequence_use_merge_profitable_p ();
440 rtx get_merge_scalar_mask (unsigned int, machine_mode) const;
441
442 bool single_step_npatterns_p () const;
443 bool npatterns_all_equal_p () const;
444 bool interleaved_stepped_npatterns_p () const;
445 bool npatterns_vid_diff_repeated_p () const;
446
447 machine_mode new_mode () const { return m_new_mode; }
448 scalar_mode inner_mode () const { return m_inner_mode; }
449 scalar_int_mode inner_int_mode () const { return m_inner_int_mode; }
450 machine_mode mask_mode () const { return m_mask_mode; }
451 machine_mode int_mode () const { return m_int_mode; }
452 unsigned int inner_bits_size () const { return m_inner_bits_size; }
453 unsigned int inner_bytes_size () const { return m_inner_bytes_size; }
454
455 private:
456 scalar_mode m_inner_mode;
457 scalar_int_mode m_inner_int_mode;
458 machine_mode m_new_mode;
459 scalar_int_mode m_new_inner_mode;
460 machine_mode m_mask_mode;
461 machine_mode m_int_mode;
462 unsigned int m_inner_bits_size;
463 unsigned int m_inner_bytes_size;
464 };
465
466 /* Return true if the vector duplicated by a super element which is the fusion
467 of consecutive elements.
468
469 v = { a, b, a, b } super element = ab, v = { ab, ab } */
470 bool
471 rvv_builder::can_duplicate_repeating_sequence_p ()
472 {
473 poly_uint64 new_size = exact_div (full_nelts (), npatterns ());
474 unsigned int new_inner_size = m_inner_bits_size * npatterns ();
475 if (!int_mode_for_size (new_inner_size, 0).exists (&m_new_inner_mode)
476 || GET_MODE_SIZE (m_new_inner_mode) > UNITS_PER_WORD
477 || !get_vector_mode (m_new_inner_mode, new_size).exists (&m_new_mode))
478 return false;
479 if (full_nelts ().is_constant ())
480 return repeating_sequence_p (0, full_nelts ().to_constant (), npatterns ());
481 return nelts_per_pattern () == 1;
482 }
483
484 /* Return true if it is a repeating sequence that using
485 merge approach has better codegen than using default
486 approach (slide1down).
487
488 Sequence A:
489 {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}
490
491 nelts = 16
492 npatterns = 2
493
494 for merging a we need mask 101010....
495 for merging b we need mask 010101....
496
497 Foreach element in the npattern, we need to build a mask in scalar register.
498 Mostely we need 3 instructions (aka COST = 3), which is consist of 2 scalar
499 instruction and 1 scalar move to v0 register. Finally we need vector merge
500 to merge them.
501
502 lui a5, #imm
503 add a5, #imm
504 vmov.s.x v0, a5
505 vmerge.vxm v9, v9, a1, v0
506
507 So the overall (roughly) COST of Sequence A = (3 + 1) * npatterns = 8.
508 If we use slide1down, the COST = nelts = 16 > 8 (COST of merge).
509 So return true in this case as it is profitable.
510
511 Sequence B:
512 {a, b, c, d, e, f, g, h, a, b, c, d, e, f, g, h}
513
514 nelts = 16
515 npatterns = 8
516
517 COST of merge approach = (3 + 1) * npatterns = 24
518 COST of slide1down approach = nelts = 16
519 Return false in this case as it is NOT profitable in merge approach.
520 */
521 bool
522 rvv_builder::repeating_sequence_use_merge_profitable_p ()
523 {
524 if (inner_bytes_size () > UNITS_PER_WORD)
525 return false;
526
527 unsigned int nelts = full_nelts ().to_constant ();
528
529 if (!repeating_sequence_p (0, nelts, npatterns ()))
530 return false;
531
532 unsigned int merge_cost = 1;
533 unsigned int build_merge_mask_cost = 3;
534 unsigned int slide1down_cost = nelts;
535
536 return (build_merge_mask_cost + merge_cost) * npatterns () < slide1down_cost;
537 }
538
539 /* Return true if it's worthwhile to use slideup combine 2 vectors. */
540 bool
541 rvv_builder::combine_sequence_use_slideup_profitable_p ()
542 {
543 int nelts = full_nelts ().to_constant ();
544 int leading_ndups = this->count_dups (0, nelts - 1, 1);
545 int trailing_ndups = this->count_dups (nelts - 1, -1, -1);
546
547 /* ??? Current heuristic we do is we do combine 2 vectors
548 by slideup when:
549 1. # of leading same elements is equal to # of trailing same elements.
550 2. Both of above are equal to nelts / 2.
551 Otherwise, it is not profitable. */
552 return leading_ndups == trailing_ndups && trailing_ndups == nelts / 2;
553 }
554
555 /* Return true if it's worthwhile to use merge combine vector with a scalar. */
556 bool
557 rvv_builder::combine_sequence_use_merge_profitable_p ()
558 {
559 int nelts = full_nelts ().to_constant ();
560 int leading_ndups = this->count_dups (0, nelts - 1, 1);
561 int trailing_ndups = this->count_dups (nelts - 1, -1, -1);
562 int nregs = riscv_get_v_regno_alignment (int_mode ());
563
564 if (leading_ndups + trailing_ndups != nelts)
565 return false;
566
567 /* Leading elements num > 255 which exceeds the maximum value
568 of QImode, we will need to use HImode. */
569 machine_mode mode;
570 if (leading_ndups > 255 || nregs > 2)
571 {
572 if (!get_vector_mode (HImode, nelts).exists (&mode))
573 return false;
574 /* We will need one more AVL/VL toggling vsetvl instruction. */
575 return leading_ndups > 4 && trailing_ndups > 4;
576 }
577
578 /* { a, a, a, b, b, ... , b } and { b, b, b, a, a, ... , a }
579 consume 3 slide instructions. */
580 return leading_ndups > 3 && trailing_ndups > 3;
581 }
582
583 /* Merge the repeating sequence into a single element and return the RTX. */
584 rtx
585 rvv_builder::get_merged_repeating_sequence ()
586 {
587 scalar_int_mode mode = Pmode;
588 rtx target = gen_reg_rtx (mode);
589 emit_move_insn (target, const0_rtx);
590 rtx imm = gen_int_mode ((1ULL << m_inner_bits_size) - 1, mode);
591 /* { a, b, a, b }: Generate duplicate element = b << bits | a. */
592 for (unsigned int i = 0; i < npatterns (); i++)
593 {
594 unsigned int loc = m_inner_bits_size * i;
595 rtx shift = gen_int_mode (loc, mode);
596 rtx ele = gen_lowpart (mode, elt (i));
597 rtx tmp = expand_simple_binop (mode, AND, ele, imm, NULL_RTX, false,
598 OPTAB_DIRECT);
599 rtx tmp2 = expand_simple_binop (mode, ASHIFT, tmp, shift, NULL_RTX, false,
600 OPTAB_DIRECT);
601 rtx tmp3 = expand_simple_binop (mode, IOR, tmp2, target, NULL_RTX, false,
602 OPTAB_DIRECT);
603 emit_move_insn (target, tmp3);
604 }
605 if (GET_MODE_SIZE (m_new_inner_mode) < UNITS_PER_WORD)
606 return gen_lowpart (m_new_inner_mode, target);
607 return target;
608 }
609
610 /* Get the mask for merge approach.
611
612 Consider such following case:
613 {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}
614 To merge "a", the mask should be 1010....
615 To merge "b", the mask should be 0101....
616 */
617 rtx
618 rvv_builder::get_merge_scalar_mask (unsigned int index_in_pattern,
619 machine_mode inner_mode) const
620 {
621 unsigned HOST_WIDE_INT mask = 0;
622 unsigned HOST_WIDE_INT base_mask = (1ULL << index_in_pattern);
623 /* Here we construct a mask pattern that will later be broadcast
624 to a vector register. The maximum broadcast size for vmv.v.x/vmv.s.x
625 is determined by the length of a vector element (ELEN) and not by
626 XLEN so make sure we do not exceed it. One example is -march=zve32*
627 which mandates ELEN == 32 but can be combined with -march=rv64
628 with XLEN == 64. */
629 unsigned int elen = TARGET_VECTOR_ELEN_64 ? 64 : 32;
630
631 gcc_assert (elen % npatterns () == 0);
632
633 int limit = elen / npatterns ();
634
635 for (int i = 0; i < limit; i++)
636 mask |= base_mask << (i * npatterns ());
637
638 return gen_int_mode (mask, inner_mode);
639 }
640
641 /* Return true if the variable-length vector is single step.
642 Single step means step all patterns in NPATTERNS are equal.
643 Consider this following case:
644
645 CASE 1: NPATTERNS = 2, NELTS_PER_PATTERN = 3.
646 { 0, 2, 2, 4, 4, 6, ... }
647 First pattern: step1 = 2 - 0 = 2
648 step2 = 4 - 2 = 2
649 Second pattern: step1 = 4 - 2 = 2
650 step2 = 6 - 4 = 2
651 Since all steps of NPATTERNS are equal step = 2.
652 Return true in this case.
653
654 CASE 2: NPATTERNS = 2, NELTS_PER_PATTERN = 3.
655 { 0, 1, 2, 4, 4, 7, ... }
656 First pattern: step1 = 2 - 0 = 2
657 step2 = 4 - 2 = 2
658 Second pattern: step1 = 4 - 1 = 3
659 step2 = 7 - 4 = 3
660 Since not all steps are equal, return false. */
661 bool
662 rvv_builder::single_step_npatterns_p () const
663 {
664 if (nelts_per_pattern () != 3)
665 return false;
666
667 poly_int64 step
668 = rtx_to_poly_int64 (elt (npatterns ())) - rtx_to_poly_int64 (elt (0));
669 for (unsigned int i = 0; i < npatterns (); i++)
670 {
671 poly_int64 ele0 = rtx_to_poly_int64 (elt (i));
672 poly_int64 ele1 = rtx_to_poly_int64 (elt (npatterns () + i));
673 poly_int64 ele2 = rtx_to_poly_int64 (elt (npatterns () * 2 + i));
674 poly_int64 diff1 = ele1 - ele0;
675 poly_int64 diff2 = ele2 - ele1;
676 if (maybe_ne (step, diff1) || maybe_ne (step, diff2))
677 return false;
678 }
679 return true;
680 }
681
682 /* Return true if the diff between const vector and vid sequence
683 is repeated. For example as below cases:
684 The diff means the const vector - vid.
685 CASE 1:
686 CONST VECTOR: {3, 2, 1, 0, 7, 6, 5, 4, ... }
687 VID : {0, 1, 2, 3, 4, 5, 6, 7, ... }
688 DIFF(MINUS) : {3, 1,-1,-3, 3, 1,-1,-3, ... }
689 The diff sequence {3, 1,-1,-3} is repeated in the npattern and
690 return TRUE for case 1.
691
692 CASE 2:
693 CONST VECTOR: {-4, 4,-3, 5,-2, 6,-1, 7, ...}
694 VID : { 0, 1, 2, 3, 4, 5, 6, 7, ... }
695 DIFF(MINUS) : {-4, 3,-5,-2,-6, 1,-7, 0, ... }
696 The diff sequence {-4, 3} is not repated in the npattern and
697 return FALSE for case 2. */
698 bool
699 rvv_builder::npatterns_vid_diff_repeated_p () const
700 {
701 if (nelts_per_pattern () != 3)
702 return false;
703 else if (npatterns () == 0)
704 return false;
705
706 for (unsigned i = 0; i < npatterns (); i++)
707 {
708 poly_int64 diff_0 = rtx_to_poly_int64 (elt (i)) - i;
709 poly_int64 diff_1
710 = rtx_to_poly_int64 (elt (npatterns () + i)) - npatterns () - i;
711
712 if (maybe_ne (diff_0, diff_1))
713 return false;
714 }
715
716 return true;
717 }
718
719 /* Return true if the permutation consists of two
720 interleaved patterns with a constant step each.
721 TODO: We currently only support NPATTERNS = 2. */
722 bool
723 rvv_builder::interleaved_stepped_npatterns_p () const
724 {
725 if (npatterns () != 2 || nelts_per_pattern () != 3)
726 return false;
727 for (unsigned int i = 0; i < npatterns (); i++)
728 {
729 poly_int64 ele0 = rtx_to_poly_int64 (elt (i));
730 poly_int64 ele1 = rtx_to_poly_int64 (elt (npatterns () + i));
731 poly_int64 ele2 = rtx_to_poly_int64 (elt (npatterns () * 2 + i));
732 poly_int64 diff1 = ele1 - ele0;
733 poly_int64 diff2 = ele2 - ele1;
734 if (maybe_ne (diff1, diff2))
735 return false;
736 }
737 return true;
738 }
739
740 /* Return true if all elements of NPATTERNS are equal.
741
742 E.g. NPATTERNS = 4:
743 { 2, 2, 2, 2, 4, 4, 4, 4, 8, 8, 8, 8, 16, 16, 16, 16, ... }
744 E.g. NPATTERNS = 8:
745 { 2, 2, 2, 2, 2, 2, 2, 2, 8, 8, 8, 8, 8, 8, 8, 8, ... }
746 We only check ele[0] ~ ele[NPATTERNS - 1] whether they are the same.
747 We don't need to check the elements[n] with n >= NPATTERNS since
748 they don't belong to the same pattern.
749 */
750 bool
751 rvv_builder::npatterns_all_equal_p () const
752 {
753 poly_int64 ele0 = rtx_to_poly_int64 (elt (0));
754 for (unsigned int i = 1; i < npatterns (); i++)
755 {
756 poly_int64 ele = rtx_to_poly_int64 (elt (i));
757 if (!known_eq (ele, ele0))
758 return false;
759 }
760 return true;
761 }
762
763 static unsigned
764 get_sew (machine_mode mode)
765 {
766 unsigned int sew = GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
767 ? 8
768 : GET_MODE_BITSIZE (GET_MODE_INNER (mode));
769 return sew;
770 }
771
772 /* Return true if X is a const_vector with all duplicate elements, which is in
773 the range between MINVAL and MAXVAL. */
774 bool
775 const_vec_all_same_in_range_p (rtx x, HOST_WIDE_INT minval,
776 HOST_WIDE_INT maxval)
777 {
778 rtx elt;
779 return (const_vec_duplicate_p (x, &elt) && CONST_INT_P (elt)
780 && IN_RANGE (INTVAL (elt), minval, maxval));
781 }
782
783 /* Return true if VEC is a constant in which every element is in the range
784 [MINVAL, MAXVAL]. The elements do not need to have the same value.
785
786 This function also exists in aarch64, we may unify it in middle-end in the
787 future. */
788
789 static bool
790 const_vec_all_in_range_p (rtx vec, poly_int64 minval, poly_int64 maxval)
791 {
792 if (!CONST_VECTOR_P (vec)
793 || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
794 return false;
795
796 int nunits;
797 if (!CONST_VECTOR_STEPPED_P (vec))
798 nunits = const_vector_encoded_nelts (vec);
799 else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
800 return false;
801
802 for (int i = 0; i < nunits; i++)
803 {
804 rtx vec_elem = CONST_VECTOR_ELT (vec, i);
805 poly_int64 value;
806 if (!poly_int_rtx_p (vec_elem, &value)
807 || maybe_lt (value, minval)
808 || maybe_gt (value, maxval))
809 return false;
810 }
811 return true;
812 }
813
814 /* Return a const vector of VAL. The VAL can be either const_int or
815 const_poly_int. */
816
817 static rtx
818 gen_const_vector_dup (machine_mode mode, poly_int64 val)
819 {
820 scalar_mode smode = GET_MODE_INNER (mode);
821 rtx c = gen_int_mode (val, smode);
822 if (!val.is_constant () && GET_MODE_SIZE (smode) > GET_MODE_SIZE (Pmode))
823 {
824 /* When VAL is const_poly_int value, we need to explicitly broadcast
825 it into a vector using RVV broadcast instruction. */
826 return expand_vector_broadcast (mode, c);
827 }
828 return gen_const_vec_duplicate (mode, c);
829 }
830
831 /* Emit a vlmax vsetvl instruction. This should only be used when
832 optimization is disabled or after vsetvl insertion pass. */
833 void
834 emit_hard_vlmax_vsetvl (machine_mode vmode, rtx vl)
835 {
836 unsigned int sew = get_sew (vmode);
837 emit_insn (gen_vsetvl (Pmode, vl, RVV_VLMAX, gen_int_mode (sew, Pmode),
838 gen_int_mode (get_vlmul (vmode), Pmode), const0_rtx,
839 const0_rtx));
840 }
841
842 void
843 emit_vlmax_vsetvl (machine_mode vmode, rtx vl)
844 {
845 unsigned int sew = get_sew (vmode);
846 enum vlmul_type vlmul = get_vlmul (vmode);
847 unsigned int ratio = calculate_ratio (sew, vlmul);
848
849 if (!optimize)
850 emit_hard_vlmax_vsetvl (vmode, vl);
851 else
852 emit_insn (gen_vlmax_avl (Pmode, vl, gen_int_mode (ratio, Pmode)));
853 }
854
855 /* Calculate SEW/LMUL ratio. */
856 unsigned int
857 calculate_ratio (unsigned int sew, enum vlmul_type vlmul)
858 {
859 unsigned int ratio;
860 switch (vlmul)
861 {
862 case LMUL_1:
863 ratio = sew;
864 break;
865 case LMUL_2:
866 ratio = sew / 2;
867 break;
868 case LMUL_4:
869 ratio = sew / 4;
870 break;
871 case LMUL_8:
872 ratio = sew / 8;
873 break;
874 case LMUL_F8:
875 ratio = sew * 8;
876 break;
877 case LMUL_F4:
878 ratio = sew * 4;
879 break;
880 case LMUL_F2:
881 ratio = sew * 2;
882 break;
883 default:
884 gcc_unreachable ();
885 }
886 return ratio;
887 }
888
889 /* SCALABLE means that the vector-length is agnostic (run-time invariant and
890 compile-time unknown). FIXED meands that the vector-length is specific
891 (compile-time known). Both RVV_SCALABLE and RVV_FIXED_VLMAX are doing
892 auto-vectorization using VLMAX vsetvl configuration. */
893 static bool
894 autovec_use_vlmax_p (void)
895 {
896 return (riscv_autovec_preference == RVV_SCALABLE
897 || riscv_autovec_preference == RVV_FIXED_VLMAX);
898 }
899
900 /* This function emits VLMAX vrgather instruction. Emit vrgather.vx/vi when sel
901 is a const duplicate vector. Otherwise, emit vrgather.vv. */
902 static void
903 emit_vlmax_gather_insn (rtx target, rtx op, rtx sel)
904 {
905 rtx elt;
906 insn_code icode;
907 machine_mode data_mode = GET_MODE (target);
908 machine_mode sel_mode = GET_MODE (sel);
909 if (const_vec_duplicate_p (sel, &elt))
910 {
911 icode = code_for_pred_gather_scalar (data_mode);
912 sel = elt;
913 }
914 else if (maybe_ne (GET_MODE_SIZE (data_mode), GET_MODE_SIZE (sel_mode)))
915 icode = code_for_pred_gatherei16 (data_mode);
916 else
917 icode = code_for_pred_gather (data_mode);
918 rtx ops[] = {target, op, sel};
919 emit_vlmax_insn (icode, BINARY_OP, ops);
920 }
921
922 static void
923 emit_vlmax_masked_gather_mu_insn (rtx target, rtx op, rtx sel, rtx mask)
924 {
925 rtx elt;
926 insn_code icode;
927 machine_mode data_mode = GET_MODE (target);
928 machine_mode sel_mode = GET_MODE (sel);
929 if (const_vec_duplicate_p (sel, &elt))
930 {
931 icode = code_for_pred_gather_scalar (data_mode);
932 sel = elt;
933 }
934 else if (maybe_ne (GET_MODE_SIZE (data_mode), GET_MODE_SIZE (sel_mode)))
935 icode = code_for_pred_gatherei16 (data_mode);
936 else
937 icode = code_for_pred_gather (data_mode);
938 rtx ops[] = {target, mask, target, op, sel};
939 emit_vlmax_insn (icode, BINARY_OP_TAMU, ops);
940 }
941
942 /* According to RVV ISA spec (16.5.1. Synthesizing vdecompress):
943 https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc
944
945 There is no inverse vdecompress provided, as this operation can be readily
946 synthesized using iota and a masked vrgather:
947
948 Desired functionality of 'vdecompress'
949 7 6 5 4 3 2 1 0 # vid
950
951 e d c b a # packed vector of 5 elements
952 1 0 0 1 1 1 0 1 # mask vector of 8 elements
953 p q r s t u v w # destination register before vdecompress
954
955 e q r d c b v a # result of vdecompress
956 # v0 holds mask
957 # v1 holds packed data
958 # v11 holds input expanded vector and result
959 viota.m v10, v0 # Calc iota from mask in v0
960 vrgather.vv v11, v1, v10, v0.t # Expand into destination
961 p q r s t u v w # v11 destination register
962 e d c b a # v1 source vector
963 1 0 0 1 1 1 0 1 # v0 mask vector
964
965 4 4 4 3 2 1 1 0 # v10 result of viota.m
966 e q r d c b v a # v11 destination after vrgather using viota.m under mask
967 */
968 static void
969 emit_vlmax_decompress_insn (rtx target, rtx op0, rtx op1, rtx mask)
970 {
971 machine_mode data_mode = GET_MODE (target);
972 machine_mode sel_mode = related_int_vector_mode (data_mode).require ();
973 if (GET_MODE_INNER (data_mode) == QImode)
974 sel_mode = get_vector_mode (HImode, GET_MODE_NUNITS (data_mode)).require ();
975
976 rtx sel = gen_reg_rtx (sel_mode);
977 rtx iota_ops[] = {sel, mask};
978 emit_vlmax_insn (code_for_pred_iota (sel_mode), UNARY_OP, iota_ops);
979 emit_vlmax_gather_insn (target, op0, sel);
980 emit_vlmax_masked_gather_mu_insn (target, op1, sel, mask);
981 }
982
983 /* Emit merge instruction. */
984
985 static machine_mode
986 get_repeating_sequence_dup_machine_mode (const rvv_builder &builder,
987 machine_mode mask_bit_mode)
988 {
989 unsigned mask_precision = GET_MODE_PRECISION (mask_bit_mode).to_constant ();
990 unsigned mask_scalar_size = mask_precision > builder.inner_bits_size ()
991 ? builder.inner_bits_size () : mask_precision;
992
993 scalar_mode inner_mode;
994 unsigned minimal_bits_size;
995
996 switch (mask_scalar_size)
997 {
998 case 8:
999 inner_mode = QImode;
1000 minimal_bits_size = TARGET_MIN_VLEN / 8; /* AKA RVVMF8. */
1001 break;
1002 case 16:
1003 inner_mode = HImode;
1004 minimal_bits_size = TARGET_MIN_VLEN / 4; /* AKA RVVMF4. */
1005 break;
1006 case 32:
1007 inner_mode = SImode;
1008 minimal_bits_size = TARGET_MIN_VLEN / 2; /* AKA RVVMF2. */
1009 break;
1010 case 64:
1011 inner_mode = DImode;
1012 minimal_bits_size = TARGET_MIN_VLEN / 1; /* AKA RVVM1. */
1013 break;
1014 default:
1015 gcc_unreachable ();
1016 break;
1017 }
1018
1019 gcc_assert (mask_precision % mask_scalar_size == 0);
1020
1021 uint64_t dup_nunit = mask_precision > mask_scalar_size
1022 ? mask_precision / mask_scalar_size : minimal_bits_size / mask_scalar_size;
1023
1024 return get_vector_mode (inner_mode, dup_nunit).require ();
1025 }
1026
1027 /* Expand series const vector. If VID is NULL_RTX, we use vid.v
1028 instructions to generate sequence for VID:
1029
1030 VID = { 0, 1, 2, 3, ... }
1031
1032 Otherwise, we use the VID argument directly. */
1033
1034 void
1035 expand_vec_series (rtx dest, rtx base, rtx step, rtx vid)
1036 {
1037 machine_mode mode = GET_MODE (dest);
1038 poly_int64 nunits_m1 = GET_MODE_NUNITS (mode) - 1;
1039 poly_int64 value;
1040 rtx result = register_operand (dest, mode) ? dest : gen_reg_rtx (mode);
1041
1042 /* VECT_IV = BASE + I * STEP. */
1043
1044 /* Step 1: Generate I = { 0, 1, 2, ... } by vid.v. */
1045 bool reverse_p = !vid && rtx_equal_p (step, constm1_rtx)
1046 && poly_int_rtx_p (base, &value)
1047 && known_eq (nunits_m1, value);
1048 if (!vid)
1049 {
1050 vid = gen_reg_rtx (mode);
1051 rtx op[] = {vid};
1052 emit_vlmax_insn (code_for_pred_series (mode), NULLARY_OP, op);
1053 }
1054
1055 rtx step_adj;
1056 if (reverse_p)
1057 {
1058 /* Special case:
1059 {nunits - 1, nunits - 2, ... , 0}.
1060 nunits can be either const_int or const_poly_int.
1061
1062 Code sequence:
1063 vid.v v
1064 vrsub nunits - 1, v. */
1065 rtx ops[]
1066 = {result, vid, gen_int_mode (nunits_m1, GET_MODE_INNER (mode))};
1067 insn_code icode = code_for_pred_sub_reverse_scalar (mode);
1068 emit_vlmax_insn (icode, BINARY_OP, ops);
1069 }
1070 else
1071 {
1072 /* Step 2: Generate I * STEP.
1073 - STEP is 1, we don't emit any instructions.
1074 - STEP is power of 2, we use vsll.vi/vsll.vx.
1075 - STEP is non-power of 2, we use vmul.vx. */
1076 if (rtx_equal_p (step, const1_rtx))
1077 step_adj = vid;
1078 else
1079 {
1080 step_adj = gen_reg_rtx (mode);
1081 if (CONST_INT_P (step) && pow2p_hwi (INTVAL (step)))
1082 {
1083 /* Emit logical left shift operation. */
1084 int shift = exact_log2 (INTVAL (step));
1085 rtx shift_amount = gen_int_mode (shift, Pmode);
1086 insn_code icode = code_for_pred_scalar (ASHIFT, mode);
1087 rtx ops[] = {step_adj, vid, shift_amount};
1088 emit_vlmax_insn (icode, BINARY_OP, ops);
1089 }
1090 else
1091 {
1092 insn_code icode = code_for_pred_scalar (MULT, mode);
1093 rtx ops[] = {step_adj, vid, step};
1094 emit_vlmax_insn (icode, BINARY_OP, ops);
1095 }
1096 }
1097
1098 /* Step 3: Generate BASE + I * STEP.
1099 - BASE is 0, use result of vid.
1100 - BASE is not 0, we use vadd.vx/vadd.vi. */
1101 if (rtx_equal_p (base, const0_rtx))
1102 emit_move_insn (result, step_adj);
1103 else
1104 {
1105 insn_code icode = code_for_pred_scalar (PLUS, mode);
1106 rtx ops[] = {result, step_adj, base};
1107 emit_vlmax_insn (icode, BINARY_OP, ops);
1108 }
1109 }
1110
1111 if (result != dest)
1112 emit_move_insn (dest, result);
1113 }
1114
1115 static void
1116 expand_const_vector (rtx target, rtx src)
1117 {
1118 machine_mode mode = GET_MODE (target);
1119 if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
1120 {
1121 rtx elt;
1122 gcc_assert (
1123 const_vec_duplicate_p (src, &elt)
1124 && (rtx_equal_p (elt, const0_rtx) || rtx_equal_p (elt, const1_rtx)));
1125 rtx ops[] = {target, src};
1126 emit_vlmax_insn (code_for_pred_mov (mode), UNARY_MASK_OP, ops);
1127 return;
1128 }
1129
1130 rtx elt;
1131 if (const_vec_duplicate_p (src, &elt))
1132 {
1133 rtx tmp = register_operand (target, mode) ? target : gen_reg_rtx (mode);
1134 /* Element in range -16 ~ 15 integer or 0.0 floating-point,
1135 we use vmv.v.i instruction. */
1136 if (satisfies_constraint_vi (src) || satisfies_constraint_Wc0 (src))
1137 {
1138 rtx ops[] = {tmp, src};
1139 emit_vlmax_insn (code_for_pred_mov (mode), UNARY_OP, ops);
1140 }
1141 else
1142 {
1143 /* Emit vec_duplicate<mode> split pattern before RA so that
1144 we could have a better optimization opportunity in LICM
1145 which will hoist vmv.v.x outside the loop and in fwprop && combine
1146 which will transform 'vv' into 'vx' instruction.
1147
1148 The reason we don't emit vec_duplicate<mode> split pattern during
1149 RA since the split stage after RA is a too late stage to generate
1150 RVV instruction which need an additional register (We can't
1151 allocate a new register after RA) for VL operand of vsetvl
1152 instruction (vsetvl a5, zero). */
1153 if (lra_in_progress)
1154 {
1155 rtx ops[] = {tmp, elt};
1156 emit_vlmax_insn (code_for_pred_broadcast (mode), UNARY_OP, ops);
1157 }
1158 else
1159 {
1160 struct expand_operand ops[2];
1161 enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
1162 gcc_assert (icode != CODE_FOR_nothing);
1163 create_output_operand (&ops[0], tmp, mode);
1164 create_input_operand (&ops[1], elt, GET_MODE_INNER (mode));
1165 expand_insn (icode, 2, ops);
1166 tmp = ops[0].value;
1167 }
1168 }
1169
1170 if (tmp != target)
1171 emit_move_insn (target, tmp);
1172 return;
1173 }
1174
1175 /* Support scalable const series vector. */
1176 rtx base, step;
1177 if (const_vec_series_p (src, &base, &step))
1178 {
1179 expand_vec_series (target, base, step);
1180 return;
1181 }
1182
1183 /* Handle variable-length vector. */
1184 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
1185 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
1186 rvv_builder builder (mode, npatterns, nelts_per_pattern);
1187 for (unsigned int i = 0; i < nelts_per_pattern; i++)
1188 {
1189 for (unsigned int j = 0; j < npatterns; j++)
1190 builder.quick_push (CONST_VECTOR_ELT (src, i * npatterns + j));
1191 }
1192 builder.finalize ();
1193
1194 if (CONST_VECTOR_DUPLICATE_P (src))
1195 {
1196 /* Handle the case with repeating sequence that NELTS_PER_PATTERN = 1
1197 E.g. NPATTERNS = 4, v = { 0, 2, 6, 7, ... }
1198 NPATTERNS = 8, v = { 0, 2, 6, 7, 19, 20, 8, 7 ... }
1199 The elements within NPATTERNS are not necessary regular. */
1200 if (builder.can_duplicate_repeating_sequence_p ())
1201 {
1202 /* We handle the case that we can find a vector containter to hold
1203 element bitsize = NPATTERNS * ele_bitsize.
1204
1205 NPATTERNS = 8, element width = 8
1206 v = { 0, 1, 2, 3, 4, 5, 6, 7, ... }
1207 In this case, we can combine NPATTERNS element into a larger
1208 element. Use element width = 64 and broadcast a vector with
1209 all element equal to 0x0706050403020100. */
1210 rtx ele = builder.get_merged_repeating_sequence ();
1211 rtx dup = expand_vector_broadcast (builder.new_mode (), ele);
1212 emit_move_insn (target, gen_lowpart (mode, dup));
1213 }
1214 else
1215 {
1216 /* We handle the case that we can't find a vector containter to hold
1217 element bitsize = NPATTERNS * ele_bitsize.
1218
1219 NPATTERNS = 8, element width = 16
1220 v = { 0, 1, 2, 3, 4, 5, 6, 7, ... }
1221 Since NPATTERNS * element width = 128, we can't find a container
1222 to hold it.
1223
1224 In this case, we use NPATTERNS merge operations to generate such
1225 vector. */
1226 unsigned int nbits = npatterns - 1;
1227
1228 /* Generate vid = { 0, 1, 2, 3, 4, 5, 6, 7, ... }. */
1229 rtx vid = gen_reg_rtx (builder.int_mode ());
1230 rtx op[] = {vid};
1231 emit_vlmax_insn (code_for_pred_series (builder.int_mode ()),
1232 NULLARY_OP, op);
1233
1234 /* Generate vid_repeat = { 0, 1, ... nbits, ... } */
1235 rtx vid_repeat = gen_reg_rtx (builder.int_mode ());
1236 rtx and_ops[] = {vid_repeat, vid,
1237 gen_int_mode (nbits, builder.inner_int_mode ())};
1238 emit_vlmax_insn (code_for_pred_scalar (AND, builder.int_mode ()),
1239 BINARY_OP, and_ops);
1240
1241 rtx tmp = gen_reg_rtx (builder.mode ());
1242 rtx dup_ops[] = {tmp, builder.elt (0)};
1243 emit_vlmax_insn (code_for_pred_broadcast (builder.mode ()), UNARY_OP,
1244 dup_ops);
1245 for (unsigned int i = 1; i < builder.npatterns (); i++)
1246 {
1247 /* Generate mask according to i. */
1248 rtx mask = gen_reg_rtx (builder.mask_mode ());
1249 rtx const_vec = gen_const_vector_dup (builder.int_mode (), i);
1250 expand_vec_cmp (mask, EQ, vid_repeat, const_vec);
1251
1252 /* Merge scalar to each i. */
1253 rtx tmp2 = gen_reg_rtx (builder.mode ());
1254 rtx merge_ops[] = {tmp2, tmp, builder.elt (i), mask};
1255 insn_code icode = code_for_pred_merge_scalar (builder.mode ());
1256 emit_vlmax_insn (icode, MERGE_OP, merge_ops);
1257 tmp = tmp2;
1258 }
1259 emit_move_insn (target, tmp);
1260 }
1261 }
1262 else if (CONST_VECTOR_STEPPED_P (src))
1263 {
1264 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
1265 if (builder.single_step_npatterns_p ())
1266 {
1267 /* Describe the case by choosing NPATTERNS = 4 as an example. */
1268 insn_code icode;
1269
1270 /* Step 1: Generate vid = { 0, 1, 2, 3, 4, 5, 6, 7, ... }. */
1271 rtx vid = gen_reg_rtx (builder.mode ());
1272 rtx vid_ops[] = {vid};
1273 icode = code_for_pred_series (builder.mode ());
1274 emit_vlmax_insn (icode, NULLARY_OP, vid_ops);
1275
1276 if (builder.npatterns_all_equal_p ())
1277 {
1278 /* Generate the variable-length vector following this rule:
1279 { a, a, a + step, a + step, a + step * 2, a + step * 2, ...}
1280 E.g. { 0, 0, 8, 8, 16, 16, ... } */
1281 /* We want to create a pattern where value[ix] = floor (ix /
1282 NPATTERNS). As NPATTERNS is always a power of two we can
1283 rewrite this as = ix & -NPATTERNS. */
1284 /* Step 2: VID AND -NPATTERNS:
1285 { 0&-4, 1&-4, 2&-4, 3 &-4, 4 &-4, 5 &-4, 6 &-4, 7 &-4, ... }
1286 */
1287 rtx imm
1288 = gen_int_mode (-builder.npatterns (), builder.inner_mode ());
1289 rtx tmp = gen_reg_rtx (builder.mode ());
1290 rtx and_ops[] = {tmp, vid, imm};
1291 icode = code_for_pred_scalar (AND, builder.mode ());
1292 emit_vlmax_insn (icode, BINARY_OP, and_ops);
1293 HOST_WIDE_INT init_val = INTVAL (builder.elt (0));
1294 if (init_val == 0)
1295 emit_move_insn (target, tmp);
1296 else
1297 {
1298 rtx dup = gen_const_vector_dup (builder.mode (), init_val);
1299 rtx add_ops[] = {target, tmp, dup};
1300 icode = code_for_pred (PLUS, builder.mode ());
1301 emit_vlmax_insn (icode, BINARY_OP, add_ops);
1302 }
1303 }
1304 else
1305 {
1306 /* Generate the variable-length vector following this rule:
1307 { a, b, a + step, b + step, a + step*2, b + step*2, ... } */
1308
1309 if (builder.npatterns_vid_diff_repeated_p ())
1310 {
1311 /* Case 1: For example as below:
1312 {3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8... }
1313 We have 3 - 0 = 3 equals 7 - 4 = 3, the sequence is
1314 repeated as below after minus vid.
1315 {3, 1, -1, -3, 3, 1, -1, -3...}
1316 Then we can simplify the diff code gen to at most
1317 npatterns(). */
1318 rvv_builder v (builder.mode (), builder.npatterns (), 1);
1319
1320 /* Step 1: Generate diff = TARGET - VID. */
1321 for (unsigned int i = 0; i < v.npatterns (); ++i)
1322 {
1323 poly_int64 diff = rtx_to_poly_int64 (builder.elt (i)) - i;
1324 v.quick_push (gen_int_mode (diff, v.inner_mode ()));
1325 }
1326
1327 /* Step 2: Generate result = VID + diff. */
1328 rtx vec = v.build ();
1329 rtx add_ops[] = {target, vid, vec};
1330 emit_vlmax_insn (code_for_pred (PLUS, builder.mode ()),
1331 BINARY_OP, add_ops);
1332 }
1333 else
1334 {
1335 /* Case 2: For example as below:
1336 { -4, 4, -4 + 1, 4 + 1, -4 + 2, 4 + 2, -4 + 3, 4 + 3, ... }
1337 */
1338 rvv_builder v (builder.mode (), builder.npatterns (), 1);
1339
1340 /* Step 1: Generate { a, b, a, b, ... } */
1341 for (unsigned int i = 0; i < v.npatterns (); ++i)
1342 v.quick_push (builder.elt (i));
1343 rtx new_base = v.build ();
1344
1345 /* Step 2: Generate tmp = VID >> LOG2 (NPATTERNS).  */
1346 rtx shift_count
1347 = gen_int_mode (exact_log2 (builder.npatterns ()),
1348 builder.inner_mode ());
1349 rtx tmp = expand_simple_binop (builder.mode (), LSHIFTRT,
1350 vid, shift_count, NULL_RTX,
1351 false, OPTAB_DIRECT);
1352
1353 /* Step 3: Generate tmp2 = tmp * step.  */
1354 rtx tmp2 = gen_reg_rtx (builder.mode ());
1355 rtx step
1356 = simplify_binary_operation (MINUS, builder.inner_mode (),
1357 builder.elt (v.npatterns()),
1358 builder.elt (0));
1359 expand_vec_series (tmp2, const0_rtx, step, tmp);
1360
1361 /* Step 4: Generate target = tmp2 + new_base.  */
1362 rtx add_ops[] = {target, tmp2, new_base};
1363 emit_vlmax_insn (code_for_pred (PLUS, builder.mode ()),
1364 BINARY_OP, add_ops);
1365 }
1366 }
1367 }
1368 else if (builder.interleaved_stepped_npatterns_p ())
1369 {
1370 rtx base1 = builder.elt (0);
1371 rtx base2 = builder.elt (1);
1372 poly_int64 step1
1373 = rtx_to_poly_int64 (builder.elt (builder.npatterns ()))
1374 - rtx_to_poly_int64 (base1);
1375 poly_int64 step2
1376 = rtx_to_poly_int64 (builder.elt (builder.npatterns () + 1))
1377 - rtx_to_poly_int64 (base2);
1378
1379 /* For { 1, 0, 2, 0, ... , n - 1, 0 }, we can use larger EEW
1380 integer vector mode to generate such vector efficiently.
1381
1382 E.g. EEW = 16, { 2, 0, 4, 0, ... }
1383
1384 can be interpreted into:
1385
1386 EEW = 32, { 2, 4, ... } */
1387 unsigned int new_smode_bitsize = builder.inner_bits_size () * 2;
1388 scalar_int_mode new_smode;
1389 machine_mode new_mode;
1390 poly_uint64 new_nunits
1391 = exact_div (GET_MODE_NUNITS (builder.mode ()), 2);
1392 if (int_mode_for_size (new_smode_bitsize, 0).exists (&new_smode)
1393 && get_vector_mode (new_smode, new_nunits).exists (&new_mode))
1394 {
1395 rtx tmp = gen_reg_rtx (new_mode);
1396 base1 = gen_int_mode (rtx_to_poly_int64 (base1), new_smode);
1397 expand_vec_series (tmp, base1, gen_int_mode (step1, new_smode));
1398
1399 if (rtx_equal_p (base2, const0_rtx) && known_eq (step2, 0))
1400 /* { 1, 0, 2, 0, ... }. */
1401 emit_move_insn (target, gen_lowpart (mode, tmp));
1402 else if (known_eq (step2, 0))
1403 {
1404 /* { 1, 1, 2, 1, ... }. */
1405 rtx scalar = expand_simple_binop (
1406 new_smode, ASHIFT,
1407 gen_int_mode (rtx_to_poly_int64 (base2), new_smode),
1408 gen_int_mode (builder.inner_bits_size (), new_smode),
1409 NULL_RTX, false, OPTAB_DIRECT);
1410 rtx tmp2 = gen_reg_rtx (new_mode);
1411 rtx and_ops[] = {tmp2, tmp, scalar};
1412 emit_vlmax_insn (code_for_pred_scalar (AND, new_mode),
1413 BINARY_OP, and_ops);
1414 emit_move_insn (target, gen_lowpart (mode, tmp2));
1415 }
1416 else
1417 {
1418 /* { 1, 3, 2, 6, ... }. */
1419 rtx tmp2 = gen_reg_rtx (new_mode);
1420 base2 = gen_int_mode (rtx_to_poly_int64 (base2), new_smode);
1421 expand_vec_series (tmp2, base2,
1422 gen_int_mode (step2, new_smode));
1423 rtx shifted_tmp2 = expand_simple_binop (
1424 new_mode, ASHIFT, tmp2,
1425 gen_int_mode (builder.inner_bits_size (), Pmode), NULL_RTX,
1426 false, OPTAB_DIRECT);
1427 rtx tmp3 = gen_reg_rtx (new_mode);
1428 rtx ior_ops[] = {tmp3, tmp, shifted_tmp2};
1429 emit_vlmax_insn (code_for_pred (IOR, new_mode), BINARY_OP,
1430 ior_ops);
1431 emit_move_insn (target, gen_lowpart (mode, tmp3));
1432 }
1433 }
1434 else
1435 {
1436 rtx vid = gen_reg_rtx (mode);
1437 expand_vec_series (vid, const0_rtx, const1_rtx);
1438 /* Transform into { 0, 0, 1, 1, 2, 2, ... }. */
1439 rtx shifted_vid
1440 = expand_simple_binop (mode, LSHIFTRT, vid, const1_rtx,
1441 NULL_RTX, false, OPTAB_DIRECT);
1442 rtx tmp1 = gen_reg_rtx (mode);
1443 rtx tmp2 = gen_reg_rtx (mode);
1444 expand_vec_series (tmp1, base1,
1445 gen_int_mode (step1, builder.inner_mode ()),
1446 shifted_vid);
1447 expand_vec_series (tmp2, base2,
1448 gen_int_mode (step2, builder.inner_mode ()),
1449 shifted_vid);
1450
1451 /* Transform into { 0, 1, 0, 1, 0, 1, ... }. */
1452 rtx and_vid = gen_reg_rtx (mode);
1453 rtx and_ops[] = {and_vid, vid, const1_rtx};
1454 emit_vlmax_insn (code_for_pred_scalar (AND, mode), BINARY_OP,
1455 and_ops);
1456 rtx mask = gen_reg_rtx (builder.mask_mode ());
1457 expand_vec_cmp (mask, EQ, and_vid, CONST1_RTX (mode));
1458
1459 rtx ops[] = {target, tmp1, tmp2, mask};
1460 emit_vlmax_insn (code_for_pred_merge (mode), MERGE_OP, ops);
1461 }
1462 }
1463 else if (npatterns == 1 && nelts_per_pattern == 3)
1464 {
1465 /* Generate the following CONST_VECTOR:
1466 { base0, base1, base1 + step, base1 + step * 2, ... } */
1467 rtx base0 = builder.elt (0);
1468 rtx base1 = builder.elt (1);
1469 rtx base2 = builder.elt (2);
1470
1471 rtx step = simplify_binary_operation (MINUS, builder.inner_mode (),
1472 base2, base1);
1473
1474 /* Step 1 - { base1, base1 + step, base1 + step * 2, ... } */
1475 rtx tmp = gen_reg_rtx (mode);
1476 expand_vec_series (tmp, base1, step);
1477 /* Step 2 - { base0, base1, base1 + step, base1 + step * 2, ... } */
1478 if (!rtx_equal_p (base0, const0_rtx))
1479 base0 = force_reg (builder.inner_mode (), base0);
1480
1481 insn_code icode = optab_handler (vec_shl_insert_optab, mode);
1482 gcc_assert (icode != CODE_FOR_nothing);
1483 emit_insn (GEN_FCN (icode) (target, tmp, base0));
1484 }
1485 else
1486 /* TODO: We will enable more variable-length vector in the future. */
1487 gcc_unreachable ();
1488 }
1489 else
1490 gcc_unreachable ();
1491 }
1492
1493 /* Get the frm mode with given CONST_INT rtx, the default mode is
1494 FRM_DYN. */
1495 enum floating_point_rounding_mode
1496 get_frm_mode (rtx operand)
1497 {
1498 gcc_assert (CONST_INT_P (operand));
1499
1500 switch (INTVAL (operand))
1501 {
1502 case FRM_RNE:
1503 return FRM_RNE;
1504 case FRM_RTZ:
1505 return FRM_RTZ;
1506 case FRM_RDN:
1507 return FRM_RDN;
1508 case FRM_RUP:
1509 return FRM_RUP;
1510 case FRM_RMM:
1511 return FRM_RMM;
1512 case FRM_DYN:
1513 return FRM_DYN;
1514 default:
1515 gcc_unreachable ();
1516 }
1517
1518 gcc_unreachable ();
1519 }
1520
1521 /* Expand a pre-RA RVV data move from SRC to DEST.
1522 It expands move for RVV fractional vector modes.
1523 Return true if the move as already been emitted. */
1524 bool
1525 legitimize_move (rtx dest, rtx *srcp)
1526 {
1527 rtx src = *srcp;
1528 machine_mode mode = GET_MODE (dest);
1529 if (CONST_VECTOR_P (src))
1530 {
1531 expand_const_vector (dest, src);
1532 return true;
1533 }
1534
1535 if (riscv_v_ext_vls_mode_p (mode))
1536 {
1537 if (GET_MODE_NUNITS (mode).to_constant () <= 31)
1538 {
1539 /* For NUNITS <= 31 VLS modes, we don't need extrac
1540 scalar regisers so we apply the naive (set (op0) (op1)) pattern. */
1541 if (can_create_pseudo_p ())
1542 {
1543 /* Need to force register if mem <- !reg. */
1544 if (MEM_P (dest) && !REG_P (src))
1545 *srcp = force_reg (mode, src);
1546
1547 return false;
1548 }
1549 }
1550 else if (GET_MODE_NUNITS (mode).to_constant () > 31 && lra_in_progress)
1551 {
1552 emit_insn (gen_mov_lra (mode, Pmode, dest, src));
1553 return true;
1554 }
1555 }
1556 else
1557 {
1558 /* In order to decrease the memory traffic, we don't use whole register
1559 * load/store for the LMUL less than 1 and mask mode, so those case will
1560 * require one extra general purpose register, but it's not allowed during
1561 * LRA process, so we have a special move pattern used for LRA, which will
1562 * defer the expansion after LRA. */
1563 if ((known_lt (GET_MODE_SIZE (mode), BYTES_PER_RISCV_VECTOR)
1564 || GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
1565 && lra_in_progress)
1566 {
1567 emit_insn (gen_mov_lra (mode, Pmode, dest, src));
1568 return true;
1569 }
1570
1571 if (known_ge (GET_MODE_SIZE (mode), BYTES_PER_RISCV_VECTOR)
1572 && GET_MODE_CLASS (mode) != MODE_VECTOR_BOOL)
1573 {
1574 /* Need to force register if mem <- !reg. */
1575 if (MEM_P (dest) && !REG_P (src))
1576 *srcp = force_reg (mode, src);
1577
1578 return false;
1579 }
1580 }
1581
1582 if (register_operand (src, mode) && register_operand (dest, mode))
1583 {
1584 emit_insn (gen_rtx_SET (dest, src));
1585 return true;
1586 }
1587
1588 unsigned insn_flags
1589 = GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL ? UNARY_MASK_OP : UNARY_OP;
1590 if (!register_operand (src, mode) && !register_operand (dest, mode))
1591 {
1592 rtx tmp = gen_reg_rtx (mode);
1593 if (MEM_P (src))
1594 {
1595 rtx ops[] = {tmp, src};
1596 emit_vlmax_insn (code_for_pred_mov (mode), insn_flags, ops);
1597 }
1598 else
1599 emit_move_insn (tmp, src);
1600 src = tmp;
1601 }
1602
1603 if (satisfies_constraint_vu (src))
1604 return false;
1605
1606 rtx ops[] = {dest, src};
1607 emit_vlmax_insn (code_for_pred_mov (mode), insn_flags, ops);
1608 return true;
1609 }
1610
1611 /* VTYPE information for machine_mode. */
1612 struct mode_vtype_group
1613 {
1614 enum vlmul_type vlmul[NUM_MACHINE_MODES];
1615 uint8_t ratio[NUM_MACHINE_MODES];
1616 machine_mode subpart_mode[NUM_MACHINE_MODES];
1617 uint8_t nf[NUM_MACHINE_MODES];
1618 mode_vtype_group ()
1619 {
1620 #define ENTRY(MODE, REQUIREMENT, VLMUL, RATIO) \
1621 vlmul[MODE##mode] = VLMUL; \
1622 ratio[MODE##mode] = RATIO;
1623 #define TUPLE_ENTRY(MODE, REQUIREMENT, SUBPART_MODE, NF, VLMUL, RATIO) \
1624 subpart_mode[MODE##mode] = SUBPART_MODE##mode; \
1625 nf[MODE##mode] = NF; \
1626 vlmul[MODE##mode] = VLMUL; \
1627 ratio[MODE##mode] = RATIO;
1628 #include "riscv-vector-switch.def"
1629 #undef ENTRY
1630 #undef TUPLE_ENTRY
1631 }
1632 };
1633
1634 static mode_vtype_group mode_vtype_infos;
1635
1636 /* Get vlmul field value by comparing LMUL with BYTES_PER_RISCV_VECTOR. */
1637 enum vlmul_type
1638 get_vlmul (machine_mode mode)
1639 {
1640 /* For VLS modes, the vlmul should be dynamically
1641 calculated since we need to adjust VLMUL according
1642 to TARGET_MIN_VLEN. */
1643 if (riscv_v_ext_vls_mode_p (mode))
1644 {
1645 int size = GET_MODE_BITSIZE (mode).to_constant ();
1646 int inner_size = GET_MODE_BITSIZE (GET_MODE_INNER (mode));
1647 if (size < TARGET_MIN_VLEN)
1648 {
1649 int factor = TARGET_MIN_VLEN / size;
1650 if (inner_size == 8)
1651 factor = MIN (factor, 8);
1652 else if (inner_size == 16)
1653 factor = MIN (factor, 4);
1654 else if (inner_size == 32)
1655 factor = MIN (factor, 2);
1656 else if (inner_size == 64)
1657 factor = MIN (factor, 1);
1658 else
1659 gcc_unreachable ();
1660
1661 switch (factor)
1662 {
1663 case 1:
1664 return LMUL_1;
1665 case 2:
1666 return LMUL_F2;
1667 case 4:
1668 return LMUL_F4;
1669 case 8:
1670 return LMUL_F8;
1671
1672 default:
1673 gcc_unreachable ();
1674 }
1675 }
1676 else
1677 {
1678 int factor = size / TARGET_MIN_VLEN;
1679 switch (factor)
1680 {
1681 case 1:
1682 return LMUL_1;
1683 case 2:
1684 return LMUL_2;
1685 case 4:
1686 return LMUL_4;
1687 case 8:
1688 return LMUL_8;
1689
1690 default:
1691 gcc_unreachable ();
1692 }
1693 }
1694 }
1695 return mode_vtype_infos.vlmul[mode];
1696 }
1697
1698 /* Return the VLMAX rtx of vector mode MODE. */
1699 rtx
1700 get_vlmax_rtx (machine_mode mode)
1701 {
1702 gcc_assert (riscv_v_ext_vector_mode_p (mode));
1703 return gen_int_mode (GET_MODE_NUNITS (mode), Pmode);
1704 }
1705
1706 /* Return the NF value of the corresponding mode. */
1707 unsigned int
1708 get_nf (machine_mode mode)
1709 {
1710 /* We don't allow non-tuple modes go through this function. */
1711 gcc_assert (riscv_v_ext_tuple_mode_p (mode));
1712 return mode_vtype_infos.nf[mode];
1713 }
1714
1715 /* Return the subpart mode of the tuple mode. For RVVM2x2SImode,
1716 the subpart mode is RVVM2SImode. This will help to build
1717 array/struct type in builtins. */
1718 machine_mode
1719 get_subpart_mode (machine_mode mode)
1720 {
1721 /* We don't allow non-tuple modes go through this function. */
1722 gcc_assert (riscv_v_ext_tuple_mode_p (mode));
1723 return mode_vtype_infos.subpart_mode[mode];
1724 }
1725
1726 /* Get ratio according to machine mode. */
1727 unsigned int
1728 get_ratio (machine_mode mode)
1729 {
1730 if (riscv_v_ext_vls_mode_p (mode))
1731 {
1732 unsigned int sew = get_sew (mode);
1733 vlmul_type vlmul = get_vlmul (mode);
1734 switch (vlmul)
1735 {
1736 case LMUL_1:
1737 return sew;
1738 case LMUL_2:
1739 return sew / 2;
1740 case LMUL_4:
1741 return sew / 4;
1742 case LMUL_8:
1743 return sew / 8;
1744 case LMUL_F8:
1745 return sew * 8;
1746 case LMUL_F4:
1747 return sew * 4;
1748 case LMUL_F2:
1749 return sew * 2;
1750
1751 default:
1752 gcc_unreachable ();
1753 }
1754 }
1755 return mode_vtype_infos.ratio[mode];
1756 }
1757
1758 /* Get ta according to operand[tail_op_idx]. */
1759 int
1760 get_ta (rtx ta)
1761 {
1762 if (INTVAL (ta) == TAIL_ANY)
1763 return INVALID_ATTRIBUTE;
1764 return INTVAL (ta);
1765 }
1766
1767 /* Get ma according to operand[mask_op_idx]. */
1768 int
1769 get_ma (rtx ma)
1770 {
1771 if (INTVAL (ma) == MASK_ANY)
1772 return INVALID_ATTRIBUTE;
1773 return INTVAL (ma);
1774 }
1775
1776 /* Get prefer tail policy. */
1777 enum tail_policy
1778 get_prefer_tail_policy ()
1779 {
1780 /* TODO: By default, we choose to use TAIL_ANY which allows
1781 compiler pick up either agnostic or undisturbed. Maybe we
1782 will have a compile option like -mprefer=agnostic to set
1783 this value???. */
1784 return TAIL_ANY;
1785 }
1786
1787 /* Get prefer mask policy. */
1788 enum mask_policy
1789 get_prefer_mask_policy ()
1790 {
1791 /* TODO: By default, we choose to use MASK_ANY which allows
1792 compiler pick up either agnostic or undisturbed. Maybe we
1793 will have a compile option like -mprefer=agnostic to set
1794 this value???. */
1795 return MASK_ANY;
1796 }
1797
1798 /* Get avl_type rtx. */
1799 rtx
1800 get_avl_type_rtx (enum avl_type type)
1801 {
1802 return gen_int_mode (type, Pmode);
1803 }
1804
1805 /* Return the appropriate mask mode for MODE. */
1806
1807 machine_mode
1808 get_mask_mode (machine_mode mode)
1809 {
1810 poly_int64 nunits = GET_MODE_NUNITS (mode);
1811 if (riscv_v_ext_tuple_mode_p (mode))
1812 {
1813 unsigned int nf = get_nf (mode);
1814 nunits = exact_div (nunits, nf);
1815 }
1816 return get_vector_mode (BImode, nunits).require ();
1817 }
1818
1819 /* Return the appropriate M1 mode for MODE. */
1820
1821 static opt_machine_mode
1822 get_m1_mode (machine_mode mode)
1823 {
1824 scalar_mode smode = GET_MODE_INNER (mode);
1825 unsigned int bytes = GET_MODE_SIZE (smode);
1826 poly_uint64 m1_nunits = exact_div (BYTES_PER_RISCV_VECTOR, bytes);
1827 return get_vector_mode (smode, m1_nunits);
1828 }
1829
1830 /* Return the RVV vector mode that has NUNITS elements of mode INNER_MODE.
1831 This function is not only used by builtins, but also will be used by
1832 auto-vectorization in the future. */
1833 opt_machine_mode
1834 get_vector_mode (scalar_mode inner_mode, poly_uint64 nunits)
1835 {
1836 enum mode_class mclass;
1837 if (inner_mode == E_BImode)
1838 mclass = MODE_VECTOR_BOOL;
1839 else if (FLOAT_MODE_P (inner_mode))
1840 mclass = MODE_VECTOR_FLOAT;
1841 else
1842 mclass = MODE_VECTOR_INT;
1843 machine_mode mode;
1844 FOR_EACH_MODE_IN_CLASS (mode, mclass)
1845 if (inner_mode == GET_MODE_INNER (mode)
1846 && known_eq (nunits, GET_MODE_NUNITS (mode))
1847 && (riscv_v_ext_vector_mode_p (mode)
1848 || riscv_v_ext_vls_mode_p (mode)))
1849 return mode;
1850 return opt_machine_mode ();
1851 }
1852
1853 /* Return the RVV tuple mode if we can find the legal tuple mode for the
1854 corresponding subpart mode and NF. */
1855 opt_machine_mode
1856 get_tuple_mode (machine_mode subpart_mode, unsigned int nf)
1857 {
1858 poly_uint64 nunits = GET_MODE_NUNITS (subpart_mode) * nf;
1859 scalar_mode inner_mode = GET_MODE_INNER (subpart_mode);
1860 enum mode_class mclass = GET_MODE_CLASS (subpart_mode);
1861 machine_mode mode;
1862 FOR_EACH_MODE_IN_CLASS (mode, mclass)
1863 if (inner_mode == GET_MODE_INNER (mode)
1864 && known_eq (nunits, GET_MODE_NUNITS (mode))
1865 && riscv_v_ext_tuple_mode_p (mode)
1866 && get_subpart_mode (mode) == subpart_mode)
1867 return mode;
1868 return opt_machine_mode ();
1869 }
1870
1871 bool
1872 simm5_p (rtx x)
1873 {
1874 if (!CONST_INT_P (x))
1875 return false;
1876 return IN_RANGE (INTVAL (x), -16, 15);
1877 }
1878
1879 bool
1880 neg_simm5_p (rtx x)
1881 {
1882 if (!CONST_INT_P (x))
1883 return false;
1884 return IN_RANGE (INTVAL (x), -15, 16);
1885 }
1886
1887 bool
1888 has_vi_variant_p (rtx_code code, rtx x)
1889 {
1890 switch (code)
1891 {
1892 case PLUS:
1893 case AND:
1894 case IOR:
1895 case XOR:
1896 case SS_PLUS:
1897 case US_PLUS:
1898 case EQ:
1899 case NE:
1900 case LE:
1901 case LEU:
1902 case GT:
1903 case GTU:
1904 return simm5_p (x);
1905
1906 case LT:
1907 case LTU:
1908 case GE:
1909 case GEU:
1910 case MINUS:
1911 case SS_MINUS:
1912 return neg_simm5_p (x);
1913
1914 default:
1915 return false;
1916 }
1917 }
1918
1919 bool
1920 sew64_scalar_helper (rtx *operands, rtx *scalar_op, rtx vl,
1921 machine_mode vector_mode, bool has_vi_variant_p,
1922 void (*emit_vector_func) (rtx *, rtx), enum avl_type type)
1923 {
1924 machine_mode scalar_mode = GET_MODE_INNER (vector_mode);
1925 if (has_vi_variant_p)
1926 {
1927 *scalar_op = force_reg (scalar_mode, *scalar_op);
1928 return false;
1929 }
1930
1931 if (TARGET_64BIT)
1932 {
1933 if (!rtx_equal_p (*scalar_op, const0_rtx))
1934 *scalar_op = force_reg (scalar_mode, *scalar_op);
1935 return false;
1936 }
1937
1938 if (immediate_operand (*scalar_op, Pmode))
1939 {
1940 if (!rtx_equal_p (*scalar_op, const0_rtx))
1941 *scalar_op = force_reg (Pmode, *scalar_op);
1942
1943 *scalar_op = gen_rtx_SIGN_EXTEND (scalar_mode, *scalar_op);
1944 return false;
1945 }
1946
1947 if (CONST_INT_P (*scalar_op))
1948 {
1949 if (maybe_gt (GET_MODE_SIZE (scalar_mode), GET_MODE_SIZE (Pmode)))
1950 *scalar_op = force_const_mem (scalar_mode, *scalar_op);
1951 else
1952 *scalar_op = force_reg (scalar_mode, *scalar_op);
1953 }
1954
1955 rtx tmp = gen_reg_rtx (vector_mode);
1956 rtx ops[] = {tmp, *scalar_op};
1957 if (type == VLMAX)
1958 emit_vlmax_insn (code_for_pred_broadcast (vector_mode), UNARY_OP, ops);
1959 else
1960 emit_nonvlmax_insn (code_for_pred_broadcast (vector_mode), UNARY_OP, ops,
1961 vl);
1962 emit_vector_func (operands, tmp);
1963
1964 return true;
1965 }
1966
1967 /* Get { ... ,0, 0, 0, ..., 0, 0, 0, 1 } mask. */
1968 rtx
1969 gen_scalar_move_mask (machine_mode mode)
1970 {
1971 rtx_vector_builder builder (mode, 1, 2);
1972 builder.quick_push (const1_rtx);
1973 builder.quick_push (const0_rtx);
1974 return builder.build ();
1975 }
1976
1977 static unsigned
1978 compute_vlmax (unsigned vector_bits, unsigned elt_size, unsigned min_size)
1979 {
1980 // Original equation:
1981 // VLMAX = (VectorBits / EltSize) * LMUL
1982 // where LMUL = MinSize / TARGET_MIN_VLEN
1983 // The following equations have been reordered to prevent loss of precision
1984 // when calculating fractional LMUL.
1985 return ((vector_bits / elt_size) * min_size) / TARGET_MIN_VLEN;
1986 }
1987
1988 static unsigned
1989 get_unknown_min_value (machine_mode mode)
1990 {
1991 enum vlmul_type vlmul = get_vlmul (mode);
1992 switch (vlmul)
1993 {
1994 case LMUL_1:
1995 return TARGET_MIN_VLEN;
1996 case LMUL_2:
1997 return TARGET_MIN_VLEN * 2;
1998 case LMUL_4:
1999 return TARGET_MIN_VLEN * 4;
2000 case LMUL_8:
2001 return TARGET_MIN_VLEN * 8;
2002 default:
2003 gcc_unreachable ();
2004 }
2005 }
2006
2007 static rtx
2008 force_vector_length_operand (rtx vl)
2009 {
2010 if (CONST_INT_P (vl) && !satisfies_constraint_K (vl))
2011 return force_reg (Pmode, vl);
2012 return vl;
2013 }
2014
2015 rtx
2016 gen_no_side_effects_vsetvl_rtx (machine_mode vmode, rtx vl, rtx avl)
2017 {
2018 unsigned int sew = get_sew (vmode);
2019 rtx tail_policy = gen_int_mode (get_prefer_tail_policy (), Pmode);
2020 rtx mask_policy = gen_int_mode (get_prefer_mask_policy (), Pmode);
2021 return gen_vsetvl_no_side_effects (Pmode, vl, avl, gen_int_mode (sew, Pmode),
2022 gen_int_mode (get_vlmul (vmode), Pmode),
2023 tail_policy, mask_policy);
2024 }
2025
2026 /* GET VL * 2 rtx. */
2027 static rtx
2028 get_vl_x2_rtx (rtx avl, machine_mode mode, machine_mode demote_mode)
2029 {
2030 rtx i32vl = NULL_RTX;
2031 if (CONST_INT_P (avl))
2032 {
2033 unsigned elt_size = GET_MODE_BITSIZE (GET_MODE_INNER (mode));
2034 unsigned min_size = get_unknown_min_value (mode);
2035 unsigned vlen_max = RVV_65536;
2036 unsigned vlmax_max = compute_vlmax (vlen_max, elt_size, min_size);
2037 unsigned vlen_min = TARGET_MIN_VLEN;
2038 unsigned vlmax_min = compute_vlmax (vlen_min, elt_size, min_size);
2039
2040 unsigned HOST_WIDE_INT avl_int = INTVAL (avl);
2041 if (avl_int <= vlmax_min)
2042 i32vl = gen_int_mode (2 * avl_int, Pmode);
2043 else if (avl_int >= 2 * vlmax_max)
2044 {
2045 // Just set i32vl to VLMAX in this situation
2046 i32vl = gen_reg_rtx (Pmode);
2047 emit_insn (
2048 gen_no_side_effects_vsetvl_rtx (demote_mode, i32vl, RVV_VLMAX));
2049 }
2050 else
2051 {
2052 // For AVL between (MinVLMAX, 2 * MaxVLMAX), the actual working vl
2053 // is related to the hardware implementation.
2054 // So let the following code handle
2055 }
2056 }
2057 if (!i32vl)
2058 {
2059 // Using vsetvli instruction to get actually used length which related to
2060 // the hardware implementation
2061 rtx i64vl = gen_reg_rtx (Pmode);
2062 emit_insn (
2063 gen_no_side_effects_vsetvl_rtx (mode, i64vl, force_reg (Pmode, avl)));
2064 // scale 2 for 32-bit length
2065 i32vl = gen_reg_rtx (Pmode);
2066 emit_insn (
2067 gen_rtx_SET (i32vl, gen_rtx_ASHIFT (Pmode, i64vl, const1_rtx)));
2068 }
2069
2070 return force_vector_length_operand (i32vl);
2071 }
2072
2073 bool
2074 slide1_sew64_helper (int unspec, machine_mode mode, machine_mode demote_mode,
2075 machine_mode demote_mask_mode, rtx *ops)
2076 {
2077 rtx scalar_op = ops[4];
2078 rtx avl = ops[5];
2079 machine_mode scalar_mode = GET_MODE_INNER (mode);
2080 if (rtx_equal_p (scalar_op, const0_rtx))
2081 {
2082 ops[5] = force_vector_length_operand (ops[5]);
2083 return false;
2084 }
2085
2086 if (TARGET_64BIT)
2087 {
2088 ops[4] = force_reg (scalar_mode, scalar_op);
2089 ops[5] = force_vector_length_operand (ops[5]);
2090 return false;
2091 }
2092
2093 if (immediate_operand (scalar_op, Pmode))
2094 {
2095 ops[4] = gen_rtx_SIGN_EXTEND (scalar_mode, force_reg (Pmode, scalar_op));
2096 ops[5] = force_vector_length_operand (ops[5]);
2097 return false;
2098 }
2099
2100 if (CONST_INT_P (scalar_op))
2101 scalar_op = force_reg (scalar_mode, scalar_op);
2102
2103 rtx vl_x2 = get_vl_x2_rtx (avl, mode, demote_mode);
2104
2105 rtx demote_scalar_op1, demote_scalar_op2;
2106 if (unspec == UNSPEC_VSLIDE1UP)
2107 {
2108 demote_scalar_op1 = gen_highpart (Pmode, scalar_op);
2109 demote_scalar_op2 = gen_lowpart (Pmode, scalar_op);
2110 }
2111 else
2112 {
2113 demote_scalar_op1 = gen_lowpart (Pmode, scalar_op);
2114 demote_scalar_op2 = gen_highpart (Pmode, scalar_op);
2115 }
2116
2117 rtx temp = gen_reg_rtx (demote_mode);
2118 rtx ta = gen_int_mode (get_prefer_tail_policy (), Pmode);
2119 rtx ma = gen_int_mode (get_prefer_mask_policy (), Pmode);
2120 rtx merge = RVV_VUNDEF (demote_mode);
2121 /* Handle vslide1<ud>_tu. */
2122 if (register_operand (ops[2], mode)
2123 && rtx_equal_p (ops[1], CONSTM1_RTX (GET_MODE (ops[1]))))
2124 {
2125 merge = gen_lowpart (demote_mode, ops[2]);
2126 ta = ops[6];
2127 ma = ops[7];
2128 }
2129
2130 emit_insn (gen_pred_slide (unspec, demote_mode, temp,
2131 CONSTM1_RTX (demote_mask_mode), merge,
2132 gen_lowpart (demote_mode, ops[3]),
2133 demote_scalar_op1, vl_x2, ta, ma, ops[8]));
2134 emit_insn (gen_pred_slide (unspec, demote_mode,
2135 gen_lowpart (demote_mode, ops[0]),
2136 CONSTM1_RTX (demote_mask_mode), merge, temp,
2137 demote_scalar_op2, vl_x2, ta, ma, ops[8]));
2138
2139 if (!rtx_equal_p (ops[1], CONSTM1_RTX (GET_MODE (ops[1])))
2140 && !rtx_equal_p (ops[2], RVV_VUNDEF (GET_MODE (ops[2]))))
2141 emit_insn (gen_pred_merge (mode, ops[0], ops[2], ops[2], ops[0], ops[1],
2142 force_vector_length_operand (ops[5]), ops[6],
2143 ops[8]));
2144 return true;
2145 }
2146
2147 rtx
2148 gen_avl_for_scalar_move (rtx avl)
2149 {
2150 /* AVL for scalar move has different behavior between 0 and large than 0. */
2151 if (CONST_INT_P (avl))
2152 {
2153 /* So we could just set AVL to 1 for any constant other than 0. */
2154 if (rtx_equal_p (avl, const0_rtx))
2155 return const0_rtx;
2156 else
2157 return const1_rtx;
2158 }
2159 else
2160 {
2161 /* For non-constant value, we set any non zero value to 1 by
2162 `sgtu new_avl,input_avl,zero` + `vsetvli`. */
2163 rtx tmp = gen_reg_rtx (Pmode);
2164 emit_insn (
2165 gen_rtx_SET (tmp, gen_rtx_fmt_ee (GTU, Pmode, avl, const0_rtx)));
2166 return tmp;
2167 }
2168 }
2169
2170 /* Expand tuple modes data movement for. */
2171 void
2172 expand_tuple_move (rtx *ops)
2173 {
2174 unsigned int i;
2175 machine_mode tuple_mode = GET_MODE (ops[0]);
2176 machine_mode subpart_mode = get_subpart_mode (tuple_mode);
2177 poly_int64 subpart_size = GET_MODE_SIZE (subpart_mode);
2178 unsigned int nf = get_nf (tuple_mode);
2179 bool fractional_p = known_lt (subpart_size, BYTES_PER_RISCV_VECTOR);
2180
2181 if (REG_P (ops[0]) && CONST_VECTOR_P (ops[1]))
2182 {
2183 rtx val;
2184 gcc_assert (can_create_pseudo_p ()
2185 && const_vec_duplicate_p (ops[1], &val));
2186 for (i = 0; i < nf; ++i)
2187 {
2188 poly_int64 offset = i * subpart_size;
2189 rtx subreg
2190 = simplify_gen_subreg (subpart_mode, ops[0], tuple_mode, offset);
2191 rtx dup = gen_const_vec_duplicate (subpart_mode, val);
2192 emit_move_insn (subreg, dup);
2193 }
2194 }
2195 else if (REG_P (ops[0]) && REG_P (ops[1]))
2196 {
2197 for (i = 0; i < nf; ++i)
2198 {
2199 int index = i;
2200
2201 /* Take NF = 2 and LMUL = 1 for example:
2202
2203 - move v8 to v9:
2204 vmv1r v10,v9
2205 vmv1r v9,v8
2206
2207 - move v8 to v7:
2208 vmv1r v7,v8
2209 vmv1r v8,v9 */
2210 if (REGNO (ops[0]) > REGNO (ops[1]))
2211 index = nf - 1 - i;
2212 poly_int64 offset = index * subpart_size;
2213 rtx dst_subreg
2214 = simplify_gen_subreg (subpart_mode, ops[0], tuple_mode, offset);
2215 rtx src_subreg
2216 = simplify_gen_subreg (subpart_mode, ops[1], tuple_mode, offset);
2217 emit_insn (gen_rtx_SET (dst_subreg, src_subreg));
2218 }
2219 }
2220 else
2221 {
2222 /* Expand tuple memory data movement. */
2223 gcc_assert (MEM_P (ops[0]) || MEM_P (ops[1]));
2224 rtx offset = gen_int_mode (subpart_size, Pmode);
2225 if (!subpart_size.is_constant ())
2226 {
2227 emit_move_insn (ops[2], gen_int_mode (BYTES_PER_RISCV_VECTOR, Pmode));
2228 if (fractional_p)
2229 {
2230 unsigned int factor
2231 = exact_div (BYTES_PER_RISCV_VECTOR, subpart_size)
2232 .to_constant ();
2233 rtx pat
2234 = gen_rtx_ASHIFTRT (Pmode, ops[2],
2235 gen_int_mode (exact_log2 (factor), Pmode));
2236 emit_insn (gen_rtx_SET (ops[2], pat));
2237 }
2238
2239 if (known_gt (subpart_size, BYTES_PER_RISCV_VECTOR))
2240 {
2241 unsigned int factor
2242 = exact_div (subpart_size, BYTES_PER_RISCV_VECTOR)
2243 .to_constant ();
2244 rtx pat
2245 = gen_rtx_ASHIFT (Pmode, ops[2],
2246 gen_int_mode (exact_log2 (factor), Pmode));
2247 emit_insn (gen_rtx_SET (ops[2], pat));
2248 }
2249 offset = ops[2];
2250 }
2251
2252 /* Non-fractional LMUL has whole register moves that don't require a
2253 vsetvl for VLMAX. */
2254 if (fractional_p)
2255 emit_vlmax_vsetvl (subpart_mode, ops[4]);
2256 if (MEM_P (ops[1]))
2257 {
2258 /* Load operations. */
2259 emit_move_insn (ops[3], XEXP (ops[1], 0));
2260 for (i = 0; i < nf; i++)
2261 {
2262 rtx subreg = simplify_gen_subreg (subpart_mode, ops[0],
2263 tuple_mode, i * subpart_size);
2264 if (i != 0)
2265 {
2266 rtx new_addr = gen_rtx_PLUS (Pmode, ops[3], offset);
2267 emit_insn (gen_rtx_SET (ops[3], new_addr));
2268 }
2269 rtx mem = gen_rtx_MEM (subpart_mode, ops[3]);
2270
2271 if (fractional_p)
2272 {
2273 rtx operands[] = {subreg, mem};
2274 emit_vlmax_insn_lra (code_for_pred_mov (subpart_mode),
2275 UNARY_OP, operands, ops[4]);
2276 }
2277 else
2278 emit_move_insn (subreg, mem);
2279 }
2280 }
2281 else
2282 {
2283 /* Store operations. */
2284 emit_move_insn (ops[3], XEXP (ops[0], 0));
2285 for (i = 0; i < nf; i++)
2286 {
2287 rtx subreg = simplify_gen_subreg (subpart_mode, ops[1],
2288 tuple_mode, i * subpart_size);
2289 if (i != 0)
2290 {
2291 rtx new_addr = gen_rtx_PLUS (Pmode, ops[3], offset);
2292 emit_insn (gen_rtx_SET (ops[3], new_addr));
2293 }
2294 rtx mem = gen_rtx_MEM (subpart_mode, ops[3]);
2295
2296 if (fractional_p)
2297 {
2298 rtx operands[] = {mem, subreg};
2299 emit_vlmax_insn_lra (code_for_pred_mov (subpart_mode),
2300 UNARY_OP, operands, ops[4]);
2301 }
2302 else
2303 emit_move_insn (mem, subreg);
2304 }
2305 }
2306 }
2307 }
2308
2309 /* Return the vectorization machine mode for RVV according to LMUL. */
2310 machine_mode
2311 preferred_simd_mode (scalar_mode mode)
2312 {
2313 if (autovec_use_vlmax_p ())
2314 {
2315 /* We use LMUL = 1 as base bytesize which is BYTES_PER_RISCV_VECTOR and
2316 riscv_autovec_lmul as multiply factor to calculate the the NUNITS to
2317 get the auto-vectorization mode. */
2318 poly_uint64 nunits;
2319 poly_uint64 vector_size = BYTES_PER_RISCV_VECTOR * TARGET_MAX_LMUL;
2320 poly_uint64 scalar_size = GET_MODE_SIZE (mode);
2321 /* Disable vectorization when we can't find a RVV mode for it.
2322 E.g. -march=rv64gc_zve32x doesn't have a vector mode to vectorize
2323 a double (DFmode) type. */
2324 if (!multiple_p (vector_size, scalar_size, &nunits))
2325 return word_mode;
2326 machine_mode rvv_mode;
2327 if (get_vector_mode (mode, nunits).exists (&rvv_mode))
2328 return rvv_mode;
2329 }
2330 return word_mode;
2331 }
2332
2333 /* Subroutine of riscv_vector_expand_vector_init.
2334 Works as follows:
2335 (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
2336 (b) Skip leading elements from BUILDER, which are the same as
2337 element NELTS_REQD - 1.
2338 (c) Insert earlier elements in reverse order in TARGET using vslide1down. */
2339
2340 static void
2341 expand_vector_init_insert_elems (rtx target, const rvv_builder &builder,
2342 int nelts_reqd)
2343 {
2344 machine_mode mode = GET_MODE (target);
2345 rtx dup = expand_vector_broadcast (mode, builder.elt (0));
2346 emit_move_insn (target, dup);
2347 int ndups = builder.count_dups (0, nelts_reqd - 1, 1);
2348 for (int i = ndups; i < nelts_reqd; i++)
2349 {
2350 unsigned int unspec
2351 = FLOAT_MODE_P (mode) ? UNSPEC_VFSLIDE1DOWN : UNSPEC_VSLIDE1DOWN;
2352 insn_code icode = code_for_pred_slide (unspec, mode);
2353 rtx ops[] = {target, target, builder.elt (i)};
2354 emit_vlmax_insn (icode, BINARY_OP, ops);
2355 }
2356 }
2357
2358 /* Use merge approach to initialize the vector with repeating sequence.
2359 v = {a, b, a, b, a, b, a, b}.
2360
2361 v = broadcast (a).
2362 mask = 0b01010101....
2363 v = merge (v, b, mask)
2364 */
2365 static void
2366 expand_vector_init_merge_repeating_sequence (rtx target,
2367 const rvv_builder &builder)
2368 {
2369 /* We can't use BIT mode (BI) directly to generate mask = 0b01010...
2370 since we don't have such instruction in RVV.
2371 Instead, we should use INT mode (QI/HI/SI/DI) with integer move
2372 instruction to generate the mask data we want. */
2373 machine_mode mask_bit_mode = get_mask_mode (builder.mode ());
2374 machine_mode mask_int_mode
2375 = get_repeating_sequence_dup_machine_mode (builder, mask_bit_mode);
2376 uint64_t full_nelts = builder.full_nelts ().to_constant ();
2377
2378 /* Step 1: Broadcast the first pattern. */
2379 rtx ops[] = {target, force_reg (builder.inner_mode (), builder.elt (0))};
2380 emit_vlmax_insn (code_for_pred_broadcast (builder.mode ()),
2381 UNARY_OP, ops);
2382 /* Step 2: Merge the rest iteration of pattern. */
2383 for (unsigned int i = 1; i < builder.npatterns (); i++)
2384 {
2385 /* Step 2-1: Generate mask register v0 for each merge. */
2386 rtx merge_mask
2387 = builder.get_merge_scalar_mask (i, GET_MODE_INNER (mask_int_mode));
2388 rtx mask = gen_reg_rtx (mask_bit_mode);
2389 rtx dup = gen_reg_rtx (mask_int_mode);
2390
2391 if (full_nelts <= builder.inner_bits_size ()) /* vmv.s.x. */
2392 {
2393 rtx ops[] = {dup, merge_mask};
2394 emit_nonvlmax_insn (code_for_pred_broadcast (GET_MODE (dup)),
2395 SCALAR_MOVE_OP, ops, CONST1_RTX (Pmode));
2396 }
2397 else /* vmv.v.x. */
2398 {
2399 rtx ops[] = {dup,
2400 force_reg (GET_MODE_INNER (mask_int_mode), merge_mask)};
2401 rtx vl = gen_int_mode (CEIL (full_nelts, builder.inner_bits_size ()),
2402 Pmode);
2403 emit_nonvlmax_insn (code_for_pred_broadcast (mask_int_mode), UNARY_OP,
2404 ops, vl);
2405 }
2406
2407 emit_move_insn (mask, gen_lowpart (mask_bit_mode, dup));
2408
2409 /* Step 2-2: Merge pattern according to the mask. */
2410 rtx ops[] = {target, target, builder.elt (i), mask};
2411 emit_vlmax_insn (code_for_pred_merge_scalar (GET_MODE (target)),
2412 MERGE_OP, ops);
2413 }
2414 }
2415
2416 /* Use slideup approach to combine the vectors.
2417 v = {a, a, a, a, b, b, b, b}
2418
2419 First:
2420 v1 = {a, a, a, a, a, a, a, a}
2421 v2 = {b, b, b, b, b, b, b, b}
2422 v = slideup (v1, v2, nelt / 2)
2423 */
2424 static void
2425 expand_vector_init_slideup_combine_sequence (rtx target,
2426 const rvv_builder &builder)
2427 {
2428 machine_mode mode = GET_MODE (target);
2429 int nelts = builder.full_nelts ().to_constant ();
2430 rtx first_elt = builder.elt (0);
2431 rtx last_elt = builder.elt (nelts - 1);
2432 rtx low = expand_vector_broadcast (mode, first_elt);
2433 rtx high = expand_vector_broadcast (mode, last_elt);
2434 insn_code icode = code_for_pred_slide (UNSPEC_VSLIDEUP, mode);
2435 rtx ops[] = {target, low, high, gen_int_mode (nelts / 2, Pmode)};
2436 emit_vlmax_insn (icode, SLIDEUP_OP_MERGE, ops);
2437 }
2438
2439 /* Use merge approach to merge a scalar into a vector.
2440 v = {a, a, a, a, a, a, b, b}
2441
2442 v1 = {a, a, a, a, a, a, a, a}
2443 scalar = b
2444 mask = {0, 0, 0, 0, 0, 0, 1, 1}
2445 */
2446 static void
2447 expand_vector_init_merge_combine_sequence (rtx target,
2448 const rvv_builder &builder)
2449 {
2450 machine_mode mode = GET_MODE (target);
2451 machine_mode imode = builder.int_mode ();
2452 machine_mode mmode = builder.mask_mode ();
2453 int nelts = builder.full_nelts ().to_constant ();
2454 int leading_ndups = builder.count_dups (0, nelts - 1, 1);
2455 if ((leading_ndups > 255 && GET_MODE_INNER (imode) == QImode)
2456 || riscv_get_v_regno_alignment (imode) > 1)
2457 imode = get_vector_mode (HImode, nelts).require ();
2458
2459 /* Generate vid = { 0, 1, 2, ..., n }. */
2460 rtx vid = gen_reg_rtx (imode);
2461 expand_vec_series (vid, const0_rtx, const1_rtx);
2462
2463 /* Generate mask. */
2464 rtx mask = gen_reg_rtx (mmode);
2465 insn_code icode = code_for_pred_cmp_scalar (imode);
2466 rtx index = gen_int_mode (leading_ndups - 1, builder.inner_int_mode ());
2467 rtx dup_rtx = gen_rtx_VEC_DUPLICATE (imode, index);
2468 /* vmsgtu.vi/vmsgtu.vx. */
2469 rtx cmp = gen_rtx_fmt_ee (GTU, mmode, vid, dup_rtx);
2470 rtx sel = builder.elt (nelts - 1);
2471 rtx mask_ops[] = {mask, cmp, vid, index};
2472 emit_vlmax_insn (icode, COMPARE_OP, mask_ops);
2473
2474 /* Duplicate the first elements. */
2475 rtx dup = expand_vector_broadcast (mode, builder.elt (0));
2476 /* Merge scalar into vector according to mask. */
2477 rtx merge_ops[] = {target, dup, sel, mask};
2478 icode = code_for_pred_merge_scalar (mode);
2479 emit_vlmax_insn (icode, MERGE_OP, merge_ops);
2480 }
2481
2482 /* Subroutine of expand_vec_init to handle case
2483 when all trailing elements of builder are same.
2484 This works as follows:
2485 (a) Use expand_insn interface to broadcast last vector element in TARGET.
2486 (b) Insert remaining elements in TARGET using insr.
2487
2488 ??? The heuristic used is to do above if number of same trailing elements
2489 is greater than leading_ndups, loosely based on
2490 heuristic from mostly_zeros_p. May need fine-tuning. */
2491
2492 static bool
2493 expand_vector_init_trailing_same_elem (rtx target,
2494 const rtx_vector_builder &builder,
2495 int nelts_reqd)
2496 {
2497 int leading_ndups = builder.count_dups (0, nelts_reqd - 1, 1);
2498 int trailing_ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
2499 machine_mode mode = GET_MODE (target);
2500
2501 if (trailing_ndups > leading_ndups)
2502 {
2503 rtx dup = expand_vector_broadcast (mode, builder.elt (nelts_reqd - 1));
2504 for (int i = nelts_reqd - trailing_ndups - 1; i >= 0; i--)
2505 {
2506 unsigned int unspec
2507 = FLOAT_MODE_P (mode) ? UNSPEC_VFSLIDE1UP : UNSPEC_VSLIDE1UP;
2508 insn_code icode = code_for_pred_slide (unspec, mode);
2509 rtx tmp = gen_reg_rtx (mode);
2510 rtx ops[] = {tmp, dup, builder.elt (i)};
2511 emit_vlmax_insn (icode, BINARY_OP, ops);
2512 /* slide1up need source and dest to be different REG. */
2513 dup = tmp;
2514 }
2515
2516 emit_move_insn (target, dup);
2517 return true;
2518 }
2519
2520 return false;
2521 }
2522
2523 /* Initialize register TARGET from the elements in PARALLEL rtx VALS. */
2524
2525 void
2526 expand_vec_init (rtx target, rtx vals)
2527 {
2528 machine_mode mode = GET_MODE (target);
2529 int nelts = XVECLEN (vals, 0);
2530
2531 rvv_builder v (mode, nelts, 1);
2532 for (int i = 0; i < nelts; i++)
2533 v.quick_push (XVECEXP (vals, 0, i));
2534 v.finalize ();
2535
2536 if (nelts > 3)
2537 {
2538 /* Case 1: Convert v = { a, b, a, b } into v = { ab, ab }. */
2539 if (v.can_duplicate_repeating_sequence_p ())
2540 {
2541 rtx ele = v.get_merged_repeating_sequence ();
2542 rtx dup = expand_vector_broadcast (v.new_mode (), ele);
2543 emit_move_insn (target, gen_lowpart (mode, dup));
2544 return;
2545 }
2546
2547 /* Case 2: Optimize repeating sequence cases that Case 1 can
2548 not handle and it is profitable. For example:
2549 ELEMENT BITSIZE = 64.
2550 v = {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}.
2551 We can't find a vector mode for "ab" which will be combined into
2552 128-bit element to duplicate. */
2553 if (v.repeating_sequence_use_merge_profitable_p ())
2554 {
2555 expand_vector_init_merge_repeating_sequence (target, v);
2556 return;
2557 }
2558
2559 /* Case 3: Optimize combine sequence.
2560 E.g. v = {a, a, a, a, a, a, a, a, b, b, b, b, b, b, b, b}.
2561 We can combine:
2562 v1 = {a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a}.
2563 and
2564 v2 = {b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b}.
2565 by slideup. */
2566 if (v.combine_sequence_use_slideup_profitable_p ())
2567 {
2568 expand_vector_init_slideup_combine_sequence (target, v);
2569 return;
2570 }
2571
2572 /* Case 4: Optimize combine sequence.
2573 E.g. v = {a, a, a, a, a, a, a, a, a, a, a, b, b, b, b, b}.
2574
2575 Generate vector:
2576 v = {a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a}.
2577
2578 Generate mask:
2579 mask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1}.
2580
2581 Merge b into v by mask:
2582 v = {a, a, a, a, a, a, a, a, a, a, a, b, b, b, b, b}. */
2583 if (v.combine_sequence_use_merge_profitable_p ())
2584 {
2585 expand_vector_init_merge_combine_sequence (target, v);
2586 return;
2587 }
2588 }
2589
2590 /* Optimize trailing same elements sequence:
2591 v = {y, y2, y3, y4, y5, x, x, x, x, x, x, x, x, x, x, x}; */
2592 if (!expand_vector_init_trailing_same_elem (target, v, nelts))
2593 /* Handle common situation by vslide1down. This function can handle any
2594 situation of vec_init<mode>. Only the cases that are not optimized above
2595 will fall through here. */
2596 expand_vector_init_insert_elems (target, v, nelts);
2597 }
2598
2599 /* Get insn code for corresponding comparison. */
2600
2601 static insn_code
2602 get_cmp_insn_code (rtx_code code, machine_mode mode)
2603 {
2604 insn_code icode;
2605 switch (code)
2606 {
2607 case EQ:
2608 case NE:
2609 case LE:
2610 case LEU:
2611 case GT:
2612 case GTU:
2613 case LTGT:
2614 icode = code_for_pred_cmp (mode);
2615 break;
2616 case LT:
2617 case LTU:
2618 case GE:
2619 case GEU:
2620 if (FLOAT_MODE_P (mode))
2621 icode = code_for_pred_cmp (mode);
2622 else
2623 icode = code_for_pred_ltge (mode);
2624 break;
2625 default:
2626 gcc_unreachable ();
2627 }
2628 return icode;
2629 }
2630
2631 /* This hook gives the vectorizer more vector mode options. We want it to not
2632 only try modes with the maximum number of units a full vector can hold but
2633 for example also half the number of units for a smaller elements size.
2634 Such vectors can be promoted to a full vector of widened elements
2635 (still with the same number of elements, essentially vectorizing at a
2636 fixed number of units rather than a fixed number of bytes). */
2637 unsigned int
2638 autovectorize_vector_modes (vector_modes *modes, bool)
2639 {
2640 if (autovec_use_vlmax_p ())
2641 {
2642 poly_uint64 full_size = BYTES_PER_RISCV_VECTOR * TARGET_MAX_LMUL;
2643
2644 /* Start with a RVV<LMUL>QImode where LMUL is the number of units that
2645 fit a whole vector.
2646 Then try LMUL = nunits / 2, nunits / 4 and nunits / 8 which
2647 is guided by the extensions we have available (vf2, vf4 and vf8).
2648
2649 - full_size: Try using full vectors for all element types.
2650 - full_size / 2:
2651 Try using 16-bit containers for 8-bit elements and full vectors
2652 for wider elements.
2653 - full_size / 4:
2654 Try using 32-bit containers for 8-bit and 16-bit elements and
2655 full vectors for wider elements.
2656 - full_size / 8:
2657 Try using 64-bit containers for all element types. */
2658 static const int rvv_factors[] = {1, 2, 4, 8, 16, 32, 64};
2659 for (unsigned int i = 0; i < sizeof (rvv_factors) / sizeof (int); i++)
2660 {
2661 poly_uint64 units;
2662 machine_mode mode;
2663 if (can_div_trunc_p (full_size, rvv_factors[i], &units)
2664 && get_vector_mode (QImode, units).exists (&mode))
2665 modes->safe_push (mode);
2666 }
2667 }
2668 /* Push all VLSmodes according to TARGET_MIN_VLEN. */
2669 unsigned int i = 0;
2670 unsigned int base_size = TARGET_MIN_VLEN * TARGET_MAX_LMUL / 8;
2671 unsigned int size = base_size;
2672 machine_mode mode;
2673 while (size > 0 && get_vector_mode (QImode, size).exists (&mode))
2674 {
2675 if (vls_mode_valid_p (mode))
2676 modes->safe_push (mode);
2677
2678 i++;
2679 size = base_size / (1U << i);
2680 }
2681 /* Enable LOOP_VINFO comparison in COST model. */
2682 return VECT_COMPARE_COSTS;
2683 }
2684
2685 /* Return true if we can find the related MODE according to default LMUL. */
2686 static bool
2687 can_find_related_mode_p (machine_mode vector_mode, scalar_mode element_mode,
2688 poly_uint64 *nunits)
2689 {
2690 if (!autovec_use_vlmax_p ())
2691 return false;
2692 if (riscv_v_ext_vector_mode_p (vector_mode)
2693 && multiple_p (BYTES_PER_RISCV_VECTOR * TARGET_MAX_LMUL,
2694 GET_MODE_SIZE (element_mode), nunits))
2695 return true;
2696 if (riscv_v_ext_vls_mode_p (vector_mode)
2697 && multiple_p (TARGET_MIN_VLEN * TARGET_MAX_LMUL,
2698 GET_MODE_SIZE (element_mode), nunits))
2699 return true;
2700 return false;
2701 }
2702
2703 /* If the given VECTOR_MODE is an RVV mode, first get the largest number
2704 of units that fit into a full vector at the given ELEMENT_MODE.
2705 We will have the vectorizer call us with a successively decreasing
2706 number of units (as specified in autovectorize_vector_modes).
2707 The starting mode is always the one specified by preferred_simd_mode. */
2708 opt_machine_mode
2709 vectorize_related_mode (machine_mode vector_mode, scalar_mode element_mode,
2710 poly_uint64 nunits)
2711 {
2712 /* TODO: We will support RVV VLS auto-vectorization mode in the future. */
2713 poly_uint64 min_units;
2714 if (can_find_related_mode_p (vector_mode, element_mode, &min_units))
2715 {
2716 machine_mode rvv_mode;
2717 if (maybe_ne (nunits, 0U))
2718 {
2719 /* If we were given a number of units NUNITS, try to find an
2720 RVV vector mode of inner mode ELEMENT_MODE with the same
2721 number of units. */
2722 if (multiple_p (min_units, nunits)
2723 && get_vector_mode (element_mode, nunits).exists (&rvv_mode))
2724 return rvv_mode;
2725 }
2726 else
2727 {
2728 /* Look for a vector mode with the same number of units as the
2729 VECTOR_MODE we were given. We keep track of the minimum
2730 number of units so far which determines the smallest necessary
2731 but largest possible, suitable mode for vectorization. */
2732 min_units = ordered_min (min_units, GET_MODE_SIZE (vector_mode));
2733 if (get_vector_mode (element_mode, min_units).exists (&rvv_mode))
2734 return rvv_mode;
2735 }
2736 }
2737
2738 return default_vectorize_related_mode (vector_mode, element_mode, nunits);
2739 }
2740
2741 /* Expand an RVV comparison. */
2742
2743 void
2744 expand_vec_cmp (rtx target, rtx_code code, rtx op0, rtx op1)
2745 {
2746 machine_mode mask_mode = GET_MODE (target);
2747 machine_mode data_mode = GET_MODE (op0);
2748 insn_code icode = get_cmp_insn_code (code, data_mode);
2749
2750 if (code == LTGT)
2751 {
2752 rtx lt = gen_reg_rtx (mask_mode);
2753 rtx gt = gen_reg_rtx (mask_mode);
2754 expand_vec_cmp (lt, LT, op0, op1);
2755 expand_vec_cmp (gt, GT, op0, op1);
2756 icode = code_for_pred (IOR, mask_mode);
2757 rtx ops[] = {target, lt, gt};
2758 emit_vlmax_insn (icode, BINARY_MASK_OP, ops);
2759 return;
2760 }
2761
2762 rtx cmp = gen_rtx_fmt_ee (code, mask_mode, op0, op1);
2763 rtx ops[] = {target, cmp, op0, op1};
2764 emit_vlmax_insn (icode, COMPARE_OP, ops);
2765 }
2766
2767 void
2768 expand_vec_cmp (rtx target, rtx_code code, rtx mask, rtx maskoff, rtx op0,
2769 rtx op1)
2770 {
2771 machine_mode mask_mode = GET_MODE (target);
2772 machine_mode data_mode = GET_MODE (op0);
2773 insn_code icode = get_cmp_insn_code (code, data_mode);
2774
2775 if (code == LTGT)
2776 {
2777 rtx lt = gen_reg_rtx (mask_mode);
2778 rtx gt = gen_reg_rtx (mask_mode);
2779 expand_vec_cmp (lt, LT, mask, maskoff, op0, op1);
2780 expand_vec_cmp (gt, GT, mask, maskoff, op0, op1);
2781 icode = code_for_pred (IOR, mask_mode);
2782 rtx ops[] = {target, lt, gt};
2783 emit_vlmax_insn (icode, BINARY_MASK_OP, ops);
2784 return;
2785 }
2786
2787 rtx cmp = gen_rtx_fmt_ee (code, mask_mode, op0, op1);
2788 rtx ops[] = {target, mask, maskoff, cmp, op0, op1};
2789 emit_vlmax_insn (icode, COMPARE_OP_MU, ops);
2790 }
2791
2792 /* Expand an RVV floating-point comparison:
2793
2794 If CAN_INVERT_P is true, the caller can also handle inverted results;
2795 return true if the result is in fact inverted. */
2796
2797 bool
2798 expand_vec_cmp_float (rtx target, rtx_code code, rtx op0, rtx op1,
2799 bool can_invert_p)
2800 {
2801 machine_mode mask_mode = GET_MODE (target);
2802 machine_mode data_mode = GET_MODE (op0);
2803
2804 /* If can_invert_p = true:
2805 It suffices to implement a u>= b as !(a < b) but with the NaNs masked off:
2806
2807 vmfeq.vv v0, va, va
2808 vmfeq.vv v1, vb, vb
2809 vmand.mm v0, v0, v1
2810 vmflt.vv v0, va, vb, v0.t
2811 vmnot.m v0, v0
2812
2813 And, if !HONOR_SNANS, then you can remove the vmand.mm by masking the
2814 second vmfeq.vv:
2815
2816 vmfeq.vv v0, va, va
2817 vmfeq.vv v0, vb, vb, v0.t
2818 vmflt.vv v0, va, vb, v0.t
2819 vmnot.m v0, v0
2820
2821 If can_invert_p = false:
2822
2823 # Example of implementing isgreater()
2824 vmfeq.vv v0, va, va # Only set where A is not NaN.
2825 vmfeq.vv v1, vb, vb # Only set where B is not NaN.
2826 vmand.mm v0, v0, v1 # Only set where A and B are ordered,
2827 vmfgt.vv v0, va, vb, v0.t # so only set flags on ordered values.
2828 */
2829
2830 rtx eq0 = gen_reg_rtx (mask_mode);
2831 rtx eq1 = gen_reg_rtx (mask_mode);
2832 switch (code)
2833 {
2834 case EQ:
2835 case NE:
2836 case LT:
2837 case LE:
2838 case GT:
2839 case GE:
2840 case LTGT:
2841 /* There is native support for the comparison. */
2842 expand_vec_cmp (target, code, op0, op1);
2843 return false;
2844 case UNEQ:
2845 case ORDERED:
2846 case UNORDERED:
2847 case UNLT:
2848 case UNLE:
2849 case UNGT:
2850 case UNGE:
2851 /* vmfeq.vv v0, va, va */
2852 expand_vec_cmp (eq0, EQ, op0, op0);
2853 if (HONOR_SNANS (data_mode))
2854 {
2855 /*
2856 vmfeq.vv v1, vb, vb
2857 vmand.mm v0, v0, v1
2858 */
2859 expand_vec_cmp (eq1, EQ, op1, op1);
2860 insn_code icode = code_for_pred (AND, mask_mode);
2861 rtx ops[] = {eq0, eq0, eq1};
2862 emit_vlmax_insn (icode, BINARY_MASK_OP, ops);
2863 }
2864 else
2865 {
2866 /* vmfeq.vv v0, vb, vb, v0.t */
2867 expand_vec_cmp (eq0, EQ, eq0, eq0, op1, op1);
2868 }
2869 break;
2870 default:
2871 gcc_unreachable ();
2872 }
2873
2874 if (code == ORDERED)
2875 {
2876 emit_move_insn (target, eq0);
2877 return false;
2878 }
2879
2880 /* There is native support for the inverse comparison. */
2881 code = reverse_condition_maybe_unordered (code);
2882 if (code == ORDERED)
2883 emit_move_insn (target, eq0);
2884 else
2885 expand_vec_cmp (eq0, code, eq0, eq0, op0, op1);
2886
2887 if (can_invert_p)
2888 {
2889 emit_move_insn (target, eq0);
2890 return true;
2891 }
2892
2893 /* We use one_cmpl<mode>2 to make Combine PASS to combine mask instructions
2894 into: vmand.mm/vmnor.mm/vmnand.mm/vmnor.mm/vmxnor.mm. */
2895 emit_insn (gen_rtx_SET (target, gen_rtx_NOT (mask_mode, eq0)));
2896 return false;
2897 }
2898
2899 /* Modulo all SEL indices to ensure they are all in range if [0, MAX_SEL].
2900 MAX_SEL is nunits - 1 if rtx_equal_p (op0, op1). Otherwise, it is
2901 2 * nunits - 1. */
2902 static rtx
2903 modulo_sel_indices (rtx op0, rtx op1, rtx sel)
2904 {
2905 rtx sel_mod;
2906 machine_mode sel_mode = GET_MODE (sel);
2907 poly_uint64 nunits = GET_MODE_NUNITS (sel_mode);
2908 poly_uint64 max_sel = rtx_equal_p (op0, op1) ? nunits - 1 : 2 * nunits - 1;
2909 /* If SEL is variable-length CONST_VECTOR, we don't need to modulo it.
2910 Or if SEL is constant-length within [0, MAX_SEL], no need to modulo the
2911 indice. */
2912 if (CONST_VECTOR_P (sel)
2913 && (!nunits.is_constant () || const_vec_all_in_range_p (sel, 0, max_sel)))
2914 sel_mod = sel;
2915 else
2916 {
2917 rtx mod = gen_const_vector_dup (sel_mode, max_sel);
2918 sel_mod
2919 = expand_simple_binop (sel_mode, AND, sel, mod, NULL, 0, OPTAB_DIRECT);
2920 }
2921 return sel_mod;
2922 }
2923
2924 /* Implement vec_perm<mode>. */
2925
2926 void
2927 expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
2928 {
2929 machine_mode data_mode = GET_MODE (target);
2930 machine_mode sel_mode = GET_MODE (sel);
2931 poly_uint64 nunits = GET_MODE_NUNITS (sel_mode);
2932
2933 /* Check if the sel only references the first values vector. If each select
2934 index is in range of [0, nunits - 1]. A single vrgather instructions is
2935 enough. Since we will use vrgatherei16.vv for variable-length vector,
2936 it is never out of range and we don't need to modulo the index. */
2937 if (nunits.is_constant () && const_vec_all_in_range_p (sel, 0, nunits - 1))
2938 {
2939 emit_vlmax_gather_insn (target, op0, sel);
2940 return;
2941 }
2942
2943 /* Check if all the indices are same. */
2944 rtx elt;
2945 if (const_vec_duplicate_p (sel, &elt))
2946 {
2947 poly_uint64 value = rtx_to_poly_int64 (elt);
2948 rtx op = op0;
2949 if (maybe_gt (value, nunits - 1))
2950 {
2951 sel = gen_const_vector_dup (sel_mode, value - nunits);
2952 op = op1;
2953 }
2954 emit_vlmax_gather_insn (target, op, sel);
2955 }
2956
2957 /* Note: vec_perm indices are supposed to wrap when they go beyond the
2958 size of the two value vectors, i.e. the upper bits of the indices
2959 are effectively ignored. RVV vrgather instead produces 0 for any
2960 out-of-range indices, so we need to modulo all the vec_perm indices
2961 to ensure they are all in range of [0, nunits - 1] when op0 == op1
2962 or all in range of [0, 2 * nunits - 1] when op0 != op1. */
2963 rtx sel_mod = modulo_sel_indices (op0, op1, sel);
2964
2965 /* Check if the two values vectors are the same. */
2966 if (rtx_equal_p (op0, op1))
2967 {
2968 emit_vlmax_gather_insn (target, op0, sel_mod);
2969 return;
2970 }
2971
2972 /* This following sequence is handling the case that:
2973 __builtin_shufflevector (vec1, vec2, index...), the index can be any
2974 value in range of [0, 2 * nunits - 1]. */
2975 machine_mode mask_mode;
2976 mask_mode = get_mask_mode (data_mode);
2977 rtx mask = gen_reg_rtx (mask_mode);
2978 rtx max_sel = gen_const_vector_dup (sel_mode, nunits);
2979
2980 /* Step 1: generate a mask that should select everything >= nunits into the
2981 * mask. */
2982 expand_vec_cmp (mask, GEU, sel_mod, max_sel);
2983
2984 /* Step2: gather every op0 values indexed by sel into target,
2985 we don't need to care about the result of the element
2986 whose index >= nunits. */
2987 emit_vlmax_gather_insn (target, op0, sel_mod);
2988
2989 /* Step3: shift the range from (nunits, max_of_mode] to
2990 [0, max_of_mode - nunits]. */
2991 rtx tmp = gen_reg_rtx (sel_mode);
2992 rtx ops[] = {tmp, sel_mod, max_sel};
2993 emit_vlmax_insn (code_for_pred (MINUS, sel_mode), BINARY_OP, ops);
2994
2995 /* Step4: gather those into the previously masked-out elements
2996 of target. */
2997 emit_vlmax_masked_gather_mu_insn (target, op1, tmp, mask);
2998 }
2999
3000 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST for RVV. */
3001
3002 /* vec_perm support. */
3003
3004 struct expand_vec_perm_d
3005 {
3006 rtx target, op0, op1;
3007 vec_perm_indices perm;
3008 machine_mode vmode;
3009 machine_mode op_mode;
3010 bool one_vector_p;
3011 bool testing_p;
3012 };
3013
3014 /* Return the appropriate index mode for gather instructions. */
3015 opt_machine_mode
3016 get_gather_index_mode (struct expand_vec_perm_d *d)
3017 {
3018 machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
3019 poly_uint64 nunits = GET_MODE_NUNITS (d->vmode);
3020
3021 if (GET_MODE_INNER (d->vmode) == QImode)
3022 {
3023 if (nunits.is_constant ())
3024 {
3025 /* If indice is LMUL8 CONST_VECTOR and any element value
3026 exceed the range of 0 ~ 255, Forbid such permutation
3027 since we need vector HI mode to hold such indice and
3028 we don't have it. */
3029 if (!d->perm.all_in_range_p (0, 255)
3030 && !get_vector_mode (HImode, nunits).exists (&sel_mode))
3031 return opt_machine_mode ();
3032 }
3033 else
3034 {
3035 /* Permuting two SEW8 variable-length vectors need vrgatherei16.vv.
3036 Otherwise, it could overflow the index range. */
3037 if (!get_vector_mode (HImode, nunits).exists (&sel_mode))
3038 return opt_machine_mode ();
3039 }
3040 }
3041 else if (riscv_get_v_regno_alignment (sel_mode) > 1
3042 && GET_MODE_INNER (sel_mode) != HImode)
3043 sel_mode = get_vector_mode (HImode, nunits).require ();
3044 return sel_mode;
3045 }
3046
3047 /* Recognize the patterns that we can use merge operation to shuffle the
3048 vectors. The value of Each element (index i) in selector can only be
3049 either i or nunits + i. We will check the pattern is actually monotonic.
3050
3051 E.g.
3052 v = VEC_PERM_EXPR (v0, v1, selector),
3053 selector = { 0, nunits + 1, 2, nunits + 3, 4, nunits + 5, ... }
3054
3055 We can transform such pattern into:
3056
3057 v = vcond_mask (v0, v1, mask),
3058 mask = { 0, 1, 0, 1, 0, 1, ... }. */
3059
3060 static bool
3061 shuffle_merge_patterns (struct expand_vec_perm_d *d)
3062 {
3063 machine_mode vmode = d->vmode;
3064 machine_mode sel_mode = related_int_vector_mode (vmode).require ();
3065 int n_patterns = d->perm.encoding ().npatterns ();
3066 poly_int64 vec_len = d->perm.length ();
3067
3068 for (int i = 0; i < n_patterns; ++i)
3069 if (!known_eq (d->perm[i], i) && !known_eq (d->perm[i], vec_len + i))
3070 return false;
3071
3072 /* Check the pattern is monotonic here, otherwise, return false. */
3073 for (int i = n_patterns; i < n_patterns * 2; i++)
3074 if (!d->perm.series_p (i, n_patterns, i, n_patterns)
3075 && !d->perm.series_p (i, n_patterns, vec_len + i, n_patterns))
3076 return false;
3077
3078 /* We need to use precomputed mask for such situation and such mask
3079 can only be computed in compile-time known size modes. */
3080 bool indices_fit_selector_p
3081 = GET_MODE_BITSIZE (GET_MODE_INNER (vmode)) > 8 || known_lt (vec_len, 256);
3082 if (!indices_fit_selector_p && !vec_len.is_constant ())
3083 return false;
3084
3085 if (d->testing_p)
3086 return true;
3087
3088 machine_mode mask_mode = get_mask_mode (vmode);
3089 rtx mask = gen_reg_rtx (mask_mode);
3090
3091 if (indices_fit_selector_p)
3092 {
3093 /* MASK = SELECTOR < NUNTIS ? 1 : 0. */
3094 rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
3095 rtx x = gen_int_mode (vec_len, GET_MODE_INNER (sel_mode));
3096 insn_code icode = code_for_pred_cmp_scalar (sel_mode);
3097 rtx cmp = gen_rtx_fmt_ee (LTU, mask_mode, sel, x);
3098 rtx ops[] = {mask, cmp, sel, x};
3099 emit_vlmax_insn (icode, COMPARE_OP, ops);
3100 }
3101 else
3102 {
3103 /* For EEW8 and NUNITS may be larger than 255, we can't use vmsltu
3104 directly to generate the selector mask, instead, we can only use
3105 precomputed mask.
3106
3107 E.g. selector = <0, 257, 2, 259> for EEW8 vector with NUNITS = 256, we
3108 don't have a QImode scalar register to hold larger than 255.
3109 We also cannot hold that in a vector QImode register if LMUL = 8, and,
3110 since there is no larger HI mode vector we cannot create a larger
3111 selector.
3112
3113 As the mask is a simple {0, 1, ...} pattern and the length is known we
3114 can store it in a scalar register and broadcast it to a mask register.
3115 */
3116 gcc_assert (vec_len.is_constant ());
3117 int size = CEIL (GET_MODE_NUNITS (mask_mode).to_constant (), 8);
3118 machine_mode mode = get_vector_mode (QImode, size).require ();
3119 rtx tmp = gen_reg_rtx (mode);
3120 rvv_builder v (mode, 1, size);
3121 for (int i = 0; i < vec_len.to_constant () / 8; i++)
3122 {
3123 uint8_t value = 0;
3124 for (int j = 0; j < 8; j++)
3125 {
3126 int index = i * 8 + j;
3127 if (known_lt (d->perm[index], 256))
3128 value |= 1 << j;
3129 }
3130 v.quick_push (gen_int_mode (value, QImode));
3131 }
3132 emit_move_insn (tmp, v.build ());
3133 emit_move_insn (mask, gen_lowpart (mask_mode, tmp));
3134 }
3135
3136 /* TARGET = MASK ? OP0 : OP1. */
3137 /* swap op0 and op1 since the order is opposite to pred_merge. */
3138 rtx ops2[] = {d->target, d->op1, d->op0, mask};
3139 emit_vlmax_insn (code_for_pred_merge (vmode), MERGE_OP, ops2);
3140 return true;
3141 }
3142
3143 /* Recognize the consecutive index that we can use a single
3144 vrgather.v[x|i] to shuffle the vectors.
3145
3146 e.g. short[8] = VEC_PERM_EXPR <a, a, {0,1,0,1,0,1,0,1}>
3147 Use SEW = 32, index = 1 vrgather.vi to get the result. */
3148 static bool
3149 shuffle_consecutive_patterns (struct expand_vec_perm_d *d)
3150 {
3151 machine_mode vmode = d->vmode;
3152 scalar_mode smode = GET_MODE_INNER (vmode);
3153 poly_int64 vec_len = d->perm.length ();
3154 HOST_WIDE_INT elt;
3155
3156 if (!vec_len.is_constant () || !d->perm[0].is_constant (&elt))
3157 return false;
3158 int vlen = vec_len.to_constant ();
3159
3160 /* Compute the last element index of consecutive pattern from the leading
3161 consecutive elements. */
3162 int last_consecutive_idx = -1;
3163 int consecutive_num = -1;
3164 for (int i = 1; i < vlen; i++)
3165 {
3166 if (maybe_ne (d->perm[i], d->perm[i - 1] + 1))
3167 break;
3168 last_consecutive_idx = i;
3169 consecutive_num = last_consecutive_idx + 1;
3170 }
3171
3172 int new_vlen = vlen / consecutive_num;
3173 if (last_consecutive_idx < 0 || consecutive_num == vlen
3174 || !pow2p_hwi (consecutive_num) || !pow2p_hwi (new_vlen))
3175 return false;
3176 /* VEC_PERM <..., (index, index + 1, ... index + consecutive_num - 1)>.
3177 All elements of index, index + 1, ... index + consecutive_num - 1 should
3178 locate at the same vector. */
3179 if (maybe_ge (d->perm[0], vec_len)
3180 != maybe_ge (d->perm[last_consecutive_idx], vec_len))
3181 return false;
3182 /* If a vector has 8 elements. We allow optimizations on consecutive
3183 patterns e.g. <0, 1, 2, 3, 0, 1, 2, 3> or <4, 5, 6, 7, 4, 5, 6, 7>.
3184 Other patterns like <2, 3, 4, 5, 2, 3, 4, 5> are not feasible patterns
3185 to be optimized. */
3186 if (d->perm[0].to_constant () % consecutive_num != 0)
3187 return false;
3188 unsigned int container_bits = consecutive_num * GET_MODE_BITSIZE (smode);
3189 if (container_bits > 64)
3190 return false;
3191 else if (container_bits == 64)
3192 {
3193 if (!TARGET_VECTOR_ELEN_64)
3194 return false;
3195 else if (FLOAT_MODE_P (smode) && !TARGET_VECTOR_ELEN_FP_64)
3196 return false;
3197 }
3198
3199 /* Check the rest of elements are the same consecutive pattern. */
3200 for (int i = consecutive_num; i < vlen; i++)
3201 if (maybe_ne (d->perm[i], d->perm[i % consecutive_num]))
3202 return false;
3203
3204 if (FLOAT_MODE_P (smode))
3205 smode = float_mode_for_size (container_bits).require ();
3206 else
3207 smode = int_mode_for_size (container_bits, 0).require ();
3208 if (!get_vector_mode (smode, new_vlen).exists (&vmode))
3209 return false;
3210 machine_mode sel_mode = related_int_vector_mode (vmode).require ();
3211
3212 /* Success! */
3213 if (d->testing_p)
3214 return true;
3215
3216 int index = elt / consecutive_num;
3217 if (index >= new_vlen)
3218 index = index - new_vlen;
3219 rtx sel = gen_const_vector_dup (sel_mode, index);
3220 rtx op = elt >= vlen ? d->op0 : d->op1;
3221 emit_vlmax_gather_insn (gen_lowpart (vmode, d->target),
3222 gen_lowpart (vmode, op), sel);
3223 return true;
3224 }
3225
3226 /* Recognize the patterns that we can use compress operation to shuffle the
3227 vectors. The perm selector of compress pattern is divided into 2 part:
3228 The first part is the random index number < NUNITS.
3229 The second part is consecutive last N index number >= NUNITS.
3230
3231 E.g.
3232 v = VEC_PERM_EXPR (v0, v1, selector),
3233 selector = { 0, 2, 6, 7 }
3234
3235 We can transform such pattern into:
3236
3237 op1 = vcompress (op0, mask)
3238 mask = { 1, 0, 1, 0 }
3239 v = op1. */
3240
3241 static bool
3242 shuffle_compress_patterns (struct expand_vec_perm_d *d)
3243 {
3244 machine_mode vmode = d->vmode;
3245 poly_int64 vec_len = d->perm.length ();
3246
3247 if (!vec_len.is_constant ())
3248 return false;
3249
3250 int vlen = vec_len.to_constant ();
3251
3252 /* It's not worthwhile the compress pattern has elemenets < 4
3253 and we can't modulo indices for compress pattern. */
3254 if (known_ge (d->perm[vlen - 1], vlen * 2) || vlen < 4)
3255 return false;
3256
3257 /* Compress pattern doesn't work for one vector. */
3258 if (d->one_vector_p)
3259 return false;
3260
3261 /* Compress point is the point that all elements value with index i >=
3262 compress point of the selector are all consecutive series increasing and
3263 each selector value >= NUNTIS. In this case, we could compress all elements
3264 of i < compress point into the op1. */
3265 int compress_point = -1;
3266 for (int i = 0; i < vlen; i++)
3267 {
3268 if (compress_point < 0 && known_ge (d->perm[i], vec_len))
3269 {
3270 compress_point = i;
3271 break;
3272 }
3273 }
3274
3275 /* We don't apply compress approach if we can't find the compress point. */
3276 if (compress_point < 0)
3277 return false;
3278
3279 /* We can only apply compress approach when all index values from 0 to
3280 compress point are increasing. */
3281 for (int i = 1; i < compress_point; i++)
3282 if (maybe_le (d->perm[i], d->perm[i - 1]))
3283 return false;
3284
3285 /* It must be series increasing from compress point. */
3286 for (int i = 1 + compress_point; i < vlen; i++)
3287 if (maybe_ne (d->perm[i], d->perm[i - 1] + 1))
3288 return false;
3289
3290 /* Success! */
3291 if (d->testing_p)
3292 return true;
3293
3294 /* Check whether we need to slideup op1 to apply compress approach.
3295
3296 E.g. For index = { 0, 2, 6, 7}, since d->perm[i - 1] = 7 which
3297 is 2 * NUNITS - 1, so we don't need to slide up.
3298
3299 For index = { 0, 2, 5, 6}, we need to slide op1 up before
3300 we apply compress approach. */
3301 bool need_slideup_p = maybe_ne (d->perm[vlen - 1], 2 * vec_len - 1)
3302 && !const_vec_duplicate_p (d->op1);
3303
3304 /* If we leave it directly be handled by general gather,
3305 the code sequence will be:
3306 VECTOR LOAD selector
3307 GEU mask, selector, NUNITS
3308 GATHER dest, op0, selector
3309 SUB selector, selector, NUNITS
3310 GATHER dest, op1, selector, mask
3311 Each ALU operation is considered as COST = 1 and VECTOR LOAD is considered
3312 as COST = 4. So, we consider the general gather handling COST = 9.
3313 TODO: This cost is not accurate, we can adjust it by tune info. */
3314 int general_cost = 9;
3315
3316 /* If we can use compress approach, the code squence will be:
3317 MASK LOAD mask
3318 COMPRESS op1, op0, mask
3319 If it needs slide up, it will be:
3320 MASK LOAD mask
3321 SLIDEUP op1
3322 COMPRESS op1, op0, mask
3323 By default, mask load COST = 2.
3324 TODO: This cost is not accurate, we can adjust it by tune info. */
3325 int compress_cost = 4;
3326
3327 if (general_cost <= compress_cost)
3328 return false;
3329
3330 /* Build a mask that is true when selector element is true. */
3331 machine_mode mask_mode = get_mask_mode (vmode);
3332 rvv_builder builder (mask_mode, vlen, 1);
3333 for (int i = 0; i < vlen; i++)
3334 {
3335 bool is_compress_index = false;
3336 for (int j = 0; j < compress_point; j++)
3337 {
3338 if (known_eq (d->perm[j], i))
3339 {
3340 is_compress_index = true;
3341 break;
3342 }
3343 }
3344 if (is_compress_index)
3345 builder.quick_push (CONST1_RTX (BImode));
3346 else
3347 builder.quick_push (CONST0_RTX (BImode));
3348 }
3349 rtx mask = force_reg (mask_mode, builder.build ());
3350
3351 rtx merge = d->op1;
3352 if (need_slideup_p)
3353 {
3354 int slideup_cnt = vlen - (d->perm[vlen - 1].to_constant () % vlen) - 1;
3355 merge = gen_reg_rtx (vmode);
3356 rtx ops[] = {merge, d->op1, gen_int_mode (slideup_cnt, Pmode)};
3357 insn_code icode = code_for_pred_slide (UNSPEC_VSLIDEUP, vmode);
3358 emit_vlmax_insn (icode, BINARY_OP, ops);
3359 }
3360
3361 insn_code icode = code_for_pred_compress (vmode);
3362 rtx ops[] = {d->target, merge, d->op0, mask};
3363 emit_vlmax_insn (icode, COMPRESS_OP_MERGE, ops);
3364 return true;
3365 }
3366
3367 /* Recognize decompress patterns:
3368
3369 1. VEC_PERM_EXPR op0 and op1
3370 with isel = { 0, nunits, 1, nunits + 1, ... }.
3371 Decompress op0 and op1 vector with the mask = { 0, 1, 0, 1, ... }.
3372
3373 2. VEC_PERM_EXPR op0 and op1
3374 with isel = { 1/2 nunits, 3/2 nunits, 1/2 nunits+1, 3/2 nunits+1,... }.
3375 Slide down op0 and op1 with OFFSET = 1/2 nunits.
3376 Decompress op0 and op1 vector with the mask = { 0, 1, 0, 1, ... }.
3377 */
3378 static bool
3379 shuffle_decompress_patterns (struct expand_vec_perm_d *d)
3380 {
3381 poly_uint64 nelt = d->perm.length ();
3382 machine_mode mask_mode = get_mask_mode (d->vmode);
3383
3384 /* For constant size indices, we dont't need to handle it here.
3385 Just leave it to vec_perm<mode>. */
3386 if (d->perm.length ().is_constant ())
3387 return false;
3388
3389 poly_uint64 first = d->perm[0];
3390 if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
3391 || !d->perm.series_p (0, 2, first, 1)
3392 || !d->perm.series_p (1, 2, first + nelt, 1))
3393 return false;
3394
3395 /* Permuting two SEW8 variable-length vectors need vrgatherei16.vv.
3396 Otherwise, it could overflow the index range. */
3397 machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
3398 if (GET_MODE_INNER (d->vmode) == QImode
3399 && !get_vector_mode (HImode, nelt).exists (&sel_mode))
3400 return false;
3401
3402 /* Success! */
3403 if (d->testing_p)
3404 return true;
3405
3406 rtx op0, op1;
3407 if (known_eq (first, 0U))
3408 {
3409 op0 = d->op0;
3410 op1 = d->op1;
3411 }
3412 else
3413 {
3414 op0 = gen_reg_rtx (d->vmode);
3415 op1 = gen_reg_rtx (d->vmode);
3416 insn_code icode = code_for_pred_slide (UNSPEC_VSLIDEDOWN, d->vmode);
3417 rtx ops0[] = {op0, d->op0, gen_int_mode (first, Pmode)};
3418 rtx ops1[] = {op1, d->op1, gen_int_mode (first, Pmode)};
3419 emit_vlmax_insn (icode, BINARY_OP, ops0);
3420 emit_vlmax_insn (icode, BINARY_OP, ops1);
3421 }
3422 /* Generate { 0, 1, .... } mask. */
3423 rtx vid = gen_reg_rtx (sel_mode);
3424 rtx vid_repeat = gen_reg_rtx (sel_mode);
3425 expand_vec_series (vid, const0_rtx, const1_rtx);
3426 rtx and_ops[] = {vid_repeat, vid, const1_rtx};
3427 emit_vlmax_insn (code_for_pred_scalar (AND, sel_mode), BINARY_OP, and_ops);
3428 rtx const_vec = gen_const_vector_dup (sel_mode, 1);
3429 rtx mask = gen_reg_rtx (mask_mode);
3430 expand_vec_cmp (mask, EQ, vid_repeat, const_vec);
3431 emit_vlmax_decompress_insn (d->target, op0, op1, mask);
3432 return true;
3433 }
3434
3435 static bool
3436 shuffle_bswap_pattern (struct expand_vec_perm_d *d)
3437 {
3438 HOST_WIDE_INT diff;
3439 unsigned i, size, step;
3440
3441 if (!d->one_vector_p || !d->perm[0].is_constant (&diff) || !diff)
3442 return false;
3443
3444 step = diff + 1;
3445 size = step * GET_MODE_UNIT_BITSIZE (d->vmode);
3446
3447 switch (size)
3448 {
3449 case 16:
3450 break;
3451 case 32:
3452 case 64:
3453 /* We will have VEC_PERM_EXPR after rtl expand when invoking
3454 __builtin_bswap. It will generate about 9 instructions in
3455 loop as below, no matter it is bswap16, bswap32 or bswap64.
3456 .L2:
3457 1 vle16.v v4,0(a0)
3458 2 vmv.v.x v2,a7
3459 3 vand.vv v2,v6,v2
3460 4 slli a2,a5,1
3461 5 vrgatherei16.vv v1,v4,v2
3462 6 sub a4,a4,a5
3463 7 vse16.v v1,0(a3)
3464 8 add a0,a0,a2
3465 9 add a3,a3,a2
3466 bne a4,zero,.L2
3467
3468 But for bswap16 we may have a even simple code gen, which
3469 has only 7 instructions in loop as below.
3470 .L5
3471 1 vle8.v v2,0(a5)
3472 2 addi a5,a5,32
3473 3 vsrl.vi v4,v2,8
3474 4 vsll.vi v2,v2,8
3475 5 vor.vv v4,v4,v2
3476 6 vse8.v v4,0(a4)
3477 7 addi a4,a4,32
3478 bne a5,a6,.L5
3479
3480 Unfortunately, the instructions in loop will grow to 13 and 24
3481 for bswap32 and bswap64. Thus, we will leverage vrgather (9 insn)
3482 for both the bswap64 and bswap32, but take shift and or (7 insn)
3483 for bswap16.
3484 */
3485 default:
3486 return false;
3487 }
3488
3489 for (i = 0; i < step; i++)
3490 if (!d->perm.series_p (i, step, diff - i, step))
3491 return false;
3492
3493 /* Disable when nunits < 4 since the later generic approach
3494 is more profitable on BSWAP. */
3495 if (!known_gt (GET_MODE_NUNITS (d->vmode), 2))
3496 return false;
3497
3498 if (d->testing_p)
3499 return true;
3500
3501 machine_mode vhi_mode;
3502 poly_uint64 vhi_nunits = exact_div (GET_MODE_NUNITS (d->vmode), 2);
3503
3504 if (!get_vector_mode (HImode, vhi_nunits).exists (&vhi_mode))
3505 return false;
3506
3507 /* Step-1: Move op0 to src with VHI mode. */
3508 rtx src = gen_reg_rtx (vhi_mode);
3509 emit_move_insn (src, gen_lowpart (vhi_mode, d->op0));
3510
3511 /* Step-2: Shift right 8 bits to dest. */
3512 rtx dest = expand_binop (vhi_mode, lshr_optab, src, gen_int_mode (8, Pmode),
3513 NULL_RTX, 0, OPTAB_DIRECT);
3514
3515 /* Step-3: Shift left 8 bits to src. */
3516 src = expand_binop (vhi_mode, ashl_optab, src, gen_int_mode (8, Pmode),
3517 NULL_RTX, 0, OPTAB_DIRECT);
3518
3519 /* Step-4: Logic Or dest and src to dest. */
3520 dest = expand_binop (vhi_mode, ior_optab, dest, src,
3521 NULL_RTX, 0, OPTAB_DIRECT);
3522
3523 /* Step-5: Move src to target with VQI mode. */
3524 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
3525
3526 return true;
3527 }
3528
3529 /* Recognize the pattern that can be shuffled by vec_extract and slide1up
3530 approach. */
3531
3532 static bool
3533 shuffle_extract_and_slide1up_patterns (struct expand_vec_perm_d *d)
3534 {
3535 poly_int64 nunits = GET_MODE_NUNITS (d->vmode);
3536
3537 /* Recognize { nunits - 1, nunits, nunits + 1, ... }. */
3538 if (!d->perm.series_p (0, 2, nunits - 1, 2)
3539 || !d->perm.series_p (1, 2, nunits, 2))
3540 return false;
3541
3542 /* Disable when nunits < 4 since the later generic approach
3543 is more profitable on indice = { nunits - 1, nunits }. */
3544 if (!known_gt (nunits, 2))
3545 return false;
3546
3547 /* Success! */
3548 if (d->testing_p)
3549 return true;
3550
3551 /* Extract the last element of the first vector. */
3552 scalar_mode smode = GET_MODE_INNER (d->vmode);
3553 rtx tmp = gen_reg_rtx (smode);
3554 emit_vec_extract (tmp, d->op0, gen_int_mode (nunits - 1, Pmode));
3555
3556 /* Insert the scalar into element 0. */
3557 unsigned int unspec
3558 = FLOAT_MODE_P (d->vmode) ? UNSPEC_VFSLIDE1UP : UNSPEC_VSLIDE1UP;
3559 insn_code icode = code_for_pred_slide (unspec, d->vmode);
3560 rtx ops[] = {d->target, d->op1, tmp};
3561 emit_vlmax_insn (icode, BINARY_OP, ops);
3562 return true;
3563 }
3564
3565 static bool
3566 shuffle_series_patterns (struct expand_vec_perm_d *d)
3567 {
3568 if (!d->one_vector_p || d->perm.encoding ().npatterns () != 1)
3569 return false;
3570
3571 poly_int64 el1 = d->perm[0];
3572 poly_int64 el2 = d->perm[1];
3573 poly_int64 el3 = d->perm[2];
3574
3575 poly_int64 step1 = el2 - el1;
3576 poly_int64 step2 = el3 - el2;
3577
3578 bool need_insert = false;
3579 bool have_series = false;
3580
3581 /* Check for a full series. */
3582 if (known_ne (step1, 0) && d->perm.series_p (0, 1, el1, step1))
3583 have_series = true;
3584
3585 /* Check for a series starting at the second element. */
3586 else if (known_ne (step2, 0) && d->perm.series_p (1, 1, el2, step2))
3587 {
3588 have_series = true;
3589 need_insert = true;
3590 }
3591
3592 if (!have_series)
3593 return false;
3594
3595 /* Disable shuffle if we can't find an appropriate integer index mode for
3596 gather. */
3597 machine_mode sel_mode;
3598 if (!get_gather_index_mode (d).exists (&sel_mode))
3599 return false;
3600
3601 /* Success! */
3602 if (d->testing_p)
3603 return true;
3604
3605 /* Create the series. */
3606 machine_mode eltmode = Pmode;
3607 rtx series = gen_reg_rtx (sel_mode);
3608 expand_vec_series (series, gen_int_mode (need_insert ? el2 : el1, eltmode),
3609 gen_int_mode (need_insert ? step2 : step1, eltmode));
3610
3611 /* Insert the remaining element if necessary. */
3612 if (need_insert)
3613 {
3614 insn_code icode = code_for_pred_slide (UNSPEC_VSLIDE1UP, sel_mode);
3615 rtx ops[]
3616 = {series, series, gen_int_mode (el1, GET_MODE_INNER (sel_mode))};
3617 emit_vlmax_insn (icode, BINARY_OP, ops);
3618 }
3619
3620 emit_vlmax_gather_insn (d->target, d->op0, series);
3621
3622 return true;
3623 }
3624
3625 /* Recognize the pattern that can be shuffled by generic approach. */
3626
3627 static bool
3628 shuffle_generic_patterns (struct expand_vec_perm_d *d)
3629 {
3630 machine_mode sel_mode;
3631
3632 /* We don't enable SLP for non-power of 2 NPATTERNS. */
3633 if (!pow2p_hwi (d->perm.encoding().npatterns ()))
3634 return false;
3635
3636 /* Disable shuffle if we can't find an appropriate integer index mode for
3637 gather. */
3638 if (!get_gather_index_mode (d).exists (&sel_mode))
3639 return false;
3640
3641 /* Success! */
3642 if (d->testing_p)
3643 return true;
3644
3645 rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
3646 /* Some FIXED-VLMAX/VLS vector permutation situations call targethook
3647 instead of expand vec_perm<mode>, we handle it directly. */
3648 expand_vec_perm (d->target, d->op0, d->op1, sel);
3649 return true;
3650 }
3651
3652 /* This function recognizes and supports different permutation patterns
3653 and enable VLA SLP auto-vectorization. */
3654 static bool
3655 expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
3656 {
3657 gcc_assert (d->op_mode != E_VOIDmode);
3658
3659 /* The pattern matching functions above are written to look for a small
3660 number to begin the sequence (0, 1, N/2). If we begin with an index
3661 from the second operand, we can swap the operands. */
3662 poly_int64 nelt = d->perm.length ();
3663 if (known_ge (d->perm[0], nelt))
3664 {
3665 d->perm.rotate_inputs (1);
3666 std::swap (d->op0, d->op1);
3667 }
3668
3669 if (known_gt (nelt, 1))
3670 {
3671 if (d->vmode == d->op_mode)
3672 {
3673 if (shuffle_merge_patterns (d))
3674 return true;
3675 if (shuffle_consecutive_patterns (d))
3676 return true;
3677 if (shuffle_compress_patterns (d))
3678 return true;
3679 if (shuffle_decompress_patterns (d))
3680 return true;
3681 if (shuffle_bswap_pattern (d))
3682 return true;
3683 if (shuffle_extract_and_slide1up_patterns (d))
3684 return true;
3685 if (shuffle_series_patterns (d))
3686 return true;
3687 if (shuffle_generic_patterns (d))
3688 return true;
3689 return false;
3690 }
3691 else
3692 return false;
3693 }
3694 return false;
3695 }
3696
3697 /* This function implements TARGET_VECTORIZE_VEC_PERM_CONST by using RVV
3698 * instructions. */
3699 bool
3700 expand_vec_perm_const (machine_mode vmode, machine_mode op_mode, rtx target,
3701 rtx op0, rtx op1, const vec_perm_indices &sel)
3702 {
3703 /* RVV doesn't have Mask type pack/unpack instructions and we don't use
3704 mask to do the iteration loop control. Just disable it directly. */
3705 if (GET_MODE_CLASS (vmode) == MODE_VECTOR_BOOL)
3706 return false;
3707 /* FIXME: Explicitly disable VLA interleave SLP vectorization when we
3708 may encounter ICE for poly size (1, 1) vectors in loop vectorizer.
3709 Ideally, middle-end loop vectorizer should be able to disable it
3710 itself, We can remove the codes here when middle-end code is able
3711 to disable VLA SLP vectorization for poly size (1, 1) VF. */
3712 if (!BYTES_PER_RISCV_VECTOR.is_constant ()
3713 && maybe_lt (BYTES_PER_RISCV_VECTOR * TARGET_MAX_LMUL,
3714 poly_int64 (16, 16)))
3715 return false;
3716
3717 struct expand_vec_perm_d d;
3718
3719 /* Check whether the mask can be applied to a single vector. */
3720 if (sel.ninputs () == 1 || (op0 && rtx_equal_p (op0, op1)))
3721 d.one_vector_p = true;
3722 else if (sel.all_from_input_p (0))
3723 {
3724 d.one_vector_p = true;
3725 op1 = op0;
3726 }
3727 else if (sel.all_from_input_p (1))
3728 {
3729 d.one_vector_p = true;
3730 op0 = op1;
3731 }
3732 else
3733 d.one_vector_p = false;
3734
3735 d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
3736 sel.nelts_per_input ());
3737 d.vmode = vmode;
3738 d.op_mode = op_mode;
3739 d.target = target;
3740 d.op0 = op0;
3741 if (op0 == op1)
3742 d.op1 = d.op0;
3743 else
3744 d.op1 = op1;
3745 d.testing_p = !target;
3746
3747 if (!d.testing_p)
3748 return expand_vec_perm_const_1 (&d);
3749
3750 rtx_insn *last = get_last_insn ();
3751 bool ret = expand_vec_perm_const_1 (&d);
3752 gcc_assert (last == get_last_insn ());
3753
3754 return ret;
3755 }
3756
3757 /* Generate no side effects vsetvl to get the vector length. */
3758 void
3759 expand_select_vl (rtx *ops)
3760 {
3761 poly_int64 nunits = rtx_to_poly_int64 (ops[2]);
3762 if (CONST_INT_P (ops[1]) && known_le (INTVAL (ops[1]), nunits))
3763 {
3764 /* If length is known <= VF, we just use the length directly instead
3765 of using vsetvli.
3766
3767 E.g. _255 = .SELECT_VL (3, POLY_INT_CST [4, 4]);
3768 We move 3 into _255 intead of using explicit vsetvl. */
3769 emit_move_insn (ops[0], ops[1]);
3770 return;
3771 }
3772 /* We arbitrary picked QImode as inner scalar mode to get vector mode.
3773 since vsetvl only demand ratio. We let VSETVL PASS to optimize it. */
3774 scalar_int_mode mode = QImode;
3775 machine_mode rvv_mode = get_vector_mode (mode, nunits).require ();
3776 emit_insn (gen_no_side_effects_vsetvl_rtx (rvv_mode, ops[0], ops[1]));
3777 }
3778
3779 /* Expand MASK_LEN_{LOAD,STORE}. */
3780 void
3781 expand_load_store (rtx *ops, bool is_load)
3782 {
3783 poly_int64 value;
3784 rtx mask = ops[2];
3785 rtx len = ops[3];
3786 machine_mode mode = GET_MODE (ops[0]);
3787
3788 if (is_vlmax_len_p (mode, len))
3789 {
3790 /* If the length operand is equal to VF, it is VLMAX load/store. */
3791 if (is_load)
3792 {
3793 rtx m_ops[] = {ops[0], mask, ops[1]};
3794 emit_vlmax_insn (code_for_pred_mov (mode), UNARY_OP_TAMA, m_ops);
3795 }
3796 else
3797 {
3798 len = gen_reg_rtx (Pmode);
3799 emit_vlmax_vsetvl (mode, len);
3800 emit_insn (gen_pred_store (mode, ops[0], mask, ops[1], len,
3801 get_avl_type_rtx (VLMAX)));
3802 }
3803 }
3804 else
3805 {
3806 if (!satisfies_constraint_K (len))
3807 len = force_reg (Pmode, len);
3808 if (is_load)
3809 {
3810 rtx m_ops[] = {ops[0], mask, ops[1]};
3811 emit_nonvlmax_insn (code_for_pred_mov (mode), UNARY_OP_TAMA, m_ops,
3812 len);
3813 }
3814 else
3815 emit_insn (gen_pred_store (mode, ops[0], mask, ops[1], len,
3816 get_avl_type_rtx (NONVLMAX)));
3817 }
3818 }
3819
3820
3821 /* Return true if the operation is the floating-point operation need FRM. */
3822 static bool
3823 needs_fp_rounding (unsigned icode, machine_mode mode)
3824 {
3825 if (!FLOAT_MODE_P (mode))
3826 return false;
3827
3828 return icode != maybe_code_for_pred (SMIN, mode)
3829 && icode != maybe_code_for_pred (UNSPEC_VFMIN, mode)
3830 && icode != maybe_code_for_pred (SMAX, mode)
3831 && icode != maybe_code_for_pred (UNSPEC_VFMAX, mode)
3832 && icode != maybe_code_for_pred (NEG, mode)
3833 && icode != maybe_code_for_pred (ABS, mode)
3834 /* narrower-FP -> FP */
3835 && icode != maybe_code_for_pred_extend (mode)
3836 /* narrower-INT -> FP */
3837 && icode != maybe_code_for_pred_widen (FLOAT, mode)
3838 && icode != maybe_code_for_pred_widen (UNSIGNED_FLOAT, mode)
3839 /* vfsgnj */
3840 && icode != maybe_code_for_pred (UNSPEC_VCOPYSIGN, mode)
3841 && icode != maybe_code_for_pred_mov (mode);
3842 }
3843
3844 /* Subroutine to expand COND_LEN_* patterns. */
3845 static void
3846 expand_cond_len_op (unsigned icode, insn_flags op_type, rtx *ops, rtx len)
3847 {
3848 rtx dest = ops[0];
3849 rtx mask = ops[1];
3850 machine_mode mode = GET_MODE (dest);
3851 machine_mode mask_mode = GET_MODE (mask);
3852 poly_int64 value;
3853 bool is_dummy_mask = rtx_equal_p (mask, CONSTM1_RTX (mask_mode));
3854 bool is_vlmax_len = is_vlmax_len_p (mode, len);
3855
3856 unsigned insn_flags = HAS_DEST_P | HAS_MASK_P | HAS_MERGE_P | op_type;
3857 /* FIXME: We don't support simplification of COND_LEN_NEG (..., dummy len,
3858 dummy mask) into NEG_EXPR in GIMPLE FOLD yet. So, we do such
3859 simplification in RISC-V backend and may do that in middle-end in the
3860 future. */
3861 if (is_dummy_mask && is_vlmax_len)
3862 insn_flags |= TDEFAULT_POLICY_P | MDEFAULT_POLICY_P;
3863 else if (is_dummy_mask)
3864 insn_flags |= TU_POLICY_P | MDEFAULT_POLICY_P;
3865 else if (is_vlmax_len)
3866 insn_flags |= TDEFAULT_POLICY_P | MU_POLICY_P;
3867 else
3868 insn_flags |= TU_POLICY_P | MU_POLICY_P;
3869
3870 if (needs_fp_rounding (icode, mode))
3871 insn_flags |= FRM_DYN_P;
3872
3873 if (is_vlmax_len)
3874 emit_vlmax_insn (icode, insn_flags, ops);
3875 else
3876 emit_nonvlmax_insn (icode, insn_flags, ops, len);
3877 }
3878
3879 /* Return RVV_VUNDEF if the ELSE value is scratch rtx. */
3880 static rtx
3881 get_else_operand (rtx op)
3882 {
3883 return GET_CODE (op) == SCRATCH ? RVV_VUNDEF (GET_MODE (op)) : op;
3884 }
3885
3886 /* Expand unary ops COND_LEN_*. */
3887 void
3888 expand_cond_len_unop (unsigned icode, rtx *ops)
3889 {
3890 rtx dest = ops[0];
3891 rtx mask = ops[1];
3892 rtx src = ops[2];
3893 rtx merge = get_else_operand (ops[3]);
3894 rtx len = ops[4];
3895
3896 rtx cond_ops[] = {dest, mask, merge, src};
3897 expand_cond_len_op (icode, UNARY_OP_P, cond_ops, len);
3898 }
3899
3900 /* Expand unary ops COND_*. */
3901 void
3902 expand_cond_unop (unsigned icode, rtx *ops)
3903 {
3904 rtx dest = ops[0];
3905 rtx mask = ops[1];
3906 rtx src = ops[2];
3907 rtx merge = get_else_operand (ops[3]);
3908 rtx len = gen_int_mode (GET_MODE_NUNITS (GET_MODE (dest)), Pmode);
3909
3910 rtx cond_ops[] = {dest, mask, merge, src};
3911 expand_cond_len_op (icode, UNARY_OP_P, cond_ops, len);
3912 }
3913
3914 /* Expand binary ops COND_LEN_*. */
3915 void
3916 expand_cond_len_binop (unsigned icode, rtx *ops)
3917 {
3918 rtx dest = ops[0];
3919 rtx mask = ops[1];
3920 rtx src1 = ops[2];
3921 rtx src2 = ops[3];
3922 rtx merge = get_else_operand (ops[4]);
3923 rtx len = ops[5];
3924
3925 rtx cond_ops[] = {dest, mask, merge, src1, src2};
3926 expand_cond_len_op (icode, BINARY_OP_P, cond_ops, len);
3927 }
3928
3929 /* Expand binary ops COND_*. */
3930 void
3931 expand_cond_binop (unsigned icode, rtx *ops)
3932 {
3933 rtx dest = ops[0];
3934 rtx mask = ops[1];
3935 rtx src1 = ops[2];
3936 rtx src2 = ops[3];
3937 rtx merge = get_else_operand (ops[4]);
3938 rtx len = gen_int_mode (GET_MODE_NUNITS (GET_MODE (dest)), Pmode);
3939
3940 rtx cond_ops[] = {dest, mask, merge, src1, src2};
3941 expand_cond_len_op (icode, BINARY_OP_P, cond_ops, len);
3942 }
3943
3944 /* Prepare insn_code for gather_load/scatter_store according to
3945 the vector mode and index mode. */
3946 static insn_code
3947 prepare_gather_scatter (machine_mode vec_mode, machine_mode idx_mode,
3948 bool is_load)
3949 {
3950 if (!is_load)
3951 return code_for_pred_indexed_store (UNSPEC_UNORDERED, vec_mode, idx_mode);
3952 else
3953 {
3954 unsigned src_eew_bitsize = GET_MODE_BITSIZE (GET_MODE_INNER (idx_mode));
3955 unsigned dst_eew_bitsize = GET_MODE_BITSIZE (GET_MODE_INNER (vec_mode));
3956 if (dst_eew_bitsize == src_eew_bitsize)
3957 return code_for_pred_indexed_load_same_eew (UNSPEC_UNORDERED, vec_mode);
3958 else if (dst_eew_bitsize > src_eew_bitsize)
3959 {
3960 unsigned factor = dst_eew_bitsize / src_eew_bitsize;
3961 switch (factor)
3962 {
3963 case 2:
3964 return code_for_pred_indexed_load_x2_greater_eew (
3965 UNSPEC_UNORDERED, vec_mode);
3966 case 4:
3967 return code_for_pred_indexed_load_x4_greater_eew (
3968 UNSPEC_UNORDERED, vec_mode);
3969 case 8:
3970 return code_for_pred_indexed_load_x8_greater_eew (
3971 UNSPEC_UNORDERED, vec_mode);
3972 default:
3973 gcc_unreachable ();
3974 }
3975 }
3976 else
3977 {
3978 unsigned factor = src_eew_bitsize / dst_eew_bitsize;
3979 switch (factor)
3980 {
3981 case 2:
3982 return code_for_pred_indexed_load_x2_smaller_eew (
3983 UNSPEC_UNORDERED, vec_mode);
3984 case 4:
3985 return code_for_pred_indexed_load_x4_smaller_eew (
3986 UNSPEC_UNORDERED, vec_mode);
3987 case 8:
3988 return code_for_pred_indexed_load_x8_smaller_eew (
3989 UNSPEC_UNORDERED, vec_mode);
3990 default:
3991 gcc_unreachable ();
3992 }
3993 }
3994 }
3995 }
3996
3997 /* Expand LEN_MASK_{GATHER_LOAD,SCATTER_STORE}. */
3998 void
3999 expand_gather_scatter (rtx *ops, bool is_load)
4000 {
4001 rtx ptr, vec_offset, vec_reg;
4002 bool zero_extend_p;
4003 int scale_log2;
4004 rtx mask = ops[5];
4005 rtx len = ops[6];
4006 if (is_load)
4007 {
4008 vec_reg = ops[0];
4009 ptr = ops[1];
4010 vec_offset = ops[2];
4011 zero_extend_p = INTVAL (ops[3]);
4012 scale_log2 = exact_log2 (INTVAL (ops[4]));
4013 }
4014 else
4015 {
4016 vec_reg = ops[4];
4017 ptr = ops[0];
4018 vec_offset = ops[1];
4019 zero_extend_p = INTVAL (ops[2]);
4020 scale_log2 = exact_log2 (INTVAL (ops[3]));
4021 }
4022
4023 machine_mode vec_mode = GET_MODE (vec_reg);
4024 machine_mode idx_mode = GET_MODE (vec_offset);
4025 scalar_mode inner_idx_mode = GET_MODE_INNER (idx_mode);
4026 unsigned inner_offsize = GET_MODE_BITSIZE (inner_idx_mode);
4027 poly_int64 nunits = GET_MODE_NUNITS (vec_mode);
4028 poly_int64 value;
4029 bool is_vlmax = is_vlmax_len_p (vec_mode, len);
4030
4031 /* Extend the offset element to address width. */
4032 if (inner_offsize < BITS_PER_WORD)
4033 {
4034 /* 7.2. Vector Load/Store Addressing Modes.
4035 If the vector offset elements are narrower than XLEN, they are
4036 zero-extended to XLEN before adding to the ptr effective address. If
4037 the vector offset elements are wider than XLEN, the least-significant
4038 XLEN bits are used in the address calculation. An implementation must
4039 raise an illegal instruction exception if the EEW is not supported for
4040 offset elements.
4041
4042 RVV spec only refers to the scale_log == 0 case. */
4043 if (!zero_extend_p || scale_log2 != 0)
4044 {
4045 if (zero_extend_p)
4046 inner_idx_mode
4047 = int_mode_for_size (inner_offsize * 2, 0).require ();
4048 else
4049 inner_idx_mode = int_mode_for_size (BITS_PER_WORD, 0).require ();
4050 machine_mode new_idx_mode
4051 = get_vector_mode (inner_idx_mode, nunits).require ();
4052 rtx tmp = gen_reg_rtx (new_idx_mode);
4053 emit_insn (gen_extend_insn (tmp, vec_offset, new_idx_mode, idx_mode,
4054 zero_extend_p ? true : false));
4055 vec_offset = tmp;
4056 idx_mode = new_idx_mode;
4057 }
4058 }
4059
4060 if (scale_log2 != 0)
4061 {
4062 rtx tmp = expand_binop (idx_mode, ashl_optab, vec_offset,
4063 gen_int_mode (scale_log2, Pmode), NULL_RTX, 0,
4064 OPTAB_DIRECT);
4065 vec_offset = tmp;
4066 }
4067
4068 insn_code icode = prepare_gather_scatter (vec_mode, idx_mode, is_load);
4069 if (is_vlmax)
4070 {
4071 if (is_load)
4072 {
4073 rtx load_ops[]
4074 = {vec_reg, mask, ptr, vec_offset};
4075 emit_vlmax_insn (icode, BINARY_OP_TAMA, load_ops);
4076 }
4077 else
4078 {
4079 rtx store_ops[] = {mask, ptr, vec_offset, vec_reg};
4080 emit_vlmax_insn (icode, SCATTER_OP_M, store_ops);
4081 }
4082 }
4083 else
4084 {
4085 if (is_load)
4086 {
4087 rtx load_ops[]
4088 = {vec_reg, mask, ptr, vec_offset};
4089 emit_nonvlmax_insn (icode, BINARY_OP_TAMA, load_ops, len);
4090 }
4091 else
4092 {
4093 rtx store_ops[] = {mask, ptr, vec_offset, vec_reg};
4094 emit_nonvlmax_insn (icode, SCATTER_OP_M, store_ops, len);
4095 }
4096 }
4097 }
4098
4099 /* Expand COND_LEN_*. */
4100 void
4101 expand_cond_len_ternop (unsigned icode, rtx *ops)
4102 {
4103 rtx dest = ops[0];
4104 rtx mask = ops[1];
4105 rtx src1 = ops[2];
4106 rtx src2 = ops[3];
4107 rtx src3 = ops[4];
4108 rtx merge = get_else_operand (ops[5]);
4109 rtx len = ops[6];
4110
4111 rtx cond_ops[] = {dest, mask, src1, src2, src3, merge};
4112 expand_cond_len_op (icode, TERNARY_OP_P, cond_ops, len);
4113 }
4114
4115 /* Expand COND_*. */
4116 void
4117 expand_cond_ternop (unsigned icode, rtx *ops)
4118 {
4119 rtx dest = ops[0];
4120 rtx mask = ops[1];
4121 rtx src1 = ops[2];
4122 rtx src2 = ops[3];
4123 rtx src3 = ops[4];
4124 rtx merge = get_else_operand (ops[5]);
4125 rtx len = gen_int_mode (GET_MODE_NUNITS (GET_MODE (dest)), Pmode);
4126
4127 rtx cond_ops[] = {dest, mask, src1, src2, src3, merge};
4128 expand_cond_len_op (icode, TERNARY_OP_P, cond_ops, len);
4129 }
4130
4131 /* Expand reduction operations.
4132 Case 1: ops = {scalar_dest, vector_src}
4133 Case 2: ops = {scalar_dest, vector_src, mask, vl}
4134 */
4135 void
4136 expand_reduction (unsigned unspec, unsigned insn_flags, rtx *ops, rtx init)
4137 {
4138 rtx scalar_dest = ops[0];
4139 rtx vector_src = ops[1];
4140 machine_mode vmode = GET_MODE (vector_src);
4141 machine_mode vel_mode = GET_MODE (scalar_dest);
4142 machine_mode m1_mode = get_m1_mode (vel_mode).require ();
4143
4144 rtx m1_tmp = gen_reg_rtx (m1_mode);
4145 rtx scalar_move_ops[] = {m1_tmp, init};
4146 emit_nonvlmax_insn (code_for_pred_broadcast (m1_mode), SCALAR_MOVE_OP,
4147 scalar_move_ops,
4148 need_mask_operand_p (insn_flags) ? ops[3]
4149 : CONST1_RTX (Pmode));
4150 rtx m1_tmp2 = gen_reg_rtx (m1_mode);
4151 rtx reduc_ops[] = {m1_tmp2, vector_src, m1_tmp};
4152 insn_code icode = code_for_pred (unspec, vmode);
4153
4154 if (need_mask_operand_p (insn_flags))
4155 {
4156 rtx mask_len_reduc_ops[] = {m1_tmp2, ops[2], vector_src, m1_tmp};
4157 emit_nonvlmax_insn (icode, insn_flags, mask_len_reduc_ops, ops[3]);
4158 }
4159 else
4160 emit_vlmax_insn (icode, insn_flags, reduc_ops);
4161
4162 emit_insn (gen_pred_extract_first (m1_mode, scalar_dest, m1_tmp2));
4163 }
4164
4165 /* Prepare ops for ternary operations.
4166 It can be called before or after RA. */
4167 void
4168 prepare_ternary_operands (rtx *ops)
4169 {
4170 machine_mode mode = GET_MODE (ops[0]);
4171
4172 if (!rtx_equal_p (ops[5], RVV_VUNDEF (mode))
4173 && (VECTOR_MODE_P (GET_MODE (ops[2]))
4174 && !rtx_equal_p (ops[2], ops[5]))
4175 && !rtx_equal_p (ops[3], ops[5])
4176 && !rtx_equal_p (ops[4], ops[5]))
4177 {
4178 /* RA will fail to find vector REG and report ICE, so we pre-merge
4179 the ops for LMUL = 8. */
4180 if (satisfies_constraint_Wc1 (ops[1]))
4181 {
4182 emit_move_insn (ops[0], ops[5]);
4183 emit_insn (gen_pred_mov (mode, ops[0], ops[1], ops[0], ops[4], ops[6],
4184 ops[7], ops[8], ops[9]));
4185 }
4186 else
4187 emit_insn (gen_pred_merge (mode, ops[0], RVV_VUNDEF (mode), ops[5],
4188 ops[4], ops[1], ops[6], ops[7], ops[9]));
4189 ops[5] = ops[4] = ops[0];
4190 }
4191 else
4192 {
4193 /* Swap the multiplication ops if the fallback value is the
4194 second of the two. */
4195 if (rtx_equal_p (ops[3], ops[5]))
4196 std::swap (ops[2], ops[3]);
4197
4198 /* TODO: ??? Maybe we could support splitting FMA (a, 4, b)
4199 into PLUS (ASHIFT (a, 2), b) according to uarchs. */
4200 }
4201 gcc_assert (rtx_equal_p (ops[5], RVV_VUNDEF (mode))
4202 || rtx_equal_p (ops[5], ops[2]) || rtx_equal_p (ops[5], ops[4]));
4203 }
4204
4205 /* Expand VEC_MASK_LEN_{LOAD_LANES,STORE_LANES}. */
4206 void
4207 expand_lanes_load_store (rtx *ops, bool is_load)
4208 {
4209 poly_int64 value;
4210 rtx mask = ops[2];
4211 rtx len = ops[3];
4212 rtx addr = is_load ? XEXP (ops[1], 0) : XEXP (ops[0], 0);
4213 rtx reg = is_load ? ops[0] : ops[1];
4214 machine_mode mode = GET_MODE (ops[0]);
4215
4216 if (is_vlmax_len_p (mode, len))
4217 {
4218 /* If the length operand is equal to VF, it is VLMAX load/store. */
4219 if (is_load)
4220 {
4221 rtx m_ops[] = {reg, mask, addr};
4222 emit_vlmax_insn (code_for_pred_unit_strided_load (mode), UNARY_OP_TAMA,
4223 m_ops);
4224 }
4225 else
4226 {
4227 len = gen_reg_rtx (Pmode);
4228 emit_vlmax_vsetvl (mode, len);
4229 emit_insn (gen_pred_unit_strided_store (mode, mask, addr, reg, len,
4230 get_avl_type_rtx (VLMAX)));
4231 }
4232 }
4233 else
4234 {
4235 if (!satisfies_constraint_K (len))
4236 len = force_reg (Pmode, len);
4237 if (is_load)
4238 {
4239 rtx m_ops[] = {reg, mask, addr};
4240 emit_nonvlmax_insn (code_for_pred_unit_strided_load (mode),
4241 UNARY_OP_TAMA, m_ops, len);
4242 }
4243 else
4244 emit_insn (gen_pred_unit_strided_store (mode, mask, addr, reg, len,
4245 get_avl_type_rtx (NONVLMAX)));
4246 }
4247 }
4248
4249 /* Expand LEN_FOLD_EXTRACT_LAST. */
4250 void
4251 expand_fold_extract_last (rtx *ops)
4252 {
4253 rtx dst = ops[0];
4254 rtx default_value = ops[1];
4255 rtx mask = ops[2];
4256 rtx anchor = gen_reg_rtx (Pmode);
4257 rtx index = gen_reg_rtx (Pmode);
4258 rtx vect = ops[3];
4259 rtx else_label = gen_label_rtx ();
4260 rtx end_label = gen_label_rtx ();
4261 rtx len = ops[4];
4262 poly_int64 value;
4263 machine_mode mode = GET_MODE (vect);
4264 machine_mode mask_mode = GET_MODE (mask);
4265 rtx compress_vect = gen_reg_rtx (mode);
4266 rtx slide_vect = gen_reg_rtx (mode);
4267 insn_code icode;
4268
4269 if (is_vlmax_len_p (mode, len))
4270 len = NULL_RTX;
4271
4272 /* Calculate the number of 1-bit in mask. */
4273 rtx cpop_ops[] = {anchor, mask};
4274 if (len)
4275 emit_nonvlmax_insn (code_for_pred_popcount (mask_mode, Pmode), CPOP_OP,
4276 cpop_ops, len);
4277 else
4278 emit_vlmax_insn (code_for_pred_popcount (mask_mode, Pmode), CPOP_OP,
4279 cpop_ops);
4280
4281 riscv_expand_conditional_branch (else_label, EQ, anchor, const0_rtx);
4282 emit_insn (gen_rtx_SET (index, gen_rtx_PLUS (Pmode, anchor, constm1_rtx)));
4283 /* Compress the vector. */
4284 icode = code_for_pred_compress (mode);
4285 rtx compress_ops[] = {compress_vect, vect, mask};
4286 if (len)
4287 emit_nonvlmax_insn (icode, COMPRESS_OP, compress_ops, len);
4288 else
4289 emit_vlmax_insn (icode, COMPRESS_OP, compress_ops);
4290 /* Emit the slide down to index 0 in a new vector. */
4291 rtx slide_ops[] = {slide_vect, compress_vect, index};
4292 icode = code_for_pred_slide (UNSPEC_VSLIDEDOWN, mode);
4293 if (len)
4294 emit_nonvlmax_insn (icode, BINARY_OP, slide_ops, len);
4295 else
4296 emit_vlmax_insn (icode, BINARY_OP, slide_ops);
4297 /* Emit v(f)mv.[xf].s. */
4298 emit_insn (gen_pred_extract_first (mode, dst, slide_vect));
4299
4300 emit_jump_insn (gen_jump (end_label));
4301 emit_barrier ();
4302 emit_label (else_label);
4303 emit_move_insn (dst, default_value);
4304 emit_label (end_label);
4305 }
4306
4307 /* Return true if the LMUL of comparison less than or equal to one. */
4308 bool
4309 cmp_lmul_le_one (machine_mode mode)
4310 {
4311 if (riscv_v_ext_vector_mode_p (mode))
4312 return known_le (GET_MODE_SIZE (mode), BYTES_PER_RISCV_VECTOR);
4313 else if (riscv_v_ext_vls_mode_p (mode))
4314 return known_le (GET_MODE_BITSIZE (mode), TARGET_MIN_VLEN);
4315 return false;
4316 }
4317
4318 /* Return true if the LMUL of comparison greater than one. */
4319 bool
4320 cmp_lmul_gt_one (machine_mode mode)
4321 {
4322 if (riscv_v_ext_vector_mode_p (mode))
4323 return known_gt (GET_MODE_SIZE (mode), BYTES_PER_RISCV_VECTOR);
4324 else if (riscv_v_ext_vls_mode_p (mode))
4325 return known_gt (GET_MODE_BITSIZE (mode), TARGET_MIN_VLEN);
4326 return false;
4327 }
4328
4329 /* Return true if the VLS mode is legal. There are 2 cases here.
4330
4331 1. Enable VLS modes for VLA vectorization since fixed length VLMAX mode
4332 is the highest priority choice and should not conflict with VLS modes.
4333 2. Enable VLS modes for some cases in fixed-vlmax, aka the bitsize of the
4334 VLS mode are smaller than the minimal vla.
4335
4336 Take vlen = 2048 as example for case 2.
4337
4338 Note: Below table based on vlen = 2048.
4339 +----------------------------------------------------+----------------------+
4340 | VLS mode | VLA mode |
4341 +----------------------------------------------------+----------------------+
4342 | Name | Precision | Inner Precision | Enabled | Min mode | Min bits |
4343 +------------+-----------+-----------------+---------+-----------+----------+
4344 | V1BI | 1 | 1 | Yes | RVVMF64BI | 32 |
4345 | V2BI | 2 | 1 | Yes | RVVMF64BI | 32 |
4346 | V4BI | 4 | 1 | Yes | RVVMF64BI | 32 |
4347 | V8BI | 8 | 1 | Yes | RVVMF64BI | 32 |
4348 | V16BI | 16 | 1 | Yes | RVVMF64BI | 32 |
4349 | V32BI | 32 | 1 | NO | RVVMF64BI | 32 |
4350 | V64BI | 64 | 1 | NO | RVVMF64BI | 32 |
4351 | ... | ... | ... | ... | RVVMF64BI | 32 |
4352 | V4096BI | 4096 | 1 | NO | RVVMF64BI | 32 |
4353 +------------+-----------+-----------------+---------+-----------+----------+
4354 | V1QI | 8 | 8 | Yes | RVVMF8QI | 256 |
4355 | V2QI | 16 | 8 | Yes | RVVMF8QI | 256 |
4356 | V4QI | 32 | 8 | Yes | RVVMF8QI | 256 |
4357 | V8QI | 64 | 8 | Yes | RVVMF8QI | 256 |
4358 | V16QI | 128 | 8 | Yes | RVVMF8QI | 256 |
4359 | V32QI | 256 | 8 | NO | RVVMF8QI | 256 |
4360 | V64QI | 512 | 8 | NO | RVVMF8QI | 256 |
4361 | ... | ... | .. | ... | RVVMF8QI | 256 |
4362 | V4096QI | 32768 | 8 | NO | RVVMF8QI | 256 |
4363 +------------+-----------+-----------------+---------+-----------+----------+
4364 | V1HI | 16 | 16 | Yes | RVVMF4HI | 512 |
4365 | V2HI | 32 | 16 | Yes | RVVMF4HI | 512 |
4366 | V4HI | 64 | 16 | Yes | RVVMF4HI | 512 |
4367 | V8HI | 128 | 16 | Yes | RVVMF4HI | 512 |
4368 | V16HI | 256 | 16 | Yes | RVVMF4HI | 512 |
4369 | V32HI | 512 | 16 | NO | RVVMF4HI | 512 |
4370 | V64HI | 1024 | 16 | NO | RVVMF4HI | 512 |
4371 | ... | ... | .. | ... | RVVMF4HI | 512 |
4372 | V2048HI | 32768 | 16 | NO | RVVMF4HI | 512 |
4373 +------------+-----------+-----------------+---------+-----------+----------+
4374 | V1SI/SF | 32 | 32 | Yes | RVVMF2SI | 1024 |
4375 | V2SI/SF | 64 | 32 | Yes | RVVMF2SI | 1024 |
4376 | V4SI/SF | 128 | 32 | Yes | RVVMF2SI | 1024 |
4377 | V8SI/SF | 256 | 32 | Yes | RVVMF2SI | 1024 |
4378 | V16SI/SF | 512 | 32 | Yes | RVVMF2SI | 1024 |
4379 | V32SI/SF | 1024 | 32 | NO | RVVMF2SI | 1024 |
4380 | V64SI/SF | 2048 | 32 | NO | RVVMF2SI | 1024 |
4381 | ... | ... | .. | ... | RVVMF2SI | 1024 |
4382 | V1024SI/SF | 32768 | 32 | NO | RVVMF2SI | 1024 |
4383 +------------+-----------+-----------------+---------+-----------+----------+
4384 | V1DI/DF | 64 | 64 | Yes | RVVM1DI | 2048 |
4385 | V2DI/DF | 128 | 64 | Yes | RVVM1DI | 2048 |
4386 | V4DI/DF | 256 | 64 | Yes | RVVM1DI | 2048 |
4387 | V8DI/DF | 512 | 64 | Yes | RVVM1DI | 2048 |
4388 | V16DI/DF | 1024 | 64 | Yes | RVVM1DI | 2048 |
4389 | V32DI/DF | 2048 | 64 | NO | RVVM1DI | 2048 |
4390 | V64DI/DF | 4096 | 64 | NO | RVVM1DI | 2048 |
4391 | ... | ... | .. | ... | RVVM1DI | 2048 |
4392 | V512DI/DF | 32768 | 64 | NO | RVVM1DI | 2048 |
4393 +------------+-----------+-----------------+---------+-----------+----------+
4394
4395 Then we can have the condition for VLS mode in fixed-vlmax, aka:
4396 PRECISION (VLSmode) < VLEN / (64 / PRECISION(VLS_inner_mode)). */
4397 bool
4398 vls_mode_valid_p (machine_mode vls_mode)
4399 {
4400 if (!TARGET_VECTOR)
4401 return false;
4402
4403 if (riscv_autovec_preference == RVV_SCALABLE)
4404 {
4405 if (GET_MODE_CLASS (vls_mode) != MODE_VECTOR_BOOL
4406 && !ordered_p (TARGET_MAX_LMUL * BITS_PER_RISCV_VECTOR,
4407 GET_MODE_PRECISION (vls_mode)))
4408 /* We enable VLS modes which are aligned with TARGET_MAX_LMUL and
4409 BITS_PER_RISCV_VECTOR.
4410
4411 e.g. When TARGET_MAX_LMUL = 1 and BITS_PER_RISCV_VECTOR = (128,128).
4412 We enable VLS modes have fixed size <= 128bit. Since ordered_p is
4413 false between VLA modes with size = (128, 128) bits and VLS mode
4414 with size = 128 bits, we will end up with multiple ICEs in
4415 middle-end generic codes. */
4416 return false;
4417 return true;
4418 }
4419
4420 if (riscv_autovec_preference == RVV_FIXED_VLMAX)
4421 {
4422 machine_mode inner_mode = GET_MODE_INNER (vls_mode);
4423 int precision = GET_MODE_PRECISION (inner_mode).to_constant ();
4424 int min_vlmax_bitsize = TARGET_MIN_VLEN / (64 / precision);
4425
4426 return GET_MODE_PRECISION (vls_mode).to_constant () < min_vlmax_bitsize;
4427 }
4428
4429 return false;
4430 }
4431
4432 /* We don't have to convert the floating point to integer when the
4433 mantissa is zero. Thus, ther will be a limitation for both the
4434 single and double precision floating point. There will be no
4435 mantissa if the floating point is greater than the limit.
4436
4437 1. Half floating point.
4438 +-----------+---------------+
4439 | float | binary layout |
4440 +-----------+---------------+
4441 | 1023.5 | 0x63ff |
4442 +-----------+---------------+
4443 | 1024.0 | 0x6400 |
4444 +-----------+---------------+
4445 | 1025.0 | 0x6401 |
4446 +-----------+---------------+
4447 | ... | ... |
4448
4449 All half floating point will be unchanged for ceil if it is
4450 greater than and equal to 1024.
4451
4452 2. Single floating point.
4453 +-----------+---------------+
4454 | float | binary layout |
4455 +-----------+---------------+
4456 | 8388607.5 | 0x4affffff |
4457 +-----------+---------------+
4458 | 8388608.0 | 0x4b000000 |
4459 +-----------+---------------+
4460 | 8388609.0 | 0x4b000001 |
4461 +-----------+---------------+
4462 | ... | ... |
4463
4464 All single floating point will be unchanged for ceil if it is
4465 greater than and equal to 8388608.
4466
4467 3. Double floating point.
4468 +--------------------+--------------------+
4469 | float | binary layout |
4470 +--------------------+--------------------+
4471 | 4503599627370495.5 | 0X432fffffffffffff |
4472 +--------------------+--------------------+
4473 | 4503599627370496.0 | 0X4330000000000000 |
4474 +--------------------+--------------------+
4475 | 4503599627370497.0 | 0X4340000000000000 |
4476 +--------------------+--------------------+
4477 | ... | ... |
4478
4479 All double floating point will be unchanged for ceil if it is
4480 greater than and equal to 4503599627370496.
4481 */
4482 static rtx
4483 get_fp_rounding_coefficient (machine_mode inner_mode)
4484 {
4485 REAL_VALUE_TYPE real;
4486
4487 if (inner_mode == E_HFmode)
4488 real_from_integer (&real, inner_mode, 1024, SIGNED);
4489 else if (inner_mode == E_SFmode)
4490 real_from_integer (&real, inner_mode, 8388608, SIGNED);
4491 else if (inner_mode == E_DFmode)
4492 real_from_integer (&real, inner_mode, 4503599627370496, SIGNED);
4493 else
4494 gcc_unreachable ();
4495
4496 return const_double_from_real_value (real, inner_mode);
4497 }
4498
4499 static rtx
4500 emit_vec_float_cmp_mask (rtx fp_vector, rtx_code code, rtx fp_scalar,
4501 machine_mode vec_fp_mode)
4502 {
4503 /* Step-1: Prepare the scalar float compare register. */
4504 rtx fp_reg = gen_reg_rtx (GET_MODE_INNER (vec_fp_mode));
4505 emit_insn (gen_move_insn (fp_reg, fp_scalar));
4506
4507 /* Step-2: Generate the mask. */
4508 machine_mode mask_mode = get_mask_mode (vec_fp_mode);
4509 rtx mask = gen_reg_rtx (mask_mode);
4510 rtx cmp = gen_rtx_fmt_ee (code, mask_mode, fp_vector, fp_reg);
4511 rtx cmp_ops[] = {mask, cmp, fp_vector, fp_reg};
4512 insn_code icode = code_for_pred_cmp_scalar (vec_fp_mode);
4513 emit_vlmax_insn (icode, COMPARE_OP, cmp_ops);
4514
4515 return mask;
4516 }
4517
4518 static void
4519 emit_vec_copysign (rtx op_dest, rtx op_src_0, rtx op_src_1,
4520 machine_mode vec_mode)
4521 {
4522 rtx sgnj_ops[] = {op_dest, op_src_0, op_src_1};
4523 insn_code icode = code_for_pred (UNSPEC_VCOPYSIGN, vec_mode);
4524
4525 emit_vlmax_insn (icode, BINARY_OP, sgnj_ops);
4526 }
4527
4528 static void
4529 emit_vec_abs (rtx op_dest, rtx op_src, machine_mode vec_mode)
4530 {
4531 rtx abs_ops[] = {op_dest, op_src};
4532 insn_code icode = code_for_pred (ABS, vec_mode);
4533
4534 emit_vlmax_insn (icode, UNARY_OP, abs_ops);
4535 }
4536
4537 static void
4538 emit_vec_cvt_x_f (rtx op_dest, rtx op_src, rtx mask,
4539 insn_type type, machine_mode vec_mode)
4540 {
4541 insn_code icode = code_for_pred_fcvt_x_f (UNSPEC_VFCVT, vec_mode);
4542
4543 if (type & USE_VUNDEF_MERGE_P)
4544 {
4545 rtx cvt_x_ops[] = {op_dest, mask, op_src};
4546 emit_vlmax_insn (icode, type, cvt_x_ops);
4547 }
4548 else
4549 {
4550 rtx cvt_x_ops[] = {op_dest, mask, op_dest, op_src};
4551 emit_vlmax_insn (icode, type, cvt_x_ops);
4552 }
4553 }
4554
4555 static void
4556 emit_vec_cvt_x_f (rtx op_dest, rtx op_src, insn_type type,
4557 machine_mode vec_mode)
4558 {
4559 rtx ops[] = {op_dest, op_src};
4560 insn_code icode = code_for_pred_fcvt_x_f (UNSPEC_VFCVT, vec_mode);
4561
4562 emit_vlmax_insn (icode, type, ops);
4563 }
4564
4565 static void
4566 emit_vec_narrow_cvt_x_f (rtx op_dest, rtx op_src, insn_type type,
4567 machine_mode vec_mode)
4568 {
4569 rtx ops[] = {op_dest, op_src};
4570 insn_code icode = code_for_pred_narrow_fcvt_x_f (UNSPEC_VFCVT, vec_mode);
4571
4572 emit_vlmax_insn (icode, type, ops);
4573 }
4574
4575 static void
4576 emit_vec_widden_cvt_x_f (rtx op_dest, rtx op_src, insn_type type,
4577 machine_mode vec_mode)
4578 {
4579 rtx ops[] = {op_dest, op_src};
4580 insn_code icode = code_for_pred_widen_fcvt_x_f (UNSPEC_VFCVT, vec_mode);
4581
4582 emit_vlmax_insn (icode, type, ops);
4583 }
4584
4585 static void
4586 emit_vec_widden_cvt_f_f (rtx op_dest, rtx op_src, insn_type type,
4587 machine_mode vec_mode)
4588 {
4589 rtx ops[] = {op_dest, op_src};
4590 insn_code icode = code_for_pred_extend (vec_mode);
4591
4592 emit_vlmax_insn (icode, type, ops);
4593 }
4594
4595 static void
4596 emit_vec_cvt_f_x (rtx op_dest, rtx op_src, rtx mask,
4597 insn_type type, machine_mode vec_mode)
4598 {
4599 rtx cvt_fp_ops[] = {op_dest, mask, op_dest, op_src};
4600 insn_code icode = code_for_pred (FLOAT, vec_mode);
4601
4602 emit_vlmax_insn (icode, type, cvt_fp_ops);
4603 }
4604
4605 static void
4606 emit_vec_cvt_x_f_rtz (rtx op_dest, rtx op_src, rtx mask,
4607 insn_type type, machine_mode vec_mode)
4608 {
4609 insn_code icode = code_for_pred (FIX, vec_mode);
4610
4611 if (type & USE_VUNDEF_MERGE_P)
4612 {
4613 rtx cvt_x_ops[] = {op_dest, mask, op_src};
4614 emit_vlmax_insn (icode, type, cvt_x_ops);
4615 }
4616 else
4617 {
4618 rtx cvt_x_ops[] = {op_dest, mask, op_dest, op_src};
4619 emit_vlmax_insn (icode, type, cvt_x_ops);
4620 }
4621 }
4622
4623 void
4624 expand_vec_ceil (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
4625 machine_mode vec_int_mode)
4626 {
4627 /* Step-1: Get the abs float value for mask generation. */
4628 emit_vec_abs (op_0, op_1, vec_fp_mode);
4629
4630 /* Step-2: Generate the mask on const fp. */
4631 rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode));
4632 rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode);
4633
4634 /* Step-3: Convert to integer on mask, with rounding up (aka ceil). */
4635 rtx tmp = gen_reg_rtx (vec_int_mode);
4636 emit_vec_cvt_x_f (tmp, op_1, mask, UNARY_OP_TAMA_FRM_RUP, vec_fp_mode);
4637
4638 /* Step-4: Convert to floating-point on mask for the final result.
4639 To avoid unnecessary frm register access, we use RUP here and it will
4640 never do the rounding up because the tmp rtx comes from the float
4641 to int conversion. */
4642 emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_RUP, vec_fp_mode);
4643
4644 /* Step-5: Retrieve the sign bit for -0.0. */
4645 emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode);
4646 }
4647
4648 void
4649 expand_vec_floor (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
4650 machine_mode vec_int_mode)
4651 {
4652 /* Step-1: Get the abs float value for mask generation. */
4653 emit_vec_abs (op_0, op_1, vec_fp_mode);
4654
4655 /* Step-2: Generate the mask on const fp. */
4656 rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode));
4657 rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode);
4658
4659 /* Step-3: Convert to integer on mask, with rounding down (aka floor). */
4660 rtx tmp = gen_reg_rtx (vec_int_mode);
4661 emit_vec_cvt_x_f (tmp, op_1, mask, UNARY_OP_TAMA_FRM_RDN, vec_fp_mode);
4662
4663 /* Step-4: Convert to floating-point on mask for the floor result. */
4664 emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_RDN, vec_fp_mode);
4665
4666 /* Step-5: Retrieve the sign bit for -0.0. */
4667 emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode);
4668 }
4669
4670 void
4671 expand_vec_nearbyint (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
4672 machine_mode vec_int_mode)
4673 {
4674 /* Step-1: Get the abs float value for mask generation. */
4675 emit_vec_abs (op_0, op_1, vec_fp_mode);
4676
4677 /* Step-2: Generate the mask on const fp. */
4678 rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode));
4679 rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode);
4680
4681 /* Step-3: Backup FP exception flags, nearbyint never raise exceptions. */
4682 rtx fflags = gen_reg_rtx (SImode);
4683 emit_insn (gen_riscv_frflags (fflags));
4684
4685 /* Step-4: Convert to integer on mask, with rounding down (aka nearbyint). */
4686 rtx tmp = gen_reg_rtx (vec_int_mode);
4687 emit_vec_cvt_x_f (tmp, op_1, mask, UNARY_OP_TAMA_FRM_DYN, vec_fp_mode);
4688
4689 /* Step-5: Convert to floating-point on mask for the nearbyint result. */
4690 emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_DYN, vec_fp_mode);
4691
4692 /* Step-6: Restore FP exception flags. */
4693 emit_insn (gen_riscv_fsflags (fflags));
4694
4695 /* Step-7: Retrieve the sign bit for -0.0. */
4696 emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode);
4697 }
4698
4699 void
4700 expand_vec_rint (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
4701 machine_mode vec_int_mode)
4702 {
4703 /* Step-1: Get the abs float value for mask generation. */
4704 emit_vec_abs (op_0, op_1, vec_fp_mode);
4705
4706 /* Step-2: Generate the mask on const fp. */
4707 rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode));
4708 rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode);
4709
4710 /* Step-3: Convert to integer on mask, with dyn rounding (aka rint). */
4711 rtx tmp = gen_reg_rtx (vec_int_mode);
4712 emit_vec_cvt_x_f (tmp, op_1, mask, UNARY_OP_TAMA_FRM_DYN, vec_fp_mode);
4713
4714 /* Step-4: Convert to floating-point on mask for the rint result. */
4715 emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_DYN, vec_fp_mode);
4716
4717 /* Step-5: Retrieve the sign bit for -0.0. */
4718 emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode);
4719 }
4720
4721 void
4722 expand_vec_round (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
4723 machine_mode vec_int_mode)
4724 {
4725 /* Step-1: Get the abs float value for mask generation. */
4726 emit_vec_abs (op_0, op_1, vec_fp_mode);
4727
4728 /* Step-2: Generate the mask on const fp. */
4729 rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode));
4730 rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode);
4731
4732 /* Step-3: Convert to integer on mask, rounding to nearest (aka round). */
4733 rtx tmp = gen_reg_rtx (vec_int_mode);
4734 emit_vec_cvt_x_f (tmp, op_1, mask, UNARY_OP_TAMA_FRM_RMM, vec_fp_mode);
4735
4736 /* Step-4: Convert to floating-point on mask for the round result. */
4737 emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_RMM, vec_fp_mode);
4738
4739 /* Step-5: Retrieve the sign bit for -0.0. */
4740 emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode);
4741 }
4742
4743 void
4744 expand_vec_trunc (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
4745 machine_mode vec_int_mode)
4746 {
4747 /* Step-1: Get the abs float value for mask generation. */
4748 emit_vec_abs (op_0, op_1, vec_fp_mode);
4749
4750 /* Step-2: Generate the mask on const fp. */
4751 rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode));
4752 rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode);
4753
4754 /* Step-3: Convert to integer on mask, rounding to zero (aka truncate). */
4755 rtx tmp = gen_reg_rtx (vec_int_mode);
4756 emit_vec_cvt_x_f_rtz (tmp, op_1, mask, UNARY_OP_TAMA, vec_fp_mode);
4757
4758 /* Step-4: Convert to floating-point on mask for the rint result. */
4759 emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_DYN, vec_fp_mode);
4760
4761 /* Step-5: Retrieve the sign bit for -0.0. */
4762 emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode);
4763 }
4764
4765 void
4766 expand_vec_roundeven (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
4767 machine_mode vec_int_mode)
4768 {
4769 /* Step-1: Get the abs float value for mask generation. */
4770 emit_vec_abs (op_0, op_1, vec_fp_mode);
4771
4772 /* Step-2: Generate the mask on const fp. */
4773 rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode));
4774 rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode);
4775
4776 /* Step-3: Convert to integer on mask, rounding to nearest, ties to even. */
4777 rtx tmp = gen_reg_rtx (vec_int_mode);
4778 emit_vec_cvt_x_f (tmp, op_1, mask, UNARY_OP_TAMA_FRM_RNE, vec_fp_mode);
4779
4780 /* Step-4: Convert to floating-point on mask for the rint result. */
4781 emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_RNE, vec_fp_mode);
4782
4783 /* Step-5: Retrieve the sign bit for -0.0. */
4784 emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode);
4785 }
4786
4787 /* Handling the rounding from floating-point to int/long/long long. */
4788 static void
4789 emit_vec_rounding_to_integer (rtx op_0, rtx op_1, insn_type type,
4790 machine_mode vec_fp_mode,
4791 machine_mode vec_int_mode,
4792 machine_mode vec_bridge_mode = E_VOIDmode)
4793 {
4794 poly_uint16 vec_fp_size = GET_MODE_SIZE (vec_fp_mode);
4795 poly_uint16 vec_int_size = GET_MODE_SIZE (vec_int_mode);
4796
4797 if (known_eq (vec_fp_size, vec_int_size)) /* SF => SI, DF => DI. */
4798 emit_vec_cvt_x_f (op_0, op_1, type, vec_fp_mode);
4799 else if (maybe_eq (vec_fp_size, vec_int_size * 2)) /* DF => SI. */
4800 emit_vec_narrow_cvt_x_f (op_0, op_1, type, vec_fp_mode);
4801 else if (maybe_eq (vec_fp_size * 2, vec_int_size)) /* SF => DI, HF => SI. */
4802 emit_vec_widden_cvt_x_f (op_0, op_1, type, vec_int_mode);
4803 else if (maybe_eq (vec_fp_size * 4, vec_int_size)) /* HF => DI. */
4804 {
4805 gcc_assert (vec_bridge_mode != E_VOIDmode);
4806
4807 rtx op_sf = gen_reg_rtx (vec_bridge_mode);
4808
4809 /* Step-1: HF => SF, no rounding here. */
4810 emit_vec_widden_cvt_f_f (op_sf, op_1, UNARY_OP, vec_bridge_mode);
4811 /* Step-2: SF => DI. */
4812 emit_vec_widden_cvt_x_f (op_0, op_sf, type, vec_int_mode);
4813 }
4814 else
4815 gcc_unreachable ();
4816 }
4817
4818 void
4819 expand_vec_lrint (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
4820 machine_mode vec_int_mode, machine_mode vec_bridge_mode)
4821 {
4822 emit_vec_rounding_to_integer (op_0, op_1, UNARY_OP_FRM_DYN, vec_fp_mode,
4823 vec_int_mode, vec_bridge_mode);
4824 }
4825
4826 void
4827 expand_vec_lround (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
4828 machine_mode vec_int_mode, machine_mode vec_bridge_mode)
4829 {
4830 emit_vec_rounding_to_integer (op_0, op_1, UNARY_OP_FRM_RMM, vec_fp_mode,
4831 vec_int_mode, vec_bridge_mode);
4832 }
4833
4834 void
4835 expand_vec_lceil (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
4836 machine_mode vec_int_mode)
4837 {
4838 emit_vec_rounding_to_integer (op_0, op_1, UNARY_OP_FRM_RUP, vec_fp_mode,
4839 vec_int_mode);
4840 }
4841
4842 void
4843 expand_vec_lfloor (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
4844 machine_mode vec_int_mode)
4845 {
4846 emit_vec_rounding_to_integer (op_0, op_1, UNARY_OP_FRM_RDN, vec_fp_mode,
4847 vec_int_mode);
4848 }
4849
4850 /* Vectorize popcount by the Wilkes-Wheeler-Gill algorithm that libgcc uses as
4851 well. */
4852 void
4853 expand_popcount (rtx *ops)
4854 {
4855 rtx dst = ops[0];
4856 rtx src = ops[1];
4857 machine_mode mode = GET_MODE (dst);
4858 scalar_mode imode = GET_MODE_INNER (mode);
4859 static const uint64_t m5 = 0x5555555555555555ULL;
4860 static const uint64_t m3 = 0x3333333333333333ULL;
4861 static const uint64_t mf = 0x0F0F0F0F0F0F0F0FULL;
4862 static const uint64_t m1 = 0x0101010101010101ULL;
4863
4864 rtx x1 = gen_reg_rtx (mode);
4865 rtx x2 = gen_reg_rtx (mode);
4866 rtx x3 = gen_reg_rtx (mode);
4867 rtx x4 = gen_reg_rtx (mode);
4868
4869 /* x1 = src - (src >> 1) & 0x555...); */
4870 rtx shift1 = expand_binop (mode, lshr_optab, src, GEN_INT (1), NULL, true,
4871 OPTAB_DIRECT);
4872
4873 rtx and1 = gen_reg_rtx (mode);
4874 rtx ops1[] = {and1, shift1, gen_int_mode (m5, imode)};
4875 emit_vlmax_insn (code_for_pred_scalar (AND, mode), riscv_vector::BINARY_OP,
4876 ops1);
4877
4878 x1 = expand_binop (mode, sub_optab, src, and1, NULL, true, OPTAB_DIRECT);
4879
4880 /* x2 = (x1 & 0x3333333333333333ULL) + ((x1 >> 2) & 0x3333333333333333ULL);
4881 */
4882 rtx and2 = gen_reg_rtx (mode);
4883 rtx ops2[] = {and2, x1, gen_int_mode (m3, imode)};
4884 emit_vlmax_insn (code_for_pred_scalar (AND, mode), riscv_vector::BINARY_OP,
4885 ops2);
4886
4887 rtx shift2 = expand_binop (mode, lshr_optab, x1, GEN_INT (2), NULL, true,
4888 OPTAB_DIRECT);
4889
4890 rtx and22 = gen_reg_rtx (mode);
4891 rtx ops22[] = {and22, shift2, gen_int_mode (m3, imode)};
4892 emit_vlmax_insn (code_for_pred_scalar (AND, mode), riscv_vector::BINARY_OP,
4893 ops22);
4894
4895 x2 = expand_binop (mode, add_optab, and2, and22, NULL, true, OPTAB_DIRECT);
4896
4897 /* x3 = (x2 + (x2 >> 4)) & 0x0f0f0f0f0f0f0f0fULL; */
4898 rtx shift3 = expand_binop (mode, lshr_optab, x2, GEN_INT (4), NULL, true,
4899 OPTAB_DIRECT);
4900
4901 rtx plus3
4902 = expand_binop (mode, add_optab, x2, shift3, NULL, true, OPTAB_DIRECT);
4903
4904 rtx ops3[] = {x3, plus3, gen_int_mode (mf, imode)};
4905 emit_vlmax_insn (code_for_pred_scalar (AND, mode), riscv_vector::BINARY_OP,
4906 ops3);
4907
4908 /* dest = (x3 * 0x0101010101010101ULL) >> 56; */
4909 rtx mul4 = gen_reg_rtx (mode);
4910 rtx ops4[] = {mul4, x3, gen_int_mode (m1, imode)};
4911 emit_vlmax_insn (code_for_pred_scalar (MULT, mode), riscv_vector::BINARY_OP,
4912 ops4);
4913
4914 x4 = expand_binop (mode, lshr_optab, mul4,
4915 GEN_INT (GET_MODE_BITSIZE (imode) - 8), NULL, true,
4916 OPTAB_DIRECT);
4917
4918 emit_move_insn (dst, x4);
4919 }
4920
4921 /* Return true if it is VLMAX AVL TYPE. */
4922 bool
4923 vlmax_avl_type_p (rtx_insn *rinsn)
4924 {
4925 extract_insn_cached (rinsn);
4926 int index = get_attr_avl_type_idx (rinsn);
4927 if (index == INVALID_ATTRIBUTE)
4928 return false;
4929 rtx avl_type = recog_data.operand[index];
4930 return INTVAL (avl_type) == VLMAX;
4931 }
4932
4933 /* Return true if it is an RVV instruction depends on VL global
4934 status register. */
4935 bool
4936 has_vl_op (rtx_insn *rinsn)
4937 {
4938 return recog_memoized (rinsn) >= 0 && get_attr_has_vl_op (rinsn);
4939 }
4940
4941 /* Get default tail policy. */
4942 static bool
4943 get_default_ta ()
4944 {
4945 /* For the instruction that doesn't require TA, we still need a default value
4946 to emit vsetvl. We pick up the default value according to prefer policy. */
4947 return (bool) (get_prefer_tail_policy () & 0x1
4948 || (get_prefer_tail_policy () >> 1 & 0x1));
4949 }
4950
4951 /* Helper function to get TA operand. */
4952 bool
4953 tail_agnostic_p (rtx_insn *rinsn)
4954 {
4955 /* If it doesn't have TA, we return agnostic by default. */
4956 extract_insn_cached (rinsn);
4957 int ta = get_attr_ta (rinsn);
4958 return ta == INVALID_ATTRIBUTE ? get_default_ta () : IS_AGNOSTIC (ta);
4959 }
4960
4961 /* Change insn and Assert the change always happens. */
4962 void
4963 validate_change_or_fail (rtx object, rtx *loc, rtx new_rtx, bool in_group)
4964 {
4965 bool change_p = validate_change (object, loc, new_rtx, in_group);
4966 gcc_assert (change_p);
4967 }
4968
4969 /* Return true if it is NONVLMAX AVL TYPE. */
4970 bool
4971 nonvlmax_avl_type_p (rtx_insn *rinsn)
4972 {
4973 extract_insn_cached (rinsn);
4974 int index = get_attr_avl_type_idx (rinsn);
4975 if (index == INVALID_ATTRIBUTE)
4976 return false;
4977 rtx avl_type = recog_data.operand[index];
4978 return INTVAL (avl_type) == NONVLMAX;
4979 }
4980
4981 /* Return true if RTX is RVV VLMAX AVL. */
4982 bool
4983 vlmax_avl_p (rtx x)
4984 {
4985 return x && rtx_equal_p (x, RVV_VLMAX);
4986 }
4987
4988 /* Helper function to get SEW operand. We always have SEW value for
4989 all RVV instructions that have VTYPE OP. */
4990 uint8_t
4991 get_sew (rtx_insn *rinsn)
4992 {
4993 return get_attr_sew (rinsn);
4994 }
4995
4996 /* Helper function to get VLMUL operand. We always have VLMUL value for
4997 all RVV instructions that have VTYPE OP. */
4998 enum vlmul_type
4999 get_vlmul (rtx_insn *rinsn)
5000 {
5001 return (enum vlmul_type) get_attr_vlmul (rinsn);
5002 }
5003
5004 /* Count the number of REGNO in RINSN. */
5005 int
5006 count_regno_occurrences (rtx_insn *rinsn, unsigned int regno)
5007 {
5008 int count = 0;
5009 extract_insn (rinsn);
5010 for (int i = 0; i < recog_data.n_operands; i++)
5011 if (refers_to_regno_p (regno, recog_data.operand[i]))
5012 count++;
5013 return count;
5014 }
5015
5016 /* Return true if the OP can be directly broadcasted. */
5017 bool
5018 can_be_broadcasted_p (rtx op)
5019 {
5020 machine_mode mode = GET_MODE (op);
5021 /* We don't allow RA (register allocation) reload generate
5022 (vec_duplicate:DI reg) in RV32 system wheras we allow
5023 (vec_duplicate:DI mem) in RV32 system. */
5024 if (!can_create_pseudo_p () && !FLOAT_MODE_P (mode)
5025 && maybe_gt (GET_MODE_SIZE (mode), GET_MODE_SIZE (Pmode))
5026 && !satisfies_constraint_Wdm (op))
5027 return false;
5028
5029 if (satisfies_constraint_K (op) || register_operand (op, mode)
5030 || satisfies_constraint_Wdm (op) || rtx_equal_p (op, CONST0_RTX (mode)))
5031 return true;
5032
5033 return can_create_pseudo_p () && nonmemory_operand (op, mode);
5034 }
5035
5036 void
5037 emit_vec_extract (rtx target, rtx src, rtx index)
5038 {
5039 machine_mode vmode = GET_MODE (src);
5040 machine_mode smode = GET_MODE (target);
5041 class expand_operand ops[3];
5042 enum insn_code icode
5043 = convert_optab_handler (vec_extract_optab, vmode, smode);
5044 gcc_assert (icode != CODE_FOR_nothing);
5045 create_output_operand (&ops[0], target, smode);
5046 ops[0].target = 1;
5047 create_input_operand (&ops[1], src, vmode);
5048
5049 poly_int64 val;
5050 if (poly_int_rtx_p (index, &val))
5051 create_integer_operand (&ops[2], val);
5052 else
5053 create_input_operand (&ops[2], index, Pmode);
5054
5055 expand_insn (icode, 3, ops);
5056 if (ops[0].value != target)
5057 emit_move_insn (target, ops[0].value);
5058 }
5059
5060 /* Return true if the offset mode is valid mode that we use for gather/scatter
5061 autovectorization. */
5062 bool
5063 gather_scatter_valid_offset_p (machine_mode mode)
5064 {
5065 /* If the element size of offset mode is already >= Pmode size,
5066 we don't need any extensions. */
5067 if (known_ge (GET_MODE_SIZE (GET_MODE_INNER (mode)), UNITS_PER_WORD))
5068 return true;
5069
5070 /* Since we are very likely extend the offset mode into vector Pmode,
5071 Disable gather/scatter autovectorization if we can't extend the offset
5072 mode into vector Pmode. */
5073 if (!get_vector_mode (Pmode, GET_MODE_NUNITS (mode)).exists ())
5074 return false;
5075 return true;
5076 }
5077
5078 /* Implement TARGET_ESTIMATED_POLY_VALUE.
5079 Look into the tuning structure for an estimate.
5080 KIND specifies the type of requested estimate: min, max or likely.
5081 For cores with a known VLA width all three estimates are the same.
5082 For generic VLA tuning we want to distinguish the maximum estimate from
5083 the minimum and likely ones.
5084 The likely estimate is the same as the minimum in that case to give a
5085 conservative behavior of auto-vectorizing with VLA when it is a win
5086 even for VLA vectorization.
5087 When VLA width information is available VAL.coeffs[1] is multiplied by
5088 the number of VLA chunks over the initial VLS bits. */
5089 HOST_WIDE_INT
5090 estimated_poly_value (poly_int64 val, unsigned int kind)
5091 {
5092 unsigned int width_source
5093 = BITS_PER_RISCV_VECTOR.is_constant ()
5094 ? (unsigned int) BITS_PER_RISCV_VECTOR.to_constant ()
5095 : (unsigned int) RVV_SCALABLE;
5096
5097 /* If there is no core-specific information then the minimum and likely
5098 values are based on TARGET_MIN_VLEN vectors and the maximum is based on
5099 the architectural maximum of 65536 bits. */
5100 unsigned int min_vlen_bytes = TARGET_MIN_VLEN / 8 - 1;
5101 if (width_source == RVV_SCALABLE)
5102 switch (kind)
5103 {
5104 case POLY_VALUE_MIN:
5105 case POLY_VALUE_LIKELY:
5106 return val.coeffs[0];
5107
5108 case POLY_VALUE_MAX:
5109 return val.coeffs[0] + val.coeffs[1] * min_vlen_bytes;
5110 }
5111
5112 /* Allow BITS_PER_RISCV_VECTOR to be a bitmask of different VL, treating the
5113 lowest as likely. This could be made more general if future -mtune
5114 options need it to be. */
5115 if (kind == POLY_VALUE_MAX)
5116 width_source = 1 << floor_log2 (width_source);
5117 else
5118 width_source = least_bit_hwi (width_source);
5119
5120 /* If the core provides width information, use that. */
5121 HOST_WIDE_INT over_min_vlen = width_source - TARGET_MIN_VLEN;
5122 return val.coeffs[0] + val.coeffs[1] * over_min_vlen / TARGET_MIN_VLEN;
5123 }
5124
5125 } // namespace riscv_vector