1 /* Subroutines used for code generation for RISC-V 'V' Extension for
3 Copyright (C) 2022-2024 Free Software Foundation, Inc.
4 Contributed by Juzhe Zhong (juzhe.zhong@rivai.ai), RiVAI Technologies Ltd.
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3, or (at your option)
13 GCC is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #define IN_TARGET_CODE 1
24 /* We have a maximum of 11 operands for RVV instruction patterns according to
26 #define RVV_INSN_OPERANDS_MAX 11
30 #include "coretypes.h"
34 #include "insn-config.h"
35 #include "insn-attr.h"
39 #include "stringpool.h"
46 #include "targhooks.h"
49 #include "tm-constrs.h"
50 #include "rtx-vector-builder.h"
51 #include "targhooks.h"
54 using namespace riscv_vector
;
56 namespace riscv_vector
{
58 /* Return true if NUNTIS <=31 so that we can use immediate AVL in vsetivli. */
60 imm_avl_p (machine_mode mode
)
62 poly_uint64 nunits
= GET_MODE_NUNITS (mode
);
64 return nunits
.is_constant ()
65 /* The vsetivli can only hold register 0~31. */
66 ? (IN_RANGE (nunits
.to_constant (), 0, 31))
67 /* Only allowed in VLS-VLMAX mode. */
71 /* Return true if LEN is equal to NUNITS that out of the range [0, 31]. */
73 is_vlmax_len_p (machine_mode mode
, rtx len
)
76 return poly_int_rtx_p (len
, &value
)
77 && known_eq (value
, GET_MODE_NUNITS (mode
));
80 /* Helper functions for insn_flags && insn_types */
82 /* Return true if caller need pass mask operand for insn pattern with
86 need_mask_operand_p (unsigned insn_flags
)
88 return (insn_flags
& HAS_MASK_P
)
89 && !(insn_flags
& (USE_ONE_TRUE_MASK_P
| USE_ALL_TRUES_MASK_P
));
92 template <int MAX_OPERANDS
> class insn_expander
95 insn_expander () = delete;
97 insn_expander (unsigned insn_flags
, bool vlmax_p
)
98 : m_insn_flags (insn_flags
), m_opno (0), m_vlmax_p (vlmax_p
),
104 void check_insn_flags () const
106 if (m_insn_flags
& USE_ONE_TRUE_MASK_P
)
107 /* USE_ONE_TRUE_MASK_P is dependent on HAS_MASK_P. */
108 gcc_assert ((m_insn_flags
& HAS_MASK_P
));
110 if (m_insn_flags
& USE_ALL_TRUES_MASK_P
)
111 /* USE_ALL_TRUES_MASK_P is dependent on HAS_MASK_P. */
112 gcc_assert ((m_insn_flags
& HAS_MASK_P
));
114 /* USE_ONE_TRUE_MASK_P and USE_ALL_TRUES_MASK_P are mutually exclusive. */
115 gcc_assert (!((m_insn_flags
& USE_ONE_TRUE_MASK_P
)
116 && (m_insn_flags
& USE_ALL_TRUES_MASK_P
)));
118 if (m_insn_flags
& USE_VUNDEF_MERGE_P
)
119 /* USE_VUNDEF_MERGE_P is dependent on HAS_MERGE_P. */
120 gcc_assert ((m_insn_flags
& HAS_MERGE_P
));
122 /* TU_POLICY_P and TDEFAULT_POLICY_P are mutually exclusive. */
124 !((m_insn_flags
& TU_POLICY_P
) && (m_insn_flags
& TDEFAULT_POLICY_P
)));
126 /* MU_POLICY_P and MDEFAULT_POLICY_P are mutually exclusive. */
128 !((m_insn_flags
& MU_POLICY_P
) && (m_insn_flags
& MDEFAULT_POLICY_P
)));
130 /* NULLARY_OP_P, UNARY_OP_P, BINARY_OP_P, TERNARY_OP_P are mutually
133 !((m_insn_flags
& NULLARY_OP_P
)
134 && ((m_insn_flags
& UNARY_OP_P
) || (m_insn_flags
& BINARY_OP_P
)
135 || (m_insn_flags
& TERNARY_OP_P
))));
137 !((m_insn_flags
& UNARY_OP_P
)
138 && ((m_insn_flags
& NULLARY_OP_P
) || (m_insn_flags
& BINARY_OP_P
)
139 || (m_insn_flags
& TERNARY_OP_P
))));
141 !((m_insn_flags
& BINARY_OP_P
)
142 && ((m_insn_flags
& NULLARY_OP_P
) || (m_insn_flags
& UNARY_OP_P
)
143 || (m_insn_flags
& TERNARY_OP_P
))));
145 !((m_insn_flags
& TERNARY_OP_P
)
146 && ((m_insn_flags
& NULLARY_OP_P
) || (m_insn_flags
& UNARY_OP_P
)
147 || (m_insn_flags
& BINARY_OP_P
))));
150 void set_vl (rtx vl
) { m_vl_op
= vl
; }
152 void add_output_operand (rtx x
, machine_mode mode
)
154 create_output_operand (&m_ops
[m_opno
++], x
, mode
);
155 gcc_assert (m_opno
<= MAX_OPERANDS
);
157 void add_input_operand (rtx x
, machine_mode mode
)
159 create_input_operand (&m_ops
[m_opno
++], x
, mode
);
160 gcc_assert (m_opno
<= MAX_OPERANDS
);
162 void add_all_one_mask_operand (machine_mode mask_mode
)
164 add_input_operand (CONSTM1_RTX (mask_mode
), mask_mode
);
166 void add_first_one_true_mask_operand (machine_mode mask_mode
)
168 add_input_operand (gen_scalar_move_mask (mask_mode
), mask_mode
);
170 void add_vundef_operand (machine_mode dest_mode
)
172 add_input_operand (RVV_VUNDEF (dest_mode
), dest_mode
);
174 void add_policy_operand ()
176 if (m_insn_flags
& TU_POLICY_P
)
178 rtx tail_policy_rtx
= gen_int_mode (TAIL_UNDISTURBED
, Pmode
);
179 add_input_operand (tail_policy_rtx
, Pmode
);
181 else if (m_insn_flags
& TDEFAULT_POLICY_P
)
183 rtx tail_policy_rtx
= gen_int_mode (get_prefer_tail_policy (), Pmode
);
184 add_input_operand (tail_policy_rtx
, Pmode
);
187 if (m_insn_flags
& MU_POLICY_P
)
189 rtx mask_policy_rtx
= gen_int_mode (MASK_UNDISTURBED
, Pmode
);
190 add_input_operand (mask_policy_rtx
, Pmode
);
192 else if (m_insn_flags
& MDEFAULT_POLICY_P
)
194 rtx mask_policy_rtx
= gen_int_mode (get_prefer_mask_policy (), Pmode
);
195 add_input_operand (mask_policy_rtx
, Pmode
);
198 void add_avl_type_operand (avl_type type
)
200 add_input_operand (gen_int_mode (type
, Pmode
), Pmode
);
204 add_rounding_mode_operand (enum floating_point_rounding_mode rounding_mode
)
206 rtx frm_rtx
= gen_int_mode (rounding_mode
, Pmode
);
207 add_input_operand (frm_rtx
, Pmode
);
210 /* Return the vtype mode based on insn_flags.
211 vtype mode mean the mode vsetvl insn set. */
213 get_vtype_mode (rtx
*ops
)
215 machine_mode vtype_mode
;
216 if (m_insn_flags
& VTYPE_MODE_FROM_OP1_P
)
217 vtype_mode
= GET_MODE (ops
[1]);
219 vtype_mode
= GET_MODE (ops
[0]);
223 void emit_insn (enum insn_code icode
, rtx
*ops
)
227 /* It's true if any operand is memory operand. */
228 bool any_mem_p
= false;
230 machine_mode vtype_mode
= get_vtype_mode (ops
);
231 machine_mode mask_mode
= get_mask_mode (vtype_mode
);
233 /* Add dest operand. */
234 if (m_insn_flags
& HAS_DEST_P
)
236 rtx op
= ops
[opno
++];
237 any_mem_p
|= MEM_P (op
);
238 add_output_operand (op
, GET_MODE (op
));
241 /* Add mask operand. */
242 if (m_insn_flags
& USE_ONE_TRUE_MASK_P
)
243 add_first_one_true_mask_operand (mask_mode
);
244 else if (m_insn_flags
& USE_ALL_TRUES_MASK_P
)
245 add_all_one_mask_operand (mask_mode
);
246 else if (m_insn_flags
& HAS_MASK_P
)
248 machine_mode mode
= insn_data
[(int) icode
].operand
[m_opno
].mode
;
249 gcc_assert (mode
!= VOIDmode
);
250 add_input_operand (ops
[opno
++], mode
);
253 /* Add merge operand. */
254 if (m_insn_flags
& USE_VUNDEF_MERGE_P
)
255 /* Same as dest operand. */
256 add_vundef_operand (GET_MODE (ops
[0]));
257 else if (m_insn_flags
& HAS_MERGE_P
)
259 machine_mode mode
= insn_data
[(int) icode
].operand
[m_opno
].mode
;
260 gcc_assert (mode
!= VOIDmode
);
261 add_input_operand (ops
[opno
++], mode
);
264 if (m_insn_flags
& NULLARY_OP_P
)
266 else if (m_insn_flags
& UNARY_OP_P
)
268 else if (m_insn_flags
& BINARY_OP_P
)
270 else if (m_insn_flags
& TERNARY_OP_P
)
275 /* Add the remain operands. */
276 for (; num_ops
; num_ops
--, opno
++)
278 any_mem_p
|= MEM_P (ops
[opno
]);
279 machine_mode mode
= insn_data
[(int) icode
].operand
[m_opno
].mode
;
280 /* 'create_input_operand doesn't allow VOIDmode.
281 According to vector.md, we may have some patterns that do not have
282 explicit machine mode specifying the operand. Such operands are
284 if (mode
== VOIDmode
)
287 /* Early assertion ensures same mode since maybe_legitimize_operand
289 gcc_assert (GET_MODE (ops
[opno
]) == VOIDmode
290 || GET_MODE (ops
[opno
]) == mode
);
292 add_input_operand (ops
[opno
], mode
);
295 /* Add vl operand. */
300 if (riscv_v_ext_vls_mode_p (vtype_mode
))
302 /* VLS modes always set VSETVL by
303 "vsetvl zero, rs1/imm". */
304 poly_uint64 nunits
= GET_MODE_NUNITS (vtype_mode
);
305 len
= gen_int_mode (nunits
, Pmode
);
308 else if (can_create_pseudo_p ())
310 len
= gen_reg_rtx (Pmode
);
311 emit_vlmax_vsetvl (vtype_mode
, len
);
315 gcc_assert (len
!= NULL_RTX
);
316 add_input_operand (len
, Pmode
);
318 /* Add tail and mask policy operands. */
319 add_policy_operand ();
321 /* Add avl_type operand. */
322 add_avl_type_operand (
323 vls_p
? avl_type::VLS
324 : (m_vlmax_p
? avl_type::VLMAX
: avl_type::NONVLMAX
));
326 /* Add rounding mode operand. */
327 if (m_insn_flags
& FRM_DYN_P
)
328 add_rounding_mode_operand (FRM_DYN
);
329 else if (m_insn_flags
& FRM_RUP_P
)
330 add_rounding_mode_operand (FRM_RUP
);
331 else if (m_insn_flags
& FRM_RDN_P
)
332 add_rounding_mode_operand (FRM_RDN
);
333 else if (m_insn_flags
& FRM_RMM_P
)
334 add_rounding_mode_operand (FRM_RMM
);
335 else if (m_insn_flags
& FRM_RNE_P
)
336 add_rounding_mode_operand (FRM_RNE
);
338 gcc_assert (insn_data
[(int) icode
].n_operands
== m_opno
);
339 expand (icode
, any_mem_p
);
342 void expand (enum insn_code icode
, bool temporary_volatile_p
= false)
344 if (temporary_volatile_p
)
346 temporary_volatile_ok
v (true);
347 expand_insn (icode
, m_opno
, m_ops
);
350 expand_insn (icode
, m_opno
, m_ops
);
354 unsigned m_insn_flags
;
358 expand_operand m_ops
[MAX_OPERANDS
];
361 /* Emit an RVV insn with a vector length that equals the number of units of the
362 vector mode. For VLA modes this corresponds to VLMAX.
364 Unless the vector length can be encoded in the vsetivl[i] instruction this
365 function must only be used as long as we can create pseudo registers. This is
366 because it will set a pseudo register to VLMAX using vsetvl and use this as
367 definition for the vector length. */
369 emit_vlmax_insn (unsigned icode
, unsigned insn_flags
, rtx
*ops
)
371 insn_expander
<RVV_INSN_OPERANDS_MAX
> e (insn_flags
, true);
372 gcc_assert (can_create_pseudo_p () || imm_avl_p (e
.get_vtype_mode (ops
)));
374 e
.emit_insn ((enum insn_code
) icode
, ops
);
377 /* Like emit_vlmax_insn but must only be used when we cannot create pseudo
378 registers anymore. This function, however, takes a predefined vector length
379 from the value in VL. */
381 emit_vlmax_insn_lra (unsigned icode
, unsigned insn_flags
, rtx
*ops
, rtx vl
)
383 gcc_assert (!can_create_pseudo_p ());
384 machine_mode mode
= GET_MODE (ops
[0]);
386 if (imm_avl_p (mode
))
388 /* Even though VL is a real hardreg already allocated since
389 it is post-RA now, we still gain benefits that we emit
390 vsetivli zero, imm instead of vsetvli VL, zero which is
391 we can be more flexible in post-RA instruction scheduling. */
392 insn_expander
<RVV_INSN_OPERANDS_MAX
> e (insn_flags
, false);
393 e
.set_vl (gen_int_mode (GET_MODE_NUNITS (mode
), Pmode
));
394 e
.emit_insn ((enum insn_code
) icode
, ops
);
398 insn_expander
<RVV_INSN_OPERANDS_MAX
> e (insn_flags
, true);
400 e
.emit_insn ((enum insn_code
) icode
, ops
);
404 /* Emit an RVV insn with a predefined vector length. Contrary to
405 emit_vlmax_insn the instruction's vector length is not deduced from its mode
406 but taken from the value in VL. */
408 emit_nonvlmax_insn (unsigned icode
, unsigned insn_flags
, rtx
*ops
, rtx vl
)
410 insn_expander
<RVV_INSN_OPERANDS_MAX
> e (insn_flags
, false);
412 e
.emit_insn ((enum insn_code
) icode
, ops
);
415 class rvv_builder
: public rtx_vector_builder
418 rvv_builder () : rtx_vector_builder () {}
419 rvv_builder (machine_mode mode
, unsigned int npatterns
,
420 unsigned int nelts_per_pattern
)
421 : rtx_vector_builder (mode
, npatterns
, nelts_per_pattern
)
423 m_inner_mode
= GET_MODE_INNER (mode
);
424 m_inner_bits_size
= GET_MODE_BITSIZE (m_inner_mode
);
425 m_inner_bytes_size
= GET_MODE_SIZE (m_inner_mode
);
426 m_mask_mode
= get_mask_mode (mode
);
429 int_mode_for_size (inner_bits_size (), 0).exists (&m_inner_int_mode
));
431 = get_vector_mode (m_inner_int_mode
, GET_MODE_NUNITS (mode
)).require ();
434 bool can_duplicate_repeating_sequence_p ();
435 rtx
get_merged_repeating_sequence ();
437 bool repeating_sequence_use_merge_profitable_p ();
438 bool combine_sequence_use_slideup_profitable_p ();
439 bool combine_sequence_use_merge_profitable_p ();
440 rtx
get_merge_scalar_mask (unsigned int, machine_mode
) const;
442 bool single_step_npatterns_p () const;
443 bool npatterns_all_equal_p () const;
444 bool interleaved_stepped_npatterns_p () const;
445 bool npatterns_vid_diff_repeated_p () const;
447 machine_mode
new_mode () const { return m_new_mode
; }
448 scalar_mode
inner_mode () const { return m_inner_mode
; }
449 scalar_int_mode
inner_int_mode () const { return m_inner_int_mode
; }
450 machine_mode
mask_mode () const { return m_mask_mode
; }
451 machine_mode
int_mode () const { return m_int_mode
; }
452 unsigned int inner_bits_size () const { return m_inner_bits_size
; }
453 unsigned int inner_bytes_size () const { return m_inner_bytes_size
; }
456 scalar_mode m_inner_mode
;
457 scalar_int_mode m_inner_int_mode
;
458 machine_mode m_new_mode
;
459 scalar_int_mode m_new_inner_mode
;
460 machine_mode m_mask_mode
;
461 machine_mode m_int_mode
;
462 unsigned int m_inner_bits_size
;
463 unsigned int m_inner_bytes_size
;
466 /* Return true if the vector duplicated by a super element which is the fusion
467 of consecutive elements.
469 v = { a, b, a, b } super element = ab, v = { ab, ab } */
471 rvv_builder::can_duplicate_repeating_sequence_p ()
473 poly_uint64 new_size
= exact_div (full_nelts (), npatterns ());
474 unsigned int new_inner_size
= m_inner_bits_size
* npatterns ();
475 if (!int_mode_for_size (new_inner_size
, 0).exists (&m_new_inner_mode
)
476 || GET_MODE_SIZE (m_new_inner_mode
) > UNITS_PER_WORD
477 || !get_vector_mode (m_new_inner_mode
, new_size
).exists (&m_new_mode
))
479 if (full_nelts ().is_constant ())
480 return repeating_sequence_p (0, full_nelts ().to_constant (), npatterns ());
481 return nelts_per_pattern () == 1;
484 /* Return true if it is a repeating sequence that using
485 merge approach has better codegen than using default
486 approach (slide1down).
489 {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}
494 for merging a we need mask 101010....
495 for merging b we need mask 010101....
497 Foreach element in the npattern, we need to build a mask in scalar register.
498 Mostely we need 3 instructions (aka COST = 3), which is consist of 2 scalar
499 instruction and 1 scalar move to v0 register. Finally we need vector merge
505 vmerge.vxm v9, v9, a1, v0
507 So the overall (roughly) COST of Sequence A = (3 + 1) * npatterns = 8.
508 If we use slide1down, the COST = nelts = 16 > 8 (COST of merge).
509 So return true in this case as it is profitable.
512 {a, b, c, d, e, f, g, h, a, b, c, d, e, f, g, h}
517 COST of merge approach = (3 + 1) * npatterns = 24
518 COST of slide1down approach = nelts = 16
519 Return false in this case as it is NOT profitable in merge approach.
522 rvv_builder::repeating_sequence_use_merge_profitable_p ()
524 if (inner_bytes_size () > UNITS_PER_WORD
)
527 unsigned int nelts
= full_nelts ().to_constant ();
529 if (!repeating_sequence_p (0, nelts
, npatterns ()))
532 unsigned int merge_cost
= 1;
533 unsigned int build_merge_mask_cost
= 3;
534 unsigned int slide1down_cost
= nelts
;
536 return (build_merge_mask_cost
+ merge_cost
) * npatterns () < slide1down_cost
;
539 /* Return true if it's worthwhile to use slideup combine 2 vectors. */
541 rvv_builder::combine_sequence_use_slideup_profitable_p ()
543 int nelts
= full_nelts ().to_constant ();
544 int leading_ndups
= this->count_dups (0, nelts
- 1, 1);
545 int trailing_ndups
= this->count_dups (nelts
- 1, -1, -1);
547 /* ??? Current heuristic we do is we do combine 2 vectors
549 1. # of leading same elements is equal to # of trailing same elements.
550 2. Both of above are equal to nelts / 2.
551 Otherwise, it is not profitable. */
552 return leading_ndups
== trailing_ndups
&& trailing_ndups
== nelts
/ 2;
555 /* Return true if it's worthwhile to use merge combine vector with a scalar. */
557 rvv_builder::combine_sequence_use_merge_profitable_p ()
559 int nelts
= full_nelts ().to_constant ();
560 int leading_ndups
= this->count_dups (0, nelts
- 1, 1);
561 int trailing_ndups
= this->count_dups (nelts
- 1, -1, -1);
562 int nregs
= riscv_get_v_regno_alignment (int_mode ());
564 if (leading_ndups
+ trailing_ndups
!= nelts
)
567 /* Leading elements num > 255 which exceeds the maximum value
568 of QImode, we will need to use HImode. */
570 if (leading_ndups
> 255 || nregs
> 2)
572 if (!get_vector_mode (HImode
, nelts
).exists (&mode
))
574 /* We will need one more AVL/VL toggling vsetvl instruction. */
575 return leading_ndups
> 4 && trailing_ndups
> 4;
578 /* { a, a, a, b, b, ... , b } and { b, b, b, a, a, ... , a }
579 consume 3 slide instructions. */
580 return leading_ndups
> 3 && trailing_ndups
> 3;
583 /* Merge the repeating sequence into a single element and return the RTX. */
585 rvv_builder::get_merged_repeating_sequence ()
587 scalar_int_mode mode
= Pmode
;
588 rtx target
= gen_reg_rtx (mode
);
589 emit_move_insn (target
, const0_rtx
);
590 rtx imm
= gen_int_mode ((1ULL << m_inner_bits_size
) - 1, mode
);
591 /* { a, b, a, b }: Generate duplicate element = b << bits | a. */
592 for (unsigned int i
= 0; i
< npatterns (); i
++)
594 unsigned int loc
= m_inner_bits_size
* i
;
595 rtx shift
= gen_int_mode (loc
, mode
);
596 rtx ele
= gen_lowpart (mode
, elt (i
));
597 rtx tmp
= expand_simple_binop (mode
, AND
, ele
, imm
, NULL_RTX
, false,
599 rtx tmp2
= expand_simple_binop (mode
, ASHIFT
, tmp
, shift
, NULL_RTX
, false,
601 rtx tmp3
= expand_simple_binop (mode
, IOR
, tmp2
, target
, NULL_RTX
, false,
603 emit_move_insn (target
, tmp3
);
605 if (GET_MODE_SIZE (m_new_inner_mode
) < UNITS_PER_WORD
)
606 return gen_lowpart (m_new_inner_mode
, target
);
610 /* Get the mask for merge approach.
612 Consider such following case:
613 {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}
614 To merge "a", the mask should be 1010....
615 To merge "b", the mask should be 0101....
618 rvv_builder::get_merge_scalar_mask (unsigned int index_in_pattern
,
619 machine_mode inner_mode
) const
621 unsigned HOST_WIDE_INT mask
= 0;
622 unsigned HOST_WIDE_INT base_mask
= (1ULL << index_in_pattern
);
623 /* Here we construct a mask pattern that will later be broadcast
624 to a vector register. The maximum broadcast size for vmv.v.x/vmv.s.x
625 is determined by the length of a vector element (ELEN) and not by
626 XLEN so make sure we do not exceed it. One example is -march=zve32*
627 which mandates ELEN == 32 but can be combined with -march=rv64
629 unsigned int elen
= TARGET_VECTOR_ELEN_64
? 64 : 32;
631 gcc_assert (elen
% npatterns () == 0);
633 int limit
= elen
/ npatterns ();
635 for (int i
= 0; i
< limit
; i
++)
636 mask
|= base_mask
<< (i
* npatterns ());
638 return gen_int_mode (mask
, inner_mode
);
641 /* Return true if the variable-length vector is single step.
642 Single step means step all patterns in NPATTERNS are equal.
643 Consider this following case:
645 CASE 1: NPATTERNS = 2, NELTS_PER_PATTERN = 3.
646 { 0, 2, 2, 4, 4, 6, ... }
647 First pattern: step1 = 2 - 0 = 2
649 Second pattern: step1 = 4 - 2 = 2
651 Since all steps of NPATTERNS are equal step = 2.
652 Return true in this case.
654 CASE 2: NPATTERNS = 2, NELTS_PER_PATTERN = 3.
655 { 0, 1, 2, 4, 4, 7, ... }
656 First pattern: step1 = 2 - 0 = 2
658 Second pattern: step1 = 4 - 1 = 3
660 Since not all steps are equal, return false. */
662 rvv_builder::single_step_npatterns_p () const
664 if (nelts_per_pattern () != 3)
668 = rtx_to_poly_int64 (elt (npatterns ())) - rtx_to_poly_int64 (elt (0));
669 for (unsigned int i
= 0; i
< npatterns (); i
++)
671 poly_int64 ele0
= rtx_to_poly_int64 (elt (i
));
672 poly_int64 ele1
= rtx_to_poly_int64 (elt (npatterns () + i
));
673 poly_int64 ele2
= rtx_to_poly_int64 (elt (npatterns () * 2 + i
));
674 poly_int64 diff1
= ele1
- ele0
;
675 poly_int64 diff2
= ele2
- ele1
;
676 if (maybe_ne (step
, diff1
) || maybe_ne (step
, diff2
))
682 /* Return true if the diff between const vector and vid sequence
683 is repeated. For example as below cases:
684 The diff means the const vector - vid.
686 CONST VECTOR: {3, 2, 1, 0, 7, 6, 5, 4, ... }
687 VID : {0, 1, 2, 3, 4, 5, 6, 7, ... }
688 DIFF(MINUS) : {3, 1,-1,-3, 3, 1,-1,-3, ... }
689 The diff sequence {3, 1,-1,-3} is repeated in the npattern and
690 return TRUE for case 1.
693 CONST VECTOR: {-4, 4,-3, 5,-2, 6,-1, 7, ...}
694 VID : { 0, 1, 2, 3, 4, 5, 6, 7, ... }
695 DIFF(MINUS) : {-4, 3,-5,-2,-6, 1,-7, 0, ... }
696 The diff sequence {-4, 3} is not repated in the npattern and
697 return FALSE for case 2. */
699 rvv_builder::npatterns_vid_diff_repeated_p () const
701 if (nelts_per_pattern () != 3)
703 else if (npatterns () == 0)
706 for (unsigned i
= 0; i
< npatterns (); i
++)
708 poly_int64 diff_0
= rtx_to_poly_int64 (elt (i
)) - i
;
710 = rtx_to_poly_int64 (elt (npatterns () + i
)) - npatterns () - i
;
712 if (maybe_ne (diff_0
, diff_1
))
719 /* Return true if the permutation consists of two
720 interleaved patterns with a constant step each.
721 TODO: We currently only support NPATTERNS = 2. */
723 rvv_builder::interleaved_stepped_npatterns_p () const
725 if (npatterns () != 2 || nelts_per_pattern () != 3)
727 for (unsigned int i
= 0; i
< npatterns (); i
++)
729 poly_int64 ele0
= rtx_to_poly_int64 (elt (i
));
730 poly_int64 ele1
= rtx_to_poly_int64 (elt (npatterns () + i
));
731 poly_int64 ele2
= rtx_to_poly_int64 (elt (npatterns () * 2 + i
));
732 poly_int64 diff1
= ele1
- ele0
;
733 poly_int64 diff2
= ele2
- ele1
;
734 if (maybe_ne (diff1
, diff2
))
740 /* Return true if all elements of NPATTERNS are equal.
743 { 2, 2, 2, 2, 4, 4, 4, 4, 8, 8, 8, 8, 16, 16, 16, 16, ... }
745 { 2, 2, 2, 2, 2, 2, 2, 2, 8, 8, 8, 8, 8, 8, 8, 8, ... }
746 We only check ele[0] ~ ele[NPATTERNS - 1] whether they are the same.
747 We don't need to check the elements[n] with n >= NPATTERNS since
748 they don't belong to the same pattern.
751 rvv_builder::npatterns_all_equal_p () const
753 poly_int64 ele0
= rtx_to_poly_int64 (elt (0));
754 for (unsigned int i
= 1; i
< npatterns (); i
++)
756 poly_int64 ele
= rtx_to_poly_int64 (elt (i
));
757 if (!known_eq (ele
, ele0
))
764 get_sew (machine_mode mode
)
766 unsigned int sew
= GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
768 : GET_MODE_BITSIZE (GET_MODE_INNER (mode
));
772 /* Return true if X is a const_vector with all duplicate elements, which is in
773 the range between MINVAL and MAXVAL. */
775 const_vec_all_same_in_range_p (rtx x
, HOST_WIDE_INT minval
,
776 HOST_WIDE_INT maxval
)
779 return (const_vec_duplicate_p (x
, &elt
) && CONST_INT_P (elt
)
780 && IN_RANGE (INTVAL (elt
), minval
, maxval
));
783 /* Return true if VEC is a constant in which every element is in the range
784 [MINVAL, MAXVAL]. The elements do not need to have the same value.
786 This function also exists in aarch64, we may unify it in middle-end in the
790 const_vec_all_in_range_p (rtx vec
, poly_int64 minval
, poly_int64 maxval
)
792 if (!CONST_VECTOR_P (vec
)
793 || GET_MODE_CLASS (GET_MODE (vec
)) != MODE_VECTOR_INT
)
797 if (!CONST_VECTOR_STEPPED_P (vec
))
798 nunits
= const_vector_encoded_nelts (vec
);
799 else if (!CONST_VECTOR_NUNITS (vec
).is_constant (&nunits
))
802 for (int i
= 0; i
< nunits
; i
++)
804 rtx vec_elem
= CONST_VECTOR_ELT (vec
, i
);
806 if (!poly_int_rtx_p (vec_elem
, &value
)
807 || maybe_lt (value
, minval
)
808 || maybe_gt (value
, maxval
))
814 /* Return a const vector of VAL. The VAL can be either const_int or
818 gen_const_vector_dup (machine_mode mode
, poly_int64 val
)
820 scalar_mode smode
= GET_MODE_INNER (mode
);
821 rtx c
= gen_int_mode (val
, smode
);
822 if (!val
.is_constant () && GET_MODE_SIZE (smode
) > GET_MODE_SIZE (Pmode
))
824 /* When VAL is const_poly_int value, we need to explicitly broadcast
825 it into a vector using RVV broadcast instruction. */
826 return expand_vector_broadcast (mode
, c
);
828 return gen_const_vec_duplicate (mode
, c
);
831 /* Emit a vlmax vsetvl instruction. This should only be used when
832 optimization is disabled or after vsetvl insertion pass. */
834 emit_hard_vlmax_vsetvl (machine_mode vmode
, rtx vl
)
836 unsigned int sew
= get_sew (vmode
);
837 emit_insn (gen_vsetvl (Pmode
, vl
, RVV_VLMAX
, gen_int_mode (sew
, Pmode
),
838 gen_int_mode (get_vlmul (vmode
), Pmode
), const0_rtx
,
843 emit_vlmax_vsetvl (machine_mode vmode
, rtx vl
)
845 unsigned int sew
= get_sew (vmode
);
846 enum vlmul_type vlmul
= get_vlmul (vmode
);
847 unsigned int ratio
= calculate_ratio (sew
, vlmul
);
850 emit_hard_vlmax_vsetvl (vmode
, vl
);
852 emit_insn (gen_vlmax_avl (Pmode
, vl
, gen_int_mode (ratio
, Pmode
)));
855 /* Calculate SEW/LMUL ratio. */
857 calculate_ratio (unsigned int sew
, enum vlmul_type vlmul
)
889 /* SCALABLE means that the vector-length is agnostic (run-time invariant and
890 compile-time unknown). FIXED meands that the vector-length is specific
891 (compile-time known). Both RVV_SCALABLE and RVV_FIXED_VLMAX are doing
892 auto-vectorization using VLMAX vsetvl configuration. */
894 autovec_use_vlmax_p (void)
896 return (riscv_autovec_preference
== RVV_SCALABLE
897 || riscv_autovec_preference
== RVV_FIXED_VLMAX
);
900 /* This function emits VLMAX vrgather instruction. Emit vrgather.vx/vi when sel
901 is a const duplicate vector. Otherwise, emit vrgather.vv. */
903 emit_vlmax_gather_insn (rtx target
, rtx op
, rtx sel
)
907 machine_mode data_mode
= GET_MODE (target
);
908 machine_mode sel_mode
= GET_MODE (sel
);
909 if (const_vec_duplicate_p (sel
, &elt
))
911 icode
= code_for_pred_gather_scalar (data_mode
);
914 else if (maybe_ne (GET_MODE_SIZE (data_mode
), GET_MODE_SIZE (sel_mode
)))
915 icode
= code_for_pred_gatherei16 (data_mode
);
917 icode
= code_for_pred_gather (data_mode
);
918 rtx ops
[] = {target
, op
, sel
};
919 emit_vlmax_insn (icode
, BINARY_OP
, ops
);
923 emit_vlmax_masked_gather_mu_insn (rtx target
, rtx op
, rtx sel
, rtx mask
)
927 machine_mode data_mode
= GET_MODE (target
);
928 machine_mode sel_mode
= GET_MODE (sel
);
929 if (const_vec_duplicate_p (sel
, &elt
))
931 icode
= code_for_pred_gather_scalar (data_mode
);
934 else if (maybe_ne (GET_MODE_SIZE (data_mode
), GET_MODE_SIZE (sel_mode
)))
935 icode
= code_for_pred_gatherei16 (data_mode
);
937 icode
= code_for_pred_gather (data_mode
);
938 rtx ops
[] = {target
, mask
, target
, op
, sel
};
939 emit_vlmax_insn (icode
, BINARY_OP_TAMU
, ops
);
942 /* According to RVV ISA spec (16.5.1. Synthesizing vdecompress):
943 https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc
945 There is no inverse vdecompress provided, as this operation can be readily
946 synthesized using iota and a masked vrgather:
948 Desired functionality of 'vdecompress'
949 7 6 5 4 3 2 1 0 # vid
951 e d c b a # packed vector of 5 elements
952 1 0 0 1 1 1 0 1 # mask vector of 8 elements
953 p q r s t u v w # destination register before vdecompress
955 e q r d c b v a # result of vdecompress
957 # v1 holds packed data
958 # v11 holds input expanded vector and result
959 viota.m v10, v0 # Calc iota from mask in v0
960 vrgather.vv v11, v1, v10, v0.t # Expand into destination
961 p q r s t u v w # v11 destination register
962 e d c b a # v1 source vector
963 1 0 0 1 1 1 0 1 # v0 mask vector
965 4 4 4 3 2 1 1 0 # v10 result of viota.m
966 e q r d c b v a # v11 destination after vrgather using viota.m under mask
969 emit_vlmax_decompress_insn (rtx target
, rtx op0
, rtx op1
, rtx mask
)
971 machine_mode data_mode
= GET_MODE (target
);
972 machine_mode sel_mode
= related_int_vector_mode (data_mode
).require ();
973 if (GET_MODE_INNER (data_mode
) == QImode
)
974 sel_mode
= get_vector_mode (HImode
, GET_MODE_NUNITS (data_mode
)).require ();
976 rtx sel
= gen_reg_rtx (sel_mode
);
977 rtx iota_ops
[] = {sel
, mask
};
978 emit_vlmax_insn (code_for_pred_iota (sel_mode
), UNARY_OP
, iota_ops
);
979 emit_vlmax_gather_insn (target
, op0
, sel
);
980 emit_vlmax_masked_gather_mu_insn (target
, op1
, sel
, mask
);
983 /* Emit merge instruction. */
986 get_repeating_sequence_dup_machine_mode (const rvv_builder
&builder
,
987 machine_mode mask_bit_mode
)
989 unsigned mask_precision
= GET_MODE_PRECISION (mask_bit_mode
).to_constant ();
990 unsigned mask_scalar_size
= mask_precision
> builder
.inner_bits_size ()
991 ? builder
.inner_bits_size () : mask_precision
;
993 scalar_mode inner_mode
;
994 unsigned minimal_bits_size
;
996 switch (mask_scalar_size
)
1000 minimal_bits_size
= TARGET_MIN_VLEN
/ 8; /* AKA RVVMF8. */
1003 inner_mode
= HImode
;
1004 minimal_bits_size
= TARGET_MIN_VLEN
/ 4; /* AKA RVVMF4. */
1007 inner_mode
= SImode
;
1008 minimal_bits_size
= TARGET_MIN_VLEN
/ 2; /* AKA RVVMF2. */
1011 inner_mode
= DImode
;
1012 minimal_bits_size
= TARGET_MIN_VLEN
/ 1; /* AKA RVVM1. */
1019 gcc_assert (mask_precision
% mask_scalar_size
== 0);
1021 uint64_t dup_nunit
= mask_precision
> mask_scalar_size
1022 ? mask_precision
/ mask_scalar_size
: minimal_bits_size
/ mask_scalar_size
;
1024 return get_vector_mode (inner_mode
, dup_nunit
).require ();
1027 /* Expand series const vector. If VID is NULL_RTX, we use vid.v
1028 instructions to generate sequence for VID:
1030 VID = { 0, 1, 2, 3, ... }
1032 Otherwise, we use the VID argument directly. */
1035 expand_vec_series (rtx dest
, rtx base
, rtx step
, rtx vid
)
1037 machine_mode mode
= GET_MODE (dest
);
1038 poly_int64 nunits_m1
= GET_MODE_NUNITS (mode
) - 1;
1040 rtx result
= register_operand (dest
, mode
) ? dest
: gen_reg_rtx (mode
);
1042 /* VECT_IV = BASE + I * STEP. */
1044 /* Step 1: Generate I = { 0, 1, 2, ... } by vid.v. */
1045 bool reverse_p
= !vid
&& rtx_equal_p (step
, constm1_rtx
)
1046 && poly_int_rtx_p (base
, &value
)
1047 && known_eq (nunits_m1
, value
);
1050 vid
= gen_reg_rtx (mode
);
1052 emit_vlmax_insn (code_for_pred_series (mode
), NULLARY_OP
, op
);
1059 {nunits - 1, nunits - 2, ... , 0}.
1060 nunits can be either const_int or const_poly_int.
1064 vrsub nunits - 1, v. */
1066 = {result
, vid
, gen_int_mode (nunits_m1
, GET_MODE_INNER (mode
))};
1067 insn_code icode
= code_for_pred_sub_reverse_scalar (mode
);
1068 emit_vlmax_insn (icode
, BINARY_OP
, ops
);
1072 /* Step 2: Generate I * STEP.
1073 - STEP is 1, we don't emit any instructions.
1074 - STEP is power of 2, we use vsll.vi/vsll.vx.
1075 - STEP is non-power of 2, we use vmul.vx. */
1076 if (rtx_equal_p (step
, const1_rtx
))
1080 step_adj
= gen_reg_rtx (mode
);
1081 if (CONST_INT_P (step
) && pow2p_hwi (INTVAL (step
)))
1083 /* Emit logical left shift operation. */
1084 int shift
= exact_log2 (INTVAL (step
));
1085 rtx shift_amount
= gen_int_mode (shift
, Pmode
);
1086 insn_code icode
= code_for_pred_scalar (ASHIFT
, mode
);
1087 rtx ops
[] = {step_adj
, vid
, shift_amount
};
1088 emit_vlmax_insn (icode
, BINARY_OP
, ops
);
1092 insn_code icode
= code_for_pred_scalar (MULT
, mode
);
1093 rtx ops
[] = {step_adj
, vid
, step
};
1094 emit_vlmax_insn (icode
, BINARY_OP
, ops
);
1098 /* Step 3: Generate BASE + I * STEP.
1099 - BASE is 0, use result of vid.
1100 - BASE is not 0, we use vadd.vx/vadd.vi. */
1101 if (rtx_equal_p (base
, const0_rtx
))
1102 emit_move_insn (result
, step_adj
);
1105 insn_code icode
= code_for_pred_scalar (PLUS
, mode
);
1106 rtx ops
[] = {result
, step_adj
, base
};
1107 emit_vlmax_insn (icode
, BINARY_OP
, ops
);
1112 emit_move_insn (dest
, result
);
1116 expand_const_vector (rtx target
, rtx src
)
1118 machine_mode mode
= GET_MODE (target
);
1119 if (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
)
1123 const_vec_duplicate_p (src
, &elt
)
1124 && (rtx_equal_p (elt
, const0_rtx
) || rtx_equal_p (elt
, const1_rtx
)));
1125 rtx ops
[] = {target
, src
};
1126 emit_vlmax_insn (code_for_pred_mov (mode
), UNARY_MASK_OP
, ops
);
1131 if (const_vec_duplicate_p (src
, &elt
))
1133 rtx tmp
= register_operand (target
, mode
) ? target
: gen_reg_rtx (mode
);
1134 /* Element in range -16 ~ 15 integer or 0.0 floating-point,
1135 we use vmv.v.i instruction. */
1136 if (satisfies_constraint_vi (src
) || satisfies_constraint_Wc0 (src
))
1138 rtx ops
[] = {tmp
, src
};
1139 emit_vlmax_insn (code_for_pred_mov (mode
), UNARY_OP
, ops
);
1143 /* Emit vec_duplicate<mode> split pattern before RA so that
1144 we could have a better optimization opportunity in LICM
1145 which will hoist vmv.v.x outside the loop and in fwprop && combine
1146 which will transform 'vv' into 'vx' instruction.
1148 The reason we don't emit vec_duplicate<mode> split pattern during
1149 RA since the split stage after RA is a too late stage to generate
1150 RVV instruction which need an additional register (We can't
1151 allocate a new register after RA) for VL operand of vsetvl
1152 instruction (vsetvl a5, zero). */
1153 if (lra_in_progress
)
1155 rtx ops
[] = {tmp
, elt
};
1156 emit_vlmax_insn (code_for_pred_broadcast (mode
), UNARY_OP
, ops
);
1160 struct expand_operand ops
[2];
1161 enum insn_code icode
= optab_handler (vec_duplicate_optab
, mode
);
1162 gcc_assert (icode
!= CODE_FOR_nothing
);
1163 create_output_operand (&ops
[0], tmp
, mode
);
1164 create_input_operand (&ops
[1], elt
, GET_MODE_INNER (mode
));
1165 expand_insn (icode
, 2, ops
);
1171 emit_move_insn (target
, tmp
);
1175 /* Support scalable const series vector. */
1177 if (const_vec_series_p (src
, &base
, &step
))
1179 expand_vec_series (target
, base
, step
);
1183 /* Handle variable-length vector. */
1184 unsigned int nelts_per_pattern
= CONST_VECTOR_NELTS_PER_PATTERN (src
);
1185 unsigned int npatterns
= CONST_VECTOR_NPATTERNS (src
);
1186 rvv_builder
builder (mode
, npatterns
, nelts_per_pattern
);
1187 for (unsigned int i
= 0; i
< nelts_per_pattern
; i
++)
1189 for (unsigned int j
= 0; j
< npatterns
; j
++)
1190 builder
.quick_push (CONST_VECTOR_ELT (src
, i
* npatterns
+ j
));
1192 builder
.finalize ();
1194 if (CONST_VECTOR_DUPLICATE_P (src
))
1196 /* Handle the case with repeating sequence that NELTS_PER_PATTERN = 1
1197 E.g. NPATTERNS = 4, v = { 0, 2, 6, 7, ... }
1198 NPATTERNS = 8, v = { 0, 2, 6, 7, 19, 20, 8, 7 ... }
1199 The elements within NPATTERNS are not necessary regular. */
1200 if (builder
.can_duplicate_repeating_sequence_p ())
1202 /* We handle the case that we can find a vector containter to hold
1203 element bitsize = NPATTERNS * ele_bitsize.
1205 NPATTERNS = 8, element width = 8
1206 v = { 0, 1, 2, 3, 4, 5, 6, 7, ... }
1207 In this case, we can combine NPATTERNS element into a larger
1208 element. Use element width = 64 and broadcast a vector with
1209 all element equal to 0x0706050403020100. */
1210 rtx ele
= builder
.get_merged_repeating_sequence ();
1211 rtx dup
= expand_vector_broadcast (builder
.new_mode (), ele
);
1212 emit_move_insn (target
, gen_lowpart (mode
, dup
));
1216 /* We handle the case that we can't find a vector containter to hold
1217 element bitsize = NPATTERNS * ele_bitsize.
1219 NPATTERNS = 8, element width = 16
1220 v = { 0, 1, 2, 3, 4, 5, 6, 7, ... }
1221 Since NPATTERNS * element width = 128, we can't find a container
1224 In this case, we use NPATTERNS merge operations to generate such
1226 unsigned int nbits
= npatterns
- 1;
1228 /* Generate vid = { 0, 1, 2, 3, 4, 5, 6, 7, ... }. */
1229 rtx vid
= gen_reg_rtx (builder
.int_mode ());
1231 emit_vlmax_insn (code_for_pred_series (builder
.int_mode ()),
1234 /* Generate vid_repeat = { 0, 1, ... nbits, ... } */
1235 rtx vid_repeat
= gen_reg_rtx (builder
.int_mode ());
1236 rtx and_ops
[] = {vid_repeat
, vid
,
1237 gen_int_mode (nbits
, builder
.inner_int_mode ())};
1238 emit_vlmax_insn (code_for_pred_scalar (AND
, builder
.int_mode ()),
1239 BINARY_OP
, and_ops
);
1241 rtx tmp
= gen_reg_rtx (builder
.mode ());
1242 rtx dup_ops
[] = {tmp
, builder
.elt (0)};
1243 emit_vlmax_insn (code_for_pred_broadcast (builder
.mode ()), UNARY_OP
,
1245 for (unsigned int i
= 1; i
< builder
.npatterns (); i
++)
1247 /* Generate mask according to i. */
1248 rtx mask
= gen_reg_rtx (builder
.mask_mode ());
1249 rtx const_vec
= gen_const_vector_dup (builder
.int_mode (), i
);
1250 expand_vec_cmp (mask
, EQ
, vid_repeat
, const_vec
);
1252 /* Merge scalar to each i. */
1253 rtx tmp2
= gen_reg_rtx (builder
.mode ());
1254 rtx merge_ops
[] = {tmp2
, tmp
, builder
.elt (i
), mask
};
1255 insn_code icode
= code_for_pred_merge_scalar (builder
.mode ());
1256 emit_vlmax_insn (icode
, MERGE_OP
, merge_ops
);
1259 emit_move_insn (target
, tmp
);
1262 else if (CONST_VECTOR_STEPPED_P (src
))
1264 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
);
1265 if (builder
.single_step_npatterns_p ())
1267 /* Describe the case by choosing NPATTERNS = 4 as an example. */
1270 /* Step 1: Generate vid = { 0, 1, 2, 3, 4, 5, 6, 7, ... }. */
1271 rtx vid
= gen_reg_rtx (builder
.mode ());
1272 rtx vid_ops
[] = {vid
};
1273 icode
= code_for_pred_series (builder
.mode ());
1274 emit_vlmax_insn (icode
, NULLARY_OP
, vid_ops
);
1276 if (builder
.npatterns_all_equal_p ())
1278 /* Generate the variable-length vector following this rule:
1279 { a, a, a + step, a + step, a + step * 2, a + step * 2, ...}
1280 E.g. { 0, 0, 8, 8, 16, 16, ... } */
1281 /* We want to create a pattern where value[ix] = floor (ix /
1282 NPATTERNS). As NPATTERNS is always a power of two we can
1283 rewrite this as = ix & -NPATTERNS. */
1284 /* Step 2: VID AND -NPATTERNS:
1285 { 0&-4, 1&-4, 2&-4, 3 &-4, 4 &-4, 5 &-4, 6 &-4, 7 &-4, ... }
1288 = gen_int_mode (-builder
.npatterns (), builder
.inner_mode ());
1289 rtx tmp
= gen_reg_rtx (builder
.mode ());
1290 rtx and_ops
[] = {tmp
, vid
, imm
};
1291 icode
= code_for_pred_scalar (AND
, builder
.mode ());
1292 emit_vlmax_insn (icode
, BINARY_OP
, and_ops
);
1293 HOST_WIDE_INT init_val
= INTVAL (builder
.elt (0));
1295 emit_move_insn (target
, tmp
);
1298 rtx dup
= gen_const_vector_dup (builder
.mode (), init_val
);
1299 rtx add_ops
[] = {target
, tmp
, dup
};
1300 icode
= code_for_pred (PLUS
, builder
.mode ());
1301 emit_vlmax_insn (icode
, BINARY_OP
, add_ops
);
1306 /* Generate the variable-length vector following this rule:
1307 { a, b, a + step, b + step, a + step*2, b + step*2, ... } */
1309 if (builder
.npatterns_vid_diff_repeated_p ())
1311 /* Case 1: For example as below:
1312 {3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8... }
1313 We have 3 - 0 = 3 equals 7 - 4 = 3, the sequence is
1314 repeated as below after minus vid.
1315 {3, 1, -1, -3, 3, 1, -1, -3...}
1316 Then we can simplify the diff code gen to at most
1318 rvv_builder
v (builder
.mode (), builder
.npatterns (), 1);
1320 /* Step 1: Generate diff = TARGET - VID. */
1321 for (unsigned int i
= 0; i
< v
.npatterns (); ++i
)
1323 poly_int64 diff
= rtx_to_poly_int64 (builder
.elt (i
)) - i
;
1324 v
.quick_push (gen_int_mode (diff
, v
.inner_mode ()));
1327 /* Step 2: Generate result = VID + diff. */
1328 rtx vec
= v
.build ();
1329 rtx add_ops
[] = {target
, vid
, vec
};
1330 emit_vlmax_insn (code_for_pred (PLUS
, builder
.mode ()),
1331 BINARY_OP
, add_ops
);
1335 /* Case 2: For example as below:
1336 { -4, 4, -4 + 1, 4 + 1, -4 + 2, 4 + 2, -4 + 3, 4 + 3, ... }
1338 rvv_builder
v (builder
.mode (), builder
.npatterns (), 1);
1340 /* Step 1: Generate { a, b, a, b, ... } */
1341 for (unsigned int i
= 0; i
< v
.npatterns (); ++i
)
1342 v
.quick_push (builder
.elt (i
));
1343 rtx new_base
= v
.build ();
1345 /* Step 2: Generate tmp = VID >> LOG2 (NPATTERNS). Â */
1347 = gen_int_mode (exact_log2 (builder
.npatterns ()),
1348 builder
.inner_mode ());
1349 rtx tmp
= expand_simple_binop (builder
.mode (), LSHIFTRT
,
1350 vid
, shift_count
, NULL_RTX
,
1351 false, OPTAB_DIRECT
);
1353 /* Step 3: Generate tmp2 = tmp * step. Â */
1354 rtx tmp2
= gen_reg_rtx (builder
.mode ());
1356 = simplify_binary_operation (MINUS
, builder
.inner_mode (),
1357 builder
.elt (v
.npatterns()),
1359 expand_vec_series (tmp2
, const0_rtx
, step
, tmp
);
1361 /* Step 4: Generate target = tmp2 + new_base. Â */
1362 rtx add_ops
[] = {target
, tmp2
, new_base
};
1363 emit_vlmax_insn (code_for_pred (PLUS
, builder
.mode ()),
1364 BINARY_OP
, add_ops
);
1368 else if (builder
.interleaved_stepped_npatterns_p ())
1370 rtx base1
= builder
.elt (0);
1371 rtx base2
= builder
.elt (1);
1373 = rtx_to_poly_int64 (builder
.elt (builder
.npatterns ()))
1374 - rtx_to_poly_int64 (base1
);
1376 = rtx_to_poly_int64 (builder
.elt (builder
.npatterns () + 1))
1377 - rtx_to_poly_int64 (base2
);
1379 /* For { 1, 0, 2, 0, ... , n - 1, 0 }, we can use larger EEW
1380 integer vector mode to generate such vector efficiently.
1382 E.g. EEW = 16, { 2, 0, 4, 0, ... }
1384 can be interpreted into:
1386 EEW = 32, { 2, 4, ... } */
1387 unsigned int new_smode_bitsize
= builder
.inner_bits_size () * 2;
1388 scalar_int_mode new_smode
;
1389 machine_mode new_mode
;
1390 poly_uint64 new_nunits
1391 = exact_div (GET_MODE_NUNITS (builder
.mode ()), 2);
1392 if (int_mode_for_size (new_smode_bitsize
, 0).exists (&new_smode
)
1393 && get_vector_mode (new_smode
, new_nunits
).exists (&new_mode
))
1395 rtx tmp
= gen_reg_rtx (new_mode
);
1396 base1
= gen_int_mode (rtx_to_poly_int64 (base1
), new_smode
);
1397 expand_vec_series (tmp
, base1
, gen_int_mode (step1
, new_smode
));
1399 if (rtx_equal_p (base2
, const0_rtx
) && known_eq (step2
, 0))
1400 /* { 1, 0, 2, 0, ... }. */
1401 emit_move_insn (target
, gen_lowpart (mode
, tmp
));
1402 else if (known_eq (step2
, 0))
1404 /* { 1, 1, 2, 1, ... }. */
1405 rtx scalar
= expand_simple_binop (
1407 gen_int_mode (rtx_to_poly_int64 (base2
), new_smode
),
1408 gen_int_mode (builder
.inner_bits_size (), new_smode
),
1409 NULL_RTX
, false, OPTAB_DIRECT
);
1410 rtx tmp2
= gen_reg_rtx (new_mode
);
1411 rtx and_ops
[] = {tmp2
, tmp
, scalar
};
1412 emit_vlmax_insn (code_for_pred_scalar (AND
, new_mode
),
1413 BINARY_OP
, and_ops
);
1414 emit_move_insn (target
, gen_lowpart (mode
, tmp2
));
1418 /* { 1, 3, 2, 6, ... }. */
1419 rtx tmp2
= gen_reg_rtx (new_mode
);
1420 base2
= gen_int_mode (rtx_to_poly_int64 (base2
), new_smode
);
1421 expand_vec_series (tmp2
, base2
,
1422 gen_int_mode (step2
, new_smode
));
1423 rtx shifted_tmp2
= expand_simple_binop (
1424 new_mode
, ASHIFT
, tmp2
,
1425 gen_int_mode (builder
.inner_bits_size (), Pmode
), NULL_RTX
,
1426 false, OPTAB_DIRECT
);
1427 rtx tmp3
= gen_reg_rtx (new_mode
);
1428 rtx ior_ops
[] = {tmp3
, tmp
, shifted_tmp2
};
1429 emit_vlmax_insn (code_for_pred (IOR
, new_mode
), BINARY_OP
,
1431 emit_move_insn (target
, gen_lowpart (mode
, tmp3
));
1436 rtx vid
= gen_reg_rtx (mode
);
1437 expand_vec_series (vid
, const0_rtx
, const1_rtx
);
1438 /* Transform into { 0, 0, 1, 1, 2, 2, ... }. */
1440 = expand_simple_binop (mode
, LSHIFTRT
, vid
, const1_rtx
,
1441 NULL_RTX
, false, OPTAB_DIRECT
);
1442 rtx tmp1
= gen_reg_rtx (mode
);
1443 rtx tmp2
= gen_reg_rtx (mode
);
1444 expand_vec_series (tmp1
, base1
,
1445 gen_int_mode (step1
, builder
.inner_mode ()),
1447 expand_vec_series (tmp2
, base2
,
1448 gen_int_mode (step2
, builder
.inner_mode ()),
1451 /* Transform into { 0, 1, 0, 1, 0, 1, ... }. */
1452 rtx and_vid
= gen_reg_rtx (mode
);
1453 rtx and_ops
[] = {and_vid
, vid
, const1_rtx
};
1454 emit_vlmax_insn (code_for_pred_scalar (AND
, mode
), BINARY_OP
,
1456 rtx mask
= gen_reg_rtx (builder
.mask_mode ());
1457 expand_vec_cmp (mask
, EQ
, and_vid
, CONST1_RTX (mode
));
1459 rtx ops
[] = {target
, tmp1
, tmp2
, mask
};
1460 emit_vlmax_insn (code_for_pred_merge (mode
), MERGE_OP
, ops
);
1463 else if (npatterns
== 1 && nelts_per_pattern
== 3)
1465 /* Generate the following CONST_VECTOR:
1466 { base0, base1, base1 + step, base1 + step * 2, ... } */
1467 rtx base0
= builder
.elt (0);
1468 rtx base1
= builder
.elt (1);
1469 rtx base2
= builder
.elt (2);
1471 rtx step
= simplify_binary_operation (MINUS
, builder
.inner_mode (),
1474 /* Step 1 - { base1, base1 + step, base1 + step * 2, ... } */
1475 rtx tmp
= gen_reg_rtx (mode
);
1476 expand_vec_series (tmp
, base1
, step
);
1477 /* Step 2 - { base0, base1, base1 + step, base1 + step * 2, ... } */
1478 if (!rtx_equal_p (base0
, const0_rtx
))
1479 base0
= force_reg (builder
.inner_mode (), base0
);
1481 insn_code icode
= optab_handler (vec_shl_insert_optab
, mode
);
1482 gcc_assert (icode
!= CODE_FOR_nothing
);
1483 emit_insn (GEN_FCN (icode
) (target
, tmp
, base0
));
1486 /* TODO: We will enable more variable-length vector in the future. */
1493 /* Get the frm mode with given CONST_INT rtx, the default mode is
1495 enum floating_point_rounding_mode
1496 get_frm_mode (rtx operand
)
1498 gcc_assert (CONST_INT_P (operand
));
1500 switch (INTVAL (operand
))
1521 /* Expand a pre-RA RVV data move from SRC to DEST.
1522 It expands move for RVV fractional vector modes.
1523 Return true if the move as already been emitted. */
1525 legitimize_move (rtx dest
, rtx
*srcp
)
1528 machine_mode mode
= GET_MODE (dest
);
1529 if (CONST_VECTOR_P (src
))
1531 expand_const_vector (dest
, src
);
1535 if (riscv_v_ext_vls_mode_p (mode
))
1537 if (GET_MODE_NUNITS (mode
).to_constant () <= 31)
1539 /* For NUNITS <= 31 VLS modes, we don't need extrac
1540 scalar regisers so we apply the naive (set (op0) (op1)) pattern. */
1541 if (can_create_pseudo_p ())
1543 /* Need to force register if mem <- !reg. */
1544 if (MEM_P (dest
) && !REG_P (src
))
1545 *srcp
= force_reg (mode
, src
);
1550 else if (GET_MODE_NUNITS (mode
).to_constant () > 31 && lra_in_progress
)
1552 emit_insn (gen_mov_lra (mode
, Pmode
, dest
, src
));
1558 /* In order to decrease the memory traffic, we don't use whole register
1559 * load/store for the LMUL less than 1 and mask mode, so those case will
1560 * require one extra general purpose register, but it's not allowed during
1561 * LRA process, so we have a special move pattern used for LRA, which will
1562 * defer the expansion after LRA. */
1563 if ((known_lt (GET_MODE_SIZE (mode
), BYTES_PER_RISCV_VECTOR
)
1564 || GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
)
1567 emit_insn (gen_mov_lra (mode
, Pmode
, dest
, src
));
1571 if (known_ge (GET_MODE_SIZE (mode
), BYTES_PER_RISCV_VECTOR
)
1572 && GET_MODE_CLASS (mode
) != MODE_VECTOR_BOOL
)
1574 /* Need to force register if mem <- !reg. */
1575 if (MEM_P (dest
) && !REG_P (src
))
1576 *srcp
= force_reg (mode
, src
);
1582 if (register_operand (src
, mode
) && register_operand (dest
, mode
))
1584 emit_insn (gen_rtx_SET (dest
, src
));
1589 = GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
? UNARY_MASK_OP
: UNARY_OP
;
1590 if (!register_operand (src
, mode
) && !register_operand (dest
, mode
))
1592 rtx tmp
= gen_reg_rtx (mode
);
1595 rtx ops
[] = {tmp
, src
};
1596 emit_vlmax_insn (code_for_pred_mov (mode
), insn_flags
, ops
);
1599 emit_move_insn (tmp
, src
);
1603 if (satisfies_constraint_vu (src
))
1606 rtx ops
[] = {dest
, src
};
1607 emit_vlmax_insn (code_for_pred_mov (mode
), insn_flags
, ops
);
1611 /* VTYPE information for machine_mode. */
1612 struct mode_vtype_group
1614 enum vlmul_type vlmul
[NUM_MACHINE_MODES
];
1615 uint8_t ratio
[NUM_MACHINE_MODES
];
1616 machine_mode subpart_mode
[NUM_MACHINE_MODES
];
1617 uint8_t nf
[NUM_MACHINE_MODES
];
1620 #define ENTRY(MODE, REQUIREMENT, VLMUL, RATIO) \
1621 vlmul[MODE##mode] = VLMUL; \
1622 ratio[MODE##mode] = RATIO;
1623 #define TUPLE_ENTRY(MODE, REQUIREMENT, SUBPART_MODE, NF, VLMUL, RATIO) \
1624 subpart_mode[MODE##mode] = SUBPART_MODE##mode; \
1625 nf[MODE##mode] = NF; \
1626 vlmul[MODE##mode] = VLMUL; \
1627 ratio[MODE##mode] = RATIO;
1628 #include "riscv-vector-switch.def"
1634 static mode_vtype_group mode_vtype_infos
;
1636 /* Get vlmul field value by comparing LMUL with BYTES_PER_RISCV_VECTOR. */
1638 get_vlmul (machine_mode mode
)
1640 /* For VLS modes, the vlmul should be dynamically
1641 calculated since we need to adjust VLMUL according
1642 to TARGET_MIN_VLEN. */
1643 if (riscv_v_ext_vls_mode_p (mode
))
1645 int size
= GET_MODE_BITSIZE (mode
).to_constant ();
1646 int inner_size
= GET_MODE_BITSIZE (GET_MODE_INNER (mode
));
1647 if (size
< TARGET_MIN_VLEN
)
1649 int factor
= TARGET_MIN_VLEN
/ size
;
1650 if (inner_size
== 8)
1651 factor
= MIN (factor
, 8);
1652 else if (inner_size
== 16)
1653 factor
= MIN (factor
, 4);
1654 else if (inner_size
== 32)
1655 factor
= MIN (factor
, 2);
1656 else if (inner_size
== 64)
1657 factor
= MIN (factor
, 1);
1678 int factor
= size
/ TARGET_MIN_VLEN
;
1695 return mode_vtype_infos
.vlmul
[mode
];
1698 /* Return the VLMAX rtx of vector mode MODE. */
1700 get_vlmax_rtx (machine_mode mode
)
1702 gcc_assert (riscv_v_ext_vector_mode_p (mode
));
1703 return gen_int_mode (GET_MODE_NUNITS (mode
), Pmode
);
1706 /* Return the NF value of the corresponding mode. */
1708 get_nf (machine_mode mode
)
1710 /* We don't allow non-tuple modes go through this function. */
1711 gcc_assert (riscv_v_ext_tuple_mode_p (mode
));
1712 return mode_vtype_infos
.nf
[mode
];
1715 /* Return the subpart mode of the tuple mode. For RVVM2x2SImode,
1716 the subpart mode is RVVM2SImode. This will help to build
1717 array/struct type in builtins. */
1719 get_subpart_mode (machine_mode mode
)
1721 /* We don't allow non-tuple modes go through this function. */
1722 gcc_assert (riscv_v_ext_tuple_mode_p (mode
));
1723 return mode_vtype_infos
.subpart_mode
[mode
];
1726 /* Get ratio according to machine mode. */
1728 get_ratio (machine_mode mode
)
1730 if (riscv_v_ext_vls_mode_p (mode
))
1732 unsigned int sew
= get_sew (mode
);
1733 vlmul_type vlmul
= get_vlmul (mode
);
1755 return mode_vtype_infos
.ratio
[mode
];
1758 /* Get ta according to operand[tail_op_idx]. */
1762 if (INTVAL (ta
) == TAIL_ANY
)
1763 return INVALID_ATTRIBUTE
;
1767 /* Get ma according to operand[mask_op_idx]. */
1771 if (INTVAL (ma
) == MASK_ANY
)
1772 return INVALID_ATTRIBUTE
;
1776 /* Get prefer tail policy. */
1778 get_prefer_tail_policy ()
1780 /* TODO: By default, we choose to use TAIL_ANY which allows
1781 compiler pick up either agnostic or undisturbed. Maybe we
1782 will have a compile option like -mprefer=agnostic to set
1787 /* Get prefer mask policy. */
1789 get_prefer_mask_policy ()
1791 /* TODO: By default, we choose to use MASK_ANY which allows
1792 compiler pick up either agnostic or undisturbed. Maybe we
1793 will have a compile option like -mprefer=agnostic to set
1798 /* Get avl_type rtx. */
1800 get_avl_type_rtx (enum avl_type type
)
1802 return gen_int_mode (type
, Pmode
);
1805 /* Return the appropriate mask mode for MODE. */
1808 get_mask_mode (machine_mode mode
)
1810 poly_int64 nunits
= GET_MODE_NUNITS (mode
);
1811 if (riscv_v_ext_tuple_mode_p (mode
))
1813 unsigned int nf
= get_nf (mode
);
1814 nunits
= exact_div (nunits
, nf
);
1816 return get_vector_mode (BImode
, nunits
).require ();
1819 /* Return the appropriate M1 mode for MODE. */
1821 static opt_machine_mode
1822 get_m1_mode (machine_mode mode
)
1824 scalar_mode smode
= GET_MODE_INNER (mode
);
1825 unsigned int bytes
= GET_MODE_SIZE (smode
);
1826 poly_uint64 m1_nunits
= exact_div (BYTES_PER_RISCV_VECTOR
, bytes
);
1827 return get_vector_mode (smode
, m1_nunits
);
1830 /* Return the RVV vector mode that has NUNITS elements of mode INNER_MODE.
1831 This function is not only used by builtins, but also will be used by
1832 auto-vectorization in the future. */
1834 get_vector_mode (scalar_mode inner_mode
, poly_uint64 nunits
)
1836 enum mode_class mclass
;
1837 if (inner_mode
== E_BImode
)
1838 mclass
= MODE_VECTOR_BOOL
;
1839 else if (FLOAT_MODE_P (inner_mode
))
1840 mclass
= MODE_VECTOR_FLOAT
;
1842 mclass
= MODE_VECTOR_INT
;
1844 FOR_EACH_MODE_IN_CLASS (mode
, mclass
)
1845 if (inner_mode
== GET_MODE_INNER (mode
)
1846 && known_eq (nunits
, GET_MODE_NUNITS (mode
))
1847 && (riscv_v_ext_vector_mode_p (mode
)
1848 || riscv_v_ext_vls_mode_p (mode
)))
1850 return opt_machine_mode ();
1853 /* Return the RVV tuple mode if we can find the legal tuple mode for the
1854 corresponding subpart mode and NF. */
1856 get_tuple_mode (machine_mode subpart_mode
, unsigned int nf
)
1858 poly_uint64 nunits
= GET_MODE_NUNITS (subpart_mode
) * nf
;
1859 scalar_mode inner_mode
= GET_MODE_INNER (subpart_mode
);
1860 enum mode_class mclass
= GET_MODE_CLASS (subpart_mode
);
1862 FOR_EACH_MODE_IN_CLASS (mode
, mclass
)
1863 if (inner_mode
== GET_MODE_INNER (mode
)
1864 && known_eq (nunits
, GET_MODE_NUNITS (mode
))
1865 && riscv_v_ext_tuple_mode_p (mode
)
1866 && get_subpart_mode (mode
) == subpart_mode
)
1868 return opt_machine_mode ();
1874 if (!CONST_INT_P (x
))
1876 return IN_RANGE (INTVAL (x
), -16, 15);
1882 if (!CONST_INT_P (x
))
1884 return IN_RANGE (INTVAL (x
), -15, 16);
1888 has_vi_variant_p (rtx_code code
, rtx x
)
1912 return neg_simm5_p (x
);
1920 sew64_scalar_helper (rtx
*operands
, rtx
*scalar_op
, rtx vl
,
1921 machine_mode vector_mode
, bool has_vi_variant_p
,
1922 void (*emit_vector_func
) (rtx
*, rtx
), enum avl_type type
)
1924 machine_mode scalar_mode
= GET_MODE_INNER (vector_mode
);
1925 if (has_vi_variant_p
)
1927 *scalar_op
= force_reg (scalar_mode
, *scalar_op
);
1933 if (!rtx_equal_p (*scalar_op
, const0_rtx
))
1934 *scalar_op
= force_reg (scalar_mode
, *scalar_op
);
1938 if (immediate_operand (*scalar_op
, Pmode
))
1940 if (!rtx_equal_p (*scalar_op
, const0_rtx
))
1941 *scalar_op
= force_reg (Pmode
, *scalar_op
);
1943 *scalar_op
= gen_rtx_SIGN_EXTEND (scalar_mode
, *scalar_op
);
1947 if (CONST_INT_P (*scalar_op
))
1949 if (maybe_gt (GET_MODE_SIZE (scalar_mode
), GET_MODE_SIZE (Pmode
)))
1950 *scalar_op
= force_const_mem (scalar_mode
, *scalar_op
);
1952 *scalar_op
= force_reg (scalar_mode
, *scalar_op
);
1955 rtx tmp
= gen_reg_rtx (vector_mode
);
1956 rtx ops
[] = {tmp
, *scalar_op
};
1958 emit_vlmax_insn (code_for_pred_broadcast (vector_mode
), UNARY_OP
, ops
);
1960 emit_nonvlmax_insn (code_for_pred_broadcast (vector_mode
), UNARY_OP
, ops
,
1962 emit_vector_func (operands
, tmp
);
1967 /* Get { ... ,0, 0, 0, ..., 0, 0, 0, 1 } mask. */
1969 gen_scalar_move_mask (machine_mode mode
)
1971 rtx_vector_builder
builder (mode
, 1, 2);
1972 builder
.quick_push (const1_rtx
);
1973 builder
.quick_push (const0_rtx
);
1974 return builder
.build ();
1978 compute_vlmax (unsigned vector_bits
, unsigned elt_size
, unsigned min_size
)
1980 // Original equation:
1981 // VLMAX = (VectorBits / EltSize) * LMUL
1982 // where LMUL = MinSize / TARGET_MIN_VLEN
1983 // The following equations have been reordered to prevent loss of precision
1984 // when calculating fractional LMUL.
1985 return ((vector_bits
/ elt_size
) * min_size
) / TARGET_MIN_VLEN
;
1989 get_unknown_min_value (machine_mode mode
)
1991 enum vlmul_type vlmul
= get_vlmul (mode
);
1995 return TARGET_MIN_VLEN
;
1997 return TARGET_MIN_VLEN
* 2;
1999 return TARGET_MIN_VLEN
* 4;
2001 return TARGET_MIN_VLEN
* 8;
2008 force_vector_length_operand (rtx vl
)
2010 if (CONST_INT_P (vl
) && !satisfies_constraint_K (vl
))
2011 return force_reg (Pmode
, vl
);
2016 gen_no_side_effects_vsetvl_rtx (machine_mode vmode
, rtx vl
, rtx avl
)
2018 unsigned int sew
= get_sew (vmode
);
2019 rtx tail_policy
= gen_int_mode (get_prefer_tail_policy (), Pmode
);
2020 rtx mask_policy
= gen_int_mode (get_prefer_mask_policy (), Pmode
);
2021 return gen_vsetvl_no_side_effects (Pmode
, vl
, avl
, gen_int_mode (sew
, Pmode
),
2022 gen_int_mode (get_vlmul (vmode
), Pmode
),
2023 tail_policy
, mask_policy
);
2026 /* GET VL * 2 rtx. */
2028 get_vl_x2_rtx (rtx avl
, machine_mode mode
, machine_mode demote_mode
)
2030 rtx i32vl
= NULL_RTX
;
2031 if (CONST_INT_P (avl
))
2033 unsigned elt_size
= GET_MODE_BITSIZE (GET_MODE_INNER (mode
));
2034 unsigned min_size
= get_unknown_min_value (mode
);
2035 unsigned vlen_max
= RVV_65536
;
2036 unsigned vlmax_max
= compute_vlmax (vlen_max
, elt_size
, min_size
);
2037 unsigned vlen_min
= TARGET_MIN_VLEN
;
2038 unsigned vlmax_min
= compute_vlmax (vlen_min
, elt_size
, min_size
);
2040 unsigned HOST_WIDE_INT avl_int
= INTVAL (avl
);
2041 if (avl_int
<= vlmax_min
)
2042 i32vl
= gen_int_mode (2 * avl_int
, Pmode
);
2043 else if (avl_int
>= 2 * vlmax_max
)
2045 // Just set i32vl to VLMAX in this situation
2046 i32vl
= gen_reg_rtx (Pmode
);
2048 gen_no_side_effects_vsetvl_rtx (demote_mode
, i32vl
, RVV_VLMAX
));
2052 // For AVL between (MinVLMAX, 2 * MaxVLMAX), the actual working vl
2053 // is related to the hardware implementation.
2054 // So let the following code handle
2059 // Using vsetvli instruction to get actually used length which related to
2060 // the hardware implementation
2061 rtx i64vl
= gen_reg_rtx (Pmode
);
2063 gen_no_side_effects_vsetvl_rtx (mode
, i64vl
, force_reg (Pmode
, avl
)));
2064 // scale 2 for 32-bit length
2065 i32vl
= gen_reg_rtx (Pmode
);
2067 gen_rtx_SET (i32vl
, gen_rtx_ASHIFT (Pmode
, i64vl
, const1_rtx
)));
2070 return force_vector_length_operand (i32vl
);
2074 slide1_sew64_helper (int unspec
, machine_mode mode
, machine_mode demote_mode
,
2075 machine_mode demote_mask_mode
, rtx
*ops
)
2077 rtx scalar_op
= ops
[4];
2079 machine_mode scalar_mode
= GET_MODE_INNER (mode
);
2080 if (rtx_equal_p (scalar_op
, const0_rtx
))
2082 ops
[5] = force_vector_length_operand (ops
[5]);
2088 ops
[4] = force_reg (scalar_mode
, scalar_op
);
2089 ops
[5] = force_vector_length_operand (ops
[5]);
2093 if (immediate_operand (scalar_op
, Pmode
))
2095 ops
[4] = gen_rtx_SIGN_EXTEND (scalar_mode
, force_reg (Pmode
, scalar_op
));
2096 ops
[5] = force_vector_length_operand (ops
[5]);
2100 if (CONST_INT_P (scalar_op
))
2101 scalar_op
= force_reg (scalar_mode
, scalar_op
);
2103 rtx vl_x2
= get_vl_x2_rtx (avl
, mode
, demote_mode
);
2105 rtx demote_scalar_op1
, demote_scalar_op2
;
2106 if (unspec
== UNSPEC_VSLIDE1UP
)
2108 demote_scalar_op1
= gen_highpart (Pmode
, scalar_op
);
2109 demote_scalar_op2
= gen_lowpart (Pmode
, scalar_op
);
2113 demote_scalar_op1
= gen_lowpart (Pmode
, scalar_op
);
2114 demote_scalar_op2
= gen_highpart (Pmode
, scalar_op
);
2117 rtx temp
= gen_reg_rtx (demote_mode
);
2118 rtx ta
= gen_int_mode (get_prefer_tail_policy (), Pmode
);
2119 rtx ma
= gen_int_mode (get_prefer_mask_policy (), Pmode
);
2120 rtx merge
= RVV_VUNDEF (demote_mode
);
2121 /* Handle vslide1<ud>_tu. */
2122 if (register_operand (ops
[2], mode
)
2123 && rtx_equal_p (ops
[1], CONSTM1_RTX (GET_MODE (ops
[1]))))
2125 merge
= gen_lowpart (demote_mode
, ops
[2]);
2130 emit_insn (gen_pred_slide (unspec
, demote_mode
, temp
,
2131 CONSTM1_RTX (demote_mask_mode
), merge
,
2132 gen_lowpart (demote_mode
, ops
[3]),
2133 demote_scalar_op1
, vl_x2
, ta
, ma
, ops
[8]));
2134 emit_insn (gen_pred_slide (unspec
, demote_mode
,
2135 gen_lowpart (demote_mode
, ops
[0]),
2136 CONSTM1_RTX (demote_mask_mode
), merge
, temp
,
2137 demote_scalar_op2
, vl_x2
, ta
, ma
, ops
[8]));
2139 if (!rtx_equal_p (ops
[1], CONSTM1_RTX (GET_MODE (ops
[1])))
2140 && !rtx_equal_p (ops
[2], RVV_VUNDEF (GET_MODE (ops
[2]))))
2141 emit_insn (gen_pred_merge (mode
, ops
[0], ops
[2], ops
[2], ops
[0], ops
[1],
2142 force_vector_length_operand (ops
[5]), ops
[6],
2148 gen_avl_for_scalar_move (rtx avl
)
2150 /* AVL for scalar move has different behavior between 0 and large than 0. */
2151 if (CONST_INT_P (avl
))
2153 /* So we could just set AVL to 1 for any constant other than 0. */
2154 if (rtx_equal_p (avl
, const0_rtx
))
2161 /* For non-constant value, we set any non zero value to 1 by
2162 `sgtu new_avl,input_avl,zero` + `vsetvli`. */
2163 rtx tmp
= gen_reg_rtx (Pmode
);
2165 gen_rtx_SET (tmp
, gen_rtx_fmt_ee (GTU
, Pmode
, avl
, const0_rtx
)));
2170 /* Expand tuple modes data movement for. */
2172 expand_tuple_move (rtx
*ops
)
2175 machine_mode tuple_mode
= GET_MODE (ops
[0]);
2176 machine_mode subpart_mode
= get_subpart_mode (tuple_mode
);
2177 poly_int64 subpart_size
= GET_MODE_SIZE (subpart_mode
);
2178 unsigned int nf
= get_nf (tuple_mode
);
2179 bool fractional_p
= known_lt (subpart_size
, BYTES_PER_RISCV_VECTOR
);
2181 if (REG_P (ops
[0]) && CONST_VECTOR_P (ops
[1]))
2184 gcc_assert (can_create_pseudo_p ()
2185 && const_vec_duplicate_p (ops
[1], &val
));
2186 for (i
= 0; i
< nf
; ++i
)
2188 poly_int64 offset
= i
* subpart_size
;
2190 = simplify_gen_subreg (subpart_mode
, ops
[0], tuple_mode
, offset
);
2191 rtx dup
= gen_const_vec_duplicate (subpart_mode
, val
);
2192 emit_move_insn (subreg
, dup
);
2195 else if (REG_P (ops
[0]) && REG_P (ops
[1]))
2197 for (i
= 0; i
< nf
; ++i
)
2201 /* Take NF = 2 and LMUL = 1 for example:
2210 if (REGNO (ops
[0]) > REGNO (ops
[1]))
2212 poly_int64 offset
= index
* subpart_size
;
2214 = simplify_gen_subreg (subpart_mode
, ops
[0], tuple_mode
, offset
);
2216 = simplify_gen_subreg (subpart_mode
, ops
[1], tuple_mode
, offset
);
2217 emit_insn (gen_rtx_SET (dst_subreg
, src_subreg
));
2222 /* Expand tuple memory data movement. */
2223 gcc_assert (MEM_P (ops
[0]) || MEM_P (ops
[1]));
2224 rtx offset
= gen_int_mode (subpart_size
, Pmode
);
2225 if (!subpart_size
.is_constant ())
2227 emit_move_insn (ops
[2], gen_int_mode (BYTES_PER_RISCV_VECTOR
, Pmode
));
2231 = exact_div (BYTES_PER_RISCV_VECTOR
, subpart_size
)
2234 = gen_rtx_ASHIFTRT (Pmode
, ops
[2],
2235 gen_int_mode (exact_log2 (factor
), Pmode
));
2236 emit_insn (gen_rtx_SET (ops
[2], pat
));
2239 if (known_gt (subpart_size
, BYTES_PER_RISCV_VECTOR
))
2242 = exact_div (subpart_size
, BYTES_PER_RISCV_VECTOR
)
2245 = gen_rtx_ASHIFT (Pmode
, ops
[2],
2246 gen_int_mode (exact_log2 (factor
), Pmode
));
2247 emit_insn (gen_rtx_SET (ops
[2], pat
));
2252 /* Non-fractional LMUL has whole register moves that don't require a
2253 vsetvl for VLMAX. */
2255 emit_vlmax_vsetvl (subpart_mode
, ops
[4]);
2258 /* Load operations. */
2259 emit_move_insn (ops
[3], XEXP (ops
[1], 0));
2260 for (i
= 0; i
< nf
; i
++)
2262 rtx subreg
= simplify_gen_subreg (subpart_mode
, ops
[0],
2263 tuple_mode
, i
* subpart_size
);
2266 rtx new_addr
= gen_rtx_PLUS (Pmode
, ops
[3], offset
);
2267 emit_insn (gen_rtx_SET (ops
[3], new_addr
));
2269 rtx mem
= gen_rtx_MEM (subpart_mode
, ops
[3]);
2273 rtx operands
[] = {subreg
, mem
};
2274 emit_vlmax_insn_lra (code_for_pred_mov (subpart_mode
),
2275 UNARY_OP
, operands
, ops
[4]);
2278 emit_move_insn (subreg
, mem
);
2283 /* Store operations. */
2284 emit_move_insn (ops
[3], XEXP (ops
[0], 0));
2285 for (i
= 0; i
< nf
; i
++)
2287 rtx subreg
= simplify_gen_subreg (subpart_mode
, ops
[1],
2288 tuple_mode
, i
* subpart_size
);
2291 rtx new_addr
= gen_rtx_PLUS (Pmode
, ops
[3], offset
);
2292 emit_insn (gen_rtx_SET (ops
[3], new_addr
));
2294 rtx mem
= gen_rtx_MEM (subpart_mode
, ops
[3]);
2298 rtx operands
[] = {mem
, subreg
};
2299 emit_vlmax_insn_lra (code_for_pred_mov (subpart_mode
),
2300 UNARY_OP
, operands
, ops
[4]);
2303 emit_move_insn (mem
, subreg
);
2309 /* Return the vectorization machine mode for RVV according to LMUL. */
2311 preferred_simd_mode (scalar_mode mode
)
2313 if (autovec_use_vlmax_p ())
2315 /* We use LMUL = 1 as base bytesize which is BYTES_PER_RISCV_VECTOR and
2316 riscv_autovec_lmul as multiply factor to calculate the the NUNITS to
2317 get the auto-vectorization mode. */
2319 poly_uint64 vector_size
= BYTES_PER_RISCV_VECTOR
* TARGET_MAX_LMUL
;
2320 poly_uint64 scalar_size
= GET_MODE_SIZE (mode
);
2321 /* Disable vectorization when we can't find a RVV mode for it.
2322 E.g. -march=rv64gc_zve32x doesn't have a vector mode to vectorize
2323 a double (DFmode) type. */
2324 if (!multiple_p (vector_size
, scalar_size
, &nunits
))
2326 machine_mode rvv_mode
;
2327 if (get_vector_mode (mode
, nunits
).exists (&rvv_mode
))
2333 /* Subroutine of riscv_vector_expand_vector_init.
2335 (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
2336 (b) Skip leading elements from BUILDER, which are the same as
2337 element NELTS_REQD - 1.
2338 (c) Insert earlier elements in reverse order in TARGET using vslide1down. */
2341 expand_vector_init_insert_elems (rtx target
, const rvv_builder
&builder
,
2344 machine_mode mode
= GET_MODE (target
);
2345 rtx dup
= expand_vector_broadcast (mode
, builder
.elt (0));
2346 emit_move_insn (target
, dup
);
2347 int ndups
= builder
.count_dups (0, nelts_reqd
- 1, 1);
2348 for (int i
= ndups
; i
< nelts_reqd
; i
++)
2351 = FLOAT_MODE_P (mode
) ? UNSPEC_VFSLIDE1DOWN
: UNSPEC_VSLIDE1DOWN
;
2352 insn_code icode
= code_for_pred_slide (unspec
, mode
);
2353 rtx ops
[] = {target
, target
, builder
.elt (i
)};
2354 emit_vlmax_insn (icode
, BINARY_OP
, ops
);
2358 /* Use merge approach to initialize the vector with repeating sequence.
2359 v = {a, b, a, b, a, b, a, b}.
2362 mask = 0b01010101....
2363 v = merge (v, b, mask)
2366 expand_vector_init_merge_repeating_sequence (rtx target
,
2367 const rvv_builder
&builder
)
2369 /* We can't use BIT mode (BI) directly to generate mask = 0b01010...
2370 since we don't have such instruction in RVV.
2371 Instead, we should use INT mode (QI/HI/SI/DI) with integer move
2372 instruction to generate the mask data we want. */
2373 machine_mode mask_bit_mode
= get_mask_mode (builder
.mode ());
2374 machine_mode mask_int_mode
2375 = get_repeating_sequence_dup_machine_mode (builder
, mask_bit_mode
);
2376 uint64_t full_nelts
= builder
.full_nelts ().to_constant ();
2378 /* Step 1: Broadcast the first pattern. */
2379 rtx ops
[] = {target
, force_reg (builder
.inner_mode (), builder
.elt (0))};
2380 emit_vlmax_insn (code_for_pred_broadcast (builder
.mode ()),
2382 /* Step 2: Merge the rest iteration of pattern. */
2383 for (unsigned int i
= 1; i
< builder
.npatterns (); i
++)
2385 /* Step 2-1: Generate mask register v0 for each merge. */
2387 = builder
.get_merge_scalar_mask (i
, GET_MODE_INNER (mask_int_mode
));
2388 rtx mask
= gen_reg_rtx (mask_bit_mode
);
2389 rtx dup
= gen_reg_rtx (mask_int_mode
);
2391 if (full_nelts
<= builder
.inner_bits_size ()) /* vmv.s.x. */
2393 rtx ops
[] = {dup
, merge_mask
};
2394 emit_nonvlmax_insn (code_for_pred_broadcast (GET_MODE (dup
)),
2395 SCALAR_MOVE_OP
, ops
, CONST1_RTX (Pmode
));
2400 force_reg (GET_MODE_INNER (mask_int_mode
), merge_mask
)};
2401 rtx vl
= gen_int_mode (CEIL (full_nelts
, builder
.inner_bits_size ()),
2403 emit_nonvlmax_insn (code_for_pred_broadcast (mask_int_mode
), UNARY_OP
,
2407 emit_move_insn (mask
, gen_lowpart (mask_bit_mode
, dup
));
2409 /* Step 2-2: Merge pattern according to the mask. */
2410 rtx ops
[] = {target
, target
, builder
.elt (i
), mask
};
2411 emit_vlmax_insn (code_for_pred_merge_scalar (GET_MODE (target
)),
2416 /* Use slideup approach to combine the vectors.
2417 v = {a, a, a, a, b, b, b, b}
2420 v1 = {a, a, a, a, a, a, a, a}
2421 v2 = {b, b, b, b, b, b, b, b}
2422 v = slideup (v1, v2, nelt / 2)
2425 expand_vector_init_slideup_combine_sequence (rtx target
,
2426 const rvv_builder
&builder
)
2428 machine_mode mode
= GET_MODE (target
);
2429 int nelts
= builder
.full_nelts ().to_constant ();
2430 rtx first_elt
= builder
.elt (0);
2431 rtx last_elt
= builder
.elt (nelts
- 1);
2432 rtx low
= expand_vector_broadcast (mode
, first_elt
);
2433 rtx high
= expand_vector_broadcast (mode
, last_elt
);
2434 insn_code icode
= code_for_pred_slide (UNSPEC_VSLIDEUP
, mode
);
2435 rtx ops
[] = {target
, low
, high
, gen_int_mode (nelts
/ 2, Pmode
)};
2436 emit_vlmax_insn (icode
, SLIDEUP_OP_MERGE
, ops
);
2439 /* Use merge approach to merge a scalar into a vector.
2440 v = {a, a, a, a, a, a, b, b}
2442 v1 = {a, a, a, a, a, a, a, a}
2444 mask = {0, 0, 0, 0, 0, 0, 1, 1}
2447 expand_vector_init_merge_combine_sequence (rtx target
,
2448 const rvv_builder
&builder
)
2450 machine_mode mode
= GET_MODE (target
);
2451 machine_mode imode
= builder
.int_mode ();
2452 machine_mode mmode
= builder
.mask_mode ();
2453 int nelts
= builder
.full_nelts ().to_constant ();
2454 int leading_ndups
= builder
.count_dups (0, nelts
- 1, 1);
2455 if ((leading_ndups
> 255 && GET_MODE_INNER (imode
) == QImode
)
2456 || riscv_get_v_regno_alignment (imode
) > 1)
2457 imode
= get_vector_mode (HImode
, nelts
).require ();
2459 /* Generate vid = { 0, 1, 2, ..., n }. */
2460 rtx vid
= gen_reg_rtx (imode
);
2461 expand_vec_series (vid
, const0_rtx
, const1_rtx
);
2463 /* Generate mask. */
2464 rtx mask
= gen_reg_rtx (mmode
);
2465 insn_code icode
= code_for_pred_cmp_scalar (imode
);
2466 rtx index
= gen_int_mode (leading_ndups
- 1, builder
.inner_int_mode ());
2467 rtx dup_rtx
= gen_rtx_VEC_DUPLICATE (imode
, index
);
2468 /* vmsgtu.vi/vmsgtu.vx. */
2469 rtx cmp
= gen_rtx_fmt_ee (GTU
, mmode
, vid
, dup_rtx
);
2470 rtx sel
= builder
.elt (nelts
- 1);
2471 rtx mask_ops
[] = {mask
, cmp
, vid
, index
};
2472 emit_vlmax_insn (icode
, COMPARE_OP
, mask_ops
);
2474 /* Duplicate the first elements. */
2475 rtx dup
= expand_vector_broadcast (mode
, builder
.elt (0));
2476 /* Merge scalar into vector according to mask. */
2477 rtx merge_ops
[] = {target
, dup
, sel
, mask
};
2478 icode
= code_for_pred_merge_scalar (mode
);
2479 emit_vlmax_insn (icode
, MERGE_OP
, merge_ops
);
2482 /* Subroutine of expand_vec_init to handle case
2483 when all trailing elements of builder are same.
2484 This works as follows:
2485 (a) Use expand_insn interface to broadcast last vector element in TARGET.
2486 (b) Insert remaining elements in TARGET using insr.
2488 ??? The heuristic used is to do above if number of same trailing elements
2489 is greater than leading_ndups, loosely based on
2490 heuristic from mostly_zeros_p. May need fine-tuning. */
2493 expand_vector_init_trailing_same_elem (rtx target
,
2494 const rtx_vector_builder
&builder
,
2497 int leading_ndups
= builder
.count_dups (0, nelts_reqd
- 1, 1);
2498 int trailing_ndups
= builder
.count_dups (nelts_reqd
- 1, -1, -1);
2499 machine_mode mode
= GET_MODE (target
);
2501 if (trailing_ndups
> leading_ndups
)
2503 rtx dup
= expand_vector_broadcast (mode
, builder
.elt (nelts_reqd
- 1));
2504 for (int i
= nelts_reqd
- trailing_ndups
- 1; i
>= 0; i
--)
2507 = FLOAT_MODE_P (mode
) ? UNSPEC_VFSLIDE1UP
: UNSPEC_VSLIDE1UP
;
2508 insn_code icode
= code_for_pred_slide (unspec
, mode
);
2509 rtx tmp
= gen_reg_rtx (mode
);
2510 rtx ops
[] = {tmp
, dup
, builder
.elt (i
)};
2511 emit_vlmax_insn (icode
, BINARY_OP
, ops
);
2512 /* slide1up need source and dest to be different REG. */
2516 emit_move_insn (target
, dup
);
2523 /* Initialize register TARGET from the elements in PARALLEL rtx VALS. */
2526 expand_vec_init (rtx target
, rtx vals
)
2528 machine_mode mode
= GET_MODE (target
);
2529 int nelts
= XVECLEN (vals
, 0);
2531 rvv_builder
v (mode
, nelts
, 1);
2532 for (int i
= 0; i
< nelts
; i
++)
2533 v
.quick_push (XVECEXP (vals
, 0, i
));
2538 /* Case 1: Convert v = { a, b, a, b } into v = { ab, ab }. */
2539 if (v
.can_duplicate_repeating_sequence_p ())
2541 rtx ele
= v
.get_merged_repeating_sequence ();
2542 rtx dup
= expand_vector_broadcast (v
.new_mode (), ele
);
2543 emit_move_insn (target
, gen_lowpart (mode
, dup
));
2547 /* Case 2: Optimize repeating sequence cases that Case 1 can
2548 not handle and it is profitable. For example:
2549 ELEMENT BITSIZE = 64.
2550 v = {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}.
2551 We can't find a vector mode for "ab" which will be combined into
2552 128-bit element to duplicate. */
2553 if (v
.repeating_sequence_use_merge_profitable_p ())
2555 expand_vector_init_merge_repeating_sequence (target
, v
);
2559 /* Case 3: Optimize combine sequence.
2560 E.g. v = {a, a, a, a, a, a, a, a, b, b, b, b, b, b, b, b}.
2562 v1 = {a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a}.
2564 v2 = {b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b}.
2566 if (v
.combine_sequence_use_slideup_profitable_p ())
2568 expand_vector_init_slideup_combine_sequence (target
, v
);
2572 /* Case 4: Optimize combine sequence.
2573 E.g. v = {a, a, a, a, a, a, a, a, a, a, a, b, b, b, b, b}.
2576 v = {a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a}.
2579 mask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1}.
2581 Merge b into v by mask:
2582 v = {a, a, a, a, a, a, a, a, a, a, a, b, b, b, b, b}. */
2583 if (v
.combine_sequence_use_merge_profitable_p ())
2585 expand_vector_init_merge_combine_sequence (target
, v
);
2590 /* Optimize trailing same elements sequence:
2591 v = {y, y2, y3, y4, y5, x, x, x, x, x, x, x, x, x, x, x}; */
2592 if (!expand_vector_init_trailing_same_elem (target
, v
, nelts
))
2593 /* Handle common situation by vslide1down. This function can handle any
2594 situation of vec_init<mode>. Only the cases that are not optimized above
2595 will fall through here. */
2596 expand_vector_init_insert_elems (target
, v
, nelts
);
2599 /* Get insn code for corresponding comparison. */
2602 get_cmp_insn_code (rtx_code code
, machine_mode mode
)
2614 icode
= code_for_pred_cmp (mode
);
2620 if (FLOAT_MODE_P (mode
))
2621 icode
= code_for_pred_cmp (mode
);
2623 icode
= code_for_pred_ltge (mode
);
2631 /* This hook gives the vectorizer more vector mode options. We want it to not
2632 only try modes with the maximum number of units a full vector can hold but
2633 for example also half the number of units for a smaller elements size.
2634 Such vectors can be promoted to a full vector of widened elements
2635 (still with the same number of elements, essentially vectorizing at a
2636 fixed number of units rather than a fixed number of bytes). */
2638 autovectorize_vector_modes (vector_modes
*modes
, bool)
2640 if (autovec_use_vlmax_p ())
2642 poly_uint64 full_size
= BYTES_PER_RISCV_VECTOR
* TARGET_MAX_LMUL
;
2644 /* Start with a RVV<LMUL>QImode where LMUL is the number of units that
2646 Then try LMUL = nunits / 2, nunits / 4 and nunits / 8 which
2647 is guided by the extensions we have available (vf2, vf4 and vf8).
2649 - full_size: Try using full vectors for all element types.
2651 Try using 16-bit containers for 8-bit elements and full vectors
2654 Try using 32-bit containers for 8-bit and 16-bit elements and
2655 full vectors for wider elements.
2657 Try using 64-bit containers for all element types. */
2658 static const int rvv_factors
[] = {1, 2, 4, 8, 16, 32, 64};
2659 for (unsigned int i
= 0; i
< sizeof (rvv_factors
) / sizeof (int); i
++)
2663 if (can_div_trunc_p (full_size
, rvv_factors
[i
], &units
)
2664 && get_vector_mode (QImode
, units
).exists (&mode
))
2665 modes
->safe_push (mode
);
2668 /* Push all VLSmodes according to TARGET_MIN_VLEN. */
2670 unsigned int base_size
= TARGET_MIN_VLEN
* TARGET_MAX_LMUL
/ 8;
2671 unsigned int size
= base_size
;
2673 while (size
> 0 && get_vector_mode (QImode
, size
).exists (&mode
))
2675 if (vls_mode_valid_p (mode
))
2676 modes
->safe_push (mode
);
2679 size
= base_size
/ (1U << i
);
2681 /* Enable LOOP_VINFO comparison in COST model. */
2682 return VECT_COMPARE_COSTS
;
2685 /* Return true if we can find the related MODE according to default LMUL. */
2687 can_find_related_mode_p (machine_mode vector_mode
, scalar_mode element_mode
,
2688 poly_uint64
*nunits
)
2690 if (!autovec_use_vlmax_p ())
2692 if (riscv_v_ext_vector_mode_p (vector_mode
)
2693 && multiple_p (BYTES_PER_RISCV_VECTOR
* TARGET_MAX_LMUL
,
2694 GET_MODE_SIZE (element_mode
), nunits
))
2696 if (riscv_v_ext_vls_mode_p (vector_mode
)
2697 && multiple_p (TARGET_MIN_VLEN
* TARGET_MAX_LMUL
,
2698 GET_MODE_SIZE (element_mode
), nunits
))
2703 /* If the given VECTOR_MODE is an RVV mode, first get the largest number
2704 of units that fit into a full vector at the given ELEMENT_MODE.
2705 We will have the vectorizer call us with a successively decreasing
2706 number of units (as specified in autovectorize_vector_modes).
2707 The starting mode is always the one specified by preferred_simd_mode. */
2709 vectorize_related_mode (machine_mode vector_mode
, scalar_mode element_mode
,
2712 /* TODO: We will support RVV VLS auto-vectorization mode in the future. */
2713 poly_uint64 min_units
;
2714 if (can_find_related_mode_p (vector_mode
, element_mode
, &min_units
))
2716 machine_mode rvv_mode
;
2717 if (maybe_ne (nunits
, 0U))
2719 /* If we were given a number of units NUNITS, try to find an
2720 RVV vector mode of inner mode ELEMENT_MODE with the same
2722 if (multiple_p (min_units
, nunits
)
2723 && get_vector_mode (element_mode
, nunits
).exists (&rvv_mode
))
2728 /* Look for a vector mode with the same number of units as the
2729 VECTOR_MODE we were given. We keep track of the minimum
2730 number of units so far which determines the smallest necessary
2731 but largest possible, suitable mode for vectorization. */
2732 min_units
= ordered_min (min_units
, GET_MODE_SIZE (vector_mode
));
2733 if (get_vector_mode (element_mode
, min_units
).exists (&rvv_mode
))
2738 return default_vectorize_related_mode (vector_mode
, element_mode
, nunits
);
2741 /* Expand an RVV comparison. */
2744 expand_vec_cmp (rtx target
, rtx_code code
, rtx op0
, rtx op1
)
2746 machine_mode mask_mode
= GET_MODE (target
);
2747 machine_mode data_mode
= GET_MODE (op0
);
2748 insn_code icode
= get_cmp_insn_code (code
, data_mode
);
2752 rtx lt
= gen_reg_rtx (mask_mode
);
2753 rtx gt
= gen_reg_rtx (mask_mode
);
2754 expand_vec_cmp (lt
, LT
, op0
, op1
);
2755 expand_vec_cmp (gt
, GT
, op0
, op1
);
2756 icode
= code_for_pred (IOR
, mask_mode
);
2757 rtx ops
[] = {target
, lt
, gt
};
2758 emit_vlmax_insn (icode
, BINARY_MASK_OP
, ops
);
2762 rtx cmp
= gen_rtx_fmt_ee (code
, mask_mode
, op0
, op1
);
2763 rtx ops
[] = {target
, cmp
, op0
, op1
};
2764 emit_vlmax_insn (icode
, COMPARE_OP
, ops
);
2768 expand_vec_cmp (rtx target
, rtx_code code
, rtx mask
, rtx maskoff
, rtx op0
,
2771 machine_mode mask_mode
= GET_MODE (target
);
2772 machine_mode data_mode
= GET_MODE (op0
);
2773 insn_code icode
= get_cmp_insn_code (code
, data_mode
);
2777 rtx lt
= gen_reg_rtx (mask_mode
);
2778 rtx gt
= gen_reg_rtx (mask_mode
);
2779 expand_vec_cmp (lt
, LT
, mask
, maskoff
, op0
, op1
);
2780 expand_vec_cmp (gt
, GT
, mask
, maskoff
, op0
, op1
);
2781 icode
= code_for_pred (IOR
, mask_mode
);
2782 rtx ops
[] = {target
, lt
, gt
};
2783 emit_vlmax_insn (icode
, BINARY_MASK_OP
, ops
);
2787 rtx cmp
= gen_rtx_fmt_ee (code
, mask_mode
, op0
, op1
);
2788 rtx ops
[] = {target
, mask
, maskoff
, cmp
, op0
, op1
};
2789 emit_vlmax_insn (icode
, COMPARE_OP_MU
, ops
);
2792 /* Expand an RVV floating-point comparison:
2794 If CAN_INVERT_P is true, the caller can also handle inverted results;
2795 return true if the result is in fact inverted. */
2798 expand_vec_cmp_float (rtx target
, rtx_code code
, rtx op0
, rtx op1
,
2801 machine_mode mask_mode
= GET_MODE (target
);
2802 machine_mode data_mode
= GET_MODE (op0
);
2804 /* If can_invert_p = true:
2805 It suffices to implement a u>= b as !(a < b) but with the NaNs masked off:
2810 vmflt.vv v0, va, vb, v0.t
2813 And, if !HONOR_SNANS, then you can remove the vmand.mm by masking the
2817 vmfeq.vv v0, vb, vb, v0.t
2818 vmflt.vv v0, va, vb, v0.t
2821 If can_invert_p = false:
2823 # Example of implementing isgreater()
2824 vmfeq.vv v0, va, va # Only set where A is not NaN.
2825 vmfeq.vv v1, vb, vb # Only set where B is not NaN.
2826 vmand.mm v0, v0, v1 # Only set where A and B are ordered,
2827 vmfgt.vv v0, va, vb, v0.t # so only set flags on ordered values.
2830 rtx eq0
= gen_reg_rtx (mask_mode
);
2831 rtx eq1
= gen_reg_rtx (mask_mode
);
2841 /* There is native support for the comparison. */
2842 expand_vec_cmp (target
, code
, op0
, op1
);
2851 /* vmfeq.vv v0, va, va */
2852 expand_vec_cmp (eq0
, EQ
, op0
, op0
);
2853 if (HONOR_SNANS (data_mode
))
2859 expand_vec_cmp (eq1
, EQ
, op1
, op1
);
2860 insn_code icode
= code_for_pred (AND
, mask_mode
);
2861 rtx ops
[] = {eq0
, eq0
, eq1
};
2862 emit_vlmax_insn (icode
, BINARY_MASK_OP
, ops
);
2866 /* vmfeq.vv v0, vb, vb, v0.t */
2867 expand_vec_cmp (eq0
, EQ
, eq0
, eq0
, op1
, op1
);
2874 if (code
== ORDERED
)
2876 emit_move_insn (target
, eq0
);
2880 /* There is native support for the inverse comparison. */
2881 code
= reverse_condition_maybe_unordered (code
);
2882 if (code
== ORDERED
)
2883 emit_move_insn (target
, eq0
);
2885 expand_vec_cmp (eq0
, code
, eq0
, eq0
, op0
, op1
);
2889 emit_move_insn (target
, eq0
);
2893 /* We use one_cmpl<mode>2 to make Combine PASS to combine mask instructions
2894 into: vmand.mm/vmnor.mm/vmnand.mm/vmnor.mm/vmxnor.mm. */
2895 emit_insn (gen_rtx_SET (target
, gen_rtx_NOT (mask_mode
, eq0
)));
2899 /* Modulo all SEL indices to ensure they are all in range if [0, MAX_SEL].
2900 MAX_SEL is nunits - 1 if rtx_equal_p (op0, op1). Otherwise, it is
2903 modulo_sel_indices (rtx op0
, rtx op1
, rtx sel
)
2906 machine_mode sel_mode
= GET_MODE (sel
);
2907 poly_uint64 nunits
= GET_MODE_NUNITS (sel_mode
);
2908 poly_uint64 max_sel
= rtx_equal_p (op0
, op1
) ? nunits
- 1 : 2 * nunits
- 1;
2909 /* If SEL is variable-length CONST_VECTOR, we don't need to modulo it.
2910 Or if SEL is constant-length within [0, MAX_SEL], no need to modulo the
2912 if (CONST_VECTOR_P (sel
)
2913 && (!nunits
.is_constant () || const_vec_all_in_range_p (sel
, 0, max_sel
)))
2917 rtx mod
= gen_const_vector_dup (sel_mode
, max_sel
);
2919 = expand_simple_binop (sel_mode
, AND
, sel
, mod
, NULL
, 0, OPTAB_DIRECT
);
2924 /* Implement vec_perm<mode>. */
2927 expand_vec_perm (rtx target
, rtx op0
, rtx op1
, rtx sel
)
2929 machine_mode data_mode
= GET_MODE (target
);
2930 machine_mode sel_mode
= GET_MODE (sel
);
2931 poly_uint64 nunits
= GET_MODE_NUNITS (sel_mode
);
2933 /* Check if the sel only references the first values vector. If each select
2934 index is in range of [0, nunits - 1]. A single vrgather instructions is
2935 enough. Since we will use vrgatherei16.vv for variable-length vector,
2936 it is never out of range and we don't need to modulo the index. */
2937 if (nunits
.is_constant () && const_vec_all_in_range_p (sel
, 0, nunits
- 1))
2939 emit_vlmax_gather_insn (target
, op0
, sel
);
2943 /* Check if all the indices are same. */
2945 if (const_vec_duplicate_p (sel
, &elt
))
2947 poly_uint64 value
= rtx_to_poly_int64 (elt
);
2949 if (maybe_gt (value
, nunits
- 1))
2951 sel
= gen_const_vector_dup (sel_mode
, value
- nunits
);
2954 emit_vlmax_gather_insn (target
, op
, sel
);
2957 /* Note: vec_perm indices are supposed to wrap when they go beyond the
2958 size of the two value vectors, i.e. the upper bits of the indices
2959 are effectively ignored. RVV vrgather instead produces 0 for any
2960 out-of-range indices, so we need to modulo all the vec_perm indices
2961 to ensure they are all in range of [0, nunits - 1] when op0 == op1
2962 or all in range of [0, 2 * nunits - 1] when op0 != op1. */
2963 rtx sel_mod
= modulo_sel_indices (op0
, op1
, sel
);
2965 /* Check if the two values vectors are the same. */
2966 if (rtx_equal_p (op0
, op1
))
2968 emit_vlmax_gather_insn (target
, op0
, sel_mod
);
2972 /* This following sequence is handling the case that:
2973 __builtin_shufflevector (vec1, vec2, index...), the index can be any
2974 value in range of [0, 2 * nunits - 1]. */
2975 machine_mode mask_mode
;
2976 mask_mode
= get_mask_mode (data_mode
);
2977 rtx mask
= gen_reg_rtx (mask_mode
);
2978 rtx max_sel
= gen_const_vector_dup (sel_mode
, nunits
);
2980 /* Step 1: generate a mask that should select everything >= nunits into the
2982 expand_vec_cmp (mask
, GEU
, sel_mod
, max_sel
);
2984 /* Step2: gather every op0 values indexed by sel into target,
2985 we don't need to care about the result of the element
2986 whose index >= nunits. */
2987 emit_vlmax_gather_insn (target
, op0
, sel_mod
);
2989 /* Step3: shift the range from (nunits, max_of_mode] to
2990 [0, max_of_mode - nunits]. */
2991 rtx tmp
= gen_reg_rtx (sel_mode
);
2992 rtx ops
[] = {tmp
, sel_mod
, max_sel
};
2993 emit_vlmax_insn (code_for_pred (MINUS
, sel_mode
), BINARY_OP
, ops
);
2995 /* Step4: gather those into the previously masked-out elements
2997 emit_vlmax_masked_gather_mu_insn (target
, op1
, tmp
, mask
);
3000 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST for RVV. */
3002 /* vec_perm support. */
3004 struct expand_vec_perm_d
3006 rtx target
, op0
, op1
;
3007 vec_perm_indices perm
;
3009 machine_mode op_mode
;
3014 /* Return the appropriate index mode for gather instructions. */
3016 get_gather_index_mode (struct expand_vec_perm_d
*d
)
3018 machine_mode sel_mode
= related_int_vector_mode (d
->vmode
).require ();
3019 poly_uint64 nunits
= GET_MODE_NUNITS (d
->vmode
);
3021 if (GET_MODE_INNER (d
->vmode
) == QImode
)
3023 if (nunits
.is_constant ())
3025 /* If indice is LMUL8 CONST_VECTOR and any element value
3026 exceed the range of 0 ~ 255, Forbid such permutation
3027 since we need vector HI mode to hold such indice and
3028 we don't have it. */
3029 if (!d
->perm
.all_in_range_p (0, 255)
3030 && !get_vector_mode (HImode
, nunits
).exists (&sel_mode
))
3031 return opt_machine_mode ();
3035 /* Permuting two SEW8 variable-length vectors need vrgatherei16.vv.
3036 Otherwise, it could overflow the index range. */
3037 if (!get_vector_mode (HImode
, nunits
).exists (&sel_mode
))
3038 return opt_machine_mode ();
3041 else if (riscv_get_v_regno_alignment (sel_mode
) > 1
3042 && GET_MODE_INNER (sel_mode
) != HImode
)
3043 sel_mode
= get_vector_mode (HImode
, nunits
).require ();
3047 /* Recognize the patterns that we can use merge operation to shuffle the
3048 vectors. The value of Each element (index i) in selector can only be
3049 either i or nunits + i. We will check the pattern is actually monotonic.
3052 v = VEC_PERM_EXPR (v0, v1, selector),
3053 selector = { 0, nunits + 1, 2, nunits + 3, 4, nunits + 5, ... }
3055 We can transform such pattern into:
3057 v = vcond_mask (v0, v1, mask),
3058 mask = { 0, 1, 0, 1, 0, 1, ... }. */
3061 shuffle_merge_patterns (struct expand_vec_perm_d
*d
)
3063 machine_mode vmode
= d
->vmode
;
3064 machine_mode sel_mode
= related_int_vector_mode (vmode
).require ();
3065 int n_patterns
= d
->perm
.encoding ().npatterns ();
3066 poly_int64 vec_len
= d
->perm
.length ();
3068 for (int i
= 0; i
< n_patterns
; ++i
)
3069 if (!known_eq (d
->perm
[i
], i
) && !known_eq (d
->perm
[i
], vec_len
+ i
))
3072 /* Check the pattern is monotonic here, otherwise, return false. */
3073 for (int i
= n_patterns
; i
< n_patterns
* 2; i
++)
3074 if (!d
->perm
.series_p (i
, n_patterns
, i
, n_patterns
)
3075 && !d
->perm
.series_p (i
, n_patterns
, vec_len
+ i
, n_patterns
))
3078 /* We need to use precomputed mask for such situation and such mask
3079 can only be computed in compile-time known size modes. */
3080 bool indices_fit_selector_p
3081 = GET_MODE_BITSIZE (GET_MODE_INNER (vmode
)) > 8 || known_lt (vec_len
, 256);
3082 if (!indices_fit_selector_p
&& !vec_len
.is_constant ())
3088 machine_mode mask_mode
= get_mask_mode (vmode
);
3089 rtx mask
= gen_reg_rtx (mask_mode
);
3091 if (indices_fit_selector_p
)
3093 /* MASK = SELECTOR < NUNTIS ? 1 : 0. */
3094 rtx sel
= vec_perm_indices_to_rtx (sel_mode
, d
->perm
);
3095 rtx x
= gen_int_mode (vec_len
, GET_MODE_INNER (sel_mode
));
3096 insn_code icode
= code_for_pred_cmp_scalar (sel_mode
);
3097 rtx cmp
= gen_rtx_fmt_ee (LTU
, mask_mode
, sel
, x
);
3098 rtx ops
[] = {mask
, cmp
, sel
, x
};
3099 emit_vlmax_insn (icode
, COMPARE_OP
, ops
);
3103 /* For EEW8 and NUNITS may be larger than 255, we can't use vmsltu
3104 directly to generate the selector mask, instead, we can only use
3107 E.g. selector = <0, 257, 2, 259> for EEW8 vector with NUNITS = 256, we
3108 don't have a QImode scalar register to hold larger than 255.
3109 We also cannot hold that in a vector QImode register if LMUL = 8, and,
3110 since there is no larger HI mode vector we cannot create a larger
3113 As the mask is a simple {0, 1, ...} pattern and the length is known we
3114 can store it in a scalar register and broadcast it to a mask register.
3116 gcc_assert (vec_len
.is_constant ());
3117 int size
= CEIL (GET_MODE_NUNITS (mask_mode
).to_constant (), 8);
3118 machine_mode mode
= get_vector_mode (QImode
, size
).require ();
3119 rtx tmp
= gen_reg_rtx (mode
);
3120 rvv_builder
v (mode
, 1, size
);
3121 for (int i
= 0; i
< vec_len
.to_constant () / 8; i
++)
3124 for (int j
= 0; j
< 8; j
++)
3126 int index
= i
* 8 + j
;
3127 if (known_lt (d
->perm
[index
], 256))
3130 v
.quick_push (gen_int_mode (value
, QImode
));
3132 emit_move_insn (tmp
, v
.build ());
3133 emit_move_insn (mask
, gen_lowpart (mask_mode
, tmp
));
3136 /* TARGET = MASK ? OP0 : OP1. */
3137 /* swap op0 and op1 since the order is opposite to pred_merge. */
3138 rtx ops2
[] = {d
->target
, d
->op1
, d
->op0
, mask
};
3139 emit_vlmax_insn (code_for_pred_merge (vmode
), MERGE_OP
, ops2
);
3143 /* Recognize the consecutive index that we can use a single
3144 vrgather.v[x|i] to shuffle the vectors.
3146 e.g. short[8] = VEC_PERM_EXPR <a, a, {0,1,0,1,0,1,0,1}>
3147 Use SEW = 32, index = 1 vrgather.vi to get the result. */
3149 shuffle_consecutive_patterns (struct expand_vec_perm_d
*d
)
3151 machine_mode vmode
= d
->vmode
;
3152 scalar_mode smode
= GET_MODE_INNER (vmode
);
3153 poly_int64 vec_len
= d
->perm
.length ();
3156 if (!vec_len
.is_constant () || !d
->perm
[0].is_constant (&elt
))
3158 int vlen
= vec_len
.to_constant ();
3160 /* Compute the last element index of consecutive pattern from the leading
3161 consecutive elements. */
3162 int last_consecutive_idx
= -1;
3163 int consecutive_num
= -1;
3164 for (int i
= 1; i
< vlen
; i
++)
3166 if (maybe_ne (d
->perm
[i
], d
->perm
[i
- 1] + 1))
3168 last_consecutive_idx
= i
;
3169 consecutive_num
= last_consecutive_idx
+ 1;
3172 int new_vlen
= vlen
/ consecutive_num
;
3173 if (last_consecutive_idx
< 0 || consecutive_num
== vlen
3174 || !pow2p_hwi (consecutive_num
) || !pow2p_hwi (new_vlen
))
3176 /* VEC_PERM <..., (index, index + 1, ... index + consecutive_num - 1)>.
3177 All elements of index, index + 1, ... index + consecutive_num - 1 should
3178 locate at the same vector. */
3179 if (maybe_ge (d
->perm
[0], vec_len
)
3180 != maybe_ge (d
->perm
[last_consecutive_idx
], vec_len
))
3182 /* If a vector has 8 elements. We allow optimizations on consecutive
3183 patterns e.g. <0, 1, 2, 3, 0, 1, 2, 3> or <4, 5, 6, 7, 4, 5, 6, 7>.
3184 Other patterns like <2, 3, 4, 5, 2, 3, 4, 5> are not feasible patterns
3186 if (d
->perm
[0].to_constant () % consecutive_num
!= 0)
3188 unsigned int container_bits
= consecutive_num
* GET_MODE_BITSIZE (smode
);
3189 if (container_bits
> 64)
3191 else if (container_bits
== 64)
3193 if (!TARGET_VECTOR_ELEN_64
)
3195 else if (FLOAT_MODE_P (smode
) && !TARGET_VECTOR_ELEN_FP_64
)
3199 /* Check the rest of elements are the same consecutive pattern. */
3200 for (int i
= consecutive_num
; i
< vlen
; i
++)
3201 if (maybe_ne (d
->perm
[i
], d
->perm
[i
% consecutive_num
]))
3204 if (FLOAT_MODE_P (smode
))
3205 smode
= float_mode_for_size (container_bits
).require ();
3207 smode
= int_mode_for_size (container_bits
, 0).require ();
3208 if (!get_vector_mode (smode
, new_vlen
).exists (&vmode
))
3210 machine_mode sel_mode
= related_int_vector_mode (vmode
).require ();
3216 int index
= elt
/ consecutive_num
;
3217 if (index
>= new_vlen
)
3218 index
= index
- new_vlen
;
3219 rtx sel
= gen_const_vector_dup (sel_mode
, index
);
3220 rtx op
= elt
>= vlen
? d
->op0
: d
->op1
;
3221 emit_vlmax_gather_insn (gen_lowpart (vmode
, d
->target
),
3222 gen_lowpart (vmode
, op
), sel
);
3226 /* Recognize the patterns that we can use compress operation to shuffle the
3227 vectors. The perm selector of compress pattern is divided into 2 part:
3228 The first part is the random index number < NUNITS.
3229 The second part is consecutive last N index number >= NUNITS.
3232 v = VEC_PERM_EXPR (v0, v1, selector),
3233 selector = { 0, 2, 6, 7 }
3235 We can transform such pattern into:
3237 op1 = vcompress (op0, mask)
3238 mask = { 1, 0, 1, 0 }
3242 shuffle_compress_patterns (struct expand_vec_perm_d
*d
)
3244 machine_mode vmode
= d
->vmode
;
3245 poly_int64 vec_len
= d
->perm
.length ();
3247 if (!vec_len
.is_constant ())
3250 int vlen
= vec_len
.to_constant ();
3252 /* It's not worthwhile the compress pattern has elemenets < 4
3253 and we can't modulo indices for compress pattern. */
3254 if (known_ge (d
->perm
[vlen
- 1], vlen
* 2) || vlen
< 4)
3257 /* Compress pattern doesn't work for one vector. */
3258 if (d
->one_vector_p
)
3261 /* Compress point is the point that all elements value with index i >=
3262 compress point of the selector are all consecutive series increasing and
3263 each selector value >= NUNTIS. In this case, we could compress all elements
3264 of i < compress point into the op1. */
3265 int compress_point
= -1;
3266 for (int i
= 0; i
< vlen
; i
++)
3268 if (compress_point
< 0 && known_ge (d
->perm
[i
], vec_len
))
3275 /* We don't apply compress approach if we can't find the compress point. */
3276 if (compress_point
< 0)
3279 /* We can only apply compress approach when all index values from 0 to
3280 compress point are increasing. */
3281 for (int i
= 1; i
< compress_point
; i
++)
3282 if (maybe_le (d
->perm
[i
], d
->perm
[i
- 1]))
3285 /* It must be series increasing from compress point. */
3286 for (int i
= 1 + compress_point
; i
< vlen
; i
++)
3287 if (maybe_ne (d
->perm
[i
], d
->perm
[i
- 1] + 1))
3294 /* Check whether we need to slideup op1 to apply compress approach.
3296 E.g. For index = { 0, 2, 6, 7}, since d->perm[i - 1] = 7 which
3297 is 2 * NUNITS - 1, so we don't need to slide up.
3299 For index = { 0, 2, 5, 6}, we need to slide op1 up before
3300 we apply compress approach. */
3301 bool need_slideup_p
= maybe_ne (d
->perm
[vlen
- 1], 2 * vec_len
- 1)
3302 && !const_vec_duplicate_p (d
->op1
);
3304 /* If we leave it directly be handled by general gather,
3305 the code sequence will be:
3306 VECTOR LOAD selector
3307 GEU mask, selector, NUNITS
3308 GATHER dest, op0, selector
3309 SUB selector, selector, NUNITS
3310 GATHER dest, op1, selector, mask
3311 Each ALU operation is considered as COST = 1 and VECTOR LOAD is considered
3312 as COST = 4. So, we consider the general gather handling COST = 9.
3313 TODO: This cost is not accurate, we can adjust it by tune info. */
3314 int general_cost
= 9;
3316 /* If we can use compress approach, the code squence will be:
3318 COMPRESS op1, op0, mask
3319 If it needs slide up, it will be:
3322 COMPRESS op1, op0, mask
3323 By default, mask load COST = 2.
3324 TODO: This cost is not accurate, we can adjust it by tune info. */
3325 int compress_cost
= 4;
3327 if (general_cost
<= compress_cost
)
3330 /* Build a mask that is true when selector element is true. */
3331 machine_mode mask_mode
= get_mask_mode (vmode
);
3332 rvv_builder
builder (mask_mode
, vlen
, 1);
3333 for (int i
= 0; i
< vlen
; i
++)
3335 bool is_compress_index
= false;
3336 for (int j
= 0; j
< compress_point
; j
++)
3338 if (known_eq (d
->perm
[j
], i
))
3340 is_compress_index
= true;
3344 if (is_compress_index
)
3345 builder
.quick_push (CONST1_RTX (BImode
));
3347 builder
.quick_push (CONST0_RTX (BImode
));
3349 rtx mask
= force_reg (mask_mode
, builder
.build ());
3354 int slideup_cnt
= vlen
- (d
->perm
[vlen
- 1].to_constant () % vlen
) - 1;
3355 merge
= gen_reg_rtx (vmode
);
3356 rtx ops
[] = {merge
, d
->op1
, gen_int_mode (slideup_cnt
, Pmode
)};
3357 insn_code icode
= code_for_pred_slide (UNSPEC_VSLIDEUP
, vmode
);
3358 emit_vlmax_insn (icode
, BINARY_OP
, ops
);
3361 insn_code icode
= code_for_pred_compress (vmode
);
3362 rtx ops
[] = {d
->target
, merge
, d
->op0
, mask
};
3363 emit_vlmax_insn (icode
, COMPRESS_OP_MERGE
, ops
);
3367 /* Recognize decompress patterns:
3369 1. VEC_PERM_EXPR op0 and op1
3370 with isel = { 0, nunits, 1, nunits + 1, ... }.
3371 Decompress op0 and op1 vector with the mask = { 0, 1, 0, 1, ... }.
3373 2. VEC_PERM_EXPR op0 and op1
3374 with isel = { 1/2 nunits, 3/2 nunits, 1/2 nunits+1, 3/2 nunits+1,... }.
3375 Slide down op0 and op1 with OFFSET = 1/2 nunits.
3376 Decompress op0 and op1 vector with the mask = { 0, 1, 0, 1, ... }.
3379 shuffle_decompress_patterns (struct expand_vec_perm_d
*d
)
3381 poly_uint64 nelt
= d
->perm
.length ();
3382 machine_mode mask_mode
= get_mask_mode (d
->vmode
);
3384 /* For constant size indices, we dont't need to handle it here.
3385 Just leave it to vec_perm<mode>. */
3386 if (d
->perm
.length ().is_constant ())
3389 poly_uint64 first
= d
->perm
[0];
3390 if ((maybe_ne (first
, 0U) && maybe_ne (first
* 2, nelt
))
3391 || !d
->perm
.series_p (0, 2, first
, 1)
3392 || !d
->perm
.series_p (1, 2, first
+ nelt
, 1))
3395 /* Permuting two SEW8 variable-length vectors need vrgatherei16.vv.
3396 Otherwise, it could overflow the index range. */
3397 machine_mode sel_mode
= related_int_vector_mode (d
->vmode
).require ();
3398 if (GET_MODE_INNER (d
->vmode
) == QImode
3399 && !get_vector_mode (HImode
, nelt
).exists (&sel_mode
))
3407 if (known_eq (first
, 0U))
3414 op0
= gen_reg_rtx (d
->vmode
);
3415 op1
= gen_reg_rtx (d
->vmode
);
3416 insn_code icode
= code_for_pred_slide (UNSPEC_VSLIDEDOWN
, d
->vmode
);
3417 rtx ops0
[] = {op0
, d
->op0
, gen_int_mode (first
, Pmode
)};
3418 rtx ops1
[] = {op1
, d
->op1
, gen_int_mode (first
, Pmode
)};
3419 emit_vlmax_insn (icode
, BINARY_OP
, ops0
);
3420 emit_vlmax_insn (icode
, BINARY_OP
, ops1
);
3422 /* Generate { 0, 1, .... } mask. */
3423 rtx vid
= gen_reg_rtx (sel_mode
);
3424 rtx vid_repeat
= gen_reg_rtx (sel_mode
);
3425 expand_vec_series (vid
, const0_rtx
, const1_rtx
);
3426 rtx and_ops
[] = {vid_repeat
, vid
, const1_rtx
};
3427 emit_vlmax_insn (code_for_pred_scalar (AND
, sel_mode
), BINARY_OP
, and_ops
);
3428 rtx const_vec
= gen_const_vector_dup (sel_mode
, 1);
3429 rtx mask
= gen_reg_rtx (mask_mode
);
3430 expand_vec_cmp (mask
, EQ
, vid_repeat
, const_vec
);
3431 emit_vlmax_decompress_insn (d
->target
, op0
, op1
, mask
);
3436 shuffle_bswap_pattern (struct expand_vec_perm_d
*d
)
3439 unsigned i
, size
, step
;
3441 if (!d
->one_vector_p
|| !d
->perm
[0].is_constant (&diff
) || !diff
)
3445 size
= step
* GET_MODE_UNIT_BITSIZE (d
->vmode
);
3453 /* We will have VEC_PERM_EXPR after rtl expand when invoking
3454 __builtin_bswap. It will generate about 9 instructions in
3455 loop as below, no matter it is bswap16, bswap32 or bswap64.
3461 5 vrgatherei16.vv v1,v4,v2
3468 But for bswap16 we may have a even simple code gen, which
3469 has only 7 instructions in loop as below.
3480 Unfortunately, the instructions in loop will grow to 13 and 24
3481 for bswap32 and bswap64. Thus, we will leverage vrgather (9 insn)
3482 for both the bswap64 and bswap32, but take shift and or (7 insn)
3489 for (i
= 0; i
< step
; i
++)
3490 if (!d
->perm
.series_p (i
, step
, diff
- i
, step
))
3493 /* Disable when nunits < 4 since the later generic approach
3494 is more profitable on BSWAP. */
3495 if (!known_gt (GET_MODE_NUNITS (d
->vmode
), 2))
3501 machine_mode vhi_mode
;
3502 poly_uint64 vhi_nunits
= exact_div (GET_MODE_NUNITS (d
->vmode
), 2);
3504 if (!get_vector_mode (HImode
, vhi_nunits
).exists (&vhi_mode
))
3507 /* Step-1: Move op0 to src with VHI mode. */
3508 rtx src
= gen_reg_rtx (vhi_mode
);
3509 emit_move_insn (src
, gen_lowpart (vhi_mode
, d
->op0
));
3511 /* Step-2: Shift right 8 bits to dest. */
3512 rtx dest
= expand_binop (vhi_mode
, lshr_optab
, src
, gen_int_mode (8, Pmode
),
3513 NULL_RTX
, 0, OPTAB_DIRECT
);
3515 /* Step-3: Shift left 8 bits to src. */
3516 src
= expand_binop (vhi_mode
, ashl_optab
, src
, gen_int_mode (8, Pmode
),
3517 NULL_RTX
, 0, OPTAB_DIRECT
);
3519 /* Step-4: Logic Or dest and src to dest. */
3520 dest
= expand_binop (vhi_mode
, ior_optab
, dest
, src
,
3521 NULL_RTX
, 0, OPTAB_DIRECT
);
3523 /* Step-5: Move src to target with VQI mode. */
3524 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, dest
));
3529 /* Recognize the pattern that can be shuffled by vec_extract and slide1up
3533 shuffle_extract_and_slide1up_patterns (struct expand_vec_perm_d
*d
)
3535 poly_int64 nunits
= GET_MODE_NUNITS (d
->vmode
);
3537 /* Recognize { nunits - 1, nunits, nunits + 1, ... }. */
3538 if (!d
->perm
.series_p (0, 2, nunits
- 1, 2)
3539 || !d
->perm
.series_p (1, 2, nunits
, 2))
3542 /* Disable when nunits < 4 since the later generic approach
3543 is more profitable on indice = { nunits - 1, nunits }. */
3544 if (!known_gt (nunits
, 2))
3551 /* Extract the last element of the first vector. */
3552 scalar_mode smode
= GET_MODE_INNER (d
->vmode
);
3553 rtx tmp
= gen_reg_rtx (smode
);
3554 emit_vec_extract (tmp
, d
->op0
, gen_int_mode (nunits
- 1, Pmode
));
3556 /* Insert the scalar into element 0. */
3558 = FLOAT_MODE_P (d
->vmode
) ? UNSPEC_VFSLIDE1UP
: UNSPEC_VSLIDE1UP
;
3559 insn_code icode
= code_for_pred_slide (unspec
, d
->vmode
);
3560 rtx ops
[] = {d
->target
, d
->op1
, tmp
};
3561 emit_vlmax_insn (icode
, BINARY_OP
, ops
);
3566 shuffle_series_patterns (struct expand_vec_perm_d
*d
)
3568 if (!d
->one_vector_p
|| d
->perm
.encoding ().npatterns () != 1)
3571 poly_int64 el1
= d
->perm
[0];
3572 poly_int64 el2
= d
->perm
[1];
3573 poly_int64 el3
= d
->perm
[2];
3575 poly_int64 step1
= el2
- el1
;
3576 poly_int64 step2
= el3
- el2
;
3578 bool need_insert
= false;
3579 bool have_series
= false;
3581 /* Check for a full series. */
3582 if (known_ne (step1
, 0) && d
->perm
.series_p (0, 1, el1
, step1
))
3585 /* Check for a series starting at the second element. */
3586 else if (known_ne (step2
, 0) && d
->perm
.series_p (1, 1, el2
, step2
))
3595 /* Disable shuffle if we can't find an appropriate integer index mode for
3597 machine_mode sel_mode
;
3598 if (!get_gather_index_mode (d
).exists (&sel_mode
))
3605 /* Create the series. */
3606 machine_mode eltmode
= Pmode
;
3607 rtx series
= gen_reg_rtx (sel_mode
);
3608 expand_vec_series (series
, gen_int_mode (need_insert
? el2
: el1
, eltmode
),
3609 gen_int_mode (need_insert
? step2
: step1
, eltmode
));
3611 /* Insert the remaining element if necessary. */
3614 insn_code icode
= code_for_pred_slide (UNSPEC_VSLIDE1UP
, sel_mode
);
3616 = {series
, series
, gen_int_mode (el1
, GET_MODE_INNER (sel_mode
))};
3617 emit_vlmax_insn (icode
, BINARY_OP
, ops
);
3620 emit_vlmax_gather_insn (d
->target
, d
->op0
, series
);
3625 /* Recognize the pattern that can be shuffled by generic approach. */
3628 shuffle_generic_patterns (struct expand_vec_perm_d
*d
)
3630 machine_mode sel_mode
;
3632 /* We don't enable SLP for non-power of 2 NPATTERNS. */
3633 if (!pow2p_hwi (d
->perm
.encoding().npatterns ()))
3636 /* Disable shuffle if we can't find an appropriate integer index mode for
3638 if (!get_gather_index_mode (d
).exists (&sel_mode
))
3645 rtx sel
= vec_perm_indices_to_rtx (sel_mode
, d
->perm
);
3646 /* Some FIXED-VLMAX/VLS vector permutation situations call targethook
3647 instead of expand vec_perm<mode>, we handle it directly. */
3648 expand_vec_perm (d
->target
, d
->op0
, d
->op1
, sel
);
3652 /* This function recognizes and supports different permutation patterns
3653 and enable VLA SLP auto-vectorization. */
3655 expand_vec_perm_const_1 (struct expand_vec_perm_d
*d
)
3657 gcc_assert (d
->op_mode
!= E_VOIDmode
);
3659 /* The pattern matching functions above are written to look for a small
3660 number to begin the sequence (0, 1, N/2). If we begin with an index
3661 from the second operand, we can swap the operands. */
3662 poly_int64 nelt
= d
->perm
.length ();
3663 if (known_ge (d
->perm
[0], nelt
))
3665 d
->perm
.rotate_inputs (1);
3666 std::swap (d
->op0
, d
->op1
);
3669 if (known_gt (nelt
, 1))
3671 if (d
->vmode
== d
->op_mode
)
3673 if (shuffle_merge_patterns (d
))
3675 if (shuffle_consecutive_patterns (d
))
3677 if (shuffle_compress_patterns (d
))
3679 if (shuffle_decompress_patterns (d
))
3681 if (shuffle_bswap_pattern (d
))
3683 if (shuffle_extract_and_slide1up_patterns (d
))
3685 if (shuffle_series_patterns (d
))
3687 if (shuffle_generic_patterns (d
))
3697 /* This function implements TARGET_VECTORIZE_VEC_PERM_CONST by using RVV
3700 expand_vec_perm_const (machine_mode vmode
, machine_mode op_mode
, rtx target
,
3701 rtx op0
, rtx op1
, const vec_perm_indices
&sel
)
3703 /* RVV doesn't have Mask type pack/unpack instructions and we don't use
3704 mask to do the iteration loop control. Just disable it directly. */
3705 if (GET_MODE_CLASS (vmode
) == MODE_VECTOR_BOOL
)
3707 /* FIXME: Explicitly disable VLA interleave SLP vectorization when we
3708 may encounter ICE for poly size (1, 1) vectors in loop vectorizer.
3709 Ideally, middle-end loop vectorizer should be able to disable it
3710 itself, We can remove the codes here when middle-end code is able
3711 to disable VLA SLP vectorization for poly size (1, 1) VF. */
3712 if (!BYTES_PER_RISCV_VECTOR
.is_constant ()
3713 && maybe_lt (BYTES_PER_RISCV_VECTOR
* TARGET_MAX_LMUL
,
3714 poly_int64 (16, 16)))
3717 struct expand_vec_perm_d d
;
3719 /* Check whether the mask can be applied to a single vector. */
3720 if (sel
.ninputs () == 1 || (op0
&& rtx_equal_p (op0
, op1
)))
3721 d
.one_vector_p
= true;
3722 else if (sel
.all_from_input_p (0))
3724 d
.one_vector_p
= true;
3727 else if (sel
.all_from_input_p (1))
3729 d
.one_vector_p
= true;
3733 d
.one_vector_p
= false;
3735 d
.perm
.new_vector (sel
.encoding (), d
.one_vector_p
? 1 : 2,
3736 sel
.nelts_per_input ());
3738 d
.op_mode
= op_mode
;
3745 d
.testing_p
= !target
;
3748 return expand_vec_perm_const_1 (&d
);
3750 rtx_insn
*last
= get_last_insn ();
3751 bool ret
= expand_vec_perm_const_1 (&d
);
3752 gcc_assert (last
== get_last_insn ());
3757 /* Generate no side effects vsetvl to get the vector length. */
3759 expand_select_vl (rtx
*ops
)
3761 poly_int64 nunits
= rtx_to_poly_int64 (ops
[2]);
3762 if (CONST_INT_P (ops
[1]) && known_le (INTVAL (ops
[1]), nunits
))
3764 /* If length is known <= VF, we just use the length directly instead
3767 E.g. _255 = .SELECT_VL (3, POLY_INT_CST [4, 4]);
3768 We move 3 into _255 intead of using explicit vsetvl. */
3769 emit_move_insn (ops
[0], ops
[1]);
3772 /* We arbitrary picked QImode as inner scalar mode to get vector mode.
3773 since vsetvl only demand ratio. We let VSETVL PASS to optimize it. */
3774 scalar_int_mode mode
= QImode
;
3775 machine_mode rvv_mode
= get_vector_mode (mode
, nunits
).require ();
3776 emit_insn (gen_no_side_effects_vsetvl_rtx (rvv_mode
, ops
[0], ops
[1]));
3779 /* Expand MASK_LEN_{LOAD,STORE}. */
3781 expand_load_store (rtx
*ops
, bool is_load
)
3786 machine_mode mode
= GET_MODE (ops
[0]);
3788 if (is_vlmax_len_p (mode
, len
))
3790 /* If the length operand is equal to VF, it is VLMAX load/store. */
3793 rtx m_ops
[] = {ops
[0], mask
, ops
[1]};
3794 emit_vlmax_insn (code_for_pred_mov (mode
), UNARY_OP_TAMA
, m_ops
);
3798 len
= gen_reg_rtx (Pmode
);
3799 emit_vlmax_vsetvl (mode
, len
);
3800 emit_insn (gen_pred_store (mode
, ops
[0], mask
, ops
[1], len
,
3801 get_avl_type_rtx (VLMAX
)));
3806 if (!satisfies_constraint_K (len
))
3807 len
= force_reg (Pmode
, len
);
3810 rtx m_ops
[] = {ops
[0], mask
, ops
[1]};
3811 emit_nonvlmax_insn (code_for_pred_mov (mode
), UNARY_OP_TAMA
, m_ops
,
3815 emit_insn (gen_pred_store (mode
, ops
[0], mask
, ops
[1], len
,
3816 get_avl_type_rtx (NONVLMAX
)));
3821 /* Return true if the operation is the floating-point operation need FRM. */
3823 needs_fp_rounding (unsigned icode
, machine_mode mode
)
3825 if (!FLOAT_MODE_P (mode
))
3828 return icode
!= maybe_code_for_pred (SMIN
, mode
)
3829 && icode
!= maybe_code_for_pred (UNSPEC_VFMIN
, mode
)
3830 && icode
!= maybe_code_for_pred (SMAX
, mode
)
3831 && icode
!= maybe_code_for_pred (UNSPEC_VFMAX
, mode
)
3832 && icode
!= maybe_code_for_pred (NEG
, mode
)
3833 && icode
!= maybe_code_for_pred (ABS
, mode
)
3834 /* narrower-FP -> FP */
3835 && icode
!= maybe_code_for_pred_extend (mode
)
3836 /* narrower-INT -> FP */
3837 && icode
!= maybe_code_for_pred_widen (FLOAT
, mode
)
3838 && icode
!= maybe_code_for_pred_widen (UNSIGNED_FLOAT
, mode
)
3840 && icode
!= maybe_code_for_pred (UNSPEC_VCOPYSIGN
, mode
)
3841 && icode
!= maybe_code_for_pred_mov (mode
);
3844 /* Subroutine to expand COND_LEN_* patterns. */
3846 expand_cond_len_op (unsigned icode
, insn_flags op_type
, rtx
*ops
, rtx len
)
3850 machine_mode mode
= GET_MODE (dest
);
3851 machine_mode mask_mode
= GET_MODE (mask
);
3853 bool is_dummy_mask
= rtx_equal_p (mask
, CONSTM1_RTX (mask_mode
));
3854 bool is_vlmax_len
= is_vlmax_len_p (mode
, len
);
3856 unsigned insn_flags
= HAS_DEST_P
| HAS_MASK_P
| HAS_MERGE_P
| op_type
;
3857 /* FIXME: We don't support simplification of COND_LEN_NEG (..., dummy len,
3858 dummy mask) into NEG_EXPR in GIMPLE FOLD yet. So, we do such
3859 simplification in RISC-V backend and may do that in middle-end in the
3861 if (is_dummy_mask
&& is_vlmax_len
)
3862 insn_flags
|= TDEFAULT_POLICY_P
| MDEFAULT_POLICY_P
;
3863 else if (is_dummy_mask
)
3864 insn_flags
|= TU_POLICY_P
| MDEFAULT_POLICY_P
;
3865 else if (is_vlmax_len
)
3866 insn_flags
|= TDEFAULT_POLICY_P
| MU_POLICY_P
;
3868 insn_flags
|= TU_POLICY_P
| MU_POLICY_P
;
3870 if (needs_fp_rounding (icode
, mode
))
3871 insn_flags
|= FRM_DYN_P
;
3874 emit_vlmax_insn (icode
, insn_flags
, ops
);
3876 emit_nonvlmax_insn (icode
, insn_flags
, ops
, len
);
3879 /* Return RVV_VUNDEF if the ELSE value is scratch rtx. */
3881 get_else_operand (rtx op
)
3883 return GET_CODE (op
) == SCRATCH
? RVV_VUNDEF (GET_MODE (op
)) : op
;
3886 /* Expand unary ops COND_LEN_*. */
3888 expand_cond_len_unop (unsigned icode
, rtx
*ops
)
3893 rtx merge
= get_else_operand (ops
[3]);
3896 rtx cond_ops
[] = {dest
, mask
, merge
, src
};
3897 expand_cond_len_op (icode
, UNARY_OP_P
, cond_ops
, len
);
3900 /* Expand unary ops COND_*. */
3902 expand_cond_unop (unsigned icode
, rtx
*ops
)
3907 rtx merge
= get_else_operand (ops
[3]);
3908 rtx len
= gen_int_mode (GET_MODE_NUNITS (GET_MODE (dest
)), Pmode
);
3910 rtx cond_ops
[] = {dest
, mask
, merge
, src
};
3911 expand_cond_len_op (icode
, UNARY_OP_P
, cond_ops
, len
);
3914 /* Expand binary ops COND_LEN_*. */
3916 expand_cond_len_binop (unsigned icode
, rtx
*ops
)
3922 rtx merge
= get_else_operand (ops
[4]);
3925 rtx cond_ops
[] = {dest
, mask
, merge
, src1
, src2
};
3926 expand_cond_len_op (icode
, BINARY_OP_P
, cond_ops
, len
);
3929 /* Expand binary ops COND_*. */
3931 expand_cond_binop (unsigned icode
, rtx
*ops
)
3937 rtx merge
= get_else_operand (ops
[4]);
3938 rtx len
= gen_int_mode (GET_MODE_NUNITS (GET_MODE (dest
)), Pmode
);
3940 rtx cond_ops
[] = {dest
, mask
, merge
, src1
, src2
};
3941 expand_cond_len_op (icode
, BINARY_OP_P
, cond_ops
, len
);
3944 /* Prepare insn_code for gather_load/scatter_store according to
3945 the vector mode and index mode. */
3947 prepare_gather_scatter (machine_mode vec_mode
, machine_mode idx_mode
,
3951 return code_for_pred_indexed_store (UNSPEC_UNORDERED
, vec_mode
, idx_mode
);
3954 unsigned src_eew_bitsize
= GET_MODE_BITSIZE (GET_MODE_INNER (idx_mode
));
3955 unsigned dst_eew_bitsize
= GET_MODE_BITSIZE (GET_MODE_INNER (vec_mode
));
3956 if (dst_eew_bitsize
== src_eew_bitsize
)
3957 return code_for_pred_indexed_load_same_eew (UNSPEC_UNORDERED
, vec_mode
);
3958 else if (dst_eew_bitsize
> src_eew_bitsize
)
3960 unsigned factor
= dst_eew_bitsize
/ src_eew_bitsize
;
3964 return code_for_pred_indexed_load_x2_greater_eew (
3965 UNSPEC_UNORDERED
, vec_mode
);
3967 return code_for_pred_indexed_load_x4_greater_eew (
3968 UNSPEC_UNORDERED
, vec_mode
);
3970 return code_for_pred_indexed_load_x8_greater_eew (
3971 UNSPEC_UNORDERED
, vec_mode
);
3978 unsigned factor
= src_eew_bitsize
/ dst_eew_bitsize
;
3982 return code_for_pred_indexed_load_x2_smaller_eew (
3983 UNSPEC_UNORDERED
, vec_mode
);
3985 return code_for_pred_indexed_load_x4_smaller_eew (
3986 UNSPEC_UNORDERED
, vec_mode
);
3988 return code_for_pred_indexed_load_x8_smaller_eew (
3989 UNSPEC_UNORDERED
, vec_mode
);
3997 /* Expand LEN_MASK_{GATHER_LOAD,SCATTER_STORE}. */
3999 expand_gather_scatter (rtx
*ops
, bool is_load
)
4001 rtx ptr
, vec_offset
, vec_reg
;
4010 vec_offset
= ops
[2];
4011 zero_extend_p
= INTVAL (ops
[3]);
4012 scale_log2
= exact_log2 (INTVAL (ops
[4]));
4018 vec_offset
= ops
[1];
4019 zero_extend_p
= INTVAL (ops
[2]);
4020 scale_log2
= exact_log2 (INTVAL (ops
[3]));
4023 machine_mode vec_mode
= GET_MODE (vec_reg
);
4024 machine_mode idx_mode
= GET_MODE (vec_offset
);
4025 scalar_mode inner_idx_mode
= GET_MODE_INNER (idx_mode
);
4026 unsigned inner_offsize
= GET_MODE_BITSIZE (inner_idx_mode
);
4027 poly_int64 nunits
= GET_MODE_NUNITS (vec_mode
);
4029 bool is_vlmax
= is_vlmax_len_p (vec_mode
, len
);
4031 /* Extend the offset element to address width. */
4032 if (inner_offsize
< BITS_PER_WORD
)
4034 /* 7.2. Vector Load/Store Addressing Modes.
4035 If the vector offset elements are narrower than XLEN, they are
4036 zero-extended to XLEN before adding to the ptr effective address. If
4037 the vector offset elements are wider than XLEN, the least-significant
4038 XLEN bits are used in the address calculation. An implementation must
4039 raise an illegal instruction exception if the EEW is not supported for
4042 RVV spec only refers to the scale_log == 0 case. */
4043 if (!zero_extend_p
|| scale_log2
!= 0)
4047 = int_mode_for_size (inner_offsize
* 2, 0).require ();
4049 inner_idx_mode
= int_mode_for_size (BITS_PER_WORD
, 0).require ();
4050 machine_mode new_idx_mode
4051 = get_vector_mode (inner_idx_mode
, nunits
).require ();
4052 rtx tmp
= gen_reg_rtx (new_idx_mode
);
4053 emit_insn (gen_extend_insn (tmp
, vec_offset
, new_idx_mode
, idx_mode
,
4054 zero_extend_p
? true : false));
4056 idx_mode
= new_idx_mode
;
4060 if (scale_log2
!= 0)
4062 rtx tmp
= expand_binop (idx_mode
, ashl_optab
, vec_offset
,
4063 gen_int_mode (scale_log2
, Pmode
), NULL_RTX
, 0,
4068 insn_code icode
= prepare_gather_scatter (vec_mode
, idx_mode
, is_load
);
4074 = {vec_reg
, mask
, ptr
, vec_offset
};
4075 emit_vlmax_insn (icode
, BINARY_OP_TAMA
, load_ops
);
4079 rtx store_ops
[] = {mask
, ptr
, vec_offset
, vec_reg
};
4080 emit_vlmax_insn (icode
, SCATTER_OP_M
, store_ops
);
4088 = {vec_reg
, mask
, ptr
, vec_offset
};
4089 emit_nonvlmax_insn (icode
, BINARY_OP_TAMA
, load_ops
, len
);
4093 rtx store_ops
[] = {mask
, ptr
, vec_offset
, vec_reg
};
4094 emit_nonvlmax_insn (icode
, SCATTER_OP_M
, store_ops
, len
);
4099 /* Expand COND_LEN_*. */
4101 expand_cond_len_ternop (unsigned icode
, rtx
*ops
)
4108 rtx merge
= get_else_operand (ops
[5]);
4111 rtx cond_ops
[] = {dest
, mask
, src1
, src2
, src3
, merge
};
4112 expand_cond_len_op (icode
, TERNARY_OP_P
, cond_ops
, len
);
4115 /* Expand COND_*. */
4117 expand_cond_ternop (unsigned icode
, rtx
*ops
)
4124 rtx merge
= get_else_operand (ops
[5]);
4125 rtx len
= gen_int_mode (GET_MODE_NUNITS (GET_MODE (dest
)), Pmode
);
4127 rtx cond_ops
[] = {dest
, mask
, src1
, src2
, src3
, merge
};
4128 expand_cond_len_op (icode
, TERNARY_OP_P
, cond_ops
, len
);
4131 /* Expand reduction operations.
4132 Case 1: ops = {scalar_dest, vector_src}
4133 Case 2: ops = {scalar_dest, vector_src, mask, vl}
4136 expand_reduction (unsigned unspec
, unsigned insn_flags
, rtx
*ops
, rtx init
)
4138 rtx scalar_dest
= ops
[0];
4139 rtx vector_src
= ops
[1];
4140 machine_mode vmode
= GET_MODE (vector_src
);
4141 machine_mode vel_mode
= GET_MODE (scalar_dest
);
4142 machine_mode m1_mode
= get_m1_mode (vel_mode
).require ();
4144 rtx m1_tmp
= gen_reg_rtx (m1_mode
);
4145 rtx scalar_move_ops
[] = {m1_tmp
, init
};
4146 emit_nonvlmax_insn (code_for_pred_broadcast (m1_mode
), SCALAR_MOVE_OP
,
4148 need_mask_operand_p (insn_flags
) ? ops
[3]
4149 : CONST1_RTX (Pmode
));
4150 rtx m1_tmp2
= gen_reg_rtx (m1_mode
);
4151 rtx reduc_ops
[] = {m1_tmp2
, vector_src
, m1_tmp
};
4152 insn_code icode
= code_for_pred (unspec
, vmode
);
4154 if (need_mask_operand_p (insn_flags
))
4156 rtx mask_len_reduc_ops
[] = {m1_tmp2
, ops
[2], vector_src
, m1_tmp
};
4157 emit_nonvlmax_insn (icode
, insn_flags
, mask_len_reduc_ops
, ops
[3]);
4160 emit_vlmax_insn (icode
, insn_flags
, reduc_ops
);
4162 emit_insn (gen_pred_extract_first (m1_mode
, scalar_dest
, m1_tmp2
));
4165 /* Prepare ops for ternary operations.
4166 It can be called before or after RA. */
4168 prepare_ternary_operands (rtx
*ops
)
4170 machine_mode mode
= GET_MODE (ops
[0]);
4172 if (!rtx_equal_p (ops
[5], RVV_VUNDEF (mode
))
4173 && (VECTOR_MODE_P (GET_MODE (ops
[2]))
4174 && !rtx_equal_p (ops
[2], ops
[5]))
4175 && !rtx_equal_p (ops
[3], ops
[5])
4176 && !rtx_equal_p (ops
[4], ops
[5]))
4178 /* RA will fail to find vector REG and report ICE, so we pre-merge
4179 the ops for LMUL = 8. */
4180 if (satisfies_constraint_Wc1 (ops
[1]))
4182 emit_move_insn (ops
[0], ops
[5]);
4183 emit_insn (gen_pred_mov (mode
, ops
[0], ops
[1], ops
[0], ops
[4], ops
[6],
4184 ops
[7], ops
[8], ops
[9]));
4187 emit_insn (gen_pred_merge (mode
, ops
[0], RVV_VUNDEF (mode
), ops
[5],
4188 ops
[4], ops
[1], ops
[6], ops
[7], ops
[9]));
4189 ops
[5] = ops
[4] = ops
[0];
4193 /* Swap the multiplication ops if the fallback value is the
4194 second of the two. */
4195 if (rtx_equal_p (ops
[3], ops
[5]))
4196 std::swap (ops
[2], ops
[3]);
4198 /* TODO: ??? Maybe we could support splitting FMA (a, 4, b)
4199 into PLUS (ASHIFT (a, 2), b) according to uarchs. */
4201 gcc_assert (rtx_equal_p (ops
[5], RVV_VUNDEF (mode
))
4202 || rtx_equal_p (ops
[5], ops
[2]) || rtx_equal_p (ops
[5], ops
[4]));
4205 /* Expand VEC_MASK_LEN_{LOAD_LANES,STORE_LANES}. */
4207 expand_lanes_load_store (rtx
*ops
, bool is_load
)
4212 rtx addr
= is_load
? XEXP (ops
[1], 0) : XEXP (ops
[0], 0);
4213 rtx reg
= is_load
? ops
[0] : ops
[1];
4214 machine_mode mode
= GET_MODE (ops
[0]);
4216 if (is_vlmax_len_p (mode
, len
))
4218 /* If the length operand is equal to VF, it is VLMAX load/store. */
4221 rtx m_ops
[] = {reg
, mask
, addr
};
4222 emit_vlmax_insn (code_for_pred_unit_strided_load (mode
), UNARY_OP_TAMA
,
4227 len
= gen_reg_rtx (Pmode
);
4228 emit_vlmax_vsetvl (mode
, len
);
4229 emit_insn (gen_pred_unit_strided_store (mode
, mask
, addr
, reg
, len
,
4230 get_avl_type_rtx (VLMAX
)));
4235 if (!satisfies_constraint_K (len
))
4236 len
= force_reg (Pmode
, len
);
4239 rtx m_ops
[] = {reg
, mask
, addr
};
4240 emit_nonvlmax_insn (code_for_pred_unit_strided_load (mode
),
4241 UNARY_OP_TAMA
, m_ops
, len
);
4244 emit_insn (gen_pred_unit_strided_store (mode
, mask
, addr
, reg
, len
,
4245 get_avl_type_rtx (NONVLMAX
)));
4249 /* Expand LEN_FOLD_EXTRACT_LAST. */
4251 expand_fold_extract_last (rtx
*ops
)
4254 rtx default_value
= ops
[1];
4256 rtx anchor
= gen_reg_rtx (Pmode
);
4257 rtx index
= gen_reg_rtx (Pmode
);
4259 rtx else_label
= gen_label_rtx ();
4260 rtx end_label
= gen_label_rtx ();
4263 machine_mode mode
= GET_MODE (vect
);
4264 machine_mode mask_mode
= GET_MODE (mask
);
4265 rtx compress_vect
= gen_reg_rtx (mode
);
4266 rtx slide_vect
= gen_reg_rtx (mode
);
4269 if (is_vlmax_len_p (mode
, len
))
4272 /* Calculate the number of 1-bit in mask. */
4273 rtx cpop_ops
[] = {anchor
, mask
};
4275 emit_nonvlmax_insn (code_for_pred_popcount (mask_mode
, Pmode
), CPOP_OP
,
4278 emit_vlmax_insn (code_for_pred_popcount (mask_mode
, Pmode
), CPOP_OP
,
4281 riscv_expand_conditional_branch (else_label
, EQ
, anchor
, const0_rtx
);
4282 emit_insn (gen_rtx_SET (index
, gen_rtx_PLUS (Pmode
, anchor
, constm1_rtx
)));
4283 /* Compress the vector. */
4284 icode
= code_for_pred_compress (mode
);
4285 rtx compress_ops
[] = {compress_vect
, vect
, mask
};
4287 emit_nonvlmax_insn (icode
, COMPRESS_OP
, compress_ops
, len
);
4289 emit_vlmax_insn (icode
, COMPRESS_OP
, compress_ops
);
4290 /* Emit the slide down to index 0 in a new vector. */
4291 rtx slide_ops
[] = {slide_vect
, compress_vect
, index
};
4292 icode
= code_for_pred_slide (UNSPEC_VSLIDEDOWN
, mode
);
4294 emit_nonvlmax_insn (icode
, BINARY_OP
, slide_ops
, len
);
4296 emit_vlmax_insn (icode
, BINARY_OP
, slide_ops
);
4297 /* Emit v(f)mv.[xf].s. */
4298 emit_insn (gen_pred_extract_first (mode
, dst
, slide_vect
));
4300 emit_jump_insn (gen_jump (end_label
));
4302 emit_label (else_label
);
4303 emit_move_insn (dst
, default_value
);
4304 emit_label (end_label
);
4307 /* Return true if the LMUL of comparison less than or equal to one. */
4309 cmp_lmul_le_one (machine_mode mode
)
4311 if (riscv_v_ext_vector_mode_p (mode
))
4312 return known_le (GET_MODE_SIZE (mode
), BYTES_PER_RISCV_VECTOR
);
4313 else if (riscv_v_ext_vls_mode_p (mode
))
4314 return known_le (GET_MODE_BITSIZE (mode
), TARGET_MIN_VLEN
);
4318 /* Return true if the LMUL of comparison greater than one. */
4320 cmp_lmul_gt_one (machine_mode mode
)
4322 if (riscv_v_ext_vector_mode_p (mode
))
4323 return known_gt (GET_MODE_SIZE (mode
), BYTES_PER_RISCV_VECTOR
);
4324 else if (riscv_v_ext_vls_mode_p (mode
))
4325 return known_gt (GET_MODE_BITSIZE (mode
), TARGET_MIN_VLEN
);
4329 /* Return true if the VLS mode is legal. There are 2 cases here.
4331 1. Enable VLS modes for VLA vectorization since fixed length VLMAX mode
4332 is the highest priority choice and should not conflict with VLS modes.
4333 2. Enable VLS modes for some cases in fixed-vlmax, aka the bitsize of the
4334 VLS mode are smaller than the minimal vla.
4336 Take vlen = 2048 as example for case 2.
4338 Note: Below table based on vlen = 2048.
4339 +----------------------------------------------------+----------------------+
4340 | VLS mode | VLA mode |
4341 +----------------------------------------------------+----------------------+
4342 | Name | Precision | Inner Precision | Enabled | Min mode | Min bits |
4343 +------------+-----------+-----------------+---------+-----------+----------+
4344 | V1BI | 1 | 1 | Yes | RVVMF64BI | 32 |
4345 | V2BI | 2 | 1 | Yes | RVVMF64BI | 32 |
4346 | V4BI | 4 | 1 | Yes | RVVMF64BI | 32 |
4347 | V8BI | 8 | 1 | Yes | RVVMF64BI | 32 |
4348 | V16BI | 16 | 1 | Yes | RVVMF64BI | 32 |
4349 | V32BI | 32 | 1 | NO | RVVMF64BI | 32 |
4350 | V64BI | 64 | 1 | NO | RVVMF64BI | 32 |
4351 | ... | ... | ... | ... | RVVMF64BI | 32 |
4352 | V4096BI | 4096 | 1 | NO | RVVMF64BI | 32 |
4353 +------------+-----------+-----------------+---------+-----------+----------+
4354 | V1QI | 8 | 8 | Yes | RVVMF8QI | 256 |
4355 | V2QI | 16 | 8 | Yes | RVVMF8QI | 256 |
4356 | V4QI | 32 | 8 | Yes | RVVMF8QI | 256 |
4357 | V8QI | 64 | 8 | Yes | RVVMF8QI | 256 |
4358 | V16QI | 128 | 8 | Yes | RVVMF8QI | 256 |
4359 | V32QI | 256 | 8 | NO | RVVMF8QI | 256 |
4360 | V64QI | 512 | 8 | NO | RVVMF8QI | 256 |
4361 | ... | ... | .. | ... | RVVMF8QI | 256 |
4362 | V4096QI | 32768 | 8 | NO | RVVMF8QI | 256 |
4363 +------------+-----------+-----------------+---------+-----------+----------+
4364 | V1HI | 16 | 16 | Yes | RVVMF4HI | 512 |
4365 | V2HI | 32 | 16 | Yes | RVVMF4HI | 512 |
4366 | V4HI | 64 | 16 | Yes | RVVMF4HI | 512 |
4367 | V8HI | 128 | 16 | Yes | RVVMF4HI | 512 |
4368 | V16HI | 256 | 16 | Yes | RVVMF4HI | 512 |
4369 | V32HI | 512 | 16 | NO | RVVMF4HI | 512 |
4370 | V64HI | 1024 | 16 | NO | RVVMF4HI | 512 |
4371 | ... | ... | .. | ... | RVVMF4HI | 512 |
4372 | V2048HI | 32768 | 16 | NO | RVVMF4HI | 512 |
4373 +------------+-----------+-----------------+---------+-----------+----------+
4374 | V1SI/SF | 32 | 32 | Yes | RVVMF2SI | 1024 |
4375 | V2SI/SF | 64 | 32 | Yes | RVVMF2SI | 1024 |
4376 | V4SI/SF | 128 | 32 | Yes | RVVMF2SI | 1024 |
4377 | V8SI/SF | 256 | 32 | Yes | RVVMF2SI | 1024 |
4378 | V16SI/SF | 512 | 32 | Yes | RVVMF2SI | 1024 |
4379 | V32SI/SF | 1024 | 32 | NO | RVVMF2SI | 1024 |
4380 | V64SI/SF | 2048 | 32 | NO | RVVMF2SI | 1024 |
4381 | ... | ... | .. | ... | RVVMF2SI | 1024 |
4382 | V1024SI/SF | 32768 | 32 | NO | RVVMF2SI | 1024 |
4383 +------------+-----------+-----------------+---------+-----------+----------+
4384 | V1DI/DF | 64 | 64 | Yes | RVVM1DI | 2048 |
4385 | V2DI/DF | 128 | 64 | Yes | RVVM1DI | 2048 |
4386 | V4DI/DF | 256 | 64 | Yes | RVVM1DI | 2048 |
4387 | V8DI/DF | 512 | 64 | Yes | RVVM1DI | 2048 |
4388 | V16DI/DF | 1024 | 64 | Yes | RVVM1DI | 2048 |
4389 | V32DI/DF | 2048 | 64 | NO | RVVM1DI | 2048 |
4390 | V64DI/DF | 4096 | 64 | NO | RVVM1DI | 2048 |
4391 | ... | ... | .. | ... | RVVM1DI | 2048 |
4392 | V512DI/DF | 32768 | 64 | NO | RVVM1DI | 2048 |
4393 +------------+-----------+-----------------+---------+-----------+----------+
4395 Then we can have the condition for VLS mode in fixed-vlmax, aka:
4396 PRECISION (VLSmode) < VLEN / (64 / PRECISION(VLS_inner_mode)). */
4398 vls_mode_valid_p (machine_mode vls_mode
)
4403 if (riscv_autovec_preference
== RVV_SCALABLE
)
4405 if (GET_MODE_CLASS (vls_mode
) != MODE_VECTOR_BOOL
4406 && !ordered_p (TARGET_MAX_LMUL
* BITS_PER_RISCV_VECTOR
,
4407 GET_MODE_PRECISION (vls_mode
)))
4408 /* We enable VLS modes which are aligned with TARGET_MAX_LMUL and
4409 BITS_PER_RISCV_VECTOR.
4411 e.g. When TARGET_MAX_LMUL = 1 and BITS_PER_RISCV_VECTOR = (128,128).
4412 We enable VLS modes have fixed size <= 128bit. Since ordered_p is
4413 false between VLA modes with size = (128, 128) bits and VLS mode
4414 with size = 128 bits, we will end up with multiple ICEs in
4415 middle-end generic codes. */
4420 if (riscv_autovec_preference
== RVV_FIXED_VLMAX
)
4422 machine_mode inner_mode
= GET_MODE_INNER (vls_mode
);
4423 int precision
= GET_MODE_PRECISION (inner_mode
).to_constant ();
4424 int min_vlmax_bitsize
= TARGET_MIN_VLEN
/ (64 / precision
);
4426 return GET_MODE_PRECISION (vls_mode
).to_constant () < min_vlmax_bitsize
;
4432 /* We don't have to convert the floating point to integer when the
4433 mantissa is zero. Thus, ther will be a limitation for both the
4434 single and double precision floating point. There will be no
4435 mantissa if the floating point is greater than the limit.
4437 1. Half floating point.
4438 +-----------+---------------+
4439 | float | binary layout |
4440 +-----------+---------------+
4442 +-----------+---------------+
4444 +-----------+---------------+
4446 +-----------+---------------+
4449 All half floating point will be unchanged for ceil if it is
4450 greater than and equal to 1024.
4452 2. Single floating point.
4453 +-----------+---------------+
4454 | float | binary layout |
4455 +-----------+---------------+
4456 | 8388607.5 | 0x4affffff |
4457 +-----------+---------------+
4458 | 8388608.0 | 0x4b000000 |
4459 +-----------+---------------+
4460 | 8388609.0 | 0x4b000001 |
4461 +-----------+---------------+
4464 All single floating point will be unchanged for ceil if it is
4465 greater than and equal to 8388608.
4467 3. Double floating point.
4468 +--------------------+--------------------+
4469 | float | binary layout |
4470 +--------------------+--------------------+
4471 | 4503599627370495.5 | 0X432fffffffffffff |
4472 +--------------------+--------------------+
4473 | 4503599627370496.0 | 0X4330000000000000 |
4474 +--------------------+--------------------+
4475 | 4503599627370497.0 | 0X4340000000000000 |
4476 +--------------------+--------------------+
4479 All double floating point will be unchanged for ceil if it is
4480 greater than and equal to 4503599627370496.
4483 get_fp_rounding_coefficient (machine_mode inner_mode
)
4485 REAL_VALUE_TYPE real
;
4487 if (inner_mode
== E_HFmode
)
4488 real_from_integer (&real
, inner_mode
, 1024, SIGNED
);
4489 else if (inner_mode
== E_SFmode
)
4490 real_from_integer (&real
, inner_mode
, 8388608, SIGNED
);
4491 else if (inner_mode
== E_DFmode
)
4492 real_from_integer (&real
, inner_mode
, 4503599627370496, SIGNED
);
4496 return const_double_from_real_value (real
, inner_mode
);
4500 emit_vec_float_cmp_mask (rtx fp_vector
, rtx_code code
, rtx fp_scalar
,
4501 machine_mode vec_fp_mode
)
4503 /* Step-1: Prepare the scalar float compare register. */
4504 rtx fp_reg
= gen_reg_rtx (GET_MODE_INNER (vec_fp_mode
));
4505 emit_insn (gen_move_insn (fp_reg
, fp_scalar
));
4507 /* Step-2: Generate the mask. */
4508 machine_mode mask_mode
= get_mask_mode (vec_fp_mode
);
4509 rtx mask
= gen_reg_rtx (mask_mode
);
4510 rtx cmp
= gen_rtx_fmt_ee (code
, mask_mode
, fp_vector
, fp_reg
);
4511 rtx cmp_ops
[] = {mask
, cmp
, fp_vector
, fp_reg
};
4512 insn_code icode
= code_for_pred_cmp_scalar (vec_fp_mode
);
4513 emit_vlmax_insn (icode
, COMPARE_OP
, cmp_ops
);
4519 emit_vec_copysign (rtx op_dest
, rtx op_src_0
, rtx op_src_1
,
4520 machine_mode vec_mode
)
4522 rtx sgnj_ops
[] = {op_dest
, op_src_0
, op_src_1
};
4523 insn_code icode
= code_for_pred (UNSPEC_VCOPYSIGN
, vec_mode
);
4525 emit_vlmax_insn (icode
, BINARY_OP
, sgnj_ops
);
4529 emit_vec_abs (rtx op_dest
, rtx op_src
, machine_mode vec_mode
)
4531 rtx abs_ops
[] = {op_dest
, op_src
};
4532 insn_code icode
= code_for_pred (ABS
, vec_mode
);
4534 emit_vlmax_insn (icode
, UNARY_OP
, abs_ops
);
4538 emit_vec_cvt_x_f (rtx op_dest
, rtx op_src
, rtx mask
,
4539 insn_type type
, machine_mode vec_mode
)
4541 insn_code icode
= code_for_pred_fcvt_x_f (UNSPEC_VFCVT
, vec_mode
);
4543 if (type
& USE_VUNDEF_MERGE_P
)
4545 rtx cvt_x_ops
[] = {op_dest
, mask
, op_src
};
4546 emit_vlmax_insn (icode
, type
, cvt_x_ops
);
4550 rtx cvt_x_ops
[] = {op_dest
, mask
, op_dest
, op_src
};
4551 emit_vlmax_insn (icode
, type
, cvt_x_ops
);
4556 emit_vec_cvt_x_f (rtx op_dest
, rtx op_src
, insn_type type
,
4557 machine_mode vec_mode
)
4559 rtx ops
[] = {op_dest
, op_src
};
4560 insn_code icode
= code_for_pred_fcvt_x_f (UNSPEC_VFCVT
, vec_mode
);
4562 emit_vlmax_insn (icode
, type
, ops
);
4566 emit_vec_narrow_cvt_x_f (rtx op_dest
, rtx op_src
, insn_type type
,
4567 machine_mode vec_mode
)
4569 rtx ops
[] = {op_dest
, op_src
};
4570 insn_code icode
= code_for_pred_narrow_fcvt_x_f (UNSPEC_VFCVT
, vec_mode
);
4572 emit_vlmax_insn (icode
, type
, ops
);
4576 emit_vec_widden_cvt_x_f (rtx op_dest
, rtx op_src
, insn_type type
,
4577 machine_mode vec_mode
)
4579 rtx ops
[] = {op_dest
, op_src
};
4580 insn_code icode
= code_for_pred_widen_fcvt_x_f (UNSPEC_VFCVT
, vec_mode
);
4582 emit_vlmax_insn (icode
, type
, ops
);
4586 emit_vec_widden_cvt_f_f (rtx op_dest
, rtx op_src
, insn_type type
,
4587 machine_mode vec_mode
)
4589 rtx ops
[] = {op_dest
, op_src
};
4590 insn_code icode
= code_for_pred_extend (vec_mode
);
4592 emit_vlmax_insn (icode
, type
, ops
);
4596 emit_vec_cvt_f_x (rtx op_dest
, rtx op_src
, rtx mask
,
4597 insn_type type
, machine_mode vec_mode
)
4599 rtx cvt_fp_ops
[] = {op_dest
, mask
, op_dest
, op_src
};
4600 insn_code icode
= code_for_pred (FLOAT
, vec_mode
);
4602 emit_vlmax_insn (icode
, type
, cvt_fp_ops
);
4606 emit_vec_cvt_x_f_rtz (rtx op_dest
, rtx op_src
, rtx mask
,
4607 insn_type type
, machine_mode vec_mode
)
4609 insn_code icode
= code_for_pred (FIX
, vec_mode
);
4611 if (type
& USE_VUNDEF_MERGE_P
)
4613 rtx cvt_x_ops
[] = {op_dest
, mask
, op_src
};
4614 emit_vlmax_insn (icode
, type
, cvt_x_ops
);
4618 rtx cvt_x_ops
[] = {op_dest
, mask
, op_dest
, op_src
};
4619 emit_vlmax_insn (icode
, type
, cvt_x_ops
);
4624 expand_vec_ceil (rtx op_0
, rtx op_1
, machine_mode vec_fp_mode
,
4625 machine_mode vec_int_mode
)
4627 /* Step-1: Get the abs float value for mask generation. */
4628 emit_vec_abs (op_0
, op_1
, vec_fp_mode
);
4630 /* Step-2: Generate the mask on const fp. */
4631 rtx const_fp
= get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode
));
4632 rtx mask
= emit_vec_float_cmp_mask (op_0
, LT
, const_fp
, vec_fp_mode
);
4634 /* Step-3: Convert to integer on mask, with rounding up (aka ceil). */
4635 rtx tmp
= gen_reg_rtx (vec_int_mode
);
4636 emit_vec_cvt_x_f (tmp
, op_1
, mask
, UNARY_OP_TAMA_FRM_RUP
, vec_fp_mode
);
4638 /* Step-4: Convert to floating-point on mask for the final result.
4639 To avoid unnecessary frm register access, we use RUP here and it will
4640 never do the rounding up because the tmp rtx comes from the float
4641 to int conversion. */
4642 emit_vec_cvt_f_x (op_0
, tmp
, mask
, UNARY_OP_TAMU_FRM_RUP
, vec_fp_mode
);
4644 /* Step-5: Retrieve the sign bit for -0.0. */
4645 emit_vec_copysign (op_0
, op_0
, op_1
, vec_fp_mode
);
4649 expand_vec_floor (rtx op_0
, rtx op_1
, machine_mode vec_fp_mode
,
4650 machine_mode vec_int_mode
)
4652 /* Step-1: Get the abs float value for mask generation. */
4653 emit_vec_abs (op_0
, op_1
, vec_fp_mode
);
4655 /* Step-2: Generate the mask on const fp. */
4656 rtx const_fp
= get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode
));
4657 rtx mask
= emit_vec_float_cmp_mask (op_0
, LT
, const_fp
, vec_fp_mode
);
4659 /* Step-3: Convert to integer on mask, with rounding down (aka floor). */
4660 rtx tmp
= gen_reg_rtx (vec_int_mode
);
4661 emit_vec_cvt_x_f (tmp
, op_1
, mask
, UNARY_OP_TAMA_FRM_RDN
, vec_fp_mode
);
4663 /* Step-4: Convert to floating-point on mask for the floor result. */
4664 emit_vec_cvt_f_x (op_0
, tmp
, mask
, UNARY_OP_TAMU_FRM_RDN
, vec_fp_mode
);
4666 /* Step-5: Retrieve the sign bit for -0.0. */
4667 emit_vec_copysign (op_0
, op_0
, op_1
, vec_fp_mode
);
4671 expand_vec_nearbyint (rtx op_0
, rtx op_1
, machine_mode vec_fp_mode
,
4672 machine_mode vec_int_mode
)
4674 /* Step-1: Get the abs float value for mask generation. */
4675 emit_vec_abs (op_0
, op_1
, vec_fp_mode
);
4677 /* Step-2: Generate the mask on const fp. */
4678 rtx const_fp
= get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode
));
4679 rtx mask
= emit_vec_float_cmp_mask (op_0
, LT
, const_fp
, vec_fp_mode
);
4681 /* Step-3: Backup FP exception flags, nearbyint never raise exceptions. */
4682 rtx fflags
= gen_reg_rtx (SImode
);
4683 emit_insn (gen_riscv_frflags (fflags
));
4685 /* Step-4: Convert to integer on mask, with rounding down (aka nearbyint). */
4686 rtx tmp
= gen_reg_rtx (vec_int_mode
);
4687 emit_vec_cvt_x_f (tmp
, op_1
, mask
, UNARY_OP_TAMA_FRM_DYN
, vec_fp_mode
);
4689 /* Step-5: Convert to floating-point on mask for the nearbyint result. */
4690 emit_vec_cvt_f_x (op_0
, tmp
, mask
, UNARY_OP_TAMU_FRM_DYN
, vec_fp_mode
);
4692 /* Step-6: Restore FP exception flags. */
4693 emit_insn (gen_riscv_fsflags (fflags
));
4695 /* Step-7: Retrieve the sign bit for -0.0. */
4696 emit_vec_copysign (op_0
, op_0
, op_1
, vec_fp_mode
);
4700 expand_vec_rint (rtx op_0
, rtx op_1
, machine_mode vec_fp_mode
,
4701 machine_mode vec_int_mode
)
4703 /* Step-1: Get the abs float value for mask generation. */
4704 emit_vec_abs (op_0
, op_1
, vec_fp_mode
);
4706 /* Step-2: Generate the mask on const fp. */
4707 rtx const_fp
= get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode
));
4708 rtx mask
= emit_vec_float_cmp_mask (op_0
, LT
, const_fp
, vec_fp_mode
);
4710 /* Step-3: Convert to integer on mask, with dyn rounding (aka rint). */
4711 rtx tmp
= gen_reg_rtx (vec_int_mode
);
4712 emit_vec_cvt_x_f (tmp
, op_1
, mask
, UNARY_OP_TAMA_FRM_DYN
, vec_fp_mode
);
4714 /* Step-4: Convert to floating-point on mask for the rint result. */
4715 emit_vec_cvt_f_x (op_0
, tmp
, mask
, UNARY_OP_TAMU_FRM_DYN
, vec_fp_mode
);
4717 /* Step-5: Retrieve the sign bit for -0.0. */
4718 emit_vec_copysign (op_0
, op_0
, op_1
, vec_fp_mode
);
4722 expand_vec_round (rtx op_0
, rtx op_1
, machine_mode vec_fp_mode
,
4723 machine_mode vec_int_mode
)
4725 /* Step-1: Get the abs float value for mask generation. */
4726 emit_vec_abs (op_0
, op_1
, vec_fp_mode
);
4728 /* Step-2: Generate the mask on const fp. */
4729 rtx const_fp
= get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode
));
4730 rtx mask
= emit_vec_float_cmp_mask (op_0
, LT
, const_fp
, vec_fp_mode
);
4732 /* Step-3: Convert to integer on mask, rounding to nearest (aka round). */
4733 rtx tmp
= gen_reg_rtx (vec_int_mode
);
4734 emit_vec_cvt_x_f (tmp
, op_1
, mask
, UNARY_OP_TAMA_FRM_RMM
, vec_fp_mode
);
4736 /* Step-4: Convert to floating-point on mask for the round result. */
4737 emit_vec_cvt_f_x (op_0
, tmp
, mask
, UNARY_OP_TAMU_FRM_RMM
, vec_fp_mode
);
4739 /* Step-5: Retrieve the sign bit for -0.0. */
4740 emit_vec_copysign (op_0
, op_0
, op_1
, vec_fp_mode
);
4744 expand_vec_trunc (rtx op_0
, rtx op_1
, machine_mode vec_fp_mode
,
4745 machine_mode vec_int_mode
)
4747 /* Step-1: Get the abs float value for mask generation. */
4748 emit_vec_abs (op_0
, op_1
, vec_fp_mode
);
4750 /* Step-2: Generate the mask on const fp. */
4751 rtx const_fp
= get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode
));
4752 rtx mask
= emit_vec_float_cmp_mask (op_0
, LT
, const_fp
, vec_fp_mode
);
4754 /* Step-3: Convert to integer on mask, rounding to zero (aka truncate). */
4755 rtx tmp
= gen_reg_rtx (vec_int_mode
);
4756 emit_vec_cvt_x_f_rtz (tmp
, op_1
, mask
, UNARY_OP_TAMA
, vec_fp_mode
);
4758 /* Step-4: Convert to floating-point on mask for the rint result. */
4759 emit_vec_cvt_f_x (op_0
, tmp
, mask
, UNARY_OP_TAMU_FRM_DYN
, vec_fp_mode
);
4761 /* Step-5: Retrieve the sign bit for -0.0. */
4762 emit_vec_copysign (op_0
, op_0
, op_1
, vec_fp_mode
);
4766 expand_vec_roundeven (rtx op_0
, rtx op_1
, machine_mode vec_fp_mode
,
4767 machine_mode vec_int_mode
)
4769 /* Step-1: Get the abs float value for mask generation. */
4770 emit_vec_abs (op_0
, op_1
, vec_fp_mode
);
4772 /* Step-2: Generate the mask on const fp. */
4773 rtx const_fp
= get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode
));
4774 rtx mask
= emit_vec_float_cmp_mask (op_0
, LT
, const_fp
, vec_fp_mode
);
4776 /* Step-3: Convert to integer on mask, rounding to nearest, ties to even. */
4777 rtx tmp
= gen_reg_rtx (vec_int_mode
);
4778 emit_vec_cvt_x_f (tmp
, op_1
, mask
, UNARY_OP_TAMA_FRM_RNE
, vec_fp_mode
);
4780 /* Step-4: Convert to floating-point on mask for the rint result. */
4781 emit_vec_cvt_f_x (op_0
, tmp
, mask
, UNARY_OP_TAMU_FRM_RNE
, vec_fp_mode
);
4783 /* Step-5: Retrieve the sign bit for -0.0. */
4784 emit_vec_copysign (op_0
, op_0
, op_1
, vec_fp_mode
);
4787 /* Handling the rounding from floating-point to int/long/long long. */
4789 emit_vec_rounding_to_integer (rtx op_0
, rtx op_1
, insn_type type
,
4790 machine_mode vec_fp_mode
,
4791 machine_mode vec_int_mode
,
4792 machine_mode vec_bridge_mode
= E_VOIDmode
)
4794 poly_uint16 vec_fp_size
= GET_MODE_SIZE (vec_fp_mode
);
4795 poly_uint16 vec_int_size
= GET_MODE_SIZE (vec_int_mode
);
4797 if (known_eq (vec_fp_size
, vec_int_size
)) /* SF => SI, DF => DI. */
4798 emit_vec_cvt_x_f (op_0
, op_1
, type
, vec_fp_mode
);
4799 else if (maybe_eq (vec_fp_size
, vec_int_size
* 2)) /* DF => SI. */
4800 emit_vec_narrow_cvt_x_f (op_0
, op_1
, type
, vec_fp_mode
);
4801 else if (maybe_eq (vec_fp_size
* 2, vec_int_size
)) /* SF => DI, HF => SI. */
4802 emit_vec_widden_cvt_x_f (op_0
, op_1
, type
, vec_int_mode
);
4803 else if (maybe_eq (vec_fp_size
* 4, vec_int_size
)) /* HF => DI. */
4805 gcc_assert (vec_bridge_mode
!= E_VOIDmode
);
4807 rtx op_sf
= gen_reg_rtx (vec_bridge_mode
);
4809 /* Step-1: HF => SF, no rounding here. */
4810 emit_vec_widden_cvt_f_f (op_sf
, op_1
, UNARY_OP
, vec_bridge_mode
);
4811 /* Step-2: SF => DI. */
4812 emit_vec_widden_cvt_x_f (op_0
, op_sf
, type
, vec_int_mode
);
4819 expand_vec_lrint (rtx op_0
, rtx op_1
, machine_mode vec_fp_mode
,
4820 machine_mode vec_int_mode
, machine_mode vec_bridge_mode
)
4822 emit_vec_rounding_to_integer (op_0
, op_1
, UNARY_OP_FRM_DYN
, vec_fp_mode
,
4823 vec_int_mode
, vec_bridge_mode
);
4827 expand_vec_lround (rtx op_0
, rtx op_1
, machine_mode vec_fp_mode
,
4828 machine_mode vec_int_mode
, machine_mode vec_bridge_mode
)
4830 emit_vec_rounding_to_integer (op_0
, op_1
, UNARY_OP_FRM_RMM
, vec_fp_mode
,
4831 vec_int_mode
, vec_bridge_mode
);
4835 expand_vec_lceil (rtx op_0
, rtx op_1
, machine_mode vec_fp_mode
,
4836 machine_mode vec_int_mode
)
4838 emit_vec_rounding_to_integer (op_0
, op_1
, UNARY_OP_FRM_RUP
, vec_fp_mode
,
4843 expand_vec_lfloor (rtx op_0
, rtx op_1
, machine_mode vec_fp_mode
,
4844 machine_mode vec_int_mode
)
4846 emit_vec_rounding_to_integer (op_0
, op_1
, UNARY_OP_FRM_RDN
, vec_fp_mode
,
4850 /* Vectorize popcount by the Wilkes-Wheeler-Gill algorithm that libgcc uses as
4853 expand_popcount (rtx
*ops
)
4857 machine_mode mode
= GET_MODE (dst
);
4858 scalar_mode imode
= GET_MODE_INNER (mode
);
4859 static const uint64_t m5
= 0x5555555555555555ULL
;
4860 static const uint64_t m3
= 0x3333333333333333ULL
;
4861 static const uint64_t mf
= 0x0F0F0F0F0F0F0F0FULL
;
4862 static const uint64_t m1
= 0x0101010101010101ULL
;
4864 rtx x1
= gen_reg_rtx (mode
);
4865 rtx x2
= gen_reg_rtx (mode
);
4866 rtx x3
= gen_reg_rtx (mode
);
4867 rtx x4
= gen_reg_rtx (mode
);
4869 /* x1 = src - (src >> 1) & 0x555...); */
4870 rtx shift1
= expand_binop (mode
, lshr_optab
, src
, GEN_INT (1), NULL
, true,
4873 rtx and1
= gen_reg_rtx (mode
);
4874 rtx ops1
[] = {and1
, shift1
, gen_int_mode (m5
, imode
)};
4875 emit_vlmax_insn (code_for_pred_scalar (AND
, mode
), riscv_vector::BINARY_OP
,
4878 x1
= expand_binop (mode
, sub_optab
, src
, and1
, NULL
, true, OPTAB_DIRECT
);
4880 /* x2 = (x1 & 0x3333333333333333ULL) + ((x1 >> 2) & 0x3333333333333333ULL);
4882 rtx and2
= gen_reg_rtx (mode
);
4883 rtx ops2
[] = {and2
, x1
, gen_int_mode (m3
, imode
)};
4884 emit_vlmax_insn (code_for_pred_scalar (AND
, mode
), riscv_vector::BINARY_OP
,
4887 rtx shift2
= expand_binop (mode
, lshr_optab
, x1
, GEN_INT (2), NULL
, true,
4890 rtx and22
= gen_reg_rtx (mode
);
4891 rtx ops22
[] = {and22
, shift2
, gen_int_mode (m3
, imode
)};
4892 emit_vlmax_insn (code_for_pred_scalar (AND
, mode
), riscv_vector::BINARY_OP
,
4895 x2
= expand_binop (mode
, add_optab
, and2
, and22
, NULL
, true, OPTAB_DIRECT
);
4897 /* x3 = (x2 + (x2 >> 4)) & 0x0f0f0f0f0f0f0f0fULL; */
4898 rtx shift3
= expand_binop (mode
, lshr_optab
, x2
, GEN_INT (4), NULL
, true,
4902 = expand_binop (mode
, add_optab
, x2
, shift3
, NULL
, true, OPTAB_DIRECT
);
4904 rtx ops3
[] = {x3
, plus3
, gen_int_mode (mf
, imode
)};
4905 emit_vlmax_insn (code_for_pred_scalar (AND
, mode
), riscv_vector::BINARY_OP
,
4908 /* dest = (x3 * 0x0101010101010101ULL) >> 56; */
4909 rtx mul4
= gen_reg_rtx (mode
);
4910 rtx ops4
[] = {mul4
, x3
, gen_int_mode (m1
, imode
)};
4911 emit_vlmax_insn (code_for_pred_scalar (MULT
, mode
), riscv_vector::BINARY_OP
,
4914 x4
= expand_binop (mode
, lshr_optab
, mul4
,
4915 GEN_INT (GET_MODE_BITSIZE (imode
) - 8), NULL
, true,
4918 emit_move_insn (dst
, x4
);
4921 /* Return true if it is VLMAX AVL TYPE. */
4923 vlmax_avl_type_p (rtx_insn
*rinsn
)
4925 extract_insn_cached (rinsn
);
4926 int index
= get_attr_avl_type_idx (rinsn
);
4927 if (index
== INVALID_ATTRIBUTE
)
4929 rtx avl_type
= recog_data
.operand
[index
];
4930 return INTVAL (avl_type
) == VLMAX
;
4933 /* Return true if it is an RVV instruction depends on VL global
4936 has_vl_op (rtx_insn
*rinsn
)
4938 return recog_memoized (rinsn
) >= 0 && get_attr_has_vl_op (rinsn
);
4941 /* Get default tail policy. */
4945 /* For the instruction that doesn't require TA, we still need a default value
4946 to emit vsetvl. We pick up the default value according to prefer policy. */
4947 return (bool) (get_prefer_tail_policy () & 0x1
4948 || (get_prefer_tail_policy () >> 1 & 0x1));
4951 /* Helper function to get TA operand. */
4953 tail_agnostic_p (rtx_insn
*rinsn
)
4955 /* If it doesn't have TA, we return agnostic by default. */
4956 extract_insn_cached (rinsn
);
4957 int ta
= get_attr_ta (rinsn
);
4958 return ta
== INVALID_ATTRIBUTE
? get_default_ta () : IS_AGNOSTIC (ta
);
4961 /* Change insn and Assert the change always happens. */
4963 validate_change_or_fail (rtx object
, rtx
*loc
, rtx new_rtx
, bool in_group
)
4965 bool change_p
= validate_change (object
, loc
, new_rtx
, in_group
);
4966 gcc_assert (change_p
);
4969 /* Return true if it is NONVLMAX AVL TYPE. */
4971 nonvlmax_avl_type_p (rtx_insn
*rinsn
)
4973 extract_insn_cached (rinsn
);
4974 int index
= get_attr_avl_type_idx (rinsn
);
4975 if (index
== INVALID_ATTRIBUTE
)
4977 rtx avl_type
= recog_data
.operand
[index
];
4978 return INTVAL (avl_type
) == NONVLMAX
;
4981 /* Return true if RTX is RVV VLMAX AVL. */
4985 return x
&& rtx_equal_p (x
, RVV_VLMAX
);
4988 /* Helper function to get SEW operand. We always have SEW value for
4989 all RVV instructions that have VTYPE OP. */
4991 get_sew (rtx_insn
*rinsn
)
4993 return get_attr_sew (rinsn
);
4996 /* Helper function to get VLMUL operand. We always have VLMUL value for
4997 all RVV instructions that have VTYPE OP. */
4999 get_vlmul (rtx_insn
*rinsn
)
5001 return (enum vlmul_type
) get_attr_vlmul (rinsn
);
5004 /* Count the number of REGNO in RINSN. */
5006 count_regno_occurrences (rtx_insn
*rinsn
, unsigned int regno
)
5009 extract_insn (rinsn
);
5010 for (int i
= 0; i
< recog_data
.n_operands
; i
++)
5011 if (refers_to_regno_p (regno
, recog_data
.operand
[i
]))
5016 /* Return true if the OP can be directly broadcasted. */
5018 can_be_broadcasted_p (rtx op
)
5020 machine_mode mode
= GET_MODE (op
);
5021 /* We don't allow RA (register allocation) reload generate
5022 (vec_duplicate:DI reg) in RV32 system wheras we allow
5023 (vec_duplicate:DI mem) in RV32 system. */
5024 if (!can_create_pseudo_p () && !FLOAT_MODE_P (mode
)
5025 && maybe_gt (GET_MODE_SIZE (mode
), GET_MODE_SIZE (Pmode
))
5026 && !satisfies_constraint_Wdm (op
))
5029 if (satisfies_constraint_K (op
) || register_operand (op
, mode
)
5030 || satisfies_constraint_Wdm (op
) || rtx_equal_p (op
, CONST0_RTX (mode
)))
5033 return can_create_pseudo_p () && nonmemory_operand (op
, mode
);
5037 emit_vec_extract (rtx target
, rtx src
, rtx index
)
5039 machine_mode vmode
= GET_MODE (src
);
5040 machine_mode smode
= GET_MODE (target
);
5041 class expand_operand ops
[3];
5042 enum insn_code icode
5043 = convert_optab_handler (vec_extract_optab
, vmode
, smode
);
5044 gcc_assert (icode
!= CODE_FOR_nothing
);
5045 create_output_operand (&ops
[0], target
, smode
);
5047 create_input_operand (&ops
[1], src
, vmode
);
5050 if (poly_int_rtx_p (index
, &val
))
5051 create_integer_operand (&ops
[2], val
);
5053 create_input_operand (&ops
[2], index
, Pmode
);
5055 expand_insn (icode
, 3, ops
);
5056 if (ops
[0].value
!= target
)
5057 emit_move_insn (target
, ops
[0].value
);
5060 /* Return true if the offset mode is valid mode that we use for gather/scatter
5061 autovectorization. */
5063 gather_scatter_valid_offset_p (machine_mode mode
)
5065 /* If the element size of offset mode is already >= Pmode size,
5066 we don't need any extensions. */
5067 if (known_ge (GET_MODE_SIZE (GET_MODE_INNER (mode
)), UNITS_PER_WORD
))
5070 /* Since we are very likely extend the offset mode into vector Pmode,
5071 Disable gather/scatter autovectorization if we can't extend the offset
5072 mode into vector Pmode. */
5073 if (!get_vector_mode (Pmode
, GET_MODE_NUNITS (mode
)).exists ())
5078 /* Implement TARGET_ESTIMATED_POLY_VALUE.
5079 Look into the tuning structure for an estimate.
5080 KIND specifies the type of requested estimate: min, max or likely.
5081 For cores with a known VLA width all three estimates are the same.
5082 For generic VLA tuning we want to distinguish the maximum estimate from
5083 the minimum and likely ones.
5084 The likely estimate is the same as the minimum in that case to give a
5085 conservative behavior of auto-vectorizing with VLA when it is a win
5086 even for VLA vectorization.
5087 When VLA width information is available VAL.coeffs[1] is multiplied by
5088 the number of VLA chunks over the initial VLS bits. */
5090 estimated_poly_value (poly_int64 val
, unsigned int kind
)
5092 unsigned int width_source
5093 = BITS_PER_RISCV_VECTOR
.is_constant ()
5094 ? (unsigned int) BITS_PER_RISCV_VECTOR
.to_constant ()
5095 : (unsigned int) RVV_SCALABLE
;
5097 /* If there is no core-specific information then the minimum and likely
5098 values are based on TARGET_MIN_VLEN vectors and the maximum is based on
5099 the architectural maximum of 65536 bits. */
5100 unsigned int min_vlen_bytes
= TARGET_MIN_VLEN
/ 8 - 1;
5101 if (width_source
== RVV_SCALABLE
)
5104 case POLY_VALUE_MIN
:
5105 case POLY_VALUE_LIKELY
:
5106 return val
.coeffs
[0];
5108 case POLY_VALUE_MAX
:
5109 return val
.coeffs
[0] + val
.coeffs
[1] * min_vlen_bytes
;
5112 /* Allow BITS_PER_RISCV_VECTOR to be a bitmask of different VL, treating the
5113 lowest as likely. This could be made more general if future -mtune
5114 options need it to be. */
5115 if (kind
== POLY_VALUE_MAX
)
5116 width_source
= 1 << floor_log2 (width_source
);
5118 width_source
= least_bit_hwi (width_source
);
5120 /* If the core provides width information, use that. */
5121 HOST_WIDE_INT over_min_vlen
= width_source
- TARGET_MIN_VLEN
;
5122 return val
.coeffs
[0] + val
.coeffs
[1] * over_min_vlen
/ TARGET_MIN_VLEN
;
5125 } // namespace riscv_vector