return false;
}
+/* Helper function to emit a vmv.vx/vi and float variants.
+ If VL is not given a VLMAX insn will be emitted, otherwise
+ a non-VLMAX insn with length VL.
+ If the value to be broadcast is not suitable for vmv.vx
+ fall back to a vlse with zero stride. This itself has a
+ fallback if the uarch prefers not to use a strided load
+ for broadcast. */
+
+void
+expand_broadcast (machine_mode mode, rtx *ops, rtx vl)
+{
+ rtx elt = ops[1];
+ avl_type type = vl ? NONVLMAX : VLMAX;
+ if (can_be_broadcast_p (elt))
+ emit_avltype_insn (code_for_pred_broadcast (mode), UNARY_OP, ops,
+ type, vl);
+ else
+ emit_avltype_insn (code_for_pred_strided_broadcast (mode),
+ UNARY_OP, ops, type, vl);
+}
+
+/* Similar to expand_broadcast but emits a vmv.s.x/vfmv.s.f instead. */
+
+void
+expand_set_first (machine_mode mode, rtx *ops, rtx vl)
+{
+ rtx elt = ops[1];
+ avl_type type = vl ? NONVLMAX : VLMAX;
+ if (can_be_broadcast_p (elt))
+ emit_avltype_insn (code_for_pred_broadcast (mode),
+ SCALAR_MOVE_OP, ops, type, vl);
+ else
+ emit_avltype_insn (code_for_pred_strided_broadcast (mode),
+ SCALAR_MOVE_OP, ops, type, vl);
+}
+
+/* Similar to expand_set_first but keeping the tail elements
+ unchanged (TU) */
+
+void
+expand_set_first_tu (machine_mode mode, rtx *ops, rtx vl)
+{
+ rtx elt = ops[2];
+ if (!vl)
+ vl = const1_rtx;
+ if (can_be_broadcast_p (elt))
+ emit_nonvlmax_insn (code_for_pred_broadcast (mode),
+ SCALAR_MOVE_MERGED_OP_TU, ops, vl);
+ else
+ emit_nonvlmax_insn (code_for_pred_strided_broadcast (mode),
+ SCALAR_MOVE_MERGED_OP_TU, ops, vl);
+}
+
static void
expand_const_vec_duplicate (rtx target, rtx src, rtx elt)
{
if (lra_in_progress)
{
rtx ops[] = {result, elt};
- emit_vlmax_insn (code_for_pred_broadcast (mode), UNARY_OP, ops);
+ expand_broadcast (mode, ops);
}
else
{
{
dup = gen_reg_rtx (builder->new_mode ());
rtx ops[] = {dup, ele};
- emit_vlmax_insn (code_for_pred_broadcast (builder->new_mode ()),
- UNARY_OP, ops);
+ expand_broadcast (builder->new_mode (), ops);
}
else
dup = expand_vector_broadcast (builder->new_mode (), ele);
rtx tmp1 = gen_reg_rtx (builder->mode ());
rtx dup_ops[] = {tmp1, builder->elt (0)};
- emit_vlmax_insn (code_for_pred_broadcast (builder->mode ()), UNARY_OP,
- dup_ops);
+ expand_broadcast (builder->mode (), dup_ops);
for (unsigned int i = 1; i < builder->npatterns (); i++)
{
}
}
+/* This is a helper for binary ops with DImode scalar operands that are
+ broadcast (like vadd.vx v1, a1).
+ Instead of having similar code for all the expanders this function
+ unifies the handling. For 64-bit targets all we do is choose
+ between the vi variant (if available) and the register variant.
+ For 32-bit targets we either create the sign-extending variant
+ of vop.vx (when the immediate fits 32 bits) or emit a vector
+ broadcast of the 64-bit register/immediate and switch to a
+ vop.vv (replacing the scalar op with the broadcast vector. */
+
bool
sew64_scalar_helper (rtx *operands, rtx *scalar_op, rtx vl,
machine_mode vector_mode, bool has_vi_variant_p,
void (*emit_vector_func) (rtx *, rtx), enum avl_type type)
{
machine_mode scalar_mode = GET_MODE_INNER (vector_mode);
+
+ /* If the scalar broadcast op fits an immediate, use the
+ vop.vi variant if there is one. */
if (has_vi_variant_p)
{
*scalar_op = force_reg (scalar_mode, *scalar_op);
return false;
}
+ /* On a 64-bit target we can always use the vop.vx variant. */
if (TARGET_64BIT)
{
if (!rtx_equal_p (*scalar_op, const0_rtx))
return false;
}
+ /* For 32 bit and if there is no vop.vi variant for a 32-bit immediate
+ we need to use the sign-extending (SI -> DI) vop.vx variants. */
if (immediate_operand (*scalar_op, Pmode))
{
if (!rtx_equal_p (*scalar_op, const0_rtx))
return false;
}
- bool avoid_strided_broadcast = false;
+ /* Now we're left with a 64-bit immediate or a register.
+ We cannot use a vop.vx variant but must broadcast the value first
+ and switch to a vop.vv variant.
+ Broadcast can either be done via vlse64.v v1, reg, zero
+ or by loading one 64-bit element (vle64.v) and using a
+ broadcast vrgather.vi. This is decided when splitting
+ the strided broadcast insn. */
+ gcc_assert (!TARGET_64BIT
+ && (CONST_INT_P (*scalar_op)
+ || register_operand (*scalar_op, scalar_mode)));
+
if (CONST_INT_P (*scalar_op))
{
if (maybe_gt (GET_MODE_SIZE (scalar_mode), GET_MODE_SIZE (Pmode)))
- {
- if (strided_load_broadcast_p ())
- *scalar_op = force_const_mem (scalar_mode, *scalar_op);
- else
- avoid_strided_broadcast = true;
- }
+ *scalar_op = force_const_mem (scalar_mode, *scalar_op);
else
*scalar_op = force_reg (scalar_mode, *scalar_op);
}
rtx tmp = gen_reg_rtx (vector_mode);
- if (!avoid_strided_broadcast)
- {
- rtx ops[] = {tmp, *scalar_op};
- emit_avltype_insn (code_for_pred_broadcast (vector_mode), UNARY_OP, ops,
- type, vl);
- }
- else
- {
- /* Load scalar as V1DI and broadcast via vrgather.vi. */
- rtx tmp1 = gen_reg_rtx (V1DImode);
- emit_move_insn (tmp1, lowpart_subreg (V1DImode, *scalar_op,
- scalar_mode));
- tmp1 = lowpart_subreg (vector_mode, tmp1, V1DImode);
-
- rtx ops[] = {tmp, tmp1, CONST0_RTX (Pmode)};
- emit_vlmax_insn (code_for_pred_gather_scalar (vector_mode),
- BINARY_OP, ops);
- }
-
+ rtx ops[] = {tmp, *scalar_op};
+ emit_avltype_insn (code_for_pred_strided_broadcast (vector_mode),
+ UNARY_OP, ops, type, vl);
emit_vector_func (operands, tmp);
return true;
/* Step 1: Broadcast the first pattern. */
rtx ops[] = {target, force_reg (builder.inner_mode (), builder.elt (0))};
- emit_vlmax_insn (code_for_pred_broadcast (builder.mode ()),
- UNARY_OP, ops);
+ expand_broadcast (builder.mode (), ops);
/* Step 2: Merge the rest iteration of pattern. */
for (unsigned int i = 1; i < builder.npatterns (); i++)
{
if (full_nelts <= builder.inner_bits_size ()) /* vmv.s.x. */
{
rtx ops[] = {dup, merge_mask};
- emit_nonvlmax_insn (code_for_pred_broadcast (GET_MODE (dup)),
- SCALAR_MOVE_OP, ops, CONST1_RTX (Pmode));
+ expand_set_first (GET_MODE (dup), ops);
}
else /* vmv.v.x. */
{
force_reg (GET_MODE_INNER (mask_int_mode), merge_mask)};
rtx vl = gen_int_mode (CEIL (full_nelts, builder.inner_bits_size ()),
Pmode);
- emit_nonvlmax_insn (code_for_pred_broadcast (mask_int_mode), UNARY_OP,
- ops, vl);
+ expand_broadcast (mask_int_mode, ops, vl);
}
emit_move_insn (mask, gen_lowpart (mask_bit_mode, dup));
rtx m1_tmp = gen_reg_rtx (m1_mode);
rtx scalar_move_ops[] = {m1_tmp, init};
- insn_code icode = code_for_pred_broadcast (m1_mode);
if (need_mask_operand_p (insn_flags))
{
if (need_vl0_safe)
- emit_nonvlmax_insn (icode, SCALAR_MOVE_OP, scalar_move_ops, const1_rtx);
+ expand_set_first (m1_mode, scalar_move_ops, const1_rtx);
else
- emit_nonvlmax_insn (icode, SCALAR_MOVE_OP, scalar_move_ops, vl_op);
+ expand_set_first (m1_mode, scalar_move_ops, vl_op);
}
else
- emit_vlmax_insn (icode, SCALAR_MOVE_OP, scalar_move_ops);
+ expand_set_first (m1_mode, scalar_move_ops);
rtx m1_tmp2 = gen_reg_rtx (m1_mode);
rtx reduc_ops[] = {m1_tmp2, vector_src, m1_tmp};
+ insn_code icode;
if (need_vl0_safe)
icode = code_for_pred (unspec_for_vl0_safe, vmode);
else
return count;
}
-/* Return true if the OP can be directly broadcast. */
+/* Return true if the OP can be broadcast with a
+ v[f]mv.v.[xif] instruction. */
+
bool
can_be_broadcast_p (rtx op)
{
machine_mode mode = GET_MODE (op);
- /* We don't allow RA (register allocation) reload generate
- (vec_duplicate:DI reg) in RV32 system wheras we allow
- (vec_duplicate:DI mem) in RV32 system. */
- if (!can_create_pseudo_p () && !FLOAT_MODE_P (mode)
- && maybe_gt (GET_MODE_SIZE (mode), GET_MODE_SIZE (Pmode))
- && !satisfies_constraint_Wdm (op))
+
+ /* Zero always works and we can always put an immediate into a
+ register.
+ What's tricky is that for an immediate we don't know the
+ register's mode it will end up in, i.e. what element size
+ we want to broadcast. So even if the immediate is small it might
+ still end up in a DImode register that we cannot broadcast.
+ vmv.s.x, i.e. a single-element set can handle this, though,
+ because it implicitly sign-extends to SEW. */
+ if (rtx_equal_p (op, CONST0_RTX (mode))
+ || const_int_operand (op, Xmode))
+ return true;
+
+ /* Do not accept DImode broadcasts on !TARGET_64BIT. Those
+ are handled by strided broadcast. */
+ if (INTEGRAL_MODE_P (mode)
+ && maybe_gt (GET_MODE_SIZE (mode), UNITS_PER_WORD))
+ return false;
+
+ /* Non-register operands that can be forced into a register we can
+ handle. These don't need to use strided broadcast. */
+ if (INTEGRAL_MODE_P (mode)
+ && (memory_operand (op, mode) || CONST_POLY_INT_P (op))
+ && can_create_pseudo_p ())
+ return true;
+
+ /* Likewise, do not accept HFmode broadcast if we don't have
+ vfmv.v.f for 16-bit registers available. */
+ if (mode == HFmode && !TARGET_ZVFH)
+ return false;
+
+ /* Same for float, just that we can always handle 64-bit doubles
+ even on !TARGET_64BIT. We have ruled out 16-bit HF already
+ above. */
+ if (FLOAT_MODE_P (mode)
+ && (memory_operand (op, mode) || CONSTANT_P (op))
+ && can_create_pseudo_p ())
+ return true;
+
+ /* After excluding all the cases we cannot handle the register types
+ that remain can always be broadcast. */
+ if (register_operand (op, mode))
+ return true;
+
+ return false;
+}
+
+/* Returns true for all operands that cannot use vmv.vx, vfmv.vf,
+ vmv.s.x, or vfmv.s.f but rather need to go via memory. */
+
+bool
+strided_broadcast_p (rtx op)
+{
+ machine_mode mode = GET_MODE (op);
+ if (!memory_operand (op, mode)
+ && !register_operand (op, mode)
+ && !rtx_equal_p (op, CONST0_RTX (mode))
+ && !const_int_operand (op, mode))
return false;
- if (satisfies_constraint_K (op) || register_operand (op, mode)
- || (strided_load_broadcast_p () && satisfies_constraint_Wdm (op))
- || rtx_equal_p (op, CONST0_RTX (mode)))
+ /* !TARGET64_BIT does not have a vmv.v.x/vmv.s.x for 64-bit
+ DImode elements. */
+ if (INTEGRAL_MODE_P (mode)
+ && maybe_gt (GET_MODE_SIZE (mode), UNITS_PER_WORD))
return true;
- return can_create_pseudo_p () && nonmemory_operand (op, mode);
+ /* Zvfhmin does not have a vfmv.v.f/vfmv.s.f. for 16-bit elements. */
+ if (!TARGET_ZVFH && mode == HFmode)
+ return true;
+
+ return false;
}
void
return false;
}
-/* Return true if we can transform vmv.v.x/vfmv.v.f to vmv.s.x/vfmv.s.f. */
+/* Return true if we can transform vmv.v.x/vfmv.v.f to vmv.s.x/vfmv.s.f.
+ That's the case if we're dealing with a scalar broadcast that
+ has VL = 1. */
+
bool
splat_to_scalar_move_p (rtx *ops)
{
(define_expand "vec_duplicate<mode>"
[(set (match_operand:V_VLS 0 "register_operand")
(vec_duplicate:V_VLS
- (match_operand:<VEL> 1 "direct_broadcast_operand")))]
+ (match_operand:<VEL> 1 "any_broadcast_operand")))]
"TARGET_VECTOR"
{
- /* Early expand DImode broadcast in RV32 system to avoid RA reload
- generate (set (reg) (vec_duplicate:DI)). */
+ /* Don't keep a DImode broadcast for RV32 in the vec_duplicate form.
+ Otherwise combine or late combine could end up doing
+ "64-bit broadcast" (!= vmv.v.x)
+ + vadd.vv
+ = vadd.vx
+ which would be invalid. */
bool gt_p = maybe_gt (GET_MODE_SIZE (<VEL>mode), GET_MODE_SIZE (Pmode));
if (!FLOAT_MODE_P (<VEL>mode) && gt_p)
{
- riscv_vector::emit_vlmax_insn (code_for_pred_broadcast (<MODE>mode),
- riscv_vector::UNARY_OP, operands);
- DONE;
+ riscv_vector::emit_vlmax_insn
+ (code_for_pred_strided_broadcast
+ (<MODE>mode), riscv_vector::UNARY_OP, operands);
+ DONE;
}
- /* Otherwise, allow it fall into general vec_duplicate pattern
- which allow us to have vv->vx combine optimization in later pass. */
+
+ /* Even though we can eventually broadcast any permissible
+ constant by moving it into a register we need to force
+ any non-immediate one into a register here.
+ If we didn't do that we couldn't fwprop/late-combine
+ vec_duplicate 123.45f
+ + vfadd.vv
+ = vfadd.vf
+ because the constant is valid for vec_duplicate but not
+ for vfadd.vf. Therefore we need to do
+ fa0 = 123.45f
+ vec_duplicate fa0
+ + vfadd.vv
+ = vfadd.vf */
+ if (!satisfies_constraint_P (operands[1])
+ && !satisfies_constraint_J (operands[1])
+ && !rtx_equal_p (operands[1], CONST0_RTX (<VEL>mode))
+ && !memory_operand (operands[1], <VEL>mode))
+ operands[1] = force_reg (<VEL>mode, operands[1]);
+
+ /* Otherwise keep the vec_duplicate pattern until split. */
})
;; According to GCC internal:
(define_insn_and_split "*vec_duplicate<mode>"
[(set (match_operand:V_VLS 0 "register_operand")
(vec_duplicate:V_VLS
- (match_operand:<VEL> 1 "direct_broadcast_operand")))]
+ (match_operand:<VEL> 1 "any_broadcast_operand")))]
"TARGET_VECTOR && can_create_pseudo_p ()"
"#"
"&& 1"
[(const_int 0)]
{
- if (!strided_load_broadcast_p ()
- && TARGET_ZVFHMIN && !TARGET_ZVFH && <VEL>mode == HFmode)
- {
- /* For Float16, reinterpret as HImode, broadcast and reinterpret
- back. */
- poly_uint64 nunits = GET_MODE_NUNITS (<MODE>mode);
- machine_mode vmodehi
- = riscv_vector::get_vector_mode (HImode, nunits).require ();
- rtx ops[] = {lowpart_subreg (vmodehi, operands[0], <MODE>mode),
- lowpart_subreg (HImode, operands[1], HFmode)};
- riscv_vector::emit_vlmax_insn (code_for_pred_broadcast (vmodehi),
- riscv_vector::UNARY_OP, ops);
- }
- else
+ if (riscv_vector::can_be_broadcast_p (operands[1]))
riscv_vector::emit_vlmax_insn (code_for_pred_broadcast (<MODE>mode),
riscv_vector::UNARY_OP, operands);
+ else
+ riscv_vector::emit_vlmax_insn (code_for_pred_strided_broadcast
+ (<MODE>mode), riscv_vector::UNARY_OP,
+ operands);
+
DONE;
}
[(set_attr "type" "vector")]
(match_operand:V_VLS 2 "vector_merge_operand")))]
"TARGET_VECTOR"
{
- /* Transform vmv.v.x/vfmv.v.f (avl = 1) into vmv.s.x since vmv.s.x/vfmv.s.f
- has better chances to do vsetvl fusion in vsetvl pass. */
bool wrap_vec_dup = true;
rtx vec_cst = NULL_RTX;
- if (riscv_vector::splat_to_scalar_move_p (operands))
- {
- operands[1] = riscv_vector::gen_scalar_move_mask (<VM>mode);
- operands[3] = force_reg (<VEL>mode, operands[3]);
- }
- else if (immediate_operand (operands[3], <VEL>mode)
- && (vec_cst = gen_const_vec_duplicate (<MODE>mode, operands[3]))
- && (/* -> pred_broadcast<mode>_zero */
- (vector_least_significant_set_mask_operand (operands[1],
- <VM>mode)
- && vector_const_0_operand (vec_cst, <MODE>mode))
- || (/* pred_broadcast<mode>_imm */
- vector_all_trues_mask_operand (operands[1], <VM>mode)
- && vector_const_int_or_double_0_operand (vec_cst,
- <MODE>mode))))
+ if (immediate_operand (operands[3], <VEL>mode)
+ && (vec_cst = gen_const_vec_duplicate (<MODE>mode, operands[3]))
+ && (/* -> pred_broadcast<mode>_zero */
+ (vector_least_significant_set_mask_operand (operands[1],
+ <VM>mode)
+ && vector_const_0_operand (vec_cst, <MODE>mode))
+ || (/* pred_broadcast<mode>_imm */
+ vector_all_trues_mask_operand (operands[1], <VM>mode)
+ && vector_const_int_or_double_0_operand (vec_cst,
+ <MODE>mode))))
{
operands[3] = vec_cst;
wrap_vec_dup = false;
}
- /* Handle vmv.s.x instruction (Wb1 mask) which has memory scalar. */
- else if (satisfies_constraint_Wdm (operands[3]))
- {
- if (satisfies_constraint_Wb1 (operands[1]))
- {
- /* Case 1: vmv.s.x (TA, x == memory) ==> vlse.v (TA) */
- if (satisfies_constraint_vu (operands[2]))
- operands[1] = CONSTM1_RTX (<VM>mode);
- else if (GET_MODE_BITSIZE (<VEL>mode) > GET_MODE_BITSIZE (Pmode))
- {
- /* Case 2: vmv.s.x (TU, x == memory) ==>
- vl = 0 or 1; + vlse.v (TU) in RV32 system */
- operands[4] = riscv_vector::gen_avl_for_scalar_move (operands[4]);
- operands[1] = CONSTM1_RTX (<VM>mode);
- }
- else
- /* Case 3: load x (memory) to register. */
- operands[3] = force_reg (<VEL>mode, operands[3]);
- }
- }
- else if (GET_MODE_BITSIZE (<VEL>mode) > GET_MODE_BITSIZE (Pmode)
- && (immediate_operand (operands[3], Pmode)
+ else if (GET_MODE_SIZE (<VEL>mode) > UNITS_PER_WORD
+ && satisfies_constraint_Wb1 (operands[1])
+ && (immediate_operand (operands[3], Xmode)
|| (CONST_POLY_INT_P (operands[3])
&& known_ge (rtx_to_poly_int64 (operands[3]), 0U)
- && known_le (rtx_to_poly_int64 (operands[3]), GET_MODE_SIZE (<MODE>mode)))))
+ && known_le (rtx_to_poly_int64 (operands[3]),
+ GET_MODE_SIZE (<MODE>mode)))))
{
rtx tmp = gen_reg_rtx (Pmode);
poly_int64 value = rtx_to_poly_int64 (operands[3]);
- emit_move_insn (tmp, gen_int_mode (value, Pmode));
+ emit_move_insn (tmp, gen_int_mode (value, Xmode));
operands[3] = gen_rtx_SIGN_EXTEND (<VEL>mode, tmp);
}
- /* Never load (const_int 0) into a register, that's silly. */
- else if (operands[3] == CONST0_RTX (<VEL>mode))
+
+ /* For a vmv.v.x never load (const_int 0) or valid immediate operands
+ into a register, because we can use vmv.v.i. */
+ else if (satisfies_constraint_Wc1 (operands[1])
+ && (satisfies_constraint_P (operands[3])
+ || operands[3] == CONST0_RTX (<VEL>mode)))
;
- /* If we're broadcasting [-16..15] across more than just
- element 0, then we can use vmv.v.i directly, thus avoiding
- the load of the constant into a GPR. */
- else if (CONST_INT_P (operands[3])
- && IN_RANGE (INTVAL (operands[3]), -16, 15)
- && !satisfies_constraint_Wb1 (operands[1]))
+ /* For vmv.s.x we have vmv.s.x v1, zero. */
+ else if (satisfies_constraint_Wb1 (operands[1])
+ && operands[3] == CONST0_RTX (<VEL>mode))
;
else
operands[3] = force_reg (<VEL>mode, operands[3]);
operands[3] = gen_rtx_VEC_DUPLICATE (<MODE>mode, operands[3]);
})
-(define_insn_and_split "*pred_broadcast<mode>"
- [(set (match_operand:V_VLSI 0 "register_operand" "=vr, vr, vd, vd, vr, vr, vr, vr")
+(define_insn_and_rewrite "*pred_broadcast<mode>"
+ [(set (match_operand:V_VLSI 0 "register_operand" "=vr, vr, vr, vr")
(if_then_else:V_VLSI
(unspec:<VM>
- [(match_operand:<VM> 1 "vector_broadcast_mask_operand" "Wc1,Wc1, vm, vm,Wc1,Wc1,Wb1,Wb1")
- (match_operand 4 "vector_length_operand" "rvl,rvl,rvl,rvl,rvl,rvl,rvl,rvl")
- (match_operand 5 "const_int_operand" " i, i, i, i, i, i, i, i")
- (match_operand 6 "const_int_operand" " i, i, i, i, i, i, i, i")
- (match_operand 7 "const_int_operand" " i, i, i, i, i, i, i, i")
+ [(match_operand:<VM> 1 "vector_broadcast_mask_operand" "Wc1,Wc1,Wb1,Wb1")
+ (match_operand 4 "vector_length_operand" "rvl,rvl,rvl,rvl")
+ (match_operand 5 "const_int_operand" " i, i, i, i")
+ (match_operand 6 "const_int_operand" " i, i, i, i")
+ (match_operand 7 "const_int_operand" " i, i, i, i")
(reg:SI VL_REGNUM)
(reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
(vec_duplicate:V_VLSI
- (match_operand:<VEL> 3 "direct_broadcast_operand" "rP,rP,Wdm,Wdm,Wdm,Wdm, rJ, rJ"))
- (match_operand:V_VLSI 2 "vector_merge_operand" "vu, 0, vu, 0, vu, 0, vu, 0")))]
+ (match_operand:<VEL> 3 "direct_broadcast_operand" " rP, rP, rJ, rJ"))
+ (match_operand:V_VLSI 2 "vector_merge_operand" " vu, 0, vu, 0")))]
"TARGET_VECTOR"
"@
vmv.v.%o3\t%0,%3
vmv.v.%o3\t%0,%3
- vlse<sew>.v\t%0,%3,zero,%1.t
- vlse<sew>.v\t%0,%3,zero,%1.t
- vlse<sew>.v\t%0,%3,zero
- vlse<sew>.v\t%0,%3,zero
vmv.s.x\t%0,%z3
vmv.s.x\t%0,%z3"
- "(register_operand (operands[3], <VEL>mode)
- || CONST_POLY_INT_P (operands[3]))
- && GET_MODE_BITSIZE (<VEL>mode) > GET_MODE_BITSIZE (Pmode)"
- [(const_int 0)]
- {
- gcc_assert (can_create_pseudo_p ());
- if (CONST_POLY_INT_P (operands[3]))
- {
- rtx tmp = gen_reg_rtx (<VEL>mode);
- emit_move_insn (tmp, operands[3]);
- operands[3] = tmp;
- }
-
- /* For SEW = 64 in RV32 system, we expand vmv.s.x:
- andi a2,a2,1
- vsetvl zero,a2,e64
- vlse64.v */
- if (satisfies_constraint_Wb1 (operands[1]))
- {
- operands[4] = riscv_vector::gen_avl_for_scalar_move (operands[4]);
- operands[1] = CONSTM1_RTX (<VM>mode);
- }
-
- /* If the target doesn't want a strided-load broadcast we go with a regular
- V1DImode load and a broadcast gather. */
- if (strided_load_broadcast_p ())
- {
- rtx mem = assign_stack_local (<VEL>mode, GET_MODE_SIZE (<VEL>mode),
- GET_MODE_ALIGNMENT (<VEL>mode));
- mem = validize_mem (mem);
- emit_move_insn (mem, operands[3]);
- mem = gen_rtx_MEM (<VEL>mode, force_reg (Pmode, XEXP (mem, 0)));
-
- emit_insn
- (gen_pred_broadcast<mode>
- (operands[0], operands[1], operands[2], mem,
- operands[4], operands[5], operands[6], operands[7]));
- }
- else
- {
- rtx tmp = gen_reg_rtx (V1DImode);
- emit_move_insn (tmp, lowpart_subreg (V1DImode, operands[3],
- <VEL>mode));
- tmp = lowpart_subreg (<MODE>mode, tmp, V1DImode);
-
- emit_insn
- (gen_pred_gather<mode>_scalar
- (operands[0], operands[1], operands[2], tmp, CONST0_RTX (Pmode),
- operands[4], operands[5], operands[6], operands[7]));
- }
- DONE;
- }
- [(set_attr "type" "vimov,vimov,vlds,vlds,vlds,vlds,vimovxv,vimovxv")
+ "&& (operands[1] == CONSTM1_RTX (<VM>mode)
+ && operands[4] == CONST1_RTX (Pmode)
+ && (register_operand (operands[3], <VEL>mode)
+ || satisfies_constraint_J (operands[3])))"
+{
+ /* A broadcast of a single element is just a vmv.s.x. */
+ operands[1] = riscv_vector::gen_scalar_move_mask (<VM>mode);
+}
+ [(set_attr "type" "vimov,vimov,vimovxv,vimovxv")
(set_attr "mode" "<MODE>")])
-(define_insn "*pred_broadcast<mode>_zvfh"
- [(set (match_operand:V_VLSF 0 "register_operand" "=vr, vr, vr, vr")
+(define_insn_and_rewrite "pred_broadcast<mode>_zvfh"
+ [(set (match_operand:V_VLSF 0 "register_operand" "=vr, vr, vr, vr")
(if_then_else:V_VLSF
(unspec:<VM>
- [(match_operand:<VM> 1 "vector_broadcast_mask_operand" "Wc1, Wc1, Wb1, Wb1")
- (match_operand 4 "vector_length_operand" "rvl, rvl, rvl, rvl")
- (match_operand 5 "const_int_operand" " i, i, i, i")
- (match_operand 6 "const_int_operand" " i, i, i, i")
- (match_operand 7 "const_int_operand" " i, i, i, i")
+ [(match_operand:<VM> 1 "vector_broadcast_mask_operand" "Wc1,Wc1,Wb1,Wb1")
+ (match_operand 4 "vector_length_operand" "rvl,rvl,rvl,rvl")
+ (match_operand 5 "const_int_operand" " i, i, i, i")
+ (match_operand 6 "const_int_operand" " i, i, i, i")
+ (match_operand 7 "const_int_operand" " i, i, i, i")
(reg:SI VL_REGNUM)
(reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
(vec_duplicate:V_VLSF
- (match_operand:<VEL> 3 "direct_broadcast_operand" " f, f, f, f"))
- (match_operand:V_VLSF 2 "vector_merge_operand" " vu, 0, vu, 0")))]
+ (match_operand:<VEL> 3 "direct_broadcast_operand" " f, f, f, f"))
+ (match_operand:V_VLSF 2 "vector_merge_operand" " vu, 0, vu, 0")))]
"TARGET_VECTOR"
"@
vfmv.v.f\t%0,%3
vfmv.v.f\t%0,%3
vfmv.s.f\t%0,%3
vfmv.s.f\t%0,%3"
+ "&& (operands[1] == CONSTM1_RTX (<VM>mode)
+ && operands[4] == CONST1_RTX (Pmode)
+ && (register_operand (operands[3], <VEL>mode)
+ || satisfies_constraint_J (operands[3])))"
+{
+ /* A broadcast of a single element is just a vfmv.s.f. */
+ operands[1] = riscv_vector::gen_scalar_move_mask (<VM>mode);
+}
[(set_attr "type" "vfmov,vfmov,vfmovfv,vfmovfv")
(set_attr "mode" "<MODE>")])
-(define_insn "*pred_broadcast<mode>_zvfhmin"
- [(set (match_operand:V_VLSF_ZVFHMIN 0 "register_operand" "=vr, vr, vr, vr")
- (if_then_else:V_VLSF_ZVFHMIN
- (unspec:<VM>
- [(match_operand:<VM> 1 "vector_broadcast_mask_operand" " vm, vm, Wc1, Wc1")
- (match_operand 4 "vector_length_operand" "rvl, rvl, rvl, rvl")
- (match_operand 5 "const_int_operand" " i, i, i, i")
- (match_operand 6 "const_int_operand" " i, i, i, i")
- (match_operand 7 "const_int_operand" " i, i, i, i")
- (reg:SI VL_REGNUM)
- (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
- (vec_duplicate:V_VLSF_ZVFHMIN
- (match_operand:<VEL> 3 "direct_broadcast_operand" " A, A, A, A"))
- (match_operand:V_VLSF_ZVFHMIN 2 "vector_merge_operand" " vu, 0, vu, 0")))]
- "TARGET_VECTOR && strided_load_broadcast_p ()"
- "@
- vlse<sew>.v\t%0,%3,zero,%1.t
- vlse<sew>.v\t%0,%3,zero,%1.t
- vlse<sew>.v\t%0,%3,zero
- vlse<sew>.v\t%0,%3,zero"
- [(set_attr "type" "vlds,vlds,vlds,vlds")
- (set_attr "mode" "<MODE>")])
-
(define_insn "*pred_broadcast<mode>_extended_scalar"
[(set (match_operand:V_VLSI_D 0 "register_operand" "=vr, vr, vr, vr")
(if_then_else:V_VLSI_D
[(set_attr "type" "vimov,vimov")
(set_attr "mode" "<MODE>")])
+(define_expand "@pred_strided_broadcast<mode>"
+ [(set (match_operand:V_VLS 0 "register_operand")
+ (if_then_else:V_VLS
+ (unspec:<VM>
+ [(match_operand:<VM> 1 "strided_broadcast_mask_operand")
+ (match_operand 4 "vector_length_operand")
+ (match_operand 5 "const_int_operand")
+ (match_operand 6 "const_int_operand")
+ (match_operand 7 "const_int_operand")
+ (reg:SI VL_REGNUM)
+ (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
+ (vec_duplicate:V_VLS
+ (match_operand:<VEL> 3 "strided_broadcast_operand"))
+ (match_operand:V_VLS 2 "vector_merge_operand")))]
+ "TARGET_VECTOR"
+{
+ if (satisfies_constraint_Wb1 (operands[1]))
+ {
+ /* If we're asked to set a single element (like vmv.s.x but we
+ need to go via memory here) and the tail policy is agnostic
+ we can overwrite all elements.
+ Thus, set the mask to broadcast. */
+ operands[1] = CONSTM1_RTX (<VM>mode);
+ if (!satisfies_constraint_vu (operands[2])
+ && GET_MODE_SIZE (<VEL>mode) > UNITS_PER_WORD)
+ {
+ /* Case 2: vmv.s.x (TU, x == memory) ==>
+ vl = 0 or 1; + vlse.v (TU) in RV32 system */
+ /* In this case we must not overwrite the residual elements,
+ so set the vector length to 0/1. */
+ operands[4] = riscv_vector::gen_avl_for_scalar_move (operands[4]);
+ }
+ }
+})
+
+(define_insn_and_split "*pred_strided_broadcast<mode>"
+ [(set (match_operand:V_VLSI 0 "register_operand" "=vd, vd, vr, vr")
+ (if_then_else:V_VLSI
+ (unspec:<VM>
+ [(match_operand:<VM> 1 "strided_broadcast_mask_operand" " vm, vm,Wc1,Wc1")
+ (match_operand 4 "vector_length_operand" "rvl,rvl,rvl,rvl")
+ (match_operand 5 "const_int_operand" " i, i, i, i")
+ (match_operand 6 "const_int_operand" " i, i, i, i")
+ (match_operand 7 "const_int_operand" " i, i, i, i")
+ (reg:SI VL_REGNUM)
+ (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
+ (vec_duplicate:V_VLSI
+ (match_operand:<VEL> 3 "strided_broadcast_operand" " A, A, A, A"))
+ (match_operand:V_VLSI 2 "vector_merge_operand" " vu, 0, vu, 0")))]
+ "TARGET_VECTOR"
+ "@
+ vlse<sew>.v\t%0,%3,zero,%1.t
+ vlse<sew>.v\t%0,%3,zero,%1.t
+ vlse<sew>.v\t%0,%3,zero
+ vlse<sew>.v\t%0,%3,zero"
+ "&& !strided_load_broadcast_p () && can_create_pseudo_p ()"
+ [(const_int 0)]
+ {
+ rtx tmp = gen_reg_rtx (V1DImode);
+ emit_move_insn (tmp, gen_lowpart (V1DImode, operands[3]));
+ tmp = lowpart_subreg (<MODE>mode, tmp, V1DImode);
+
+ emit_insn
+ (gen_pred_gather<mode>_scalar
+ (operands[0], operands[1], operands[2], tmp, CONST0_RTX (Pmode),
+ operands[4], operands[5], operands[6], operands[7]));
+ DONE;
+ }
+ [(set_attr "type" "vlds,vlds,vlds,vlds")
+ (set_attr "mode" "<MODE>")])
+
+(define_insn_and_split "*pred_strided_broadcast<mode>_zvfhmin"
+ [(set (match_operand:V_VLSF_ZVFHMIN 0 "register_operand" "=vr, vr, vr, vr")
+ (if_then_else:V_VLSF_ZVFHMIN
+ (unspec:<VM>
+ [(match_operand:<VM> 1 "strided_broadcast_mask_operand" " vm, vm, Wc1, Wc1")
+ (match_operand 4 "vector_length_operand" "rvl, rvl, rvl, rvl")
+ (match_operand 5 "const_int_operand" " i, i, i, i")
+ (match_operand 6 "const_int_operand" " i, i, i, i")
+ (match_operand 7 "const_int_operand" " i, i, i, i")
+ (reg:SI VL_REGNUM)
+ (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
+ (vec_duplicate:V_VLSF_ZVFHMIN
+ (match_operand:<VEL> 3 "strided_broadcast_operand" " A, A, A, A"))
+ (match_operand:V_VLSF_ZVFHMIN 2 "vector_merge_operand" " vu, 0, vu, 0")))]
+ "TARGET_VECTOR"
+ "@
+ vlse<sew>.v\t%0,%3,zero,%1.t
+ vlse<sew>.v\t%0,%3,zero,%1.t
+ vlse<sew>.v\t%0,%3,zero
+ vlse<sew>.v\t%0,%3,zero"
+ "&& !strided_load_broadcast_p ()
+ && <VEL>mode == HFmode
+ && can_create_pseudo_p ()"
+ [(const_int 0)]
+ {
+ poly_uint64 nunits = GET_MODE_NUNITS (<MODE>mode);
+ machine_mode vmodehi
+ = riscv_vector::get_vector_mode (HImode, nunits).require ();
+ rtx ops[] = {gen_lowpart (vmodehi, operands[0]),
+ gen_lowpart (HImode, operands[3])};
+ riscv_vector::emit_avltype_insn (code_for_pred_broadcast (vmodehi),
+ riscv_vector::UNARY_OP, ops,
+ (riscv_vector::avl_type) INTVAL (operands[7]),
+ operands[4]);
+ DONE;
+ }
+ [(set_attr "type" "vlds,vlds,vlds,vlds")
+ (set_attr "mode" "<MODE>")])
+
+
;; -------------------------------------------------------------------------------
;; ---- Predicated Strided loads/stores
;; -------------------------------------------------------------------------------