(and (match_code "const_vector")
(match_test "rtx_equal_p (op, riscv_vector::gen_scalar_move_mask (GET_MODE (op)))")))
-(define_memory_constraint "Wdm"
+(define_constraint "Wdm"
"Vector duplicate memory operand"
- (and (match_code "mem")
- (match_code "reg" "0")))
+ (and (match_test "strided_load_broadcast_p ()")
+ (and (match_code "mem")
+ (match_code "reg" "0"))))
;; Vendor ISA extension constraints.
;; The scalar operand can be directly broadcast by RVV instructions.
(define_predicate "direct_broadcast_operand"
- (match_test "riscv_vector::can_be_broadcasted_p (op)"))
+ (match_test "riscv_vector::can_be_broadcast_p (op)"))
;; A CONST_INT operand that has exactly two bits cleared.
(define_predicate "const_nottwobits_operand"
void emit_hard_vlmax_vsetvl (machine_mode, rtx);
void emit_vlmax_insn (unsigned, unsigned, rtx *);
void emit_nonvlmax_insn (unsigned, unsigned, rtx *, rtx);
+void emit_avltype_insn (unsigned, unsigned, rtx *, avl_type, rtx = nullptr);
void emit_vlmax_insn_lra (unsigned, unsigned, rtx *, rtx);
enum vlmul_type get_vlmul (machine_mode);
rtx get_vlmax_rtx (machine_mode);
enum vlmul_type get_vlmul (rtx_insn *);
int count_regno_occurrences (rtx_insn *, unsigned int);
bool imm_avl_p (machine_mode);
-bool can_be_broadcasted_p (rtx);
+bool can_be_broadcast_p (rtx);
bool gather_scatter_valid_offset_p (machine_mode);
HOST_WIDE_INT estimated_poly_value (poly_int64, unsigned int);
bool whole_reg_to_reg_move_p (rtx *, machine_mode, int);
extern bool th_print_operand_address (FILE *, machine_mode, rtx);
#endif
+extern bool strided_load_broadcast_p (void);
extern bool riscv_use_divmod_expander (void);
void riscv_init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree, int);
extern bool
expand_vector_broadcast (mode, mem); \
insn = get_last_insn (); \
src = SET_SRC (PATTERN (insn)); \
- ASSERT_TRUE (MEM_P (XEXP (src, 0))); \
- ASSERT_TRUE ( \
- rtx_equal_p (src, gen_rtx_VEC_DUPLICATE (mode, XEXP (src, 0)))); \
+ if (strided_load_broadcast_p ()) \
+ { \
+ ASSERT_TRUE (MEM_P (XEXP (src, 0))); \
+ ASSERT_TRUE ( \
+ rtx_equal_p (src, \
+ gen_rtx_VEC_DUPLICATE (mode, XEXP (src, 0)))); \
+ } \
end_sequence (); \
/* Test vmv.v.x or vfmv.v.f. */ \
start_sequence (); \
e.emit_insn ((enum insn_code) icode, ops);
}
+/* Emit either a VLMAX insn or a non-VLMAX insn depending on TYPE. For a
+ non-VLMAX insn, the length must be specified in VL. */
+
+void
+emit_avltype_insn (unsigned icode, unsigned insn_flags, rtx *ops,
+ avl_type type, rtx vl)
+{
+ if (type != avl_type::VLMAX && vl != NULL_RTX)
+ {
+ insn_expander<RVV_INSN_OPERANDS_MAX> e (insn_flags, false);
+ e.set_vl (vl);
+ e.emit_insn ((enum insn_code) icode, ops);
+ }
+ else
+ {
+ insn_expander<RVV_INSN_OPERANDS_MAX> e (insn_flags, true);
+ e.emit_insn ((enum insn_code) icode, ops);
+ }
+}
+
/* Return true if the vector duplicated by a super element which is the fusion
of consecutive elements.
return false;
}
+ bool avoid_strided_broadcast = false;
if (CONST_INT_P (*scalar_op))
{
if (maybe_gt (GET_MODE_SIZE (scalar_mode), GET_MODE_SIZE (Pmode)))
- *scalar_op = force_const_mem (scalar_mode, *scalar_op);
+ {
+ if (strided_load_broadcast_p ())
+ *scalar_op = force_const_mem (scalar_mode, *scalar_op);
+ else
+ avoid_strided_broadcast = true;
+ }
else
*scalar_op = force_reg (scalar_mode, *scalar_op);
}
rtx tmp = gen_reg_rtx (vector_mode);
- rtx ops[] = {tmp, *scalar_op};
- if (type == VLMAX)
- emit_vlmax_insn (code_for_pred_broadcast (vector_mode), UNARY_OP, ops);
+ if (!avoid_strided_broadcast)
+ {
+ rtx ops[] = {tmp, *scalar_op};
+ emit_avltype_insn (code_for_pred_broadcast (vector_mode), UNARY_OP, ops,
+ type, vl);
+ }
else
- emit_nonvlmax_insn (code_for_pred_broadcast (vector_mode), UNARY_OP, ops,
- vl);
+ {
+ /* Load scalar as V1DI and broadcast via vrgather.vi. */
+ rtx tmp1 = gen_reg_rtx (V1DImode);
+ emit_move_insn (tmp1, lowpart_subreg (V1DImode, *scalar_op,
+ scalar_mode));
+ tmp1 = lowpart_subreg (vector_mode, tmp1, V1DImode);
+
+ rtx ops[] = {tmp, tmp1, CONST0_RTX (Pmode)};
+ emit_vlmax_insn (code_for_pred_gather_scalar (vector_mode),
+ BINARY_OP, ops);
+ }
+
emit_vector_func (operands, tmp);
return true;
return count;
}
-/* Return true if the OP can be directly broadcasted. */
+/* Return true if the OP can be directly broadcast. */
bool
-can_be_broadcasted_p (rtx op)
+can_be_broadcast_p (rtx op)
{
machine_mode mode = GET_MODE (op);
/* We don't allow RA (register allocation) reload generate
return false;
if (satisfies_constraint_K (op) || register_operand (op, mode)
- || satisfies_constraint_Wdm (op) || rtx_equal_p (op, CONST0_RTX (mode)))
+ || (strided_load_broadcast_p () && satisfies_constraint_Wdm (op))
+ || rtx_equal_p (op, CONST0_RTX (mode)))
return true;
return can_create_pseudo_p () && nonmemory_operand (op, mode);
bool vector_unaligned_access;
bool use_divmod_expansion;
bool overlap_op_by_pieces;
+ bool use_zero_stride_load;
bool speculative_sched_vsetvl;
unsigned int fusible_ops;
const struct cpu_vector_cost *vec_costs;
false, /* vector_unaligned_access */
false, /* use_divmod_expansion */
false, /* overlap_op_by_pieces */
+ true, /* use_zero_stride_load */
false, /* speculative_sched_vsetvl */
RISCV_FUSE_NOTHING, /* fusible_ops */
NULL, /* vector cost */
false, /* vector_unaligned_access */
false, /* use_divmod_expansion */
false, /* overlap_op_by_pieces */
+ true, /* use_zero_stride_load */
false, /* speculative_sched_vsetvl */
RISCV_FUSE_NOTHING, /* fusible_ops */
NULL, /* vector cost */
false, /* vector_unaligned_access */
false, /* use_divmod_expansion */
false, /* overlap_op_by_pieces */
+ true, /* use_zero_stride_load */
false, /* speculative_sched_vsetvl */
RISCV_FUSE_NOTHING, /* fusible_ops */
NULL, /* vector cost */
false, /* vector_unaligned_access */
false, /* use_divmod_expansion */
false, /* overlap_op_by_pieces */
+ true, /* use_zero_stride_load */
false, /* speculative_sched_vsetvl */
RISCV_FUSE_LUI_ADDI | RISCV_FUSE_AUIPC_ADDI, /* fusible_ops */
&generic_vector_cost, /* vector cost */
false, /* vector_unaligned_access */
false, /* use_divmod_expansion */
false, /* overlap_op_by_pieces */
+ true, /* use_zero_stride_load */
false, /* speculative_sched_vsetvl */
RISCV_FUSE_LUI_ADDI | RISCV_FUSE_AUIPC_ADDI, /* fusible_ops */
&generic_vector_cost, /* vector cost */
false, /* vector_unaligned_access */
false, /* use_divmod_expansion */
false, /* overlap_op_by_pieces */
+ true, /* use_zero_stride_load */
false, /* speculative_sched_vsetvl */
RISCV_FUSE_NOTHING, /* fusible_ops */
NULL, /* vector cost */
false, /* vector_unaligned_access */
false, /* use_divmod_expansion */
false, /* overlap_op_by_pieces */
+ true, /* use_zero_stride_load */
false, /* speculative_sched_vsetvl */
RISCV_FUSE_ZEXTW | RISCV_FUSE_ZEXTH, /* fusible_ops */
NULL, /* vector cost */
true, /* vector_unaligned_access */
false, /* use_divmod_expansion */
true, /* overlap_op_by_pieces */
+ true, /* use_zero_stride_load */
false, /* speculative_sched_vsetvl */
RISCV_FUSE_NOTHING, /* fusible_ops */
&generic_vector_cost, /* vector cost */
true, /* vector_unaligned_access */
true, /* use_divmod_expansion */
true, /* overlap_op_by_pieces */
+ true, /* use_zero_stride_load */
false, /* speculative_sched_vsetvl */
RISCV_FUSE_NOTHING, /* fusible_ops */
&generic_vector_cost, /* vector cost */
false, /* vector_unaligned_access */
false, /* use_divmod_expansion */
false, /* overlap_op_by_pieces */
+ true, /* use_zero_stride_load */
false, /* speculative_sched_vsetvl */
RISCV_FUSE_NOTHING, /* fusible_ops */
NULL, /* vector cost */
false, /* vector_unaligned_access */
true, /* use_divmod_expansion */
false, /* overlap_op_by_pieces */
+ true, /* use_zero_stride_load */
false, /* speculative_sched_vsetvl */
RISCV_FUSE_NOTHING, /* fusible_ops */
NULL, /* vector cost */
gen_lowpart (QImode, shift)));
}
+/* Return TRUE if we should use the zero stride load, FALSE otherwise. */
+
+bool
+strided_load_broadcast_p ()
+{
+ return tune_param->use_zero_stride_load;
+}
+
/* Return TRUE if we should use the divmod expander, FALSE otherwise. This
allows the behavior to be tuned for specific implementations as well as
when optimizing for size. */
"&& 1"
[(const_int 0)]
{
- riscv_vector::emit_vlmax_insn (code_for_pred_broadcast (<MODE>mode),
- riscv_vector::UNARY_OP, operands);
+ if (!strided_load_broadcast_p ()
+ && TARGET_ZVFHMIN && !TARGET_ZVFH && <VEL>mode == HFmode)
+ {
+ /* For Float16, reinterpret as HImode, broadcast and reinterpret
+ back. */
+ poly_uint64 nunits = GET_MODE_NUNITS (<MODE>mode);
+ machine_mode vmodehi
+ = riscv_vector::get_vector_mode (HImode, nunits).require ();
+ rtx ops[] = {lowpart_subreg (vmodehi, operands[0], <MODE>mode),
+ lowpart_subreg (HImode, operands[1], HFmode)};
+ riscv_vector::emit_vlmax_insn (code_for_pred_broadcast (vmodehi),
+ riscv_vector::UNARY_OP, ops);
+ }
+ else
+ riscv_vector::emit_vlmax_insn (code_for_pred_broadcast (<MODE>mode),
+ riscv_vector::UNARY_OP, operands);
DONE;
}
[(set_attr "type" "vector")]
}
}
else if (GET_MODE_BITSIZE (<VEL>mode) > GET_MODE_BITSIZE (Pmode)
- && (immediate_operand (operands[3], Pmode)
+ && (immediate_operand (operands[3], Pmode)
|| (CONST_POLY_INT_P (operands[3])
&& known_ge (rtx_to_poly_int64 (operands[3]), 0U)
&& known_le (rtx_to_poly_int64 (operands[3]), GET_MODE_SIZE (<MODE>mode)))))
"(register_operand (operands[3], <VEL>mode)
|| CONST_POLY_INT_P (operands[3]))
&& GET_MODE_BITSIZE (<VEL>mode) > GET_MODE_BITSIZE (Pmode)"
- [(set (match_dup 0)
- (if_then_else:V_VLSI (unspec:<VM> [(match_dup 1) (match_dup 4)
- (match_dup 5) (match_dup 6) (match_dup 7)
- (reg:SI VL_REGNUM) (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
- (vec_duplicate:V_VLSI (match_dup 3))
- (match_dup 2)))]
+ [(const_int 0)]
{
gcc_assert (can_create_pseudo_p ());
if (CONST_POLY_INT_P (operands[3]))
emit_move_insn (tmp, operands[3]);
operands[3] = tmp;
}
- rtx m = assign_stack_local (<VEL>mode, GET_MODE_SIZE (<VEL>mode),
- GET_MODE_ALIGNMENT (<VEL>mode));
- m = validize_mem (m);
- emit_move_insn (m, operands[3]);
- m = gen_rtx_MEM (<VEL>mode, force_reg (Pmode, XEXP (m, 0)));
- operands[3] = m;
/* For SEW = 64 in RV32 system, we expand vmv.s.x:
andi a2,a2,1
operands[4] = riscv_vector::gen_avl_for_scalar_move (operands[4]);
operands[1] = CONSTM1_RTX (<VM>mode);
}
+
+ /* If the target doesn't want a strided-load broadcast we go with a regular
+ V1DImode load and a broadcast gather. */
+ if (strided_load_broadcast_p ())
+ {
+ rtx mem = assign_stack_local (<VEL>mode, GET_MODE_SIZE (<VEL>mode),
+ GET_MODE_ALIGNMENT (<VEL>mode));
+ mem = validize_mem (mem);
+ emit_move_insn (mem, operands[3]);
+ mem = gen_rtx_MEM (<VEL>mode, force_reg (Pmode, XEXP (mem, 0)));
+
+ emit_insn
+ (gen_pred_broadcast<mode>
+ (operands[0], operands[1], operands[2], mem,
+ operands[4], operands[5], operands[6], operands[7]));
+ }
+ else
+ {
+ rtx tmp = gen_reg_rtx (V1DImode);
+ emit_move_insn (tmp, lowpart_subreg (V1DImode, operands[3],
+ <VEL>mode));
+ tmp = lowpart_subreg (<MODE>mode, tmp, V1DImode);
+
+ emit_insn
+ (gen_pred_gather<mode>_scalar
+ (operands[0], operands[1], operands[2], tmp, CONST0_RTX (Pmode),
+ operands[4], operands[5], operands[6], operands[7]));
+ }
+ DONE;
}
[(set_attr "type" "vimov,vimov,vlds,vlds,vlds,vlds,vimovxv,vimovxv")
(set_attr "mode" "<MODE>")])
(reg:SI VL_REGNUM)
(reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
(vec_duplicate:V_VLSF_ZVFHMIN
- (match_operand:<VEL> 3 "direct_broadcast_operand" "Wdm, Wdm, Wdm, Wdm"))
+ (match_operand:<VEL> 3 "direct_broadcast_operand" " A, A, A, A"))
(match_operand:V_VLSF_ZVFHMIN 2 "vector_merge_operand" " vu, 0, vu, 0")))]
- "TARGET_VECTOR"
+ "TARGET_VECTOR && strided_load_broadcast_p ()"
"@
vlse<sew>.v\t%0,%3,zero,%1.t
vlse<sew>.v\t%0,%3,zero,%1.t