This patch would like to combine the vec_duplicate + vrub.vv to the
vrsub.vx. From example as below code. The related pattern will depend
on the cost of vec_duplicate from GR2VR. Then the late-combine will
take action if the cost of GR2VR is zero, and reject the combination
if the GR2VR cost is greater than zero.
Assume we have example code like below, GR2VR cost is 0.
#define DEF_VX_BINARY_REVERSE_CASE_0(T, OP, NAME) \
void \
test_vx_binary_reverse_##NAME##_##T##_case_0 (T * restrict out, \
T * restrict in, T x, \
unsigned n) \
{ \
for (unsigned i = 0; i < n; i++) \
out[i] = x OP in[i]; \
}
DEF_VX_BINARY_REVERSE_CASE_0(int32_t, -)
Before this patch:
54 │ test_vx_binary_reverse_rsub_int32_t_case_0:
55 │ beq a3,zero,.L27
56 │ vsetvli a5,zero,e32,m1,ta,ma
57 │ vmv.v.x v2,a2
58 │ slli a3,a3,32
59 │ srli a3,a3,32
60 │ .L22:
61 │ vsetvli a5,a3,e32,m1,ta,ma
62 │ vle32.v v1,0(a1)
63 │ slli a4,a5,2
64 │ sub a3,a3,a5
65 │ add a1,a1,a4
66 │ vsub.vv v1,v2,v1
67 │ vse32.v v1,0(a0)
68 │ add a0,a0,a4
69 │ bne a3,zero,.L22
After this patch:
50 │ test_vx_binary_reverse_rsub_int32_t_case_0:
51 │ beq a3,zero,.L27
52 │ slli a3,a3,32
53 │ srli a3,a3,32
54 │ .L22:
55 │ vsetvli a5,a3,e32,m1,ta,ma
56 │ vle32.v v1,0(a1)
57 │ slli a4,a5,2
58 │ sub a3,a3,a5
59 │ add a1,a1,a4
60 │ vrsub.vx v1,v1,a2
61 │ vse32.v v1,0(a0)
62 │ add a0,a0,a4
63 │ bne a3,zero,.L22
The below test suites are passed for this patch.
* The rv64gcv fully regression test.
gcc/ChangeLog:
* config/riscv/autovec-opt.md: Leverage the new add func to
expand the vx insn.
* config/riscv/riscv-protos.h (expand_vx_binary_vec_dup_vec): Add
new func decl to expand format v = vop(vec_dup(x), v).
(expand_vx_binary_vec_vec_dup): Diito but for format
v = vop(v, vec_dup(x)).
* config/riscv/riscv-v.cc (expand_vx_binary_vec_dup_vec): Add new
func impl to expand vx for v = vop(vec_dup(x), v).
(expand_vx_binary_vec_vec_dup): Diito but for another format
v = vop(v, vec_dup(x)).
Signed-off-by: Pan Li <pan2.li@intel.com>
"&& 1"
[(const_int 0)]
{
- rtx ops[] = {operands[0], operands[2], operands[1]};
- riscv_vector::emit_vlmax_insn (code_for_pred_scalar (<CODE>, <MODE>mode),
- riscv_vector::BINARY_OP, ops);
+ riscv_vector::expand_vx_binary_vec_dup_vec (operands[0], operands[2],
+ operands[1], <CODE>,
+ <MODE>mode);
}
[(set_attr "type" "vialu")])
(define_insn_and_split "*<optab>_vx_<mode>"
[(set (match_operand:V_VLSI 0 "register_operand")
(any_int_binop_no_shift_vx:V_VLSI
- (match_operand:V_VLSI 2 "<binop_rhs2_predicate>")
+ (match_operand:V_VLSI 1 "<binop_rhs2_predicate>")
(vec_duplicate:V_VLSI
- (match_operand:<VEL> 1 "register_operand"))))]
+ (match_operand:<VEL> 2 "register_operand"))))]
"TARGET_VECTOR && can_create_pseudo_p ()"
"#"
"&& 1"
[(const_int 0)]
{
- rtx ops[] = {operands[0], operands[2], operands[1]};
- riscv_vector::emit_vlmax_insn (code_for_pred_scalar (<CODE>, <MODE>mode),
- riscv_vector::BINARY_OP, ops);
+ riscv_vector::expand_vx_binary_vec_vec_dup (operands[0], operands[1],
+ operands[2], <CODE>,
+ <MODE>mode);
}
[(set_attr "type" "vialu")])
machine_mode);
void expand_vec_oct_sstrunc (rtx, rtx, machine_mode, machine_mode,
machine_mode);
+void expand_vx_binary_vec_dup_vec (rtx, rtx, rtx, rtx_code, machine_mode);
+void expand_vx_binary_vec_vec_dup (rtx, rtx, rtx, rtx_code, machine_mode);
#endif
bool sew64_scalar_helper (rtx *, rtx *, rtx, machine_mode,
bool, void (*)(rtx *, rtx), enum avl_type);
expand_vec_double_sstrunc (op_0, quad_rtx, quad_mode);
}
+/* Expand the binary vx combine with the format like v2 = vop(vec_dup(x), v1).
+ Aka the first op comes from the vec_duplicate, and the second op is
+ the vector reg. */
+
+void
+expand_vx_binary_vec_dup_vec (rtx op_0, rtx op_1, rtx op_2,
+ rtx_code code, machine_mode mode)
+{
+ enum insn_code icode;
+
+ switch (code)
+ {
+ case PLUS:
+ icode = code_for_pred_scalar (code, mode);
+ break;
+ case MINUS:
+ icode = code_for_pred_sub_reverse_scalar (mode);
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ rtx ops[] = {op_0, op_1, op_2};
+ emit_vlmax_insn (icode, riscv_vector::BINARY_OP, ops);
+}
+
+/* Expand the binary vx combine with the format like v2 = vop(v1, vec_dup(x)).
+ Aka the second op comes from the vec_duplicate, and the first op is
+ the vector reg. */
+
+void
+expand_vx_binary_vec_vec_dup (rtx op_0, rtx op_1, rtx op_2,
+ rtx_code code, machine_mode mode)
+{
+ enum insn_code icode;
+
+ switch (code)
+ {
+ case MINUS:
+ icode = code_for_pred_scalar (code, mode);
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ rtx ops[] = {op_0, op_1, op_2};
+ emit_vlmax_insn (icode, riscv_vector::BINARY_OP, ops);
+}
+
/* Vectorize popcount by the Wilkes-Wheeler-Gill algorithm that libgcc uses as
well. */
void