From bdc4062a0796788e44d5e6ecd753268a8b453cc7 Mon Sep 17 00:00:00 2001 From: Andrew Stubbs Date: Thu, 12 Jun 2025 16:57:23 +0000 Subject: [PATCH] amdgcn: add more insn patterns using vec_duplicate These new insns allow more efficient use of scalar inputs to 64-bit vector add and mul. Also, the patch adjusts the existing mul.._dup because it was actually a dup2 (the vec_duplicate is on the second input), and that was inconveniently inconsistent. The patterns are generally useful, but will be used directly by a follow-up patch. gcc/ChangeLog: * config/gcn/gcn-valu.md (add3_dup): New. (add3_dup_exec): New. (mul3_highpart_dup): New. (mul3_dup): Move the vec_duplicate to operand 1. (mul3_dup_exec): New. (vec_series): Adjust call to gen_mul3_dup. * config/gcn/gcn.cc (gcn_expand_vector_init): Likewise. --- gcc/config/gcn/gcn-valu.md | 181 ++++++++++++++++++++++++++++++++++++- gcc/config/gcn/gcn.cc | 4 +- 2 files changed, 179 insertions(+), 6 deletions(-) diff --git a/gcc/config/gcn/gcn-valu.md b/gcc/config/gcn/gcn-valu.md index 7c4dde1cfce..dfa6b1523bd 100644 --- a/gcc/config/gcn/gcn-valu.md +++ b/gcc/config/gcn/gcn-valu.md @@ -1645,6 +1645,39 @@ [(set_attr "type" "vmult") (set_attr "length" "8")]) +(define_insn_and_split "add3_dup" + [(set (match_operand:V_DI 0 "register_operand" "= v") + (plus:V_DI + (vec_duplicate:V_DI + (match_operand:DI 1 "register_operand" "SvB")) + (match_operand:V_DI 2 "gcn_alu_operand" "vDb"))) + (clobber (reg:DI VCC_REG)) + (clobber (match_scratch: 3 "=&v"))] + "" + "#" + "gcn_can_split_p (mode, operands[0]) + && gcn_can_split_p (mode, operands[1]) + && gcn_can_split_p (mode, operands[2])" + [(const_int 0)] + { + rtx vcc = gen_rtx_REG (DImode, VCC_REG); + emit_insn (gen_add3_vcc_dup + (gcn_operand_part (mode, operands[0], 0), + gcn_operand_part (DImode, operands[1], 0), + gcn_operand_part (mode, operands[2], 0), + vcc)); + emit_insn (gen_vec_duplicate (operands[3], + gcn_operand_part (DImode, operands[1], 1))); + emit_insn (gen_addc3 + (gcn_operand_part (mode, operands[0], 1), + operands[3], + gcn_operand_part (mode, operands[2], 1), + vcc, vcc)); + DONE; + } + [(set_attr "type" "vmult") + (set_attr "length" "8")]) + (define_insn_and_split "add3_exec" [(set (match_operand:V_DI 0 "register_operand" "= v") (vec_merge:V_DI @@ -1682,6 +1715,49 @@ [(set_attr "type" "vmult") (set_attr "length" "8")]) +(define_insn_and_split "add3_dup_exec" + [(set (match_operand:V_DI 0 "register_operand" "= v") + (vec_merge:V_DI + (plus:V_DI + (vec_duplicate:V_DI + (match_operand:DI 1 "register_operand" "SvB")) + (match_operand:V_DI 2 "gcn_alu_operand" "vDb")) + (match_operand:V_DI 3 "gcn_register_or_unspec_operand" " U0") + (match_operand:DI 4 "gcn_exec_reg_operand" " e"))) + (clobber (reg:DI VCC_REG)) + (clobber (match_scratch: 5 "=&v"))] + "" + "#" + "gcn_can_split_p (mode, operands[0]) + && gcn_can_split_p (mode, operands[1]) + && gcn_can_split_p (mode, operands[2]) + && gcn_can_split_p (mode, operands[4])" + [(const_int 0)] + { + rtx vcc = gen_rtx_REG (DImode, VCC_REG); + emit_insn (gen_add3_vcc_dup_exec + (gcn_operand_part (mode, operands[0], 0), + gcn_operand_part (DImode, operands[1], 0), + gcn_operand_part (mode, operands[2], 0), + vcc, + gcn_operand_part (mode, operands[3], 0), + operands[4])); + emit_insn (gen_vec_duplicate_exec (operands[5], + gcn_operand_part (DImode, operands[1], 1), + gcn_gen_undef (mode), + operands[4])); + emit_insn (gen_addc3_exec + (gcn_operand_part (mode, operands[0], 1), + operands[5], + gcn_operand_part (mode, operands[2], 1), + vcc, vcc, + gcn_operand_part (mode, operands[3], 1), + operands[4])); + DONE; + } + [(set_attr "type" "vmult") + (set_attr "length" "8")]) + (define_insn_and_split "sub3" [(set (match_operand:V_DI 0 "register_operand" "= v, v") (minus:V_DI @@ -2187,6 +2263,22 @@ [(set_attr "type" "vop3a") (set_attr "length" "8")]) +(define_insn "mul3_highpart_dup" + [(set (match_operand:V_SI 0 "register_operand" "= v") + (truncate:V_SI + (lshiftrt: + (mult: + (any_extend: + (vec_duplicate:V_SI + (match_operand:SI 1 "gcn_alu_operand" "SvA"))) + (any_extend: + (match_operand:V_SI 2 "gcn_alu_operand" " vA"))) + (const_int 32))))] + "" + "v_mul_hi0\t%0, %2, %1" + [(set_attr "type" "vop3a") + (set_attr "length" "8")]) + (define_insn "mul3" [(set (match_operand:V_INT_1REG 0 "register_operand" "= v") (mult:V_INT_1REG @@ -2198,11 +2290,11 @@ (set_attr "length" "8")]) (define_insn "mul3_dup" - [(set (match_operand:V_INT_1REG 0 "register_operand" "= v") + [(set (match_operand:V_INT_1REG 0 "register_operand" "= v") (mult:V_INT_1REG - (match_operand:V_INT_1REG 1 "gcn_alu_operand" "%vSvA") (vec_duplicate:V_INT_1REG - (match_operand: 2 "gcn_alu_operand" " SvA"))))] + (match_operand: 1 "gcn_alu_operand" "SvA")) + (match_operand:V_INT_1REG 2 "gcn_alu_operand" " vA")))] "" "v_mul_lo_u32\t%0, %1, %2" [(set_attr "type" "vop3a") @@ -2238,6 +2330,37 @@ DONE; }) +(define_insn_and_split "mul3_dup" + [(set (match_operand:V_DI 0 "register_operand" "=&v") + (mult:V_DI + (vec_duplicate:V_DI + (match_operand:DI 1 "gcn_alu_operand" " Sv")) + (match_operand:V_DI 2 "gcn_alu_operand" "vDA"))) + (clobber (match_scratch: 3 "=&v"))] + "" + "#" + "reload_completed" + [(const_int 0)] + { + rtx out_lo = gcn_operand_part (mode, operands[0], 0); + rtx out_hi = gcn_operand_part (mode, operands[0], 1); + rtx left_lo = gcn_operand_part (DImode, operands[1], 0); + rtx left_hi = gcn_operand_part (DImode, operands[1], 1); + rtx right_lo = gcn_operand_part (mode, operands[2], 0); + rtx right_hi = gcn_operand_part (mode, operands[2], 1); + rtx tmp = operands[3]; + + emit_insn (gen_mul3_dup (out_lo, left_lo, right_lo)); + emit_insn (gen_umul3_highpart_dup (out_hi, left_lo, right_lo)); + emit_insn (gen_mul3_dup (tmp, left_hi, right_lo)); + emit_insn (gen_add3 (out_hi, out_hi, tmp)); + emit_insn (gen_mul3_dup (tmp, left_lo, right_hi)); + emit_insn (gen_add3 (out_hi, out_hi, tmp)); + emit_insn (gen_mul3_dup (tmp, left_hi, right_hi)); + emit_insn (gen_add3 (out_hi, out_hi, tmp)); + DONE; + }) + (define_insn_and_split "mul3_exec" [(set (match_operand:V_DI 0 "register_operand" "=&v") (vec_merge:V_DI @@ -2286,6 +2409,56 @@ DONE; }) +(define_insn_and_split "mul3_dup_exec" + [(set (match_operand:V_DI 0 "register_operand" "=&v") + (vec_merge:V_DI + (mult:V_DI + (vec_duplicate:V_DI + (match_operand:DI 1 "gcn_alu_operand" " Sv")) + (match_operand:V_DI 2 "gcn_alu_operand" "vDA")) + (match_operand:V_DI 3 "gcn_register_or_unspec_operand" " U0") + (match_operand:DI 4 "gcn_exec_reg_operand" " e"))) + (clobber (match_scratch: 5 "=&v"))] + "" + "#" + "reload_completed" + [(const_int 0)] + { + rtx out_lo = gcn_operand_part (mode, operands[0], 0); + rtx out_hi = gcn_operand_part (mode, operands[0], 1); + rtx left_lo = gcn_operand_part (DImode, operands[1], 0); + rtx left_hi = gcn_operand_part (DImode, operands[1], 1); + rtx right_lo = gcn_operand_part (mode, operands[2], 0); + rtx right_hi = gcn_operand_part (mode, operands[2], 1); + rtx exec = operands[4]; + rtx tmp = operands[5]; + + rtx old_lo, old_hi; + if (GET_CODE (operands[3]) == UNSPEC) + { + old_lo = old_hi = gcn_gen_undef (mode); + } + else + { + old_lo = gcn_operand_part (mode, operands[3], 0); + old_hi = gcn_operand_part (mode, operands[3], 1); + } + + rtx undef = gcn_gen_undef (mode); + + emit_insn (gen_mul3_dup_exec (out_lo, left_lo, right_lo, old_lo, + exec)); + emit_insn (gen_umul3_highpart_dup_exec (out_hi, left_lo, right_lo, + old_hi, exec)); + emit_insn (gen_mul3_dup_exec (tmp, left_hi, right_lo, undef, exec)); + emit_insn (gen_add3_exec (out_hi, out_hi, tmp, out_hi, exec)); + emit_insn (gen_mul3_dup_exec (tmp, left_lo, right_hi, undef, exec)); + emit_insn (gen_add3_exec (out_hi, out_hi, tmp, out_hi, exec)); + emit_insn (gen_mul3_dup_exec (tmp, left_hi, right_hi, undef, exec)); + emit_insn (gen_add3_exec (out_hi, out_hi, tmp, out_hi, exec)); + DONE; + }) + (define_insn_and_split "mul3_zext" [(set (match_operand:V_DI 0 "register_operand" "=&v") (mult:V_DI @@ -4397,7 +4570,7 @@ rtx tmp = gen_reg_rtx (mode); rtx v1 = gen_rtx_REG (mode, VGPR_REGNO (1)); - emit_insn (gen_mul3_dup (tmp, v1, operands[2])); + emit_insn (gen_mul3_dup (tmp, operands[2], v1)); emit_insn (gen_add3_dup (operands[0], tmp, operands[1])); DONE; }) diff --git a/gcc/config/gcn/gcn.cc b/gcc/config/gcn/gcn.cc index 0ce5a29fbb5..56c832a483e 100644 --- a/gcc/config/gcn/gcn.cc +++ b/gcc/config/gcn/gcn.cc @@ -1995,8 +1995,8 @@ gcn_expand_vector_init (rtx op0, rtx vec) rtx addr = gen_reg_rtx (addrmode); int unit_size = GET_MODE_SIZE (GET_MODE_INNER (GET_MODE (op0))); - emit_insn (gen_mulvNsi3_dup (ramp, gen_rtx_REG (offsetmode, VGPR_REGNO (1)), - GEN_INT (unit_size))); + emit_insn (gen_mulvNsi3_dup (ramp, GEN_INT (unit_size), + gen_rtx_REG (offsetmode, VGPR_REGNO (1)))); bool simple_repeat = true; -- 2.47.2