]> git.ipfire.org Git - thirdparty/gcc.git/commitdiff
amdgcn: multi-size vector reductions
authorAndrew Stubbs <ams@codesourcery.com>
Fri, 28 Oct 2022 11:38:43 +0000 (12:38 +0100)
committerAndrew Stubbs <ams@codesourcery.com>
Mon, 31 Oct 2022 12:20:52 +0000 (12:20 +0000)
Add support for vector reductions for any vector width by switching iterators
and generalising the code slightly.  There's no one-instruction way to move an
item from lane 31 to lane 0 (63, 15, 7, 3, and 1 are all fine though), and
vec_extract is probably fewer cycles anyway, so now we always reduce to an
SGPR.

gcc/ChangeLog:

* config/gcn/gcn-valu.md (V64_SI): Delete iterator.
(V64_DI): Likewise.
(V64_1REG): Likewise.
(V64_INT_1REG): Likewise.
(V64_2REG): Likewise.
(V64_ALL): Likewise.
(V64_FP): Likewise.
(reduc_<reduc_op>_scal_<mode>): Use V_ALL. Use gen_vec_extract.
(fold_left_plus_<mode>): Use V_FP.
(*<reduc_op>_dpp_shr_<mode>): Use V_1REG.
(*<reduc_op>_dpp_shr_<mode>): Use V_DI.
(*plus_carry_dpp_shr_<mode>): Use V_INT_1REG.
(*plus_carry_in_dpp_shr_<mode>): Use V_SI.
(*plus_carry_dpp_shr_<mode>): Use V_DI.
(mov_from_lane63_<mode>): Delete.
(mov_from_lane63_<mode>): Delete.
* config/gcn/gcn.cc (gcn_expand_reduc_scalar): Support partial vectors.
* config/gcn/gcn.md (unspec): Remove UNSPEC_MOV_FROM_LANE63.

gcc/config/gcn/gcn-valu.md
gcc/config/gcn/gcn.cc
gcc/config/gcn/gcn.md

index 00c0e3be1ea433c8fd7b17f17a74f8db87f81ad0..6274d2e922802b3ee912e6355a7f703cc96c7a87 100644 (file)
 (define_mode_iterator V_DF
                      [V2DF V4DF V8DF V16DF V32DF V64DF])
 
-(define_mode_iterator V64_SI
-                     [V64SI])
-(define_mode_iterator V64_DI
-                     [V64DI])
-
 ; Vector modes for sub-dword modes
 (define_mode_iterator V_QIHI
                      [V2QI V2HI
                       V32HF V32SF
                       V64HF V64SF])
 
-; V64_* modes are for where more general support is unimplemented
-; (e.g. reductions)
-(define_mode_iterator V64_1REG
-                     [V64QI V64HI V64SI V64HF V64SF])
-(define_mode_iterator V64_INT_1REG
-                     [V64QI V64HI V64SI])
-
 ; Vector modes for two vector registers
 (define_mode_iterator V_2REG
                      [V2DI V2DF
@@ -93,9 +81,6 @@
                       V32DI V32DF
                       V64DI V64DF])
 
-(define_mode_iterator V64_2REG
-                     [V64DI V64DF])
-
 ; Vector modes with native support
 (define_mode_iterator V_noQI
                      [V2HI V2HF V2SI V2SF V2DI V2DF
                       V32HF V32SF V32DF
                       V64HF V64SF V64DF])
 
-(define_mode_iterator V64_ALL
-                     [V64QI V64HI V64HF V64SI V64SF V64DI V64DF])
-(define_mode_iterator V64_FP
-                     [V64HF V64SF V64DF])
-
 (define_mode_attr scalar_mode
   [(V2QI "qi") (V2HI "hi") (V2SI "si")
    (V2HF "hf") (V2SF "sf") (V2DI "di") (V2DF "df")
 (define_expand "reduc_<reduc_op>_scal_<mode>"
   [(set (match_operand:<SCALAR_MODE> 0 "register_operand")
        (unspec:<SCALAR_MODE>
-         [(match_operand:V64_ALL 1 "register_operand")]
+         [(match_operand:V_ALL 1 "register_operand")]
          REDUC_UNSPEC))]
   ""
   {
     rtx tmp = gcn_expand_reduc_scalar (<MODE>mode, operands[1],
                                       <reduc_unspec>);
 
-    /* The result of the reduction is in lane 63 of tmp.  */
-    emit_insn (gen_mov_from_lane63_<mode> (operands[0], tmp));
+    rtx last_lane = GEN_INT (GET_MODE_NUNITS (<MODE>mode) - 1);
+    emit_insn (gen_vec_extract<mode><scalar_mode> (operands[0], tmp,
+                                                  last_lane));
 
     DONE;
   })
 (define_expand "fold_left_plus_<mode>"
  [(match_operand:<SCALAR_MODE> 0 "register_operand")
   (match_operand:<SCALAR_MODE> 1 "gcn_alu_operand")
-  (match_operand:V64_FP 2 "gcn_alu_operand")]
+  (match_operand:V_FP 2 "gcn_alu_operand")]
   "can_create_pseudo_p ()
    && (flag_openacc || flag_openmp
        || flag_associative_math)"
    })
 
 (define_insn "*<reduc_op>_dpp_shr_<mode>"
-  [(set (match_operand:V64_1REG 0 "register_operand"   "=v")
-       (unspec:V64_1REG
-         [(match_operand:V64_1REG 1 "register_operand" "v")
-          (match_operand:V64_1REG 2 "register_operand" "v")
-          (match_operand:SI 3 "const_int_operand"      "n")]
+  [(set (match_operand:V_1REG 0 "register_operand"   "=v")
+       (unspec:V_1REG
+         [(match_operand:V_1REG 1 "register_operand" "v")
+          (match_operand:V_1REG 2 "register_operand" "v")
+          (match_operand:SI 3 "const_int_operand"        "n")]
          REDUC_UNSPEC))]
   ; GCN3 requires a carry out, GCN5 not
   "!(TARGET_GCN3 && SCALAR_INT_MODE_P (<SCALAR_MODE>mode)
    (set_attr "length" "8")])
 
 (define_insn_and_split "*<reduc_op>_dpp_shr_<mode>"
-  [(set (match_operand:V64_DI 0 "register_operand"    "=v")
-       (unspec:V64_DI
-         [(match_operand:V64_DI 1 "register_operand" "v")
-          (match_operand:V64_DI 2 "register_operand" "v")
-          (match_operand:SI 3 "const_int_operand"    "n")]
+  [(set (match_operand:V_DI 0 "register_operand"    "=v")
+       (unspec:V_DI
+         [(match_operand:V_DI 1 "register_operand" "v")
+          (match_operand:V_DI 2 "register_operand" "v")
+          (match_operand:SI 3 "const_int_operand"  "n")]
          REDUC_2REG_UNSPEC))]
   ""
   "#"
 ; Special cases for addition.
 
 (define_insn "*plus_carry_dpp_shr_<mode>"
-  [(set (match_operand:V64_INT_1REG 0 "register_operand"   "=v")
-       (unspec:V64_INT_1REG
-         [(match_operand:V64_INT_1REG 1 "register_operand" "v")
-          (match_operand:V64_INT_1REG 2 "register_operand" "v")
+  [(set (match_operand:V_INT_1REG 0 "register_operand"   "=v")
+       (unspec:V_INT_1REG
+         [(match_operand:V_INT_1REG 1 "register_operand" "v")
+          (match_operand:V_INT_1REG 2 "register_operand" "v")
           (match_operand:SI 3 "const_int_operand"        "n")]
          UNSPEC_PLUS_CARRY_DPP_SHR))
    (clobber (reg:DI VCC_REG))]
    (set_attr "length" "8")])
 
 (define_insn "*plus_carry_in_dpp_shr_<mode>"
-  [(set (match_operand:V64_SI 0 "register_operand"    "=v")
-       (unspec:V64_SI
-         [(match_operand:V64_SI 1 "register_operand" "v")
-          (match_operand:V64_SI 2 "register_operand" "v")
-          (match_operand:SI 3 "const_int_operand"    "n")
-          (match_operand:DI 4 "register_operand"     "cV")]
+  [(set (match_operand:V_SI 0 "register_operand"    "=v")
+       (unspec:V_SI
+         [(match_operand:V_SI 1 "register_operand" "v")
+          (match_operand:V_SI 2 "register_operand" "v")
+          (match_operand:SI 3 "const_int_operand"  "n")
+          (match_operand:DI 4 "register_operand"   "cV")]
          UNSPEC_PLUS_CARRY_IN_DPP_SHR))
    (clobber (reg:DI VCC_REG))]
   ""
    (set_attr "length" "8")])
 
 (define_insn_and_split "*plus_carry_dpp_shr_<mode>"
-  [(set (match_operand:V64_DI 0 "register_operand"    "=v")
-       (unspec:V64_DI
-         [(match_operand:V64_DI 1 "register_operand" "v")
-          (match_operand:V64_DI 2 "register_operand" "v")
-          (match_operand:SI 3 "const_int_operand"    "n")]
+  [(set (match_operand:V_DI 0 "register_operand"    "=v")
+       (unspec:V_DI
+         [(match_operand:V_DI 1 "register_operand" "v")
+          (match_operand:V_DI 2 "register_operand" "v")
+          (match_operand:SI 3 "const_int_operand"  "n")]
          UNSPEC_PLUS_CARRY_DPP_SHR))
    (clobber (reg:DI VCC_REG))]
   ""
   [(set_attr "type" "vmult")
    (set_attr "length" "16")])
 
-; Instructions to move a scalar value from lane 63 of a vector register.
-(define_insn "mov_from_lane63_<mode>"
-  [(set (match_operand:<SCALAR_MODE> 0 "register_operand" "=Sg,v")
-       (unspec:<SCALAR_MODE>
-         [(match_operand:V64_1REG 1 "register_operand"   "  v,v")]
-         UNSPEC_MOV_FROM_LANE63))]
-  ""
-  "@
-   v_readlane_b32\t%0, %1, 63
-   v_mov_b32\t%0, %1 wave_ror:1"
-  [(set_attr "type" "vop3a,vop_dpp")
-   (set_attr "exec" "none,*")
-   (set_attr "length" "8")])
-
-(define_insn "mov_from_lane63_<mode>"
-  [(set (match_operand:<SCALAR_MODE> 0 "register_operand" "=Sg,v")
-       (unspec:<SCALAR_MODE>
-         [(match_operand:V64_2REG 1 "register_operand"   "  v,v")]
-         UNSPEC_MOV_FROM_LANE63))]
-  ""
-  "@
-   v_readlane_b32\t%L0, %L1, 63\;v_readlane_b32\t%H0, %H1, 63
-   * if (REGNO (operands[0]) <= REGNO (operands[1]))   \
-       return \"v_mov_b32\t%L0, %L1 wave_ror:1\;\"     \
-             \"v_mov_b32\t%H0, %H1 wave_ror:1\";       \
-     else                                              \
-       return \"v_mov_b32\t%H0, %H1 wave_ror:1\;\"     \
-             \"v_mov_b32\t%L0, %L1 wave_ror:1\";"
-  [(set_attr "type" "vop3a,vop_dpp")
-   (set_attr "exec" "none,*")
-   (set_attr "length" "8")])
-
 ;; }}}
 ;; {{{ Miscellaneous
 
index a561976d7f54235eb01b1db25eb8c5b134276948..b9d9170f167650dd336f616737b8760c950153f4 100644 (file)
@@ -4918,23 +4918,25 @@ gcn_expand_dpp_shr_insn (machine_mode mode, const char *insn,
 
    The vector register SRC of mode MODE is reduced using the operation given
    by UNSPEC, and the scalar result is returned in lane 63 of a vector
-   register.  */
-/* FIXME: Implement reductions for sizes other than V64.
-          (They're currently disabled in the machine description.)  */
+   register (or lane 31, 15, 7, 3, 1 for partial vectors).  */
 
 rtx
 gcn_expand_reduc_scalar (machine_mode mode, rtx src, int unspec)
 {
   machine_mode orig_mode = mode;
+  machine_mode scalar_mode = GET_MODE_INNER (mode);
+  int vf = GET_MODE_NUNITS (mode);
   bool use_moves = (((unspec == UNSPEC_SMIN_DPP_SHR
+                     || unspec == UNSPEC_SMIN_DPP_SHR
                      || unspec == UNSPEC_SMAX_DPP_SHR
                      || unspec == UNSPEC_UMIN_DPP_SHR
                      || unspec == UNSPEC_UMAX_DPP_SHR)
-                    && (mode == V64DImode
-                        || mode == V64DFmode))
+                    && (scalar_mode == DImode
+                        || scalar_mode == DFmode))
                    || (unspec == UNSPEC_PLUS_DPP_SHR
-                       && mode == V64DFmode));
+                       && scalar_mode == DFmode));
   rtx_code code = (unspec == UNSPEC_SMIN_DPP_SHR ? SMIN
+                  : unspec == UNSPEC_SMIN_DPP_SHR ? SMIN
                   : unspec == UNSPEC_SMAX_DPP_SHR ? SMAX
                   : unspec == UNSPEC_UMIN_DPP_SHR ? UMIN
                   : unspec == UNSPEC_UMAX_DPP_SHR ? UMAX
@@ -4944,23 +4946,23 @@ gcn_expand_reduc_scalar (machine_mode mode, rtx src, int unspec)
                       || unspec == UNSPEC_SMAX_DPP_SHR
                       || unspec == UNSPEC_UMIN_DPP_SHR
                       || unspec == UNSPEC_UMAX_DPP_SHR)
-                     && (mode == V64QImode
-                         || mode == V64HImode));
+                     && (scalar_mode == QImode
+                         || scalar_mode == HImode));
   bool unsignedp = (unspec == UNSPEC_UMIN_DPP_SHR
                    || unspec == UNSPEC_UMAX_DPP_SHR);
   bool use_plus_carry = unspec == UNSPEC_PLUS_DPP_SHR
                        && GET_MODE_CLASS (mode) == MODE_VECTOR_INT
-                       && (TARGET_GCN3 || mode == V64DImode);
+                       && (TARGET_GCN3 || scalar_mode == DImode);
 
   if (use_plus_carry)
     unspec = UNSPEC_PLUS_CARRY_DPP_SHR;
 
   if (use_extends)
     {
-      rtx tmp = gen_reg_rtx (V64SImode);
+      mode = VnMODE (vf, SImode);
+      rtx tmp = gen_reg_rtx (mode);
       convert_move (tmp, src, unsignedp);
       src = tmp;
-      mode = V64SImode;
     }
 
   /* Perform reduction by first performing the reduction operation on every
@@ -4968,7 +4970,8 @@ gcn_expand_reduc_scalar (machine_mode mode, rtx src, int unspec)
      iteration (thereby effectively reducing every 4 lanes) and so on until
      all lanes are reduced.  */
   rtx in, out = force_reg (mode, src);
-  for (int i = 0, shift = 1; i < 6; i++, shift <<= 1)
+  int iterations = exact_log2 (vf);
+  for (int i = 0, shift = 1; i < iterations; i++, shift <<= 1)
     {
       rtx shift_val = gen_rtx_CONST_INT (VOIDmode, shift);
       in = out;
index a3c9523cd6dc6e31c3ba1ab1ce2b77965e4f4e61..6c1a438f9d1a0b82191f92bf49fd66d1b7f2a4dd 100644 (file)
@@ -78,7 +78,6 @@
   UNSPEC_PLUS_CARRY_DPP_SHR UNSPEC_PLUS_CARRY_IN_DPP_SHR
   UNSPEC_AND_DPP_SHR UNSPEC_IOR_DPP_SHR UNSPEC_XOR_DPP_SHR
   UNSPEC_MOV_DPP_SHR
-  UNSPEC_MOV_FROM_LANE63
   UNSPEC_GATHER
   UNSPEC_SCATTER
   UNSPEC_RCP