]> git.ipfire.org Git - thirdparty/gcc.git/commitdiff
RISC-V: Make zero-stride load broadcast a tunable.
authorRobin Dapp <rdapp@ventanamicro.com>
Thu, 10 Jul 2025 07:41:48 +0000 (09:41 +0200)
committerRobin Dapp <rdapp@ventanamicro.com>
Thu, 10 Jul 2025 13:56:20 +0000 (15:56 +0200)
This patch makes the zero-stride load broadcast idiom dependent on a
uarch-tunable "use_zero_stride_load".  Right now we have quite a few
paths that reach a strided load and some of them are not exactly
straightforward.

While broadcast is relatively rare on rv64 targets it is more common on
rv32 targets that want to vectorize 64-bit elements.

While the patch is more involved than I would have liked it could have
even touched more places.  The whole broadcast-like insn path feels a
bit hackish due to the several optimizations we employ.  Some of the
complications stem from the fact that we lump together real broadcasts,
vector single-element sets, and strided broadcasts.  The strided-load
alternatives currently require a memory_constraint to work properly
which causes more complications when trying to disable just these.

In short, the whole pred_broadcast handling in combination with the
sew64_scalar_helper could use work in the future.  I was about to start
with it in this patch but soon realized that it would only distract from
the original intent.  What can help in the future is split strided and
non-strided broadcast entirely, as well as the single-element sets.

Yet unclear is whether we need to pay special attention for misaligned
strided loads (PR120782).

I regtested on rv32 and rv64 with strided_load_broadcast_p forced to
true and false.  With either I didn't observe any new execution failures
but obviously there are new scan failures with strided broadcast turned
off.

PR target/118734

gcc/ChangeLog:

* config/riscv/constraints.md (Wdm): Use tunable for Wdm
constraint.
* config/riscv/riscv-protos.h (emit_avltype_insn): Declare.
(can_be_broadcasted_p): Rename to...
(can_be_broadcast_p): ...this.
* config/riscv/predicates.md: Use renamed function.
(strided_load_broadcast_p): Declare.
* config/riscv/riscv-selftests.cc (run_broadcast_selftests):
Only run broadcast selftest if strided broadcasts are OK.
* config/riscv/riscv-v.cc (emit_avltype_insn): New function.
(sew64_scalar_helper): Only emit a pred_broadcast if the new
tunable says so.
(can_be_broadcasted_p): Rename to...
(can_be_broadcast_p): ...this and use new tunable.
* config/riscv/riscv.cc (struct riscv_tune_param): Add strided
broad tunable.
(strided_load_broadcast_p): Implement.
* config/riscv/vector.md: Use strided_load_broadcast_p () and
work around 64-bit broadcast on rv32 targets.

gcc/config/riscv/constraints.md
gcc/config/riscv/predicates.md
gcc/config/riscv/riscv-protos.h
gcc/config/riscv/riscv-selftests.cc
gcc/config/riscv/riscv-v.cc
gcc/config/riscv/riscv.cc
gcc/config/riscv/vector.md

index ccab1a2e29dfa0f7ba3bbcd24d0d81b7ebac19db..5ecaa19eb0140cbba9aa25ac8b092797e5e9b09c 100644 (file)
  (and (match_code "const_vector")
       (match_test "rtx_equal_p (op, riscv_vector::gen_scalar_move_mask (GET_MODE (op)))")))
 
-(define_memory_constraint "Wdm"
+(define_constraint "Wdm"
   "Vector duplicate memory operand"
-  (and (match_code "mem")
-       (match_code "reg" "0")))
+  (and (match_test "strided_load_broadcast_p ()")
+       (and (match_code "mem")
+           (match_code "reg" "0"))))
 
 ;; Vendor ISA extension constraints.
 
index 8baad2fae7a9c128a5b87a84d97a25d061c9ee37..1f9a6b562e531fb41697160e6cc3c5b152897cad 100644 (file)
 
 ;; The scalar operand can be directly broadcast by RVV instructions.
 (define_predicate "direct_broadcast_operand"
-  (match_test "riscv_vector::can_be_broadcasted_p (op)"))
+  (match_test "riscv_vector::can_be_broadcast_p (op)"))
 
 ;; A CONST_INT operand that has exactly two bits cleared.
 (define_predicate "const_nottwobits_operand"
index 38f63ea84248a9ae9f461a7483b12691424de5f5..a41c4c299fac33e0fea43d62610822f9bed6f59d 100644 (file)
@@ -604,6 +604,7 @@ void emit_vlmax_vsetvl (machine_mode, rtx);
 void emit_hard_vlmax_vsetvl (machine_mode, rtx);
 void emit_vlmax_insn (unsigned, unsigned, rtx *);
 void emit_nonvlmax_insn (unsigned, unsigned, rtx *, rtx);
+void emit_avltype_insn (unsigned, unsigned, rtx *, avl_type, rtx = nullptr);
 void emit_vlmax_insn_lra (unsigned, unsigned, rtx *, rtx);
 enum vlmul_type get_vlmul (machine_mode);
 rtx get_vlmax_rtx (machine_mode);
@@ -760,7 +761,7 @@ uint8_t get_sew (rtx_insn *);
 enum vlmul_type get_vlmul (rtx_insn *);
 int count_regno_occurrences (rtx_insn *, unsigned int);
 bool imm_avl_p (machine_mode);
-bool can_be_broadcasted_p (rtx);
+bool can_be_broadcast_p (rtx);
 bool gather_scatter_valid_offset_p (machine_mode);
 HOST_WIDE_INT estimated_poly_value (poly_int64, unsigned int);
 bool whole_reg_to_reg_move_p (rtx *, machine_mode, int);
@@ -813,6 +814,7 @@ extern const char *th_output_move (rtx, rtx);
 extern bool th_print_operand_address (FILE *, machine_mode, rtx);
 #endif
 
+extern bool strided_load_broadcast_p (void);
 extern bool riscv_use_divmod_expander (void);
 void riscv_init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree, int);
 extern bool
index 34d01ac76b75799f36d8da4815dcb7a5163e06d7..9ca1ffee394fac83a757183df2d55eeea23b1c16 100644 (file)
@@ -342,9 +342,13 @@ run_broadcast_selftests (void)
          expand_vector_broadcast (mode, mem);                                 \
          insn = get_last_insn ();                                             \
          src = SET_SRC (PATTERN (insn));                                      \
-         ASSERT_TRUE (MEM_P (XEXP (src, 0)));                                 \
-         ASSERT_TRUE (                                                        \
-           rtx_equal_p (src, gen_rtx_VEC_DUPLICATE (mode, XEXP (src, 0))));   \
+         if (strided_load_broadcast_p ())                                     \
+           {                                                                  \
+             ASSERT_TRUE (MEM_P (XEXP (src, 0)));                             \
+             ASSERT_TRUE (                                                    \
+               rtx_equal_p (src,                                              \
+                            gen_rtx_VEC_DUPLICATE (mode, XEXP (src, 0))));    \
+           }                                                                  \
          end_sequence ();                                                     \
          /* Test vmv.v.x or vfmv.v.f.  */                                     \
          start_sequence ();                                                   \
index 2efe56afc9eb19b74970de4e0c81e116e4ca361c..242ac087764ffd05df69fbc88a8c7a6e48434aaa 100644 (file)
@@ -437,6 +437,26 @@ emit_nonvlmax_insn (unsigned icode, unsigned insn_flags, rtx *ops, rtx vl)
   e.emit_insn ((enum insn_code) icode, ops);
 }
 
+/* Emit either a VLMAX insn or a non-VLMAX insn depending on TYPE.  For a
+   non-VLMAX insn, the length must be specified in VL.  */
+
+void
+emit_avltype_insn (unsigned icode, unsigned insn_flags, rtx *ops,
+                  avl_type type, rtx vl)
+{
+  if (type != avl_type::VLMAX && vl != NULL_RTX)
+    {
+      insn_expander<RVV_INSN_OPERANDS_MAX> e (insn_flags, false);
+      e.set_vl (vl);
+      e.emit_insn ((enum insn_code) icode, ops);
+    }
+  else
+    {
+      insn_expander<RVV_INSN_OPERANDS_MAX> e (insn_flags, true);
+      e.emit_insn ((enum insn_code) icode, ops);
+    }
+}
+
 /* Return true if the vector duplicated by a super element which is the fusion
    of consecutive elements.
 
@@ -2144,21 +2164,40 @@ sew64_scalar_helper (rtx *operands, rtx *scalar_op, rtx vl,
       return false;
     }
 
+  bool avoid_strided_broadcast = false;
   if (CONST_INT_P (*scalar_op))
     {
       if (maybe_gt (GET_MODE_SIZE (scalar_mode), GET_MODE_SIZE (Pmode)))
-       *scalar_op = force_const_mem (scalar_mode, *scalar_op);
+       {
+         if (strided_load_broadcast_p ())
+           *scalar_op = force_const_mem (scalar_mode, *scalar_op);
+         else
+           avoid_strided_broadcast = true;
+       }
       else
        *scalar_op = force_reg (scalar_mode, *scalar_op);
     }
 
   rtx tmp = gen_reg_rtx (vector_mode);
-  rtx ops[] = {tmp, *scalar_op};
-  if (type == VLMAX)
-    emit_vlmax_insn (code_for_pred_broadcast (vector_mode), UNARY_OP, ops);
+  if (!avoid_strided_broadcast)
+    {
+      rtx ops[] = {tmp, *scalar_op};
+      emit_avltype_insn (code_for_pred_broadcast (vector_mode), UNARY_OP, ops,
+                        type, vl);
+    }
   else
-    emit_nonvlmax_insn (code_for_pred_broadcast (vector_mode), UNARY_OP, ops,
-                       vl);
+    {
+      /* Load scalar as V1DI and broadcast via vrgather.vi.  */
+      rtx tmp1 = gen_reg_rtx (V1DImode);
+      emit_move_insn (tmp1, lowpart_subreg (V1DImode, *scalar_op,
+                                           scalar_mode));
+      tmp1 = lowpart_subreg (vector_mode, tmp1, V1DImode);
+
+      rtx ops[] = {tmp, tmp1, CONST0_RTX (Pmode)};
+      emit_vlmax_insn (code_for_pred_gather_scalar (vector_mode),
+                      BINARY_OP, ops);
+    }
+
   emit_vector_func (operands, tmp);
 
   return true;
@@ -5769,9 +5808,9 @@ count_regno_occurrences (rtx_insn *rinsn, unsigned int regno)
   return count;
 }
 
-/* Return true if the OP can be directly broadcasted.  */
+/* Return true if the OP can be directly broadcast.  */
 bool
-can_be_broadcasted_p (rtx op)
+can_be_broadcast_p (rtx op)
 {
   machine_mode mode = GET_MODE (op);
   /* We don't allow RA (register allocation) reload generate
@@ -5783,7 +5822,8 @@ can_be_broadcasted_p (rtx op)
     return false;
 
   if (satisfies_constraint_K (op) || register_operand (op, mode)
-      || satisfies_constraint_Wdm (op) || rtx_equal_p (op, CONST0_RTX (mode)))
+      || (strided_load_broadcast_p () && satisfies_constraint_Wdm (op))
+      || rtx_equal_p (op, CONST0_RTX (mode)))
     return true;
 
   return can_create_pseudo_p () && nonmemory_operand (op, mode);
index 023adc3284dfc2a394ecda0fd8bb687b6427d5af..a4428f0e96d2991158b56e6d27341a244bf371d6 100644 (file)
@@ -306,6 +306,7 @@ struct riscv_tune_param
   bool vector_unaligned_access;
   bool use_divmod_expansion;
   bool overlap_op_by_pieces;
+  bool use_zero_stride_load;
   bool speculative_sched_vsetvl;
   unsigned int fusible_ops;
   const struct cpu_vector_cost *vec_costs;
@@ -469,6 +470,7 @@ static const struct riscv_tune_param generic_tune_info = {
   false,                                       /* vector_unaligned_access */
   false,                                       /* use_divmod_expansion */
   false,                                       /* overlap_op_by_pieces */
+  true,                                                /* use_zero_stride_load */
   false,                                       /* speculative_sched_vsetvl */
   RISCV_FUSE_NOTHING,                           /* fusible_ops */
   NULL,                                                /* vector cost */
@@ -492,6 +494,7 @@ static const struct riscv_tune_param rocket_tune_info = {
   false,                                       /* vector_unaligned_access */
   false,                                       /* use_divmod_expansion */
   false,                                       /* overlap_op_by_pieces */
+  true,                                                /* use_zero_stride_load */
   false,                                       /* speculative_sched_vsetvl */
   RISCV_FUSE_NOTHING,                           /* fusible_ops */
   NULL,                                                /* vector cost */
@@ -515,6 +518,7 @@ static const struct riscv_tune_param sifive_7_tune_info = {
   false,                                       /* vector_unaligned_access */
   false,                                       /* use_divmod_expansion */
   false,                                       /* overlap_op_by_pieces */
+  true,                                                /* use_zero_stride_load */
   false,                                       /* speculative_sched_vsetvl */
   RISCV_FUSE_NOTHING,                           /* fusible_ops */
   NULL,                                                /* vector cost */
@@ -538,6 +542,7 @@ static const struct riscv_tune_param sifive_p400_tune_info = {
   false,                                       /* vector_unaligned_access */
   false,                                       /* use_divmod_expansion */
   false,                                       /* overlap_op_by_pieces */
+  true,                                                /* use_zero_stride_load */
   false,                                       /* speculative_sched_vsetvl */
   RISCV_FUSE_LUI_ADDI | RISCV_FUSE_AUIPC_ADDI,  /* fusible_ops */
   &generic_vector_cost,                                /* vector cost */
@@ -561,6 +566,7 @@ static const struct riscv_tune_param sifive_p600_tune_info = {
   false,                                       /* vector_unaligned_access */
   false,                                       /* use_divmod_expansion */
   false,                                       /* overlap_op_by_pieces */
+  true,                                                /* use_zero_stride_load */
   false,                                       /* speculative_sched_vsetvl */
   RISCV_FUSE_LUI_ADDI | RISCV_FUSE_AUIPC_ADDI,  /* fusible_ops */
   &generic_vector_cost,                                /* vector cost */
@@ -584,6 +590,7 @@ static const struct riscv_tune_param thead_c906_tune_info = {
   false,                                       /* vector_unaligned_access */
   false,       /* use_divmod_expansion */
   false,                                       /* overlap_op_by_pieces */
+  true,                                                /* use_zero_stride_load */
   false,                                       /* speculative_sched_vsetvl */
   RISCV_FUSE_NOTHING,                           /* fusible_ops */
   NULL,                                                /* vector cost */
@@ -607,6 +614,7 @@ static const struct riscv_tune_param xiangshan_nanhu_tune_info = {
   false,                                       /* vector_unaligned_access */
   false,                                       /* use_divmod_expansion */
   false,                                       /* overlap_op_by_pieces */
+  true,                                                /* use_zero_stride_load */
   false,                                       /* speculative_sched_vsetvl */
   RISCV_FUSE_ZEXTW | RISCV_FUSE_ZEXTH,          /* fusible_ops */
   NULL,                                                /* vector cost */
@@ -630,6 +638,7 @@ static const struct riscv_tune_param generic_ooo_tune_info = {
   true,                                                /* vector_unaligned_access */
   false,                                       /* use_divmod_expansion */
   true,                                                /* overlap_op_by_pieces */
+  true,                                                /* use_zero_stride_load */
   false,                                       /* speculative_sched_vsetvl */
   RISCV_FUSE_NOTHING,                           /* fusible_ops */
   &generic_vector_cost,                                /* vector cost */
@@ -653,6 +662,7 @@ static const struct riscv_tune_param tt_ascalon_d8_tune_info = {
   true,                                                /* vector_unaligned_access */
   true,                                                /* use_divmod_expansion */
   true,                                                /* overlap_op_by_pieces */
+  true,                                                /* use_zero_stride_load */
   false,                                       /* speculative_sched_vsetvl */
   RISCV_FUSE_NOTHING,                           /* fusible_ops */
   &generic_vector_cost,                                /* vector cost */
@@ -676,6 +686,7 @@ static const struct riscv_tune_param optimize_size_tune_info = {
   false,                                       /* vector_unaligned_access */
   false,                                       /* use_divmod_expansion */
   false,                                       /* overlap_op_by_pieces */
+  true,                                                /* use_zero_stride_load */
   false,                                       /* speculative_sched_vsetvl */
   RISCV_FUSE_NOTHING,                           /* fusible_ops */
   NULL,                                                /* vector cost */
@@ -699,6 +710,7 @@ static const struct riscv_tune_param mips_p8700_tune_info = {
   false,        /* vector_unaligned_access */
   true,         /* use_divmod_expansion */
   false,        /* overlap_op_by_pieces */
+  true,                                                /* use_zero_stride_load */
   false,                                       /* speculative_sched_vsetvl */
   RISCV_FUSE_NOTHING,                          /* fusible_ops */
   NULL,         /* vector cost */
@@ -12765,6 +12777,14 @@ riscv_lshift_subword (machine_mode mode ATTRIBUTE_UNUSED, rtx value, rtx shift,
                                                  gen_lowpart (QImode, shift)));
 }
 
+/* Return TRUE if we should use the zero stride load, FALSE otherwise. */
+
+bool
+strided_load_broadcast_p ()
+{
+  return tune_param->use_zero_stride_load;
+}
+
 /* Return TRUE if we should use the divmod expander, FALSE otherwise.  This
    allows the behavior to be tuned for specific implementations as well as
    when optimizing for size.  */
index 2046f04a02c5fc78300b5159e668ffaf1a6cb9af..c5b23b37f7dc89d72958009a45143658ab0b8d60 100644 (file)
   "&& 1"
   [(const_int 0)]
   {
-    riscv_vector::emit_vlmax_insn (code_for_pred_broadcast (<MODE>mode),
-                                   riscv_vector::UNARY_OP, operands);
+    if (!strided_load_broadcast_p ()
+       && TARGET_ZVFHMIN && !TARGET_ZVFH && <VEL>mode == HFmode)
+      {
+       /* For Float16, reinterpret as HImode, broadcast and reinterpret
+          back.  */
+       poly_uint64 nunits = GET_MODE_NUNITS (<MODE>mode);
+       machine_mode vmodehi
+         = riscv_vector::get_vector_mode (HImode, nunits).require ();
+       rtx ops[] = {lowpart_subreg (vmodehi, operands[0], <MODE>mode),
+                    lowpart_subreg (HImode, operands[1], HFmode)};
+       riscv_vector::emit_vlmax_insn (code_for_pred_broadcast (vmodehi),
+                                      riscv_vector::UNARY_OP, ops);
+      }
+    else
+      riscv_vector::emit_vlmax_insn (code_for_pred_broadcast (<MODE>mode),
+                                    riscv_vector::UNARY_OP, operands);
     DONE;
   }
   [(set_attr "type" "vector")]
        }
     }
   else if (GET_MODE_BITSIZE (<VEL>mode) > GET_MODE_BITSIZE (Pmode)
-           && (immediate_operand (operands[3], Pmode)
+          && (immediate_operand (operands[3], Pmode)
               || (CONST_POLY_INT_P (operands[3])
                   && known_ge (rtx_to_poly_int64 (operands[3]), 0U)
                   && known_le (rtx_to_poly_int64 (operands[3]), GET_MODE_SIZE (<MODE>mode)))))
   "(register_operand (operands[3], <VEL>mode)
   || CONST_POLY_INT_P (operands[3]))
   && GET_MODE_BITSIZE (<VEL>mode) > GET_MODE_BITSIZE (Pmode)"
-  [(set (match_dup 0)
-       (if_then_else:V_VLSI (unspec:<VM> [(match_dup 1) (match_dup 4)
-            (match_dup 5) (match_dup 6) (match_dup 7)
-            (reg:SI VL_REGNUM) (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
-         (vec_duplicate:V_VLSI (match_dup 3))
-         (match_dup 2)))]
+  [(const_int 0)]
   {
     gcc_assert (can_create_pseudo_p ());
     if (CONST_POLY_INT_P (operands[3]))
        emit_move_insn (tmp, operands[3]);
        operands[3] = tmp;
       }
-    rtx m = assign_stack_local (<VEL>mode, GET_MODE_SIZE (<VEL>mode),
-                               GET_MODE_ALIGNMENT (<VEL>mode));
-    m = validize_mem (m);
-    emit_move_insn (m, operands[3]);
-    m = gen_rtx_MEM (<VEL>mode, force_reg (Pmode, XEXP (m, 0)));
-    operands[3] = m;
 
     /* For SEW = 64 in RV32 system, we expand vmv.s.x:
        andi a2,a2,1
        operands[4] = riscv_vector::gen_avl_for_scalar_move (operands[4]);
        operands[1] = CONSTM1_RTX (<VM>mode);
       }
+
+    /* If the target doesn't want a strided-load broadcast we go with a regular
+       V1DImode load and a broadcast gather.  */
+    if (strided_load_broadcast_p ())
+      {
+       rtx mem = assign_stack_local (<VEL>mode, GET_MODE_SIZE (<VEL>mode),
+                                     GET_MODE_ALIGNMENT (<VEL>mode));
+       mem = validize_mem (mem);
+       emit_move_insn (mem, operands[3]);
+       mem = gen_rtx_MEM (<VEL>mode, force_reg (Pmode, XEXP (mem, 0)));
+
+       emit_insn
+         (gen_pred_broadcast<mode>
+          (operands[0], operands[1], operands[2], mem,
+           operands[4], operands[5], operands[6], operands[7]));
+      }
+    else
+      {
+       rtx tmp = gen_reg_rtx (V1DImode);
+       emit_move_insn (tmp, lowpart_subreg (V1DImode, operands[3],
+                                            <VEL>mode));
+       tmp = lowpart_subreg (<MODE>mode, tmp, V1DImode);
+
+       emit_insn
+         (gen_pred_gather<mode>_scalar
+          (operands[0], operands[1], operands[2], tmp, CONST0_RTX (Pmode),
+           operands[4], operands[5], operands[6], operands[7]));
+      }
+    DONE;
   }
   [(set_attr "type" "vimov,vimov,vlds,vlds,vlds,vlds,vimovxv,vimovxv")
    (set_attr "mode" "<MODE>")])
             (reg:SI VL_REGNUM)
             (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
          (vec_duplicate:V_VLSF_ZVFHMIN
-           (match_operand:<VEL>        3 "direct_broadcast_operand"      "Wdm, Wdm, Wdm, Wdm"))
+           (match_operand:<VEL>        3 "direct_broadcast_operand"      "  A,   A,   A,   A"))
          (match_operand:V_VLSF_ZVFHMIN 2 "vector_merge_operand"          " vu,   0,  vu,   0")))]
-  "TARGET_VECTOR"
+  "TARGET_VECTOR && strided_load_broadcast_p ()"
   "@
    vlse<sew>.v\t%0,%3,zero,%1.t
    vlse<sew>.v\t%0,%3,zero,%1.t