RISC-V: Make zero-stride load broadcast a tunable.

author Robin Dapp <rdapp@ventanamicro.com>

Thu, 10 Jul 2025 07:41:48 +0000 (09:41 +0200)

committer Robin Dapp <rdapp@ventanamicro.com>

Thu, 10 Jul 2025 13:56:20 +0000 (15:56 +0200)
author Robin Dapp <rdapp@ventanamicro.com>
Thu, 10 Jul 2025 07:41:48 +0000 (09:41 +0200)
committer Robin Dapp <rdapp@ventanamicro.com>
Thu, 10 Jul 2025 13:56:20 +0000 (15:56 +0200)
diff --git a/gcc/config/riscv/constraints.md b/gcc/config/riscv/constraints.md

index ccab1a2e29dfa0f7ba3bbcd24d0d81b7ebac19db..5ecaa19eb0140cbba9aa25ac8b092797e5e9b09c 100644 (file)
--- a/gcc/config/riscv/constraints.md
+++ b/gcc/config/riscv/constraints.md
@@ -237,10 +237,11 @@
   (and (match_code "const_vector")
        (match_test "rtx_equal_p (op, riscv_vector::gen_scalar_move_mask (GET_MODE (op)))")))
  
-(define_memory_constraint "Wdm"
+(define_constraint "Wdm"
    "Vector duplicate memory operand"
-  (and (match_code "mem")
-       (match_code "reg" "0")))
+  (and (match_test "strided_load_broadcast_p ()")
+       (and (match_code "mem")
+           (match_code "reg" "0"))))
  
  ;; Vendor ISA extension constraints.
  
diff --git a/gcc/config/riscv/predicates.md b/gcc/config/riscv/predicates.md

index 8baad2fae7a9c128a5b87a84d97a25d061c9ee37..1f9a6b562e531fb41697160e6cc3c5b152897cad 100644 (file)
--- a/gcc/config/riscv/predicates.md
+++ b/gcc/config/riscv/predicates.md
@@ -617,7 +617,7 @@
  
  ;; The scalar operand can be directly broadcast by RVV instructions.
  (define_predicate "direct_broadcast_operand"
-  (match_test "riscv_vector::can_be_broadcasted_p (op)"))
+  (match_test "riscv_vector::can_be_broadcast_p (op)"))
  
  ;; A CONST_INT operand that has exactly two bits cleared.
  (define_predicate "const_nottwobits_operand"
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h

index 38f63ea84248a9ae9f461a7483b12691424de5f5..a41c4c299fac33e0fea43d62610822f9bed6f59d 100644 (file)
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -604,6 +604,7 @@ void emit_vlmax_vsetvl (machine_mode, rtx);
  void emit_hard_vlmax_vsetvl (machine_mode, rtx);
  void emit_vlmax_insn (unsigned, unsigned, rtx *);
  void emit_nonvlmax_insn (unsigned, unsigned, rtx *, rtx);
+void emit_avltype_insn (unsigned, unsigned, rtx *, avl_type, rtx = nullptr);
  void emit_vlmax_insn_lra (unsigned, unsigned, rtx *, rtx);
  enum vlmul_type get_vlmul (machine_mode);
  rtx get_vlmax_rtx (machine_mode);
@@ -760,7 +761,7 @@ uint8_t get_sew (rtx_insn *);
  enum vlmul_type get_vlmul (rtx_insn *);
  int count_regno_occurrences (rtx_insn *, unsigned int);
  bool imm_avl_p (machine_mode);
-bool can_be_broadcasted_p (rtx);
+bool can_be_broadcast_p (rtx);
  bool gather_scatter_valid_offset_p (machine_mode);
  HOST_WIDE_INT estimated_poly_value (poly_int64, unsigned int);
  bool whole_reg_to_reg_move_p (rtx *, machine_mode, int);
@@ -813,6 +814,7 @@ extern const char *th_output_move (rtx, rtx);
  extern bool th_print_operand_address (FILE *, machine_mode, rtx);
  #endif
  
+extern bool strided_load_broadcast_p (void);
  extern bool riscv_use_divmod_expander (void);
  void riscv_init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree, int);
  extern bool
diff --git a/gcc/config/riscv/riscv-selftests.cc b/gcc/config/riscv/riscv-selftests.cc

index 34d01ac76b75799f36d8da4815dcb7a5163e06d7..9ca1ffee394fac83a757183df2d55eeea23b1c16 100644 (file)
--- a/gcc/config/riscv/riscv-selftests.cc
+++ b/gcc/config/riscv/riscv-selftests.cc
@@ -342,9 +342,13 @@ run_broadcast_selftests (void)
           expand_vector_broadcast (mode, mem);                                 \
           insn = get_last_insn ();                                             \
           src = SET_SRC (PATTERN (insn));                                      \
-         ASSERT_TRUE (MEM_P (XEXP (src, 0)));                                 \
-         ASSERT_TRUE (                                                        \
-           rtx_equal_p (src, gen_rtx_VEC_DUPLICATE (mode, XEXP (src, 0))));   \
+         if (strided_load_broadcast_p ())                                     \
+           {                                                                  \
+             ASSERT_TRUE (MEM_P (XEXP (src, 0)));                             \
+             ASSERT_TRUE (                                                    \
+               rtx_equal_p (src,                                              \
+                            gen_rtx_VEC_DUPLICATE (mode, XEXP (src, 0))));    \
+           }                                                                  \
           end_sequence ();                                                     \
           /* Test vmv.v.x or vfmv.v.f.  */                                     \
           start_sequence ();                                                   \
diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc

index 2efe56afc9eb19b74970de4e0c81e116e4ca361c..242ac087764ffd05df69fbc88a8c7a6e48434aaa 100644 (file)
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -437,6 +437,26 @@ emit_nonvlmax_insn (unsigned icode, unsigned insn_flags, rtx *ops, rtx vl)
    e.emit_insn ((enum insn_code) icode, ops);
  }
  
+/* Emit either a VLMAX insn or a non-VLMAX insn depending on TYPE.  For a
+   non-VLMAX insn, the length must be specified in VL.  */
+
+void
+emit_avltype_insn (unsigned icode, unsigned insn_flags, rtx *ops,
+                  avl_type type, rtx vl)
+{
+  if (type != avl_type::VLMAX && vl != NULL_RTX)
+    {
+      insn_expander<RVV_INSN_OPERANDS_MAX> e (insn_flags, false);
+      e.set_vl (vl);
+      e.emit_insn ((enum insn_code) icode, ops);
+    }
+  else
+    {
+      insn_expander<RVV_INSN_OPERANDS_MAX> e (insn_flags, true);
+      e.emit_insn ((enum insn_code) icode, ops);
+    }
+}
+
  /* Return true if the vector duplicated by a super element which is the fusion
     of consecutive elements.
  
@@ -2144,21 +2164,40 @@ sew64_scalar_helper (rtx *operands, rtx *scalar_op, rtx vl,
        return false;
      }
  
+  bool avoid_strided_broadcast = false;
    if (CONST_INT_P (*scalar_op))
      {
        if (maybe_gt (GET_MODE_SIZE (scalar_mode), GET_MODE_SIZE (Pmode)))
-       *scalar_op = force_const_mem (scalar_mode, *scalar_op);
+       {
+         if (strided_load_broadcast_p ())
+           *scalar_op = force_const_mem (scalar_mode, *scalar_op);
+         else
+           avoid_strided_broadcast = true;
+       }
        else
         *scalar_op = force_reg (scalar_mode, *scalar_op);
      }
  
    rtx tmp = gen_reg_rtx (vector_mode);
-  rtx ops[] = {tmp, *scalar_op};
-  if (type == VLMAX)
-    emit_vlmax_insn (code_for_pred_broadcast (vector_mode), UNARY_OP, ops);
+  if (!avoid_strided_broadcast)
+    {
+      rtx ops[] = {tmp, *scalar_op};
+      emit_avltype_insn (code_for_pred_broadcast (vector_mode), UNARY_OP, ops,
+                        type, vl);
+    }
    else
-    emit_nonvlmax_insn (code_for_pred_broadcast (vector_mode), UNARY_OP, ops,
-                       vl);
+    {
+      /* Load scalar as V1DI and broadcast via vrgather.vi.  */
+      rtx tmp1 = gen_reg_rtx (V1DImode);
+      emit_move_insn (tmp1, lowpart_subreg (V1DImode, *scalar_op,
+                                           scalar_mode));
+      tmp1 = lowpart_subreg (vector_mode, tmp1, V1DImode);
+
+      rtx ops[] = {tmp, tmp1, CONST0_RTX (Pmode)};
+      emit_vlmax_insn (code_for_pred_gather_scalar (vector_mode),
+                      BINARY_OP, ops);
+    }
+
    emit_vector_func (operands, tmp);
  
    return true;
@@ -5769,9 +5808,9 @@ count_regno_occurrences (rtx_insn *rinsn, unsigned int regno)
    return count;
  }
  
-/* Return true if the OP can be directly broadcasted.  */
+/* Return true if the OP can be directly broadcast.  */
  bool
-can_be_broadcasted_p (rtx op)
+can_be_broadcast_p (rtx op)
  {
    machine_mode mode = GET_MODE (op);
    /* We don't allow RA (register allocation) reload generate
@@ -5783,7 +5822,8 @@ can_be_broadcasted_p (rtx op)
      return false;
  
    if (satisfies_constraint_K (op) || register_operand (op, mode)
-      || satisfies_constraint_Wdm (op) || rtx_equal_p (op, CONST0_RTX (mode)))
+      || (strided_load_broadcast_p () && satisfies_constraint_Wdm (op))
+      || rtx_equal_p (op, CONST0_RTX (mode)))
      return true;
  
    return can_create_pseudo_p () && nonmemory_operand (op, mode);
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc

index 023adc3284dfc2a394ecda0fd8bb687b6427d5af..a4428f0e96d2991158b56e6d27341a244bf371d6 100644 (file)
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -306,6 +306,7 @@ struct riscv_tune_param
    bool vector_unaligned_access;
    bool use_divmod_expansion;
    bool overlap_op_by_pieces;
+  bool use_zero_stride_load;
    bool speculative_sched_vsetvl;
    unsigned int fusible_ops;
    const struct cpu_vector_cost *vec_costs;
@@ -469,6 +470,7 @@ static const struct riscv_tune_param generic_tune_info = {
    false,                                       /* vector_unaligned_access */
    false,                                       /* use_divmod_expansion */
    false,                                       /* overlap_op_by_pieces */
+  true,                                                /* use_zero_stride_load */
    false,                                       /* speculative_sched_vsetvl */
    RISCV_FUSE_NOTHING,                           /* fusible_ops */
    NULL,                                                /* vector cost */
@@ -492,6 +494,7 @@ static const struct riscv_tune_param rocket_tune_info = {
    false,                                       /* vector_unaligned_access */
    false,                                       /* use_divmod_expansion */
    false,                                       /* overlap_op_by_pieces */
+  true,                                                /* use_zero_stride_load */
    false,                                       /* speculative_sched_vsetvl */
    RISCV_FUSE_NOTHING,                           /* fusible_ops */
    NULL,                                                /* vector cost */
@@ -515,6 +518,7 @@ static const struct riscv_tune_param sifive_7_tune_info = {
    false,                                       /* vector_unaligned_access */
    false,                                       /* use_divmod_expansion */
    false,                                       /* overlap_op_by_pieces */
+  true,                                                /* use_zero_stride_load */
    false,                                       /* speculative_sched_vsetvl */
    RISCV_FUSE_NOTHING,                           /* fusible_ops */
    NULL,                                                /* vector cost */
@@ -538,6 +542,7 @@ static const struct riscv_tune_param sifive_p400_tune_info = {
    false,                                       /* vector_unaligned_access */
    false,                                       /* use_divmod_expansion */
    false,                                       /* overlap_op_by_pieces */
+  true,                                                /* use_zero_stride_load */
    false,                                       /* speculative_sched_vsetvl */
    RISCV_FUSE_LUI_ADDI | RISCV_FUSE_AUIPC_ADDI,  /* fusible_ops */
    &generic_vector_cost,                                /* vector cost */
@@ -561,6 +566,7 @@ static const struct riscv_tune_param sifive_p600_tune_info = {
    false,                                       /* vector_unaligned_access */
    false,                                       /* use_divmod_expansion */
    false,                                       /* overlap_op_by_pieces */
+  true,                                                /* use_zero_stride_load */
    false,                                       /* speculative_sched_vsetvl */
    RISCV_FUSE_LUI_ADDI | RISCV_FUSE_AUIPC_ADDI,  /* fusible_ops */
    &generic_vector_cost,                                /* vector cost */
@@ -584,6 +590,7 @@ static const struct riscv_tune_param thead_c906_tune_info = {
    false,                                       /* vector_unaligned_access */
    false,       /* use_divmod_expansion */
    false,                                       /* overlap_op_by_pieces */
+  true,                                                /* use_zero_stride_load */
    false,                                       /* speculative_sched_vsetvl */
    RISCV_FUSE_NOTHING,                           /* fusible_ops */
    NULL,                                                /* vector cost */
@@ -607,6 +614,7 @@ static const struct riscv_tune_param xiangshan_nanhu_tune_info = {
    false,                                       /* vector_unaligned_access */
    false,                                       /* use_divmod_expansion */
    false,                                       /* overlap_op_by_pieces */
+  true,                                                /* use_zero_stride_load */
    false,                                       /* speculative_sched_vsetvl */
    RISCV_FUSE_ZEXTW | RISCV_FUSE_ZEXTH,          /* fusible_ops */
    NULL,                                                /* vector cost */
@@ -630,6 +638,7 @@ static const struct riscv_tune_param generic_ooo_tune_info = {
    true,                                                /* vector_unaligned_access */
    false,                                       /* use_divmod_expansion */
    true,                                                /* overlap_op_by_pieces */
+  true,                                                /* use_zero_stride_load */
    false,                                       /* speculative_sched_vsetvl */
    RISCV_FUSE_NOTHING,                           /* fusible_ops */
    &generic_vector_cost,                                /* vector cost */
@@ -653,6 +662,7 @@ static const struct riscv_tune_param tt_ascalon_d8_tune_info = {
    true,                                                /* vector_unaligned_access */
    true,                                                /* use_divmod_expansion */
    true,                                                /* overlap_op_by_pieces */
+  true,                                                /* use_zero_stride_load */
    false,                                       /* speculative_sched_vsetvl */
    RISCV_FUSE_NOTHING,                           /* fusible_ops */
    &generic_vector_cost,                                /* vector cost */
@@ -676,6 +686,7 @@ static const struct riscv_tune_param optimize_size_tune_info = {
    false,                                       /* vector_unaligned_access */
    false,                                       /* use_divmod_expansion */
    false,                                       /* overlap_op_by_pieces */
+  true,                                                /* use_zero_stride_load */
    false,                                       /* speculative_sched_vsetvl */
    RISCV_FUSE_NOTHING,                           /* fusible_ops */
    NULL,                                                /* vector cost */
@@ -699,6 +710,7 @@ static const struct riscv_tune_param mips_p8700_tune_info = {
    false,        /* vector_unaligned_access */
    true,         /* use_divmod_expansion */
    false,        /* overlap_op_by_pieces */
+  true,                                                /* use_zero_stride_load */
    false,                                       /* speculative_sched_vsetvl */
    RISCV_FUSE_NOTHING,                          /* fusible_ops */
    NULL,         /* vector cost */
@@ -12765,6 +12777,14 @@ riscv_lshift_subword (machine_mode mode ATTRIBUTE_UNUSED, rtx value, rtx shift,
                                                   gen_lowpart (QImode, shift)));
  }
  
+/* Return TRUE if we should use the zero stride load, FALSE otherwise. */
+
+bool
+strided_load_broadcast_p ()
+{
+  return tune_param->use_zero_stride_load;
+}
+
  /* Return TRUE if we should use the divmod expander, FALSE otherwise.  This
     allows the behavior to be tuned for specific implementations as well as
     when optimizing for size.  */
diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md

index 2046f04a02c5fc78300b5159e668ffaf1a6cb9af..c5b23b37f7dc89d72958009a45143658ab0b8d60 100644 (file)
--- a/gcc/config/riscv/vector.md
+++ b/gcc/config/riscv/vector.md
@@ -1580,8 +1580,22 @@
    "&& 1"
    [(const_int 0)]
    {
-    riscv_vector::emit_vlmax_insn (code_for_pred_broadcast (<MODE>mode),
-                                   riscv_vector::UNARY_OP, operands);
+    if (!strided_load_broadcast_p ()
+       && TARGET_ZVFHMIN && !TARGET_ZVFH && <VEL>mode == HFmode)
+      {
+       /* For Float16, reinterpret as HImode, broadcast and reinterpret
+          back.  */
+       poly_uint64 nunits = GET_MODE_NUNITS (<MODE>mode);
+       machine_mode vmodehi
+         = riscv_vector::get_vector_mode (HImode, nunits).require ();
+       rtx ops[] = {lowpart_subreg (vmodehi, operands[0], <MODE>mode),
+                    lowpart_subreg (HImode, operands[1], HFmode)};
+       riscv_vector::emit_vlmax_insn (code_for_pred_broadcast (vmodehi),
+                                      riscv_vector::UNARY_OP, ops);
+      }
+    else
+      riscv_vector::emit_vlmax_insn (code_for_pred_broadcast (<MODE>mode),
+                                    riscv_vector::UNARY_OP, operands);
      DONE;
    }
    [(set_attr "type" "vector")]
@@ -2171,7 +2185,7 @@
         }
      }
    else if (GET_MODE_BITSIZE (<VEL>mode) > GET_MODE_BITSIZE (Pmode)
-           && (immediate_operand (operands[3], Pmode)
+          && (immediate_operand (operands[3], Pmode)
                || (CONST_POLY_INT_P (operands[3])
                    && known_ge (rtx_to_poly_int64 (operands[3]), 0U)
                    && known_le (rtx_to_poly_int64 (operands[3]), GET_MODE_SIZE (<MODE>mode)))))
@@ -2224,12 +2238,7 @@
    "(register_operand (operands[3], <VEL>mode)
    || CONST_POLY_INT_P (operands[3]))
    && GET_MODE_BITSIZE (<VEL>mode) > GET_MODE_BITSIZE (Pmode)"
-  [(set (match_dup 0)
-       (if_then_else:V_VLSI (unspec:<VM> [(match_dup 1) (match_dup 4)
-            (match_dup 5) (match_dup 6) (match_dup 7)
-            (reg:SI VL_REGNUM) (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
-         (vec_duplicate:V_VLSI (match_dup 3))
-         (match_dup 2)))]
+  [(const_int 0)]
    {
      gcc_assert (can_create_pseudo_p ());
      if (CONST_POLY_INT_P (operands[3]))
@@ -2238,12 +2247,6 @@
         emit_move_insn (tmp, operands[3]);
         operands[3] = tmp;
        }
-    rtx m = assign_stack_local (<VEL>mode, GET_MODE_SIZE (<VEL>mode),
-                               GET_MODE_ALIGNMENT (<VEL>mode));
-    m = validize_mem (m);
-    emit_move_insn (m, operands[3]);
-    m = gen_rtx_MEM (<VEL>mode, force_reg (Pmode, XEXP (m, 0)));
-    operands[3] = m;
  
      /* For SEW = 64 in RV32 system, we expand vmv.s.x:
         andi a2,a2,1
@@ -2254,6 +2257,35 @@
         operands[4] = riscv_vector::gen_avl_for_scalar_move (operands[4]);
         operands[1] = CONSTM1_RTX (<VM>mode);
        }
+
+    /* If the target doesn't want a strided-load broadcast we go with a regular
+       V1DImode load and a broadcast gather.  */
+    if (strided_load_broadcast_p ())
+      {
+       rtx mem = assign_stack_local (<VEL>mode, GET_MODE_SIZE (<VEL>mode),
+                                     GET_MODE_ALIGNMENT (<VEL>mode));
+       mem = validize_mem (mem);
+       emit_move_insn (mem, operands[3]);
+       mem = gen_rtx_MEM (<VEL>mode, force_reg (Pmode, XEXP (mem, 0)));
+
+       emit_insn
+         (gen_pred_broadcast<mode>
+          (operands[0], operands[1], operands[2], mem,
+           operands[4], operands[5], operands[6], operands[7]));
+      }
+    else
+      {
+       rtx tmp = gen_reg_rtx (V1DImode);
+       emit_move_insn (tmp, lowpart_subreg (V1DImode, operands[3],
+                                            <VEL>mode));
+       tmp = lowpart_subreg (<MODE>mode, tmp, V1DImode);
+
+       emit_insn
+         (gen_pred_gather<mode>_scalar
+          (operands[0], operands[1], operands[2], tmp, CONST0_RTX (Pmode),
+           operands[4], operands[5], operands[6], operands[7]));
+      }
+    DONE;
    }
    [(set_attr "type" "vimov,vimov,vlds,vlds,vlds,vlds,vimovxv,vimovxv")
     (set_attr "mode" "<MODE>")])
@@ -2293,9 +2325,9 @@
              (reg:SI VL_REGNUM)
              (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
           (vec_duplicate:V_VLSF_ZVFHMIN
-           (match_operand:<VEL>        3 "direct_broadcast_operand"      "Wdm, Wdm, Wdm, Wdm"))
+           (match_operand:<VEL>        3 "direct_broadcast_operand"      "  A,   A,   A,   A"))
           (match_operand:V_VLSF_ZVFHMIN 2 "vector_merge_operand"          " vu,   0,  vu,   0")))]
-  "TARGET_VECTOR"
+  "TARGET_VECTOR && strided_load_broadcast_p ()"
    "@
     vlse<sew>.v\t%0,%3,zero,%1.t
     vlse<sew>.v\t%0,%3,zero,%1.t
author	Robin Dapp <rdapp@ventanamicro.com>
	Thu, 10 Jul 2025 07:41:48 +0000 (09:41 +0200)
committer	Robin Dapp <rdapp@ventanamicro.com>
	Thu, 10 Jul 2025 13:56:20 +0000 (15:56 +0200)
gcc/config/riscv/constraints.md		patch \| blob \| blame \| history
gcc/config/riscv/predicates.md		patch \| blob \| blame \| history
gcc/config/riscv/riscv-protos.h		patch \| blob \| blame \| history
gcc/config/riscv/riscv-selftests.cc		patch \| blob \| blame \| history
gcc/config/riscv/riscv-v.cc		patch \| blob \| blame \| history
gcc/config/riscv/riscv.cc		patch \| blob \| blame \| history
gcc/config/riscv/vector.md		patch \| blob \| blame \| history