RISC-V: Rework broadcast handling [PR121073].

author Robin Dapp <rdapp@ventanamicro.com>

Thu, 17 Jul 2025 09:09:43 +0000 (11:09 +0200)

committer Robin Dapp <rdapp@ventanamicro.com>

Wed, 23 Jul 2025 15:36:26 +0000 (17:36 +0200)
author Robin Dapp <rdapp@ventanamicro.com>
Thu, 17 Jul 2025 09:09:43 +0000 (11:09 +0200)
committer Robin Dapp <rdapp@ventanamicro.com>
Wed, 23 Jul 2025 15:36:26 +0000 (17:36 +0200)
diff --git a/gcc/config/riscv/autovec-opt.md b/gcc/config/riscv/autovec-opt.md

index d884942279130ecda5b4199b9883192727e79386..6dd4b77be00685ee92c8ebc56abe08061994865f 100644 (file)
--- a/gcc/config/riscv/autovec-opt.md
+++ b/gcc/config/riscv/autovec-opt.md
@@ -1900,8 +1900,7 @@
      emit_insn (gen_extend<vsubel><vel>2(tmp, operands[1]));
  
      rtx ops[] = {operands[0], tmp};
-    riscv_vector::emit_vlmax_insn (code_for_pred_broadcast (<MODE>mode),
-                                   riscv_vector::UNARY_OP, ops);
+    riscv_vector::expand_broadcast (<MODE>mode, ops);
      DONE;
    }
    [(set_attr "type" "vfwmuladd")]
diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md

index 1fff8ac2fc462db6f5ffa16bf83b998cf6da1687..48de5efdde52f98db7e1c18c772bae4fa19a4744 100644 (file)
--- a/gcc/config/riscv/autovec.md
+++ b/gcc/config/riscv/autovec.md
@@ -1359,9 +1359,7 @@
    if (operands[2] == const0_rtx)
      {
        rtx ops[] = {operands[0], operands[0], operands[1]};
-      riscv_vector::emit_nonvlmax_insn (code_for_pred_broadcast (<MODE>mode),
-                                       riscv_vector::SCALAR_MOVE_MERGED_OP_TU,
-                                       ops, CONST1_RTX (Pmode));
+      riscv_vector::expand_set_first_tu (<MODE>mode, ops);
      }
    else
      {
@@ -1385,8 +1383,7 @@
          VL we need for the slide.  */
        rtx tmp = gen_reg_rtx (<MODE>mode);
        rtx ops1[] = {tmp, operands[1]};
-      emit_nonvlmax_insn (code_for_pred_broadcast (<MODE>mode),
-                           riscv_vector::UNARY_OP, ops1, length);
+      riscv_vector::expand_broadcast (<MODE>mode, ops1, length);
  
        /* Slide exactly one element up leaving the tail elements
          unchanged.  */
diff --git a/gcc/config/riscv/predicates.md b/gcc/config/riscv/predicates.md

index 1f9a6b562e531fb41697160e6cc3c5b152897cad..381f96c3e72526578b03a6bf71f69971dd50c383 100644 (file)
--- a/gcc/config/riscv/predicates.md
+++ b/gcc/config/riscv/predicates.md
@@ -517,6 +517,10 @@
         (match_operand 0 "vector_all_trues_mask_operand")))
  
  (define_predicate "vector_broadcast_mask_operand"
+  (ior (match_operand 0 "vector_least_significant_set_mask_operand")
+       (match_operand 0 "vector_all_trues_mask_operand")))
+
+(define_predicate "strided_broadcast_mask_operand"
    (ior (match_operand 0 "vector_least_significant_set_mask_operand")
      (ior (match_operand 0 "register_operand")
           (match_operand 0 "vector_all_trues_mask_operand"))))
@@ -619,6 +623,15 @@
  (define_predicate "direct_broadcast_operand"
    (match_test "riscv_vector::can_be_broadcast_p (op)"))
  
+;; A strided broadcast is just a fallback pattern that loads from
+;; memory.
+(define_predicate "strided_broadcast_operand"
+  (match_test "riscv_vector::strided_broadcast_p (op)"))
+
+(define_predicate "any_broadcast_operand"
+  (ior (match_operand 0 "direct_broadcast_operand")
+       (match_operand 0 "strided_broadcast_operand")))
+
  ;; A CONST_INT operand that has exactly two bits cleared.
  (define_predicate "const_nottwobits_operand"
    (and (match_code "const_int")
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h

index a41c4c299fac33e0fea43d62610822f9bed6f59d..0379f2ce25678ea9d297c866b07683168c84de0a 100644 (file)
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -695,6 +695,9 @@ bool expand_block_move (rtx, rtx, rtx, bool);
  machine_mode preferred_simd_mode (scalar_mode);
  machine_mode get_mask_mode (machine_mode);
  void expand_vec_series (rtx, rtx, rtx, rtx = 0);
+void expand_broadcast (machine_mode, rtx *, rtx = 0);
+void expand_set_first (machine_mode, rtx *, rtx = 0);
+void expand_set_first_tu (machine_mode, rtx *, rtx = 0);
  void expand_vec_init (rtx, rtx);
  void expand_vec_perm (rtx, rtx, rtx, rtx);
  void expand_select_vl (rtx *);
@@ -762,6 +765,7 @@ enum vlmul_type get_vlmul (rtx_insn *);
  int count_regno_occurrences (rtx_insn *, unsigned int);
  bool imm_avl_p (machine_mode);
  bool can_be_broadcast_p (rtx);
+bool strided_broadcast_p (rtx);
  bool gather_scatter_valid_offset_p (machine_mode);
  HOST_WIDE_INT estimated_poly_value (poly_int64, unsigned int);
  bool whole_reg_to_reg_move_p (rtx *, machine_mode, int);
diff --git a/gcc/config/riscv/riscv-string.cc b/gcc/config/riscv/riscv-string.cc

index 90801899ec8e3cabc4ab4670ec1bab8e336c8024..61c4a095ab4bb6809b14266fb656c25799d4bfbd 100644 (file)
--- a/gcc/config/riscv/riscv-string.cc
+++ b/gcc/config/riscv/riscv-string.cc
@@ -1625,16 +1625,14 @@ expand_vec_setmem (rtx dst_in, rtx length_in, rtx fill_value_in)
       Otherwise, use a predicated store.  */
    if (known_eq (GET_MODE_SIZE (info.vmode), INTVAL (info.avl)))
      {
-      emit_vlmax_insn (code_for_pred_broadcast (info.vmode), UNARY_OP,
-                      broadcast_ops);
+      riscv_vector::expand_broadcast (info.vmode, broadcast_ops);
        emit_move_insn (dst, fill_value);
      }
    else
      {
        if (!satisfies_constraint_vl (info.avl))
         info.avl = force_reg (Pmode, info.avl);
-      emit_nonvlmax_insn (code_for_pred_broadcast (info.vmode),
-                         riscv_vector::UNARY_OP, broadcast_ops, info.avl);
+      riscv_vector::expand_broadcast (info.vmode, broadcast_ops, info.avl);
        machine_mode mask_mode
         = riscv_vector::get_vector_mode (BImode, GET_MODE_NUNITS (info.vmode))
           .require ();
diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc

index 242ac087764ffd05df69fbc88a8c7a6e48434aaa..9f6ae79326e4cab22d2fbc94fb27d60782651e92 100644 (file)
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -1190,6 +1190,59 @@ expand_vector_init_trailing_same_elem (rtx target,
    return false;
  }
  
+/* Helper function to emit a vmv.vx/vi and float variants.
+   If VL is not given a VLMAX insn will be emitted, otherwise
+   a non-VLMAX insn with length VL.
+   If the value to be broadcast is not suitable for vmv.vx
+   fall back to a vlse with zero stride.  This itself has a
+   fallback if the uarch prefers not to use a strided load
+   for broadcast.  */
+
+void
+expand_broadcast (machine_mode mode, rtx *ops, rtx vl)
+{
+  rtx elt = ops[1];
+  avl_type type = vl ? NONVLMAX : VLMAX;
+  if (can_be_broadcast_p (elt))
+    emit_avltype_insn (code_for_pred_broadcast (mode), UNARY_OP, ops,
+                      type, vl);
+  else
+    emit_avltype_insn (code_for_pred_strided_broadcast (mode),
+                      UNARY_OP, ops, type, vl);
+}
+
+/* Similar to expand_broadcast but emits a vmv.s.x/vfmv.s.f instead.  */
+
+void
+expand_set_first (machine_mode mode, rtx *ops, rtx vl)
+{
+  rtx elt = ops[1];
+  avl_type type = vl ? NONVLMAX : VLMAX;
+  if (can_be_broadcast_p (elt))
+    emit_avltype_insn (code_for_pred_broadcast (mode),
+                       SCALAR_MOVE_OP, ops, type, vl);
+  else
+    emit_avltype_insn (code_for_pred_strided_broadcast (mode),
+                       SCALAR_MOVE_OP, ops, type, vl);
+}
+
+/* Similar to expand_set_first but keeping the tail elements
+   unchanged (TU) */
+
+void
+expand_set_first_tu (machine_mode mode, rtx *ops, rtx vl)
+{
+  rtx elt = ops[2];
+  if (!vl)
+    vl = const1_rtx;
+  if (can_be_broadcast_p (elt))
+    emit_nonvlmax_insn (code_for_pred_broadcast (mode),
+                       SCALAR_MOVE_MERGED_OP_TU, ops, vl);
+  else
+    emit_nonvlmax_insn (code_for_pred_strided_broadcast (mode),
+                       SCALAR_MOVE_MERGED_OP_TU, ops, vl);
+}
+
  static void
  expand_const_vec_duplicate (rtx target, rtx src, rtx elt)
  {
@@ -1226,7 +1279,7 @@ expand_const_vec_duplicate (rtx target, rtx src, rtx elt)
        if (lra_in_progress)
         {
           rtx ops[] = {result, elt};
-         emit_vlmax_insn (code_for_pred_broadcast (mode), UNARY_OP, ops);
+         expand_broadcast (mode, ops);
         }
        else
         {
@@ -1278,8 +1331,7 @@ expand_const_vector_duplicate_repeating (rtx target, rvv_builder *builder)
      {
        dup = gen_reg_rtx (builder->new_mode ());
        rtx ops[] = {dup, ele};
-      emit_vlmax_insn (code_for_pred_broadcast (builder->new_mode ()),
-                      UNARY_OP, ops);
+      expand_broadcast (builder->new_mode (), ops);
      }
    else
      dup = expand_vector_broadcast (builder->new_mode (), ele);
@@ -1322,8 +1374,7 @@ expand_const_vector_duplicate_default (rtx target, rvv_builder *builder)
  
    rtx tmp1 = gen_reg_rtx (builder->mode ());
    rtx dup_ops[] = {tmp1, builder->elt (0)};
-  emit_vlmax_insn (code_for_pred_broadcast (builder->mode ()), UNARY_OP,
-                  dup_ops);
+  expand_broadcast (builder->mode (), dup_ops);
  
    for (unsigned int i = 1; i < builder->npatterns (); i++)
      {
@@ -2136,18 +2187,32 @@ has_vi_variant_p (rtx_code code, rtx x)
      }
  }
  
+/* This is a helper for binary ops with DImode scalar operands that are
+   broadcast (like vadd.vx v1, a1).
+   Instead of having similar code for all the expanders this function
+   unifies the handling.  For 64-bit targets all we do is choose
+   between the vi variant (if available) and the register variant.
+   For 32-bit targets we either create the sign-extending variant
+   of vop.vx (when the immediate fits 32 bits) or emit a vector
+   broadcast of the 64-bit register/immediate and switch to a
+   vop.vv (replacing the scalar op with the broadcast vector.  */
+
  bool
  sew64_scalar_helper (rtx *operands, rtx *scalar_op, rtx vl,
                      machine_mode vector_mode, bool has_vi_variant_p,
                      void (*emit_vector_func) (rtx *, rtx), enum avl_type type)
  {
    machine_mode scalar_mode = GET_MODE_INNER (vector_mode);
+
+  /* If the scalar broadcast op fits an immediate, use the
+     vop.vi variant if there is one.  */
    if (has_vi_variant_p)
      {
        *scalar_op = force_reg (scalar_mode, *scalar_op);
        return false;
      }
  
+  /* On a 64-bit target we can always use the vop.vx variant.  */
    if (TARGET_64BIT)
      {
        if (!rtx_equal_p (*scalar_op, const0_rtx))
@@ -2155,6 +2220,8 @@ sew64_scalar_helper (rtx *operands, rtx *scalar_op, rtx vl,
        return false;
      }
  
+  /* For 32 bit and if there is no vop.vi variant for a 32-bit immediate
+     we need to use the sign-extending (SI -> DI) vop.vx variants.  */
    if (immediate_operand (*scalar_op, Pmode))
      {
        if (!rtx_equal_p (*scalar_op, const0_rtx))
@@ -2164,40 +2231,29 @@ sew64_scalar_helper (rtx *operands, rtx *scalar_op, rtx vl,
        return false;
      }
  
-  bool avoid_strided_broadcast = false;
+  /* Now we're left with a 64-bit immediate or a register.
+     We cannot use a vop.vx variant but must broadcast the value first
+     and switch to a vop.vv variant.
+     Broadcast can either be done via vlse64.v v1, reg, zero
+     or by loading one 64-bit element (vle64.v) and using a
+     broadcast vrgather.vi.  This is decided when splitting
+     the strided broadcast insn.  */
+  gcc_assert (!TARGET_64BIT
+             && (CONST_INT_P (*scalar_op)
+                 || register_operand (*scalar_op, scalar_mode)));
+
    if (CONST_INT_P (*scalar_op))
      {
        if (maybe_gt (GET_MODE_SIZE (scalar_mode), GET_MODE_SIZE (Pmode)))
-       {
-         if (strided_load_broadcast_p ())
-           *scalar_op = force_const_mem (scalar_mode, *scalar_op);
-         else
-           avoid_strided_broadcast = true;
-       }
+       *scalar_op = force_const_mem (scalar_mode, *scalar_op);
        else
         *scalar_op = force_reg (scalar_mode, *scalar_op);
      }
  
    rtx tmp = gen_reg_rtx (vector_mode);
-  if (!avoid_strided_broadcast)
-    {
-      rtx ops[] = {tmp, *scalar_op};
-      emit_avltype_insn (code_for_pred_broadcast (vector_mode), UNARY_OP, ops,
-                        type, vl);
-    }
-  else
-    {
-      /* Load scalar as V1DI and broadcast via vrgather.vi.  */
-      rtx tmp1 = gen_reg_rtx (V1DImode);
-      emit_move_insn (tmp1, lowpart_subreg (V1DImode, *scalar_op,
-                                           scalar_mode));
-      tmp1 = lowpart_subreg (vector_mode, tmp1, V1DImode);
-
-      rtx ops[] = {tmp, tmp1, CONST0_RTX (Pmode)};
-      emit_vlmax_insn (code_for_pred_gather_scalar (vector_mode),
-                      BINARY_OP, ops);
-    }
-
+  rtx ops[] = {tmp, *scalar_op};
+  emit_avltype_insn (code_for_pred_strided_broadcast (vector_mode),
+                    UNARY_OP, ops, type, vl);
    emit_vector_func (operands, tmp);
  
    return true;
@@ -2591,8 +2647,7 @@ expand_vector_init_merge_repeating_sequence (rtx target,
  
    /* Step 1: Broadcast the first pattern.  */
    rtx ops[] = {target, force_reg (builder.inner_mode (), builder.elt (0))};
-  emit_vlmax_insn (code_for_pred_broadcast (builder.mode ()),
-                   UNARY_OP, ops);
+  expand_broadcast (builder.mode (), ops);
    /* Step 2: Merge the rest iteration of pattern.  */
    for (unsigned int i = 1; i < builder.npatterns (); i++)
      {
@@ -2605,8 +2660,7 @@ expand_vector_init_merge_repeating_sequence (rtx target,
        if (full_nelts <= builder.inner_bits_size ()) /* vmv.s.x.  */
         {
           rtx ops[] = {dup, merge_mask};
-         emit_nonvlmax_insn (code_for_pred_broadcast (GET_MODE (dup)),
-                              SCALAR_MOVE_OP, ops, CONST1_RTX (Pmode));
+         expand_set_first (GET_MODE (dup), ops);
         }
        else /* vmv.v.x.  */
         {
@@ -2614,8 +2668,7 @@ expand_vector_init_merge_repeating_sequence (rtx target,
                        force_reg (GET_MODE_INNER (mask_int_mode), merge_mask)};
           rtx vl = gen_int_mode (CEIL (full_nelts, builder.inner_bits_size ()),
                                  Pmode);
-         emit_nonvlmax_insn (code_for_pred_broadcast (mask_int_mode), UNARY_OP,
-                              ops, vl);
+         expand_broadcast (mask_int_mode, ops, vl);
         }
  
        emit_move_insn (mask, gen_lowpart (mask_bit_mode, dup));
@@ -4706,20 +4759,20 @@ expand_reduction (unsigned unspec, unsigned unspec_for_vl0_safe,
  
    rtx m1_tmp = gen_reg_rtx (m1_mode);
    rtx scalar_move_ops[] = {m1_tmp, init};
-  insn_code icode = code_for_pred_broadcast (m1_mode);
    if (need_mask_operand_p (insn_flags))
      {
        if (need_vl0_safe)
-       emit_nonvlmax_insn (icode, SCALAR_MOVE_OP, scalar_move_ops, const1_rtx);
+       expand_set_first (m1_mode, scalar_move_ops, const1_rtx);
        else
-       emit_nonvlmax_insn (icode, SCALAR_MOVE_OP, scalar_move_ops, vl_op);
+       expand_set_first (m1_mode, scalar_move_ops, vl_op);
      }
    else
-    emit_vlmax_insn (icode, SCALAR_MOVE_OP, scalar_move_ops);
+    expand_set_first (m1_mode, scalar_move_ops);
  
    rtx m1_tmp2 = gen_reg_rtx (m1_mode);
    rtx reduc_ops[] = {m1_tmp2, vector_src, m1_tmp};
  
+  insn_code icode;
    if (need_vl0_safe)
      icode = code_for_pred (unspec_for_vl0_safe, vmode);
    else
@@ -5808,25 +5861,84 @@ count_regno_occurrences (rtx_insn *rinsn, unsigned int regno)
    return count;
  }
  
-/* Return true if the OP can be directly broadcast.  */
+/* Return true if the OP can be broadcast with a
+   v[f]mv.v.[xif] instruction.  */
+
  bool
  can_be_broadcast_p (rtx op)
  {
    machine_mode mode = GET_MODE (op);
-  /* We don't allow RA (register allocation) reload generate
-    (vec_duplicate:DI reg) in RV32 system wheras we allow
-    (vec_duplicate:DI mem) in RV32 system.  */
-  if (!can_create_pseudo_p () && !FLOAT_MODE_P (mode)
-      && maybe_gt (GET_MODE_SIZE (mode), GET_MODE_SIZE (Pmode))
-      && !satisfies_constraint_Wdm (op))
+
+  /* Zero always works and we can always put an immediate into a
+     register.
+     What's tricky is that for an immediate we don't know the
+     register's mode it will end up in, i.e. what element size
+     we want to broadcast.  So even if the immediate is small it might
+     still end up in a DImode register that we cannot broadcast.
+     vmv.s.x, i.e. a single-element set can handle this, though,
+     because it implicitly sign-extends to SEW.  */
+  if (rtx_equal_p (op, CONST0_RTX (mode))
+      || const_int_operand (op, Xmode))
+    return true;
+
+  /* Do not accept DImode broadcasts on !TARGET_64BIT.  Those
+     are handled by strided broadcast.  */
+  if (INTEGRAL_MODE_P (mode)
+      && maybe_gt (GET_MODE_SIZE (mode), UNITS_PER_WORD))
+    return false;
+
+  /* Non-register operands that can be forced into a register we can
+     handle.  These don't need to use strided broadcast. */
+  if (INTEGRAL_MODE_P (mode)
+      && (memory_operand (op, mode) || CONST_POLY_INT_P (op))
+      && can_create_pseudo_p ())
+    return true;
+
+  /* Likewise, do not accept HFmode broadcast if we don't have
+     vfmv.v.f for 16-bit registers available.  */
+  if (mode == HFmode && !TARGET_ZVFH)
+    return false;
+
+  /* Same for float, just that we can always handle 64-bit doubles
+     even on !TARGET_64BIT.  We have ruled out 16-bit HF already
+     above.  */
+  if (FLOAT_MODE_P (mode)
+      && (memory_operand (op, mode) || CONSTANT_P (op))
+      && can_create_pseudo_p ())
+    return true;
+
+  /* After excluding all the cases we cannot handle the register types
+     that remain can always be broadcast.  */
+  if (register_operand (op, mode))
+    return true;
+
+  return false;
+}
+
+/* Returns true for all operands that cannot use vmv.vx, vfmv.vf,
+   vmv.s.x, or vfmv.s.f but rather need to go via memory.  */
+
+bool
+strided_broadcast_p (rtx op)
+{
+  machine_mode mode = GET_MODE (op);
+  if (!memory_operand (op, mode)
+      && !register_operand (op, mode)
+      && !rtx_equal_p (op, CONST0_RTX (mode))
+      && !const_int_operand (op, mode))
      return false;
  
-  if (satisfies_constraint_K (op) || register_operand (op, mode)
-      || (strided_load_broadcast_p () && satisfies_constraint_Wdm (op))
-      || rtx_equal_p (op, CONST0_RTX (mode)))
+  /* !TARGET64_BIT does not have a vmv.v.x/vmv.s.x for 64-bit
+     DImode elements.  */
+  if (INTEGRAL_MODE_P (mode)
+      && maybe_gt (GET_MODE_SIZE (mode), UNITS_PER_WORD))
      return true;
  
-  return can_create_pseudo_p () && nonmemory_operand (op, mode);
+  /* Zvfhmin does not have a vfmv.v.f/vfmv.s.f.  for 16-bit elements.  */
+  if (!TARGET_ZVFH && mode == HFmode)
+    return true;
+
+  return false;
  }
  
  void
@@ -5941,7 +6053,10 @@ whole_reg_to_reg_move_p (rtx *ops, machine_mode mode, int avl_type_index)
    return false;
  }
  
-/* Return true if we can transform vmv.v.x/vfmv.v.f to vmv.s.x/vfmv.s.f.  */
+/* Return true if we can transform vmv.v.x/vfmv.v.f to vmv.s.x/vfmv.s.f.
+   That's the case if we're dealing with a scalar broadcast that
+   has VL = 1.  */
+
  bool
  splat_to_scalar_move_p (rtx *ops)
  {
diff --git a/gcc/config/riscv/riscv-vector-builtins-bases.cc b/gcc/config/riscv/riscv-vector-builtins-bases.cc

index bf5172c6e041a640125fbfe870c2a420119b8fa4..7e4d396f05a614b78771681276eb25c781ef0c38 100644 (file)
--- a/gcc/config/riscv/riscv-vector-builtins-bases.cc
+++ b/gcc/config/riscv/riscv-vector-builtins-bases.cc
@@ -643,7 +643,8 @@ public:
         return e.use_exact_insn (code_for_pred_mov (e.vector_mode ()));
        case OP_TYPE_x:
        case OP_TYPE_f:
-       return e.use_exact_insn (code_for_pred_broadcast (e.vector_mode ()));
+       return e.use_scalar_broadcast_insn
+         (code_for_pred_broadcast (e.vector_mode ()));
        default:
         gcc_unreachable ();
        }
diff --git a/gcc/config/riscv/riscv-vector-builtins.cc b/gcc/config/riscv/riscv-vector-builtins.cc

index 8810af0d9ccb89b3ca8b0f6a2a1480638136c99e..0db7549fc5ca688f4bd90435840191a0f842a581 100644 (file)
--- a/gcc/config/riscv/riscv-vector-builtins.cc
+++ b/gcc/config/riscv/riscv-vector-builtins.cc
@@ -4753,7 +4753,10 @@ function_expander::use_ternop_insn (bool vd_accum_p, insn_code icode)
  }
  
  /* Implement the call using instruction ICODE, with a 1:1 mapping between
-   arguments and input operands.  */
+   arguments and input operands.
+   There are operands that cannot be broadcast using v[f]mv.  In that case
+   we switch to a strided broadcast.  */
+
  rtx
  function_expander::use_widen_ternop_insn (insn_code icode)
  {
@@ -4794,7 +4797,10 @@ function_expander::use_widen_ternop_insn (insn_code icode)
  }
  
  /* Implement the call using instruction ICODE, with a 1:1 mapping between
-   arguments and input operands.  */
+   arguments and input operands.
+   There are operands that cannot be broadcast using v[f]mv.  In that case
+   we switch to a strided broadcast.  */
+
  rtx
  function_expander::use_scalar_move_insn (insn_code icode)
  {
@@ -4812,6 +4818,37 @@ function_expander::use_scalar_move_insn (insn_code icode)
    for (int argno = arg_offset; argno < call_expr_nargs (exp); argno++)
      add_input_operand (argno);
  
+  if (!can_be_broadcast_p (m_ops[3].value))
+    icode = code_for_pred_strided_broadcast (vector_mode ());
+
+  add_input_operand (Pmode, get_tail_policy_for_pred (pred));
+  add_input_operand (Pmode, get_mask_policy_for_pred (pred));
+  add_input_operand (Pmode, get_avl_type_rtx (avl_type::NONVLMAX));
+  return generate_insn (icode);
+}
+
+/* Implement the call using instruction ICODE, with a 1:1 mapping between
+   arguments and input operands.  */
+rtx
+function_expander::use_scalar_broadcast_insn (insn_code icode)
+{
+  machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
+
+  /* Record the offset to get the argument.  */
+  int arg_offset = 0;
+  add_all_one_mask_operand (mask_mode ());
+
+  if (use_real_merge_p (pred))
+    add_input_operand (arg_offset++);
+  else
+    add_vundef_operand (mode);
+
+  for (int argno = arg_offset; argno < call_expr_nargs (exp); argno++)
+    add_input_operand (argno);
+
+  if (!can_be_broadcast_p (m_ops[3].value))
+    icode = code_for_pred_strided_broadcast (vector_mode ());
+
    add_input_operand (Pmode, get_tail_policy_for_pred (pred));
    add_input_operand (Pmode, get_mask_policy_for_pred (pred));
    add_input_operand (Pmode, get_avl_type_rtx (avl_type::NONVLMAX));
diff --git a/gcc/config/riscv/riscv-vector-builtins.h b/gcc/config/riscv/riscv-vector-builtins.h

index 1f2587ab6afa10de36efa4d5bb4e7ad6cc6d67d9..86d81154db934a2c326f9a478d8132b6dbdd6222 100644 (file)
--- a/gcc/config/riscv/riscv-vector-builtins.h
+++ b/gcc/config/riscv/riscv-vector-builtins.h
@@ -497,6 +497,7 @@ public:
    rtx use_ternop_insn (bool, insn_code);
    rtx use_widen_ternop_insn (insn_code);
    rtx use_scalar_move_insn (insn_code);
+  rtx use_scalar_broadcast_insn (insn_code);
    rtx generate_insn (insn_code);
  
    /* The function call expression.  */
diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md

index c498166791e2b03dfa51242bbfea1140e26a0e5c..66b76701f5a81833c57a4b7fda80ef75ac3caef6 100644 (file)
--- a/gcc/config/riscv/vector.md
+++ b/gcc/config/riscv/vector.md
@@ -1551,20 +1551,44 @@
  (define_expand "vec_duplicate<mode>"
    [(set (match_operand:V_VLS 0 "register_operand")
          (vec_duplicate:V_VLS
-          (match_operand:<VEL> 1 "direct_broadcast_operand")))]
+          (match_operand:<VEL> 1 "any_broadcast_operand")))]
    "TARGET_VECTOR"
    {
-    /* Early expand DImode broadcast in RV32 system to avoid RA reload
-       generate (set (reg) (vec_duplicate:DI)).  */
+    /* Don't keep a DImode broadcast for RV32 in the vec_duplicate form.
+       Otherwise combine or late combine could end up doing
+             "64-bit broadcast" (!= vmv.v.x)
+            + vadd.vv
+           = vadd.vx
+       which would be invalid.  */
      bool gt_p = maybe_gt (GET_MODE_SIZE (<VEL>mode), GET_MODE_SIZE (Pmode));
      if (!FLOAT_MODE_P (<VEL>mode) && gt_p)
        {
-        riscv_vector::emit_vlmax_insn (code_for_pred_broadcast (<MODE>mode),
-                                      riscv_vector::UNARY_OP, operands);
-       DONE;
+        riscv_vector::emit_vlmax_insn
+         (code_for_pred_strided_broadcast
+           (<MODE>mode), riscv_vector::UNARY_OP, operands);
+       DONE;
        }
-    /* Otherwise, allow it fall into general vec_duplicate pattern
-       which allow us to have vv->vx combine optimization in later pass.  */
+
+    /* Even though we can eventually broadcast any permissible
+       constant by moving it into a register we need to force
+       any non-immediate one into a register here.
+       If we didn't do that we couldn't fwprop/late-combine
+             vec_duplicate 123.45f
+           + vfadd.vv
+           = vfadd.vf
+       because the constant is valid for vec_duplicate but not
+       for vfadd.vf.  Therefore we need to do
+             fa0 = 123.45f
+             vec_duplicate fa0
+           + vfadd.vv
+           = vfadd.vf  */
+    if (!satisfies_constraint_P (operands[1])
+       && !satisfies_constraint_J (operands[1])
+       && !rtx_equal_p (operands[1], CONST0_RTX (<VEL>mode))
+       && !memory_operand (operands[1], <VEL>mode))
+      operands[1] = force_reg (<VEL>mode, operands[1]);
+
+    /* Otherwise keep the vec_duplicate pattern until split.  */
    })
  
  ;; According to GCC internal:
@@ -1574,28 +1598,20 @@
  (define_insn_and_split "*vec_duplicate<mode>"
    [(set (match_operand:V_VLS 0 "register_operand")
          (vec_duplicate:V_VLS
-          (match_operand:<VEL> 1 "direct_broadcast_operand")))]
+          (match_operand:<VEL> 1 "any_broadcast_operand")))]
    "TARGET_VECTOR && can_create_pseudo_p ()"
    "#"
    "&& 1"
    [(const_int 0)]
    {
-    if (!strided_load_broadcast_p ()
-       && TARGET_ZVFHMIN && !TARGET_ZVFH && <VEL>mode == HFmode)
-      {
-       /* For Float16, reinterpret as HImode, broadcast and reinterpret
-          back.  */
-       poly_uint64 nunits = GET_MODE_NUNITS (<MODE>mode);
-       machine_mode vmodehi
-         = riscv_vector::get_vector_mode (HImode, nunits).require ();
-       rtx ops[] = {lowpart_subreg (vmodehi, operands[0], <MODE>mode),
-                    lowpart_subreg (HImode, operands[1], HFmode)};
-       riscv_vector::emit_vlmax_insn (code_for_pred_broadcast (vmodehi),
-                                      riscv_vector::UNARY_OP, ops);
-      }
-    else
+    if (riscv_vector::can_be_broadcast_p (operands[1]))
        riscv_vector::emit_vlmax_insn (code_for_pred_broadcast (<MODE>mode),
                                      riscv_vector::UNARY_OP, operands);
+    else
+      riscv_vector::emit_vlmax_insn (code_for_pred_strided_broadcast
+                                    (<MODE>mode), riscv_vector::UNARY_OP,
+                                    operands);
+
      DONE;
    }
    [(set_attr "type" "vector")]
@@ -2141,69 +2157,45 @@
           (match_operand:V_VLS 2 "vector_merge_operand")))]
    "TARGET_VECTOR"
  {
-  /* Transform vmv.v.x/vfmv.v.f (avl = 1) into vmv.s.x since vmv.s.x/vfmv.s.f
-     has better chances to do vsetvl fusion in vsetvl pass.  */
    bool wrap_vec_dup = true;
    rtx vec_cst = NULL_RTX;
-  if (riscv_vector::splat_to_scalar_move_p (operands))
-    {
-      operands[1] = riscv_vector::gen_scalar_move_mask (<VM>mode);
-      operands[3] = force_reg (<VEL>mode, operands[3]);
-    }
-  else if (immediate_operand (operands[3], <VEL>mode)
-          && (vec_cst = gen_const_vec_duplicate (<MODE>mode, operands[3]))
-          && (/* -> pred_broadcast<mode>_zero */
-              (vector_least_significant_set_mask_operand (operands[1],
-                                                          <VM>mode)
-               && vector_const_0_operand (vec_cst, <MODE>mode))
-              || (/* pred_broadcast<mode>_imm */
-                  vector_all_trues_mask_operand (operands[1], <VM>mode)
-                  && vector_const_int_or_double_0_operand (vec_cst,
-                                                           <MODE>mode))))
+  if (immediate_operand (operands[3], <VEL>mode)
+      && (vec_cst = gen_const_vec_duplicate (<MODE>mode, operands[3]))
+      && (/* -> pred_broadcast<mode>_zero */
+         (vector_least_significant_set_mask_operand (operands[1],
+                                                     <VM>mode)
+          && vector_const_0_operand (vec_cst, <MODE>mode))
+         || (/* pred_broadcast<mode>_imm */
+             vector_all_trues_mask_operand (operands[1], <VM>mode)
+             && vector_const_int_or_double_0_operand (vec_cst,
+                                                      <MODE>mode))))
      {
        operands[3] = vec_cst;
        wrap_vec_dup = false;
      }
-  /* Handle vmv.s.x instruction (Wb1 mask) which has memory scalar.  */
-  else if (satisfies_constraint_Wdm (operands[3]))
-    {
-      if (satisfies_constraint_Wb1 (operands[1]))
-       {
-         /* Case 1: vmv.s.x (TA, x == memory) ==> vlse.v (TA)  */
-         if (satisfies_constraint_vu (operands[2]))
-           operands[1] = CONSTM1_RTX (<VM>mode);
-         else if (GET_MODE_BITSIZE (<VEL>mode) > GET_MODE_BITSIZE (Pmode))
-           {
-             /* Case 2: vmv.s.x (TU, x == memory) ==>
-                          vl = 0 or 1; + vlse.v (TU) in RV32 system  */
-             operands[4] = riscv_vector::gen_avl_for_scalar_move (operands[4]);
-             operands[1] = CONSTM1_RTX (<VM>mode);
-           }
-         else
-           /* Case 3: load x (memory) to register.  */
-           operands[3] = force_reg (<VEL>mode, operands[3]);
-       }
-    }
-  else if (GET_MODE_BITSIZE (<VEL>mode) > GET_MODE_BITSIZE (Pmode)
-          && (immediate_operand (operands[3], Pmode)
+  else if (GET_MODE_SIZE (<VEL>mode) > UNITS_PER_WORD
+          && satisfies_constraint_Wb1 (operands[1])
+          && (immediate_operand (operands[3], Xmode)
                || (CONST_POLY_INT_P (operands[3])
                    && known_ge (rtx_to_poly_int64 (operands[3]), 0U)
-                  && known_le (rtx_to_poly_int64 (operands[3]), GET_MODE_SIZE (<MODE>mode)))))
+                  && known_le (rtx_to_poly_int64 (operands[3]),
+                               GET_MODE_SIZE (<MODE>mode)))))
      {
        rtx tmp = gen_reg_rtx (Pmode);
        poly_int64 value = rtx_to_poly_int64 (operands[3]);
-      emit_move_insn (tmp, gen_int_mode (value, Pmode));
+      emit_move_insn (tmp, gen_int_mode (value, Xmode));
        operands[3] = gen_rtx_SIGN_EXTEND (<VEL>mode, tmp);
      }
-  /* Never load (const_int 0) into a register, that's silly.  */
-  else if (operands[3] == CONST0_RTX (<VEL>mode))
+
+  /* For a vmv.v.x never load (const_int 0) or valid immediate operands
+     into a register, because we can use vmv.v.i.  */
+  else if (satisfies_constraint_Wc1 (operands[1])
+      && (satisfies_constraint_P (operands[3])
+         || operands[3] == CONST0_RTX (<VEL>mode)))
      ;
-  /* If we're broadcasting [-16..15] across more than just
-     element 0, then we can use vmv.v.i directly, thus avoiding
-     the load of the constant into a GPR.  */
-  else if (CONST_INT_P (operands[3])
-          && IN_RANGE (INTVAL (operands[3]), -16, 15)
-          && !satisfies_constraint_Wb1 (operands[1]))
+  /* For vmv.s.x we have vmv.s.x v1, zero.  */
+  else if (satisfies_constraint_Wb1 (operands[1])
+          && operands[3] == CONST0_RTX (<VEL>mode))
      ;
    else
      operands[3] = force_reg (<VEL>mode, operands[3]);
@@ -2211,131 +2203,68 @@
      operands[3] = gen_rtx_VEC_DUPLICATE (<MODE>mode, operands[3]);
  })
  
-(define_insn_and_split "*pred_broadcast<mode>"
-  [(set (match_operand:V_VLSI 0 "register_operand"                 "=vr, vr, vd, vd, vr, vr, vr, vr")
+(define_insn_and_rewrite "*pred_broadcast<mode>"
+  [(set (match_operand:V_VLSI 0 "register_operand"                 "=vr, vr, vr, vr")
         (if_then_else:V_VLSI
           (unspec:<VM>
-           [(match_operand:<VM> 1 "vector_broadcast_mask_operand" "Wc1,Wc1, vm, vm,Wc1,Wc1,Wb1,Wb1")
-            (match_operand 4 "vector_length_operand"              "rvl,rvl,rvl,rvl,rvl,rvl,rvl,rvl")
-            (match_operand 5 "const_int_operand"                  "  i,  i,  i,  i,  i,  i,  i,  i")
-            (match_operand 6 "const_int_operand"                  "  i,  i,  i,  i,  i,  i,  i,  i")
-            (match_operand 7 "const_int_operand"                  "  i,  i,  i,  i,  i,  i,  i,  i")
+           [(match_operand:<VM> 1 "vector_broadcast_mask_operand" "Wc1,Wc1,Wb1,Wb1")
+            (match_operand 4 "vector_length_operand"              "rvl,rvl,rvl,rvl")
+            (match_operand 5 "const_int_operand"                  "  i,  i,  i,  i")
+            (match_operand 6 "const_int_operand"                  "  i,  i,  i,  i")
+            (match_operand 7 "const_int_operand"                  "  i,  i,  i,  i")
              (reg:SI VL_REGNUM)
              (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
           (vec_duplicate:V_VLSI
-           (match_operand:<VEL> 3 "direct_broadcast_operand"       "rP,rP,Wdm,Wdm,Wdm,Wdm, rJ, rJ"))
-         (match_operand:V_VLSI 2 "vector_merge_operand"            "vu, 0, vu,  0, vu,  0, vu,  0")))]
+           (match_operand:<VEL> 3 "direct_broadcast_operand"      " rP, rP, rJ, rJ"))
+         (match_operand:V_VLSI 2 "vector_merge_operand"           " vu,  0, vu,  0")))]
    "TARGET_VECTOR"
    "@
     vmv.v.%o3\t%0,%3
     vmv.v.%o3\t%0,%3
-   vlse<sew>.v\t%0,%3,zero,%1.t
-   vlse<sew>.v\t%0,%3,zero,%1.t
-   vlse<sew>.v\t%0,%3,zero
-   vlse<sew>.v\t%0,%3,zero
     vmv.s.x\t%0,%z3
     vmv.s.x\t%0,%z3"
-  "(register_operand (operands[3], <VEL>mode)
-  || CONST_POLY_INT_P (operands[3]))
-  && GET_MODE_BITSIZE (<VEL>mode) > GET_MODE_BITSIZE (Pmode)"
-  [(const_int 0)]
-  {
-    gcc_assert (can_create_pseudo_p ());
-    if (CONST_POLY_INT_P (operands[3]))
-      {
-       rtx tmp = gen_reg_rtx (<VEL>mode);
-       emit_move_insn (tmp, operands[3]);
-       operands[3] = tmp;
-      }
-
-    /* For SEW = 64 in RV32 system, we expand vmv.s.x:
-       andi a2,a2,1
-       vsetvl zero,a2,e64
-       vlse64.v  */
-    if (satisfies_constraint_Wb1 (operands[1]))
-      {
-       operands[4] = riscv_vector::gen_avl_for_scalar_move (operands[4]);
-       operands[1] = CONSTM1_RTX (<VM>mode);
-      }
-
-    /* If the target doesn't want a strided-load broadcast we go with a regular
-       V1DImode load and a broadcast gather.  */
-    if (strided_load_broadcast_p ())
-      {
-       rtx mem = assign_stack_local (<VEL>mode, GET_MODE_SIZE (<VEL>mode),
-                                     GET_MODE_ALIGNMENT (<VEL>mode));
-       mem = validize_mem (mem);
-       emit_move_insn (mem, operands[3]);
-       mem = gen_rtx_MEM (<VEL>mode, force_reg (Pmode, XEXP (mem, 0)));
-
-       emit_insn
-         (gen_pred_broadcast<mode>
-          (operands[0], operands[1], operands[2], mem,
-           operands[4], operands[5], operands[6], operands[7]));
-      }
-    else
-      {
-       rtx tmp = gen_reg_rtx (V1DImode);
-       emit_move_insn (tmp, lowpart_subreg (V1DImode, operands[3],
-                                            <VEL>mode));
-       tmp = lowpart_subreg (<MODE>mode, tmp, V1DImode);
-
-       emit_insn
-         (gen_pred_gather<mode>_scalar
-          (operands[0], operands[1], operands[2], tmp, CONST0_RTX (Pmode),
-           operands[4], operands[5], operands[6], operands[7]));
-      }
-    DONE;
-  }
-  [(set_attr "type" "vimov,vimov,vlds,vlds,vlds,vlds,vimovxv,vimovxv")
+  "&& (operands[1] == CONSTM1_RTX (<VM>mode)
+       && operands[4] == CONST1_RTX (Pmode)
+       && (register_operand (operands[3], <VEL>mode)
+           || satisfies_constraint_J (operands[3])))"
+{
+  /* A broadcast of a single element is just a vmv.s.x.  */
+  operands[1] = riscv_vector::gen_scalar_move_mask (<VM>mode);
+}
+  [(set_attr "type" "vimov,vimov,vimovxv,vimovxv")
     (set_attr "mode" "<MODE>")])
  
-(define_insn "*pred_broadcast<mode>_zvfh"
-  [(set (match_operand:V_VLSF    0 "register_operand"              "=vr,  vr,  vr,  vr")
+(define_insn_and_rewrite "pred_broadcast<mode>_zvfh"
+  [(set (match_operand:V_VLSF    0 "register_operand"              "=vr, vr, vr, vr")
         (if_then_else:V_VLSF
           (unspec:<VM>
-           [(match_operand:<VM> 1 "vector_broadcast_mask_operand" "Wc1, Wc1, Wb1, Wb1")
-            (match_operand      4 "vector_length_operand"         "rvl, rvl, rvl, rvl")
-            (match_operand      5 "const_int_operand"             "  i,   i,   i,   i")
-            (match_operand      6 "const_int_operand"             "  i,   i,   i,   i")
-            (match_operand      7 "const_int_operand"             "  i,   i,   i,   i")
+           [(match_operand:<VM> 1 "vector_broadcast_mask_operand" "Wc1,Wc1,Wb1,Wb1")
+            (match_operand      4 "vector_length_operand"         "rvl,rvl,rvl,rvl")
+            (match_operand      5 "const_int_operand"             "  i,  i,  i,  i")
+            (match_operand      6 "const_int_operand"             "  i,  i,  i,  i")
+            (match_operand      7 "const_int_operand"             "  i,  i,  i,  i")
              (reg:SI VL_REGNUM)
              (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
           (vec_duplicate:V_VLSF
-           (match_operand:<VEL> 3 "direct_broadcast_operand"      "  f,   f,   f,   f"))
-         (match_operand:V_VLSF  2 "vector_merge_operand"          " vu,   0,  vu,   0")))]
+           (match_operand:<VEL> 3 "direct_broadcast_operand"      "  f,  f,  f,  f"))
+         (match_operand:V_VLSF  2 "vector_merge_operand"          " vu,  0, vu,  0")))]
    "TARGET_VECTOR"
    "@
     vfmv.v.f\t%0,%3
     vfmv.v.f\t%0,%3
     vfmv.s.f\t%0,%3
     vfmv.s.f\t%0,%3"
+  "&& (operands[1] == CONSTM1_RTX (<VM>mode)
+       && operands[4] == CONST1_RTX (Pmode)
+       && (register_operand (operands[3], <VEL>mode)
+           || satisfies_constraint_J (operands[3])))"
+{
+  /* A broadcast of a single element is just a vfmv.s.f.  */
+  operands[1] = riscv_vector::gen_scalar_move_mask (<VM>mode);
+}
    [(set_attr "type" "vfmov,vfmov,vfmovfv,vfmovfv")
     (set_attr "mode" "<MODE>")])
  
-(define_insn "*pred_broadcast<mode>_zvfhmin"
-  [(set (match_operand:V_VLSF_ZVFHMIN   0 "register_operand"              "=vr,  vr,  vr,  vr")
-       (if_then_else:V_VLSF_ZVFHMIN
-         (unspec:<VM>
-           [(match_operand:<VM>        1 "vector_broadcast_mask_operand" " vm,  vm, Wc1, Wc1")
-            (match_operand             4 "vector_length_operand"         "rvl, rvl, rvl, rvl")
-            (match_operand             5 "const_int_operand"             "  i,   i,   i,   i")
-            (match_operand             6 "const_int_operand"             "  i,   i,   i,   i")
-            (match_operand             7 "const_int_operand"             "  i,   i,   i,   i")
-            (reg:SI VL_REGNUM)
-            (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
-         (vec_duplicate:V_VLSF_ZVFHMIN
-           (match_operand:<VEL>        3 "direct_broadcast_operand"      "  A,   A,   A,   A"))
-         (match_operand:V_VLSF_ZVFHMIN 2 "vector_merge_operand"          " vu,   0,  vu,   0")))]
-  "TARGET_VECTOR && strided_load_broadcast_p ()"
-  "@
-   vlse<sew>.v\t%0,%3,zero,%1.t
-   vlse<sew>.v\t%0,%3,zero,%1.t
-   vlse<sew>.v\t%0,%3,zero
-   vlse<sew>.v\t%0,%3,zero"
-  [(set_attr "type" "vlds,vlds,vlds,vlds")
-   (set_attr "mode" "<MODE>")])
-
  (define_insn "*pred_broadcast<mode>_extended_scalar"
    [(set (match_operand:V_VLSI_D 0 "register_operand"               "=vr, vr, vr, vr")
         (if_then_else:V_VLSI_D
@@ -2398,6 +2327,117 @@
    [(set_attr "type" "vimov,vimov")
     (set_attr "mode" "<MODE>")])
  
+(define_expand "@pred_strided_broadcast<mode>"
+  [(set (match_operand:V_VLS 0 "register_operand")
+       (if_then_else:V_VLS
+         (unspec:<VM>
+           [(match_operand:<VM> 1 "strided_broadcast_mask_operand")
+            (match_operand 4 "vector_length_operand")
+            (match_operand 5 "const_int_operand")
+            (match_operand 6 "const_int_operand")
+            (match_operand 7 "const_int_operand")
+            (reg:SI VL_REGNUM)
+            (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
+         (vec_duplicate:V_VLS
+           (match_operand:<VEL> 3 "strided_broadcast_operand"))
+         (match_operand:V_VLS 2 "vector_merge_operand")))]
+  "TARGET_VECTOR"
+{
+  if (satisfies_constraint_Wb1 (operands[1]))
+    {
+      /* If we're asked to set a single element (like vmv.s.x but we
+        need to go via memory here) and the tail policy is agnostic
+        we can overwrite all elements.
+        Thus, set the mask to broadcast.  */
+      operands[1] = CONSTM1_RTX (<VM>mode);
+      if (!satisfies_constraint_vu (operands[2])
+         && GET_MODE_SIZE (<VEL>mode) > UNITS_PER_WORD)
+       {
+         /* Case 2: vmv.s.x (TU, x == memory) ==>
+            vl = 0 or 1; + vlse.v (TU) in RV32 system  */
+         /* In this case we must not overwrite the residual elements,
+            so set the vector length to 0/1.  */
+         operands[4] = riscv_vector::gen_avl_for_scalar_move (operands[4]);
+       }
+    }
+})
+
+(define_insn_and_split "*pred_strided_broadcast<mode>"
+  [(set (match_operand:V_VLSI 0 "register_operand"                  "=vd, vd, vr, vr")
+       (if_then_else:V_VLSI
+         (unspec:<VM>
+           [(match_operand:<VM> 1 "strided_broadcast_mask_operand" " vm, vm,Wc1,Wc1")
+            (match_operand 4 "vector_length_operand"               "rvl,rvl,rvl,rvl")
+            (match_operand 5 "const_int_operand"                   "  i,  i,  i,  i")
+            (match_operand 6 "const_int_operand"                   "  i,  i,  i,  i")
+            (match_operand 7 "const_int_operand"                   "  i,  i,  i,  i")
+            (reg:SI VL_REGNUM)
+            (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
+         (vec_duplicate:V_VLSI
+           (match_operand:<VEL> 3 "strided_broadcast_operand"      "  A,  A,  A,  A"))
+         (match_operand:V_VLSI 2 "vector_merge_operand"            " vu,  0, vu,  0")))]
+  "TARGET_VECTOR"
+  "@
+   vlse<sew>.v\t%0,%3,zero,%1.t
+   vlse<sew>.v\t%0,%3,zero,%1.t
+   vlse<sew>.v\t%0,%3,zero
+   vlse<sew>.v\t%0,%3,zero"
+  "&& !strided_load_broadcast_p () && can_create_pseudo_p ()"
+  [(const_int 0)]
+  {
+    rtx tmp = gen_reg_rtx (V1DImode);
+    emit_move_insn (tmp, gen_lowpart (V1DImode, operands[3]));
+    tmp = lowpart_subreg (<MODE>mode, tmp, V1DImode);
+
+    emit_insn
+      (gen_pred_gather<mode>_scalar
+       (operands[0], operands[1], operands[2], tmp, CONST0_RTX (Pmode),
+       operands[4], operands[5], operands[6], operands[7]));
+    DONE;
+  }
+  [(set_attr "type" "vlds,vlds,vlds,vlds")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn_and_split "*pred_strided_broadcast<mode>_zvfhmin"
+  [(set (match_operand:V_VLSF_ZVFHMIN   0 "register_operand"               "=vr,  vr,  vr,  vr")
+       (if_then_else:V_VLSF_ZVFHMIN
+         (unspec:<VM>
+           [(match_operand:<VM>        1 "strided_broadcast_mask_operand" " vm,  vm, Wc1, Wc1")
+            (match_operand             4 "vector_length_operand"          "rvl, rvl, rvl, rvl")
+            (match_operand             5 "const_int_operand"              "  i,   i,   i,   i")
+            (match_operand             6 "const_int_operand"              "  i,   i,   i,   i")
+            (match_operand             7 "const_int_operand"              "  i,   i,   i,   i")
+            (reg:SI VL_REGNUM)
+            (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
+         (vec_duplicate:V_VLSF_ZVFHMIN
+           (match_operand:<VEL>        3 "strided_broadcast_operand"      "  A,   A,   A,   A"))
+         (match_operand:V_VLSF_ZVFHMIN 2 "vector_merge_operand"           " vu,   0,  vu,   0")))]
+  "TARGET_VECTOR"
+  "@
+   vlse<sew>.v\t%0,%3,zero,%1.t
+   vlse<sew>.v\t%0,%3,zero,%1.t
+   vlse<sew>.v\t%0,%3,zero
+   vlse<sew>.v\t%0,%3,zero"
+  "&& !strided_load_broadcast_p ()
+   && <VEL>mode == HFmode
+   && can_create_pseudo_p ()"
+  [(const_int 0)]
+  {
+    poly_uint64 nunits = GET_MODE_NUNITS (<MODE>mode);
+    machine_mode vmodehi
+      = riscv_vector::get_vector_mode (HImode, nunits).require ();
+    rtx ops[] = {gen_lowpart (vmodehi, operands[0]),
+                gen_lowpart (HImode, operands[3])};
+    riscv_vector::emit_avltype_insn (code_for_pred_broadcast (vmodehi),
+                                    riscv_vector::UNARY_OP, ops,
+                                    (riscv_vector::avl_type) INTVAL (operands[7]),
+                                    operands[4]);
+    DONE;
+  }
+  [(set_attr "type" "vlds,vlds,vlds,vlds")
+   (set_attr "mode" "<MODE>")])
+
+
  ;; -------------------------------------------------------------------------------
  ;; ---- Predicated Strided loads/stores
  ;; -------------------------------------------------------------------------------
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/repeat-6.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/repeat-6.c

index 4dc5703d894794cb1047696e4d3cff6bc054dbaa..0fa1ea049b14e8e8bce642d4d4347ecfd5ba866f 100644 (file)
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/repeat-6.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/repeat-6.c
@@ -72,7 +72,7 @@ f_vnx128qi (int8_t *out)
    *(vnx128qi *) out = v;
  }
  
-/* { dg-final { scan-assembler-times {vmv.v.x\tv[0-9]+,\s*[a-x0-9]+} 6 } } */
+/* { dg-final { scan-assembler-times {vmv.v.x\tv[0-9]+,\s*[a-x0-9]+} 7 } } */
  /* { dg-final { scan-assembler-times {slli\t[a-x0-9]+,\s*[a-x0-9]+,\s*8} 6 } } */
  /* { dg-final { scan-assembler-times {or\t[a-x0-9]+,\s*[a-x0-9]+,\s*[a-x0-9]+} 6 } } */
  /* { dg-final { scan-assembler-times {vslide1down\.vx\tv[0-9]+,\s*v[0-9]+,\s*[a-x0-9]+} 1 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/scalar_move-5.c b/gcc/testsuite/gcc.target/riscv/rvv/base/scalar_move-5.c

index 04dec7bc8dc73224a900b26285466edc57de37c5..4f6785ace68661ee797a960046b7a0ba476de510 100644 (file)
--- a/gcc/testsuite/gcc.target/riscv/rvv/base/scalar_move-5.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/scalar_move-5.c
@@ -6,9 +6,9 @@
  
  /*
  ** foo:
-** addi\t[a-x0-9]+,\s*[a-x0-9]+,100
+** ...
  ** vsetvli\tzero,a2,e64,m2,t[au],m[au]
-** vlse64.v\tv[0-9]+,0\([a-x0-9]+\),zero
+** vmv.s.x\tv[0-9]+.*
  ** vs2r.v\tv[0-9]+,0\([a-x0-9]+\)
  ** ret
  */
@@ -23,7 +23,7 @@ void foo (void *base, void *out, size_t vl)
  ** foo2:
  ** fld\tfa[0-9]+,\s*100\(a0\)
  ** vsetvli\tzero,a2,e64,m2,t[au],m[au]
-** vfmv\.v\.f\tv[0-9]+,\s*fa[0-9]+
+** vfmv\.s\.f\tv[0-9]+,\s*fa[0-9]+
  ** vs2r.v\tv[0-9]+,0\([a-x0-9]+\)
  ** ret
  */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/scalar_move-6.c b/gcc/testsuite/gcc.target/riscv/rvv/base/scalar_move-6.c

index 0ebb92eda422f4733a01ed0a038da83f4d8cedfc..a8c9263c4df6d2989db171128018388c37389cf2 100644 (file)
--- a/gcc/testsuite/gcc.target/riscv/rvv/base/scalar_move-6.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/scalar_move-6.c
@@ -23,7 +23,7 @@ void foo (void *base, void *out, size_t vl)
  ** foo2:
  ** fld\tfa[0-9]+,\s*100\(a0\)
  ** vsetvli\tzero,a2,e64,m2,t[au],m[au]
-** vfmv\.v\.f\tv[0-9]+,\s*fa[0-9]+
+** vfmv\.s\.f\tv[0-9]+,\s*fa[0-9]+
  ** vs2r.v\tv[0-9]+,0\([a-x0-9]+\)
  ** ret
  */
@@ -52,7 +52,7 @@ void foo3 (void *base, void *out, size_t vl)
  /*
  ** foo4:
  ** ...
-** vfmv\.v\.f\tv[0-9]+,\s*fa[0-9]+
+** vfmv\.s\.f\tv[0-9]+,\s*fa[0-9]+
  ** ...
  ** ret
  */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/scalar_move-7.c b/gcc/testsuite/gcc.target/riscv/rvv/base/scalar_move-7.c

index 512fa62858a96382f6254ca3d79be70730ae99ab..cf53aca5c6299f945faa00645a1a694d2c3c25df 100644 (file)
--- a/gcc/testsuite/gcc.target/riscv/rvv/base/scalar_move-7.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/scalar_move-7.c
@@ -6,9 +6,9 @@
  
  /*
  ** foo:
-** addi\t[a-x0-9]+,\s*[a-x0-9]+,100
+** ...
  ** vsetvli\tzero,a2,e64,m2,t[au],m[au]
-** vlse64.v\tv[0-9]+,0\([a-x0-9]+\),zero
+** vmv\.v\.x\tv[0-9]+,\s*a[0-9]+
  ** vs2r.v\tv[0-9]+,0\([a-x0-9]+\)
  ** ret
  */
@@ -37,7 +37,7 @@ void foo2 (void *base, void *out, size_t vl)
  /*
  ** foo3:
  ** ...
-** vlse64.v\tv[0-9]+,0\([a-x0-9]+\),zero
+** vmv\.v\.x\tv[0-9]+,\s*a[0-9]+
  ** ...
  ** ret
  */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/scalar_move-8.c b/gcc/testsuite/gcc.target/riscv/rvv/base/scalar_move-8.c

index d9d10f3702a4823365837fd87cde843e36c6d0de..fd3b7c57510b8b505767aa21f68c8f025091364c 100644 (file)
--- a/gcc/testsuite/gcc.target/riscv/rvv/base/scalar_move-8.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/scalar_move-8.c
@@ -175,9 +175,8 @@ void foo12 (void *base, void *out, size_t vl)
  /*
  ** foo13:
  ** ...
-** vmv.v.x\tv[0-9]+,\s*[a-x0-9]+
+** vlse64.v\tv[0-9]+,0\([a-x0-9]+\),zero
  ** ...
-** ret
  */
  void foo13 (void *base, void *out, size_t vl)
  {
@@ -189,7 +188,7 @@ void foo13 (void *base, void *out, size_t vl)
  /*
  ** foo14:
  ** ...
-** vmv.v.x\tv[0-9]+,\s*[a-x0-9]+
+** vlse64.v\tv[0-9]+,0\([a-x0-9]+\),zero
  ** ...
  */
  void foo14 (void *base, void *out, size_t vl)
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/scalar_move-9.c b/gcc/testsuite/gcc.target/riscv/rvv/base/scalar_move-9.c

index 80ee1b5f0c9382d7065db09925f44742ba6d301e..64c22dd39e6f15b4135519c44ac87346edf68e1b 100644 (file)
--- a/gcc/testsuite/gcc.target/riscv/rvv/base/scalar_move-9.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/scalar_move-9.c
@@ -23,4 +23,3 @@ vuint64m2_t f3(vuint64m2_t var_17, uint64_t var_60, size_t vl)
  
  /* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*0,\s*e64,\s*m2,\s*t[au],\s*m[au]} 1 } } */
  /* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*1,\s*e64,\s*m2,\s*t[au],\s*m[au]} 1 } } */
-/* { dg-final { scan-assembler-times {sgtu} 1 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/pr121073.c b/gcc/testsuite/gcc.target/riscv/rvv/pr121073.c

new file mode 100644 (file)

index 0000000..2789d0f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/pr121073.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -mrvv-vector-bits=zvl -fsigned-char -fno-strict-aliasing -fwrapv -Wno-stringop-overflow -Wno-aggressive-loop-optimizations" } */
+
+int a;
+unsigned char p[1][21];
+void init() {
+  for (int s = 0; s < 21; ++s)
+    for (int t = 0; t < 21; ++t)
+      p[s][t] = 39;
+  for (short t = 0; t < 9; t += -5077966496202321318LL + 28071)
+    a = p[3][t] && p[2][t];
+}
author	Robin Dapp <rdapp@ventanamicro.com>
	Thu, 17 Jul 2025 09:09:43 +0000 (11:09 +0200)
committer	Robin Dapp <rdapp@ventanamicro.com>
	Wed, 23 Jul 2025 15:36:26 +0000 (17:36 +0200)
gcc/config/riscv/autovec-opt.md		patch \| blob \| blame \| history
gcc/config/riscv/autovec.md		patch \| blob \| blame \| history
gcc/config/riscv/predicates.md		patch \| blob \| blame \| history
gcc/config/riscv/riscv-protos.h		patch \| blob \| blame \| history
gcc/config/riscv/riscv-string.cc		patch \| blob \| blame \| history
gcc/config/riscv/riscv-v.cc		patch \| blob \| blame \| history
gcc/config/riscv/riscv-vector-builtins-bases.cc		patch \| blob \| blame \| history
gcc/config/riscv/riscv-vector-builtins.cc		patch \| blob \| blame \| history
gcc/config/riscv/riscv-vector-builtins.h		patch \| blob \| blame \| history
gcc/config/riscv/vector.md		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/repeat-6.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/riscv/rvv/base/scalar_move-5.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/riscv/rvv/base/scalar_move-6.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/riscv/rvv/base/scalar_move-7.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/riscv/rvv/base/scalar_move-8.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/riscv/rvv/base/scalar_move-9.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/riscv/rvv/pr121073.c	[new file with mode: 0644]	patch \| blob