]> git.ipfire.org Git - thirdparty/gcc.git/commitdiff
RISC-V: Add RISC-V RVV main-loop overhead comparison in cost model
authorZhongyao Chen <chen.zhongyao@zte.com.cn>
Wed, 20 May 2026 09:30:22 +0000 (17:30 +0800)
committerZhongyao Chen <chen.zhongyao@zte.com.cn>
Thu, 28 May 2026 10:53:17 +0000 (10:53 +0000)
Add an RVV-specific loop-overhead comparison in the RISC-V cost model and
use it after inside-loop cost comparison.

The RISC-V implementation prefers RVV mode that eliminate the main
loop, and otherwise compares their main-loop head overhead.

Local testing shows no regressions. This is likely because few testcases
have equal inside-loop cost, especially before VLS lmul cost scaling support.

I also ran regression tests with temporary VLS lmul cost scaling support.
Only 3 regressions found:
  - dyn-lmul-conv-1.c & dyn-lmul-conv-2.c: Cost model now prefers smaller LMULs
due to VLS lmul scaling, so this is reasonable, just need to update expectations.
  - pr123414.c: This test relies on large LMULs to trigger a specific bug,
so reasonable too, can be fixed by adding -fno-vect-cost-model.

The VLS LMUL cost scaling patch will be updated after this is pushed.

gcc/ChangeLog:
* config/riscv/riscv-vector-costs.cc
(estimated_loop_iters): New function.
(compare_loop_overhead): New function.
(costs::better_main_loop_than_p): Compare RVV loop overhead after
inside-loop cost.

Signed-off-by: Zhongyao Chen <chen.zhongyao@zte.com.cn>
gcc/config/riscv/riscv-vector-costs.cc

index 6d37519dbfee560734cdd8b4d0d5f5622c057c56..833a525abd65e167b9a51db3c0f8559c2562d4d1 100644 (file)
@@ -1095,6 +1095,74 @@ costs::prefer_unrolled_loop () const
              <= (unsigned int) param_max_completely_peeled_insns));
 }
 
+/* Return the estimated number of vector iterations for LOOP_VINFO, or
+   HOST_WIDE_INT_M1U if the scalar iteration count is not known.  */
+static unsigned HOST_WIDE_INT
+estimated_loop_iters (loop_vec_info loop_vinfo)
+{
+  if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
+    return HOST_WIDE_INT_M1U;
+
+  unsigned HOST_WIDE_INT scalar_niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
+  unsigned int vf = vect_vf_for_cost (loop_vinfo);
+  return (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
+         ? CEIL (scalar_niters, vf)
+         : scalar_niters / vf);
+}
+
+/* Compare the estimated loop overheads of two loops.  With LMUL cost scaling,
+   simple loop bodies can have equal inside-loop costs for different LMULs.
+   Include loop-back branch costs so that larger RVV modes are preferred when
+   they reduce or eliminate vector loop iterations.  */
+static int
+compare_loop_overhead (loop_vec_info this_loop_vinfo,
+                      loop_vec_info other_loop_vinfo)
+{
+  gcc_assert (LOOP_VINFO_NITERS_KNOWN_P (this_loop_vinfo));
+  gcc_assert (LOOP_VINFO_NITERS_KNOWN_P (other_loop_vinfo));
+
+  unsigned HOST_WIDE_INT this_niters = estimated_loop_iters (this_loop_vinfo);
+  unsigned HOST_WIDE_INT other_niters = estimated_loop_iters (other_loop_vinfo);
+  bool this_eliminate_loop_p = this_niters == 1;
+  bool other_eliminate_loop_p = other_niters == 1;
+
+  if (this_eliminate_loop_p != other_eliminate_loop_p)
+    {
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_NOTE, vect_location,
+                        "Preferring %s loop because it is estimated to"
+                        " eliminate the main loop entirely\n",
+                        GET_MODE_NAME ((this_eliminate_loop_p
+                                        ? this_loop_vinfo
+                                        : other_loop_vinfo)->vector_mode));
+      return this_eliminate_loop_p ? -1 : 1;
+    }
+
+  unsigned int branch_cost
+    = builtin_vectorization_cost (cond_branch_taken, NULL_TREE, 0);
+  unsigned HOST_WIDE_INT this_overhead
+    = this_niters > 1 ? (this_niters - 1) * branch_cost : 0;
+  unsigned HOST_WIDE_INT other_overhead
+    = other_niters > 1 ? (other_niters - 1) * branch_cost : 0;
+
+  if (this_overhead != other_overhead)
+    {
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_NOTE, vect_location,
+                        "Preferring %s loop because it has lower"
+                        " loop overhead ("
+                        HOST_WIDE_INT_PRINT_UNSIGNED " vs. "
+                        HOST_WIDE_INT_PRINT_UNSIGNED ")\n",
+                        GET_MODE_NAME ((this_overhead < other_overhead
+                                        ? this_loop_vinfo
+                                        : other_loop_vinfo)->vector_mode),
+                        this_overhead, other_overhead);
+      return this_overhead < other_overhead ? -1 : 1;
+    }
+
+  return 0;
+}
+
 bool
 costs::better_main_loop_than_p (const vector_costs *uncast_other) const
 {
@@ -1213,7 +1281,28 @@ costs::better_main_loop_than_p (const vector_costs *uncast_other) const
           && m_cost_type == VLS_VECTOR_COST)
     return false;
 
-  return vector_costs::better_main_loop_than_p (other);
+  /* Fall back to generic costing if either iteration count is unknown.  For
+     known iteration counts, include loop overhead when comparing different
+     LMULs.  This handles such cases better than better_main_loop_than_p,
+     especially while outside costs can still overestimate prologue costs
+     (PR target/125476).  */
+  if (!LOOP_VINFO_NITERS_KNOWN_P (this_loop_vinfo)
+      || !LOOP_VINFO_NITERS_KNOWN_P (other_loop_vinfo))
+    return vector_costs::better_main_loop_than_p (other);
+
+  int diff = compare_inside_loop_cost (other);
+  if (diff != 0)
+    return diff < 0;
+
+  diff = compare_loop_overhead (this_loop_vinfo, other_loop_vinfo);
+  if (diff != 0)
+    return diff < 0;
+
+  diff = compare_outside_loop_cost (other);
+  if (diff != 0)
+    return diff < 0;
+
+  return false;
 }
 
 /* Returns the group size i.e. the number of vectors to be loaded by a