[PATCH v4] RISC-V: Add per-type reduction costs to the vector cost model

author Wang Yaduo <wangyaduo@linux.alibaba.com>

Fri, 22 May 2026 17:26:10 +0000 (11:26 -0600)

committer Jeff Law <jeffrey.law@oss.qualcomm.com>

Fri, 22 May 2026 17:28:33 +0000 (11:28 -0600)
author Wang Yaduo <wangyaduo@linux.alibaba.com>
Fri, 22 May 2026 17:26:10 +0000 (11:26 -0600)
committer Jeff Law <jeffrey.law@oss.qualcomm.com>
Fri, 22 May 2026 17:28:33 +0000 (11:28 -0600)
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h

index fb600d60168d0ac84855692c713fb4eb6b78f239..234d625441d8b1a79491f74aa7e921d19a124574 100644 (file)
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -280,6 +280,21 @@ struct common_vector_cost
  
    /* Cost of an unaligned vector store.  */
    const int unalign_store_cost;
+
+  /* Cost of vector reduction operations (unordered / tree reduction).
+     Indexed by element type.  */
+  const int reduc_i8_cost;
+  const int reduc_i16_cost;
+  const int reduc_i32_cost;
+  const int reduc_i64_cost;
+  const int reduc_f16_cost;
+  const int reduc_f32_cost;
+  const int reduc_f64_cost;
+
+  /* Cost of ordered (fold-left) floating-point reductions.  */
+  const int reduc_f16_ordered_cost;
+  const int reduc_f32_ordered_cost;
+  const int reduc_f64_ordered_cost;
  };
  
  /* scalable vectorization (VLA) specific cost.  */
@@ -289,8 +304,6 @@ struct scalable_vector_cost : common_vector_cost
      : common_vector_cost (base)
    {}
  
-  /* TODO: We will need more other kinds of vector cost for VLA.
-     E.g. fold_left reduction cost, lanes load/store cost, ..., etc.  */
  };
  
  /* Additional costs for register copies.  Cost is for one register.  */
diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc

index e678e0de766e8395a235908746ecd17264ce86ea..6d37519dbfee560734cdd8b4d0d5f5622c057c56 100644 (file)
--- a/gcc/config/riscv/riscv-vector-costs.cc
+++ b/gcc/config/riscv/riscv-vector-costs.cc
@@ -1274,6 +1274,58 @@ get_lmul_cost_scaling (machine_mode mode)
      }
  }
  
+/* Return true if STMT_INFO or NODE represents a reduction operation.  */
+
+static bool
+is_reduction (stmt_vec_info stmt_info, slp_tree node)
+{
+  return (stmt_info && vect_is_reduction (stmt_info))
+        || (node && vect_is_reduction (node));
+}
+
+/* Return the per-type reduction cost for VECTYPE, or 0 if no specific cost
+   applies.  For FP types, distinguish ordered vs unordered reductions.  */
+
+static int
+get_reduction_cost (vec_info *vinfo, const cpu_vector_cost *costs,
+                   loop_vec_info loop, slp_tree node, tree vectype)
+{
+  const common_vector_cost *common_costs
+    = loop && riscv_vla_mode_p (loop->vector_mode)
+      ? costs->vla : costs->vls;
+
+  bool is_ordered = false;
+  if (FLOAT_TYPE_P (vectype) && node)
+    {
+      int reduc_type = vect_reduc_type (vinfo, node);
+      is_ordered = (reduc_type == FOLD_LEFT_REDUCTION);
+    }
+
+  switch (GET_MODE_INNER (TYPE_MODE (vectype)))
+    {
+    case E_QImode:
+      return common_costs->reduc_i8_cost;
+    case E_HImode:
+      return common_costs->reduc_i16_cost;
+    case E_SImode:
+      return common_costs->reduc_i32_cost;
+    case E_DImode:
+      return common_costs->reduc_i64_cost;
+    case E_HFmode:
+    case E_BFmode:
+      return is_ordered ? common_costs->reduc_f16_ordered_cost
+                       : common_costs->reduc_f16_cost;
+    case E_SFmode:
+      return is_ordered ? common_costs->reduc_f32_ordered_cost
+                       : common_costs->reduc_f32_cost;
+    case E_DFmode:
+      return is_ordered ? common_costs->reduc_f64_ordered_cost
+                       : common_costs->reduc_f64_cost;
+    default:
+      return 0;
+    }
+}
+
  /* Adjust vectorization cost after calling riscv_builtin_vectorization_cost.
     For some statement, we would like to further fine-grain tweak the cost on
     top of riscv_builtin_vectorization_cost handling which doesn't have any
@@ -1292,9 +1344,18 @@ costs::adjust_stmt_cost (enum vect_cost_for_stmt kind, loop_vec_info loop,
         += (FLOAT_TYPE_P (vectype) ? get_fr2vr_cost () : get_gr2vr_cost ());
        break;
      case vec_to_scalar:
-      stmt_cost
-       += (FLOAT_TYPE_P (vectype) ? get_vr2fr_cost () : get_vr2gr_cost ());
-      break;
+      {
+       int reduc_cost = 0;
+       if (vectype && is_reduction (stmt_info, node))
+         reduc_cost = get_reduction_cost (m_vinfo, costs, loop, node, vectype);
+
+       if (reduc_cost)
+         stmt_cost = reduc_cost;
+
+       stmt_cost
+         += (FLOAT_TYPE_P (vectype) ? get_vr2fr_cost () : get_vr2gr_cost ());
+       break;
+      }
      case vector_load:
      case vector_store:
         {
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc

index c66a6d1efeed1e007c14764946b6e15fa66d148f..8a737bb41b6651bde38efdbf3370d33818a1882f 100644 (file)
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -397,6 +397,16 @@ static const common_vector_cost rvv_vls_vector_cost = {
    1, /* align_store_cost  */
    2, /* unalign_load_cost  */
    2, /* unalign_store_cost  */
+  2, /* reduc_i8_cost  */
+  2, /* reduc_i16_cost  */
+  2, /* reduc_i32_cost  */
+  2, /* reduc_i64_cost  */
+  2, /* reduc_f16_cost  */
+  2, /* reduc_f32_cost  */
+  2, /* reduc_f64_cost  */
+  20, /* reduc_f16_ordered_cost  */
+  10, /* reduc_f32_ordered_cost  */
+  5, /* reduc_f64_ordered_cost  */
  };
  
  /* RVV costs for VLA vector operations.  */
@@ -420,6 +430,16 @@ static const scalable_vector_cost rvv_vla_vector_cost = {
      1, /* align_store_cost  */
      2, /* unalign_load_cost  */
      2, /* unalign_store_cost  */
+    2, /* reduc_i8_cost  */
+    2, /* reduc_i16_cost  */
+    2, /* reduc_i32_cost  */
+    2, /* reduc_i64_cost  */
+    2, /* reduc_f16_cost  */
+    2, /* reduc_f32_cost  */
+    2, /* reduc_f64_cost  */
+    20, /* reduc_f16_ordered_cost  */
+    10, /* reduc_f32_ordered_cost  */
+    5, /* reduc_f64_ordered_cost  */
    },
  };
  
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/reduc_vla_ordered.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/reduc_vla_ordered.c

new file mode 100644 (file)

index 0000000..367a801
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/reduc_vla_ordered.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv_zvfh -mabi=lp64d -O3 -mrvv-vector-bits=scalable -fdump-tree-vect-details" } */
+
+#include <stdint-gcc.h>
+
+#define DEF_REDUC_PLUS(TYPE)                           \
+  TYPE __attribute__ ((noinline, noclone))             \
+  reduc_plus_##TYPE (TYPE *restrict a, int n)          \
+  {                                                    \
+    TYPE r = 0;                                                \
+    for (int i = 0; i < n; ++i)                                \
+      r += a[i];                                       \
+    return r;                                          \
+  }
+
+DEF_REDUC_PLUS (_Float16)
+DEF_REDUC_PLUS (float)
+DEF_REDUC_PLUS (double)
+
+/* Ordered reduction cost: reduc_f*_ordered_cost + vr2fr.  */
+/* { dg-final { scan-tree-dump "vec_to_scalar costs 22" "vect" } } */
+/* { dg-final { scan-tree-dump "vec_to_scalar costs 12" "vect" } } */
+/* { dg-final { scan-tree-dump "vec_to_scalar costs 7" "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/reduc_vla_unordered.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/reduc_vla_unordered.c

new file mode 100644 (file)

index 0000000..b605eef
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/reduc_vla_unordered.c
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv_zvfh -mabi=lp64d -O3 -mrvv-vector-bits=scalable -ffast-math -fdump-tree-vect-details" } */
+
+#include <stdint-gcc.h>
+
+#define DEF_REDUC_PLUS(TYPE)                   \
+TYPE __attribute__ ((noinline, noclone))       \
+reduc_plus_##TYPE (TYPE *restrict a, int n)    \
+{                                              \
+  TYPE r = 0;                                  \
+  for (int i = 0; i < n; ++i)                  \
+    r += a[i];                                 \
+  return r;                                    \
+}
+
+DEF_REDUC_PLUS (int8_t)
+DEF_REDUC_PLUS (int16_t)
+DEF_REDUC_PLUS (int32_t)
+DEF_REDUC_PLUS (int64_t)
+DEF_REDUC_PLUS (_Float16)
+DEF_REDUC_PLUS (float)
+DEF_REDUC_PLUS (double)
+
+/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 7 "vect" } } */
+/* { dg-final { scan-assembler-times {vredsum\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 4 } } */
+/* { dg-final { scan-assembler-times {vfredusum\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 3 } } */
+
+/* Unordered reduction cost: reduc_*_cost + vr2gr/vr2fr.  */
+/* { dg-final { scan-tree-dump "vec_to_scalar costs 4" "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/reduc_vls_ordered.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/reduc_vls_ordered.c

new file mode 100644 (file)

index 0000000..6d7a52c
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/reduc_vls_ordered.c
@@ -0,0 +1,26 @@
+/* Ordered FP reduction costs in VLS mode.  */
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3 -mrvv-max-lmul=m8 -fdump-tree-vect-details" } */
+
+#include <stdint-gcc.h>
+
+#define DEF_REDUC_PLUS(TYPE, NUM)                      \
+  TYPE __attribute__ ((noinline, noclone))             \
+  reduc_plus_##TYPE##_##NUM (TYPE *restrict a)         \
+  {                                                    \
+    TYPE r = 0;                                                \
+    for (int i = 0; i < NUM; ++i)                      \
+      r += a[i];                                       \
+    return r;                                          \
+  }
+
+DEF_REDUC_PLUS (_Float16, 8)
+DEF_REDUC_PLUS (float, 8)
+DEF_REDUC_PLUS (double, 8)
+
+/* { dg-final { scan-assembler-not {csrr} } } */
+
+/* f16: 20+2=22, f32: 10+2=12, f64: 5+2=7.  */
+/* { dg-final { scan-tree-dump "vec_to_scalar costs 22" "vect" } } */
+/* { dg-final { scan-tree-dump "vec_to_scalar costs 12" "vect" } } */
+/* { dg-final { scan-tree-dump "vec_to_scalar costs 7" "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/reduc_vls_unordered.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/reduc_vls_unordered.c

new file mode 100644 (file)

index 0000000..fd3350a
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/reduc_vls_unordered.c
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3 -mrvv-max-lmul=m8 -ffast-math -fdump-tree-vect-details" } */
+
+#include <stdint-gcc.h>
+
+#define DEF_REDUC_PLUS(TYPE, NUM)                      \
+  TYPE __attribute__ ((noinline, noclone))             \
+  reduc_plus_##TYPE##_##NUM (TYPE *restrict a)         \
+  {                                                    \
+    TYPE r = 0;                                                \
+    for (int i = 0; i < NUM; ++i)                      \
+      r += a[i];                                       \
+    return r;                                          \
+  }
+
+DEF_REDUC_PLUS (int8_t, 8)
+DEF_REDUC_PLUS (int16_t, 8)
+DEF_REDUC_PLUS (int32_t, 8)
+DEF_REDUC_PLUS (int64_t, 8)
+DEF_REDUC_PLUS (_Float16, 8)
+DEF_REDUC_PLUS (float, 8)
+DEF_REDUC_PLUS (double, 8)
+
+/* { dg-final { scan-assembler-times {vredsum\.vs} 4 } } */
+/* { dg-final { scan-assembler-times {vfredusum\.vs} 3 } } */
+/* { dg-final { scan-assembler-not {csrr} } } */
+
+/* Unordered reduction cost: reduc_*_cost (2) + vr2gr/vr2fr (2) = 4.  */
+/* { dg-final { scan-tree-dump "vec_to_scalar costs 4" "vect" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c

index 08d983997e2ac199de31c21c42d25d83d1395f77..23ceb66b4b00c569648d9f7b6426387bcb1d32de 100644 (file)
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c
@@ -1,5 +1,5 @@
  /* { dg-do run { target { riscv_v } } } */
-/* { dg-additional-options "-mrvv-vector-bits=scalable -fdump-tree-vect-details" } */
+/* { dg-additional-options "-mrvv-vector-bits=scalable -mmax-vectorization -fdump-tree-vect-details" } */
  
  double
  __attribute__ ((noipa))
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-19.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-19.c

index 5a4df4824240e4bcda93865f2c9d41a29fee5a61..b09b38cce84499ced9cbef0d2db322d84eea05f4 100644 (file)
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-19.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-19.c
@@ -1,5 +1,5 @@
  /* { dg-do compile } */
-/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3 -mrvv-max-lmul=m8 -fdump-tree-optimized-details" } */
+/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3 -mrvv-max-lmul=m8 -mmax-vectorization -fdump-tree-optimized-details" } */
  
  #include "def.h"
  
@@ -14,7 +14,7 @@ DEF_REDUC_PLUS (_Float16, 512)
  DEF_REDUC_PLUS (_Float16, 1024)
  DEF_REDUC_PLUS (_Float16, 2048)
  
-/* { dg-final { scan-assembler-times {vfredosum\.vs} 9 } } */
+/* { dg-final { scan-assembler-times {vfredosum\.vs} 11 } } */
  /* { dg-final { scan-assembler-not {csrr} } } */
  /* { dg-final { scan-tree-dump-not "1,1" "optimized" } } */
  /* { dg-final { scan-tree-dump-not "2,2" "optimized" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-20.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-20.c

index daf9c8a32a947359c34bf7578eb22d618dd9eeac..f37ebd6ea4809860a92058199bb7945c20359781 100644 (file)
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-20.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-20.c
@@ -1,5 +1,5 @@
  /* { dg-do compile } */
-/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3 -mrvv-max-lmul=m8 -fdump-tree-optimized-details" } */
+/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3 -mrvv-max-lmul=m8 -mmax-vectorization -fdump-tree-optimized-details" } */
  
  #include "def.h"
  
@@ -13,7 +13,7 @@ DEF_REDUC_PLUS (float, 256)
  DEF_REDUC_PLUS (float, 512)
  DEF_REDUC_PLUS (float, 1024)
  
-/* { dg-final { scan-assembler-times {vfredosum\.vs} 8 } } */
+/* { dg-final { scan-assembler-times {vfredosum\.vs} 10 } } */
  /* { dg-final { scan-assembler-not {csrr} } } */
  /* { dg-final { scan-tree-dump-not "1,1" "optimized" } } */
  /* { dg-final { scan-tree-dump-not "2,2" "optimized" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-21.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-21.c

index d1b8c2535ccab31659bd9946d6130d1de3024777..a67dda5e6fa564d314ec33102671511f525e52a6 100644 (file)
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-21.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-21.c
@@ -1,5 +1,5 @@
  /* { dg-do compile } */
-/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3 -mrvv-max-lmul=m8 -fdump-tree-optimized-details" } */
+/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3 -mrvv-max-lmul=m8 -mmax-vectorization -fdump-tree-optimized-details" } */
  
  #include "def.h"
  
@@ -12,7 +12,7 @@ DEF_REDUC_PLUS (float, 128)
  DEF_REDUC_PLUS (float, 256)
  DEF_REDUC_PLUS (float, 512)
  
-/* { dg-final { scan-assembler-times {vfredosum\.vs} 7 } } */
+/* { dg-final { scan-assembler-times {vfredosum\.vs} 9 } } */
  /* { dg-final { scan-assembler-not {csrr} } } */
  /* { dg-final { scan-tree-dump-not "1,1" "optimized" } } */
  /* { dg-final { scan-tree-dump-not "2,2" "optimized" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/wred-3.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/wred-3.c

index 6e9456b23209bb2128fd479a87b5f62c62f2b3ee..2fad7ad4c9951cf902312df4bcff708815755c5a 100644 (file)
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/wred-3.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/wred-3.c
@@ -1,9 +1,9 @@
  /* { dg-do compile } */
-/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3 -mrvv-max-lmul=m8 -fdump-tree-optimized" } */
+/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3 -mrvv-max-lmul=m8 -mmax-vectorization -fdump-tree-optimized" } */
  
  #include "wred-2.c"
  
-/* { dg-final { scan-assembler-times {vfwredosum\.vs} 17 } } */
+/* { dg-final { scan-assembler-times {vfwredosum\.vs} 19 } } */
  /* { dg-final { scan-assembler-not {csrr} } } */
  /* { dg-final { scan-tree-dump-not "1,1" "optimized" } } */
  /* { dg-final { scan-tree-dump-not "2,2" "optimized" } } */
author	Wang Yaduo <wangyaduo@linux.alibaba.com>
	Fri, 22 May 2026 17:26:10 +0000 (11:26 -0600)
committer	Jeff Law <jeffrey.law@oss.qualcomm.com>
	Fri, 22 May 2026 17:28:33 +0000 (11:28 -0600)
gcc/config/riscv/riscv-protos.h		patch \| blob \| blame \| history
gcc/config/riscv/riscv-vector-costs.cc		patch \| blob \| blame \| history
gcc/config/riscv/riscv.cc		patch \| blob \| blame \| history
gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/reduc_vla_ordered.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/reduc_vla_unordered.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/reduc_vls_ordered.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/reduc_vls_unordered.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-19.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-20.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-21.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/wred-3.c		patch \| blob \| blame \| history