[PATCH v4] RISC-V: Apply LMUL cost scaling to vector operations

author Zhongyao Chen <chenzhongyao.hit@gmail.com>

Wed, 6 May 2026 13:04:25 +0000 (07:04 -0600)

committer Jeff Law <jeffrey.law@oss.qualcomm.com>

Wed, 6 May 2026 13:04:25 +0000 (07:04 -0600)
author Zhongyao Chen <chenzhongyao.hit@gmail.com>
Wed, 6 May 2026 13:04:25 +0000 (07:04 -0600)
committer Jeff Law <jeffrey.law@oss.qualcomm.com>
Wed, 6 May 2026 13:04:25 +0000 (07:04 -0600)
diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc

index f582551eba79ea9e066da16bbbfcaa58459a66f3..e678e0de766e8395a235908746ecd17264ce86ea 100644 (file)
--- a/gcc/config/riscv/riscv-vector-costs.cc
+++ b/gcc/config/riscv/riscv-vector-costs.cc
@@ -1235,6 +1235,45 @@ segment_loadstore_group_size (enum vect_cost_for_stmt kind,
    return 0;
  }
  
+/* Calculate LMUL-based cost scaling factor.
+   Larger LMUL values process more data but have proportionally
+   higher latency and register pressure.
+
+   Returns the cost scaling factor based on LMUL.  For LMUL > 1,
+   the factor represents the relative cost increase (2x, 4x, 8x).
+   For LMUL <= 1, returns 1 (no scaling).  */
+static unsigned
+get_lmul_cost_scaling (machine_mode mode)
+{
+  if (!riscv_vla_mode_p (mode))
+    return 1;
+
+  enum vlmul_type vlmul = get_vlmul (mode);
+
+  /* Cost scaling based on LMUL and data processed.
+     Larger LMUL values have proportionally higher latency:
+     - m1 (LMUL_1): 1x (baseline)
+     - m2 (LMUL_2): 2x (processes 2x data, ~2x latency)
+     - m4 (LMUL_4): 4x (processes 4x data, ~4x latency)
+     - m8 (LMUL_8): 8x (processes 8x data, ~8x latency)
+     - mf2/mf4/mf8: 1x (fractional LMUL, already efficient)  */
+  switch (vlmul)
+    {
+    case LMUL_2:
+      return 2;
+    case LMUL_4:
+      return 4;
+    case LMUL_8:
+      return 8;
+    case LMUL_1:
+    case LMUL_F2:
+    case LMUL_F4:
+    case LMUL_F8:
+    default:
+      return 1;
+    }
+}
+
  /* Adjust vectorization cost after calling riscv_builtin_vectorization_cost.
     For some statement, we would like to further fine-grain tweak the cost on
     top of riscv_builtin_vectorization_cost handling which doesn't have any
@@ -1379,6 +1418,17 @@ costs::adjust_stmt_cost (enum vect_cost_for_stmt kind, loop_vec_info loop,
      default:
        break;
      }
+
+  /* Apply LMUL cost scaling uniformly to all vector operations.
+     Larger LMUL values have higher latency and register pressure,
+     which affects performance regardless of loop structure.  */
+  if (vectype)
+    {
+      unsigned lmul_factor = get_lmul_cost_scaling (TYPE_MODE (vectype));
+      if (lmul_factor > 1)
+       stmt_cost *= lmul_factor;
+    }
+
    return stmt_cost;
  }
  
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113112-4.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113112-4.c

index 2c91987480bfd1b8fea6764de992116112fb0933..8af5cc5fc0ccfa5edbb2966344a775b26ad87fdc 100644 (file)
--- a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113112-4.c
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113112-4.c
@@ -21,8 +21,9 @@ void move_replacements (rtx *x, rtx *y, int n_replacements)
        }
  }
  
-/* { dg-final { scan-assembler-not {e64,m2} { xfail *-*-* } } } */
-/* { dg-final { scan-assembler {e64,m4} { xfail *-*-* } } } */
+/* { dg-final { scan-assembler {e64,m1} } } */
+/* { dg-final { scan-assembler-not {e64,m2} } } */
+/* { dg-final { scan-assembler-not {e64,m4} } } */
  /* { dg-final { scan-assembler-not {jr} } } */
  /* { dg-final { scan-assembler {ret} } } */
  /* { dg-final { scan-assembler-not {sp} } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/dyn-lmul-conv-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/dyn-lmul-conv-1.c

index b07bd86f76e46790f610aca5554ee3f0211e84c4..91d777a58a78c5d798fc96e306d69895a695faf8 100644 (file)
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/dyn-lmul-conv-1.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/dyn-lmul-conv-1.c
@@ -37,6 +37,7 @@ void foo8x (long *restrict a, char *restrict b, int n)
      a[i] = b[i];
  }
  
+/* { dg-final { scan-assembler-times ",m1," 3 } } */
  /* { dg-final { scan-assembler-times ",m2," 3 } } */
-/* { dg-final { scan-assembler-times ",m4," 2 } } */
-/* { dg-final { scan-assembler-times ",m8," 1 } } */
+/* { dg-final { scan-assembler-times ",m4," 4 } } */
+/* { dg-final { scan-assembler-times ",m8," 2 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/dyn-lmul-conv-2.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/dyn-lmul-conv-2.c

index c37e4dd63f2003f204d788d276b62b9d20a16500..468f061e3b1cc7688ea969e9a9f59ff30a4eab7a 100644 (file)
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/dyn-lmul-conv-2.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/dyn-lmul-conv-2.c
@@ -37,7 +37,8 @@ void foo8x (unsigned char *restrict a, unsigned long *restrict b, int n)
      a[i] = b[i];
  }
  
-/* { dg-final { scan-assembler-times ",m1," 6 } } */
+/* { dg-final { scan-assembler-times ",m1," 7 } } */
  /* { dg-final { scan-assembler-times ",m2," 3 } } */
  /* { dg-final { scan-assembler-times ",m4," 1 } } */
+/* { dg-final { scan-assembler-times ",m8," 1 } } */
  /* { dg-final { scan-assembler-not ",mf2," } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-16.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-16.c

index 1c7503b770eef9730d53b441fa6484ed4d113f3b..b31453852b2319c070f852cfb885fb6a5ec06bb5 100644 (file)
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-16.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-16.c
@@ -19,8 +19,7 @@ f (uint8_t *restrict a, uint8_t *restrict b, int n)
      }
  }
  
-/* FIXME: Since we don't have VECT cost model yet, LOAD_LANES/STORE_LANES are chosen
-   instead of SLP when rvv-autotec-max-lmul=m1.  */
-/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 1 "optimized" { xfail { any-opts "-mrvv-max-lmul=m1" } } } } */
-/* { dg-final { scan-assembler {\tvid\.v} { xfail { any-opts "-mrvv-max-lmul=m1"} } } } */
+/* FIXME: Since we don't have VECT cost model yet, LOAD_LANES/STORE_LANES are chosen instead of SLP.  */
+/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 1 "optimized" { xfail { any-opts "-mrvv-max-lmul=m1" "-mrvv-max-lmul=m2" "-mrvv-max-lmul=m4" "-mrvv-max-lmul=m8" } } } } */
+/* { dg-final { scan-assembler {\tvid\.v} { xfail { any-opts "-mrvv-max-lmul=m1" "-mrvv-max-lmul=m2" "-mrvv-max-lmul=m4" "-mrvv-max-lmul=m8" } } } } */
  /* { dg-final { scan-assembler-not {\tvmul} } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-5.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-5.c

index a10a7c831b1b5a9b5f021f10c1744148ced67f55..2b2099d6e604f63aee27b1d59839dd07725e51ac 100644 (file)
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-5.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-5.c
@@ -20,5 +20,5 @@ f (int8_t *restrict a, int8_t *restrict b, int n)
  }
  
  /* FIXME: Since we don't have VECT cost model yet, LOAD_LANES/STORE_LANES are chosen
-   instead of SLP when rvv-autotec-max-lmul=m1.  */
-/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 1 "optimized" { xfail { any-opts "-mrvv-max-lmul=m1" } } } } */
+   instead of SLP.  */
+/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 1 "optimized" { xfail { any-opts "-mrvv-max-lmul=m1" "-mrvv-max-lmul=m2" } } } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr122558.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr122558.c

new file mode 100644 (file)

index 0000000..c9dbba6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr122558.c
@@ -0,0 +1,37 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -ftree-vectorize -mabi=lp64d -march=rv64gcv -mrvv-max-lmul=dynamic -fdump-tree-vect-all" } */
+/* { dg-skip-if "" { *-*-* } { "-O0" "-O1" "-O2" "-Os" "-Og" "-Oz" } } */
+
+#include <stdint-gcc.h>
+
+void dct( int16_t d[16], int16_t dct[16] )
+{
+    int16_t tmp[16];
+    for( int i = 0; i < 4; i++ )
+    {
+        int s03 = d[i*4+0] + d[i*4+3];
+        int s12 = d[i*4+1] + d[i*4+2];
+        int d03 = d[i*4+0] - d[i*4+3];
+        int d12 = d[i*4+1] - d[i*4+2];
+        tmp[0*4+i] =   s03 +   s12;
+        tmp[1*4+i] = 2*d03 +   d12;
+        tmp[2*4+i] =   s03 -   s12;
+        tmp[3*4+i] =   d03 - 2*d12;
+    }
+    for( int i = 0; i < 4; i++ )
+    {
+        int s03 = tmp[i*4+0] + tmp[i*4+3];
+        int s12 = tmp[i*4+1] + tmp[i*4+2];
+        int d03 = tmp[i*4+0] - tmp[i*4+3];
+        int d12 = tmp[i*4+1] - tmp[i*4+2];
+
+        dct[i*4+0] =   s03 +   s12;
+        dct[i*4+1] = 2*d03 +   d12;
+        dct[i*4+2] =   s03 -   s12;
+        dct[i*4+3] =   d03 - 2*d12;
+    }
+}
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "Choosing vector mode RVVMF2QI" "vect" } } */
+
author	Zhongyao Chen <chenzhongyao.hit@gmail.com>
	Wed, 6 May 2026 13:04:25 +0000 (07:04 -0600)
committer	Jeff Law <jeffrey.law@oss.qualcomm.com>
	Wed, 6 May 2026 13:04:25 +0000 (07:04 -0600)
gcc/config/riscv/riscv-vector-costs.cc		patch \| blob \| blame \| history
gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113112-4.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/riscv/rvv/autovec/dyn-lmul-conv-1.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/riscv/rvv/autovec/dyn-lmul-conv-2.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-16.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-5.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/riscv/rvv/autovec/pr122558.c	[new file with mode: 0644]	patch \| blob