]> git.ipfire.org Git - thirdparty/gcc.git/commitdiff
[PATCH v4] RISC-V: Apply LMUL cost scaling to vector operations
authorZhongyao Chen <chenzhongyao.hit@gmail.com>
Wed, 6 May 2026 13:04:25 +0000 (07:04 -0600)
committerJeff Law <jeffrey.law@oss.qualcomm.com>
Wed, 6 May 2026 13:04:25 +0000 (07:04 -0600)
This patch introduces multiplicative cost scaling (x2/x4/x8) to model
the higher latency and register pressure of larger LMULs.  The scaling
is applied uniformly in adjust_stmt_cost for all vector operations.

In addition to VLA, VLS should also get the same LMUL cost scaling,
but doing so causes too many testsuite regressions currently,
mostly because these tests also need expectation updates.
This is left for future work.

All failures displayed in CI should have been fixed. Changes here are
all expectation updates, except for slp_run-17.c which is pre-existing
— I will open a PR for it later.

PR target/122558

gcc/ChangeLog:

* config/riscv/riscv-vector-costs.cc (get_lmul_cost_scaling):
New function to calculate multiplicative scaling factors.
(costs::adjust_stmt_cost): Apply LMUL scaling uniformly to all
vector statements.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/pr122558.c: New test.
* gcc.target/riscv/rvv/autovec/dyn-lmul-conv-1.c: Update expected
dump counts after VLA LMUL cost scaling.
* gcc.target/riscv/rvv/autovec/dyn-lmul-conv-2.c: Likewise.
* gcc.target/riscv/rvv/autovec/partial/slp-16.c: Likewise.
* gcc.target/riscv/rvv/autovec/partial/slp-5.c: Likewise.
* gcc.dg/vect/costmodel/riscv/rvv/pr113112-4.c: Likewise.

Signed-off-by: Zhongyao Chen <chen.zhongyao@zte.com.cn>
gcc/config/riscv/riscv-vector-costs.cc
gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113112-4.c
gcc/testsuite/gcc.target/riscv/rvv/autovec/dyn-lmul-conv-1.c
gcc/testsuite/gcc.target/riscv/rvv/autovec/dyn-lmul-conv-2.c
gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-16.c
gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-5.c
gcc/testsuite/gcc.target/riscv/rvv/autovec/pr122558.c [new file with mode: 0644]

index f582551eba79ea9e066da16bbbfcaa58459a66f3..e678e0de766e8395a235908746ecd17264ce86ea 100644 (file)
@@ -1235,6 +1235,45 @@ segment_loadstore_group_size (enum vect_cost_for_stmt kind,
   return 0;
 }
 
+/* Calculate LMUL-based cost scaling factor.
+   Larger LMUL values process more data but have proportionally
+   higher latency and register pressure.
+
+   Returns the cost scaling factor based on LMUL.  For LMUL > 1,
+   the factor represents the relative cost increase (2x, 4x, 8x).
+   For LMUL <= 1, returns 1 (no scaling).  */
+static unsigned
+get_lmul_cost_scaling (machine_mode mode)
+{
+  if (!riscv_vla_mode_p (mode))
+    return 1;
+
+  enum vlmul_type vlmul = get_vlmul (mode);
+
+  /* Cost scaling based on LMUL and data processed.
+     Larger LMUL values have proportionally higher latency:
+     - m1 (LMUL_1): 1x (baseline)
+     - m2 (LMUL_2): 2x (processes 2x data, ~2x latency)
+     - m4 (LMUL_4): 4x (processes 4x data, ~4x latency)
+     - m8 (LMUL_8): 8x (processes 8x data, ~8x latency)
+     - mf2/mf4/mf8: 1x (fractional LMUL, already efficient)  */
+  switch (vlmul)
+    {
+    case LMUL_2:
+      return 2;
+    case LMUL_4:
+      return 4;
+    case LMUL_8:
+      return 8;
+    case LMUL_1:
+    case LMUL_F2:
+    case LMUL_F4:
+    case LMUL_F8:
+    default:
+      return 1;
+    }
+}
+
 /* Adjust vectorization cost after calling riscv_builtin_vectorization_cost.
    For some statement, we would like to further fine-grain tweak the cost on
    top of riscv_builtin_vectorization_cost handling which doesn't have any
@@ -1379,6 +1418,17 @@ costs::adjust_stmt_cost (enum vect_cost_for_stmt kind, loop_vec_info loop,
     default:
       break;
     }
+
+  /* Apply LMUL cost scaling uniformly to all vector operations.
+     Larger LMUL values have higher latency and register pressure,
+     which affects performance regardless of loop structure.  */
+  if (vectype)
+    {
+      unsigned lmul_factor = get_lmul_cost_scaling (TYPE_MODE (vectype));
+      if (lmul_factor > 1)
+       stmt_cost *= lmul_factor;
+    }
+
   return stmt_cost;
 }
 
index 2c91987480bfd1b8fea6764de992116112fb0933..8af5cc5fc0ccfa5edbb2966344a775b26ad87fdc 100644 (file)
@@ -21,8 +21,9 @@ void move_replacements (rtx *x, rtx *y, int n_replacements)
       }
 }
 
-/* { dg-final { scan-assembler-not {e64,m2} { xfail *-*-* } } } */
-/* { dg-final { scan-assembler {e64,m4} { xfail *-*-* } } } */
+/* { dg-final { scan-assembler {e64,m1} } } */
+/* { dg-final { scan-assembler-not {e64,m2} } } */
+/* { dg-final { scan-assembler-not {e64,m4} } } */
 /* { dg-final { scan-assembler-not {jr} } } */
 /* { dg-final { scan-assembler {ret} } } */
 /* { dg-final { scan-assembler-not {sp} } } */
index b07bd86f76e46790f610aca5554ee3f0211e84c4..91d777a58a78c5d798fc96e306d69895a695faf8 100644 (file)
@@ -37,6 +37,7 @@ void foo8x (long *restrict a, char *restrict b, int n)
     a[i] = b[i];
 }
 
+/* { dg-final { scan-assembler-times ",m1," 3 } } */
 /* { dg-final { scan-assembler-times ",m2," 3 } } */
-/* { dg-final { scan-assembler-times ",m4," 2 } } */
-/* { dg-final { scan-assembler-times ",m8," 1 } } */
+/* { dg-final { scan-assembler-times ",m4," 4 } } */
+/* { dg-final { scan-assembler-times ",m8," 2 } } */
index c37e4dd63f2003f204d788d276b62b9d20a16500..468f061e3b1cc7688ea969e9a9f59ff30a4eab7a 100644 (file)
@@ -37,7 +37,8 @@ void foo8x (unsigned char *restrict a, unsigned long *restrict b, int n)
     a[i] = b[i];
 }
 
-/* { dg-final { scan-assembler-times ",m1," 6 } } */
+/* { dg-final { scan-assembler-times ",m1," 7 } } */
 /* { dg-final { scan-assembler-times ",m2," 3 } } */
 /* { dg-final { scan-assembler-times ",m4," 1 } } */
+/* { dg-final { scan-assembler-times ",m8," 1 } } */
 /* { dg-final { scan-assembler-not ",mf2," } } */
index 1c7503b770eef9730d53b441fa6484ed4d113f3b..b31453852b2319c070f852cfb885fb6a5ec06bb5 100644 (file)
@@ -19,8 +19,7 @@ f (uint8_t *restrict a, uint8_t *restrict b, int n)
     }
 }
 
-/* FIXME: Since we don't have VECT cost model yet, LOAD_LANES/STORE_LANES are chosen
-   instead of SLP when rvv-autotec-max-lmul=m1.  */
-/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 1 "optimized" { xfail { any-opts "-mrvv-max-lmul=m1" } } } } */
-/* { dg-final { scan-assembler {\tvid\.v} { xfail { any-opts "-mrvv-max-lmul=m1"} } } } */
+/* FIXME: Since we don't have VECT cost model yet, LOAD_LANES/STORE_LANES are chosen instead of SLP.  */
+/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 1 "optimized" { xfail { any-opts "-mrvv-max-lmul=m1" "-mrvv-max-lmul=m2" "-mrvv-max-lmul=m4" "-mrvv-max-lmul=m8" } } } } */
+/* { dg-final { scan-assembler {\tvid\.v} { xfail { any-opts "-mrvv-max-lmul=m1" "-mrvv-max-lmul=m2" "-mrvv-max-lmul=m4" "-mrvv-max-lmul=m8" } } } } */
 /* { dg-final { scan-assembler-not {\tvmul} } } */
index a10a7c831b1b5a9b5f021f10c1744148ced67f55..2b2099d6e604f63aee27b1d59839dd07725e51ac 100644 (file)
@@ -20,5 +20,5 @@ f (int8_t *restrict a, int8_t *restrict b, int n)
 }
 
 /* FIXME: Since we don't have VECT cost model yet, LOAD_LANES/STORE_LANES are chosen
-   instead of SLP when rvv-autotec-max-lmul=m1.  */
-/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 1 "optimized" { xfail { any-opts "-mrvv-max-lmul=m1" } } } } */
+   instead of SLP.  */
+/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 1 "optimized" { xfail { any-opts "-mrvv-max-lmul=m1" "-mrvv-max-lmul=m2" } } } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr122558.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr122558.c
new file mode 100644 (file)
index 0000000..c9dbba6
--- /dev/null
@@ -0,0 +1,37 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -ftree-vectorize -mabi=lp64d -march=rv64gcv -mrvv-max-lmul=dynamic -fdump-tree-vect-all" } */
+/* { dg-skip-if "" { *-*-* } { "-O0" "-O1" "-O2" "-Os" "-Og" "-Oz" } } */
+
+#include <stdint-gcc.h>
+
+void dct( int16_t d[16], int16_t dct[16] )
+{
+    int16_t tmp[16];
+    for( int i = 0; i < 4; i++ )
+    {
+        int s03 = d[i*4+0] + d[i*4+3];
+        int s12 = d[i*4+1] + d[i*4+2];
+        int d03 = d[i*4+0] - d[i*4+3];
+        int d12 = d[i*4+1] - d[i*4+2];
+        tmp[0*4+i] =   s03 +   s12;
+        tmp[1*4+i] = 2*d03 +   d12;
+        tmp[2*4+i] =   s03 -   s12;
+        tmp[3*4+i] =   d03 - 2*d12;
+    }
+    for( int i = 0; i < 4; i++ )
+    {
+        int s03 = tmp[i*4+0] + tmp[i*4+3];
+        int s12 = tmp[i*4+1] + tmp[i*4+2];
+        int d03 = tmp[i*4+0] - tmp[i*4+3];
+        int d12 = tmp[i*4+1] - tmp[i*4+2];
+
+        dct[i*4+0] =   s03 +   s12;
+        dct[i*4+1] = 2*d03 +   d12;
+        dct[i*4+2] =   s03 -   s12;
+        dct[i*4+3] =   d03 - 2*d12;
+    }
+}
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "Choosing vector mode RVVMF2QI" "vect" } } */
+