return 0;
}
+/* Calculate LMUL-based cost scaling factor.
+ Larger LMUL values process more data but have proportionally
+ higher latency and register pressure.
+
+ Returns the cost scaling factor based on LMUL. For LMUL > 1,
+ the factor represents the relative cost increase (2x, 4x, 8x).
+ For LMUL <= 1, returns 1 (no scaling). */
+static unsigned
+get_lmul_cost_scaling (machine_mode mode)
+{
+ if (!riscv_vla_mode_p (mode))
+ return 1;
+
+ enum vlmul_type vlmul = get_vlmul (mode);
+
+ /* Cost scaling based on LMUL and data processed.
+ Larger LMUL values have proportionally higher latency:
+ - m1 (LMUL_1): 1x (baseline)
+ - m2 (LMUL_2): 2x (processes 2x data, ~2x latency)
+ - m4 (LMUL_4): 4x (processes 4x data, ~4x latency)
+ - m8 (LMUL_8): 8x (processes 8x data, ~8x latency)
+ - mf2/mf4/mf8: 1x (fractional LMUL, already efficient) */
+ switch (vlmul)
+ {
+ case LMUL_2:
+ return 2;
+ case LMUL_4:
+ return 4;
+ case LMUL_8:
+ return 8;
+ case LMUL_1:
+ case LMUL_F2:
+ case LMUL_F4:
+ case LMUL_F8:
+ default:
+ return 1;
+ }
+}
+
/* Adjust vectorization cost after calling riscv_builtin_vectorization_cost.
For some statement, we would like to further fine-grain tweak the cost on
top of riscv_builtin_vectorization_cost handling which doesn't have any
default:
break;
}
+
+ /* Apply LMUL cost scaling uniformly to all vector operations.
+ Larger LMUL values have higher latency and register pressure,
+ which affects performance regardless of loop structure. */
+ if (vectype)
+ {
+ unsigned lmul_factor = get_lmul_cost_scaling (TYPE_MODE (vectype));
+ if (lmul_factor > 1)
+ stmt_cost *= lmul_factor;
+ }
+
return stmt_cost;
}
}
}
-/* { dg-final { scan-assembler-not {e64,m2} { xfail *-*-* } } } */
-/* { dg-final { scan-assembler {e64,m4} { xfail *-*-* } } } */
+/* { dg-final { scan-assembler {e64,m1} } } */
+/* { dg-final { scan-assembler-not {e64,m2} } } */
+/* { dg-final { scan-assembler-not {e64,m4} } } */
/* { dg-final { scan-assembler-not {jr} } } */
/* { dg-final { scan-assembler {ret} } } */
/* { dg-final { scan-assembler-not {sp} } } */
a[i] = b[i];
}
+/* { dg-final { scan-assembler-times ",m1," 3 } } */
/* { dg-final { scan-assembler-times ",m2," 3 } } */
-/* { dg-final { scan-assembler-times ",m4," 2 } } */
-/* { dg-final { scan-assembler-times ",m8," 1 } } */
+/* { dg-final { scan-assembler-times ",m4," 4 } } */
+/* { dg-final { scan-assembler-times ",m8," 2 } } */
a[i] = b[i];
}
-/* { dg-final { scan-assembler-times ",m1," 6 } } */
+/* { dg-final { scan-assembler-times ",m1," 7 } } */
/* { dg-final { scan-assembler-times ",m2," 3 } } */
/* { dg-final { scan-assembler-times ",m4," 1 } } */
+/* { dg-final { scan-assembler-times ",m8," 1 } } */
/* { dg-final { scan-assembler-not ",mf2," } } */
}
}
-/* FIXME: Since we don't have VECT cost model yet, LOAD_LANES/STORE_LANES are chosen
- instead of SLP when rvv-autotec-max-lmul=m1. */
-/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 1 "optimized" { xfail { any-opts "-mrvv-max-lmul=m1" } } } } */
-/* { dg-final { scan-assembler {\tvid\.v} { xfail { any-opts "-mrvv-max-lmul=m1"} } } } */
+/* FIXME: Since we don't have VECT cost model yet, LOAD_LANES/STORE_LANES are chosen instead of SLP. */
+/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 1 "optimized" { xfail { any-opts "-mrvv-max-lmul=m1" "-mrvv-max-lmul=m2" "-mrvv-max-lmul=m4" "-mrvv-max-lmul=m8" } } } } */
+/* { dg-final { scan-assembler {\tvid\.v} { xfail { any-opts "-mrvv-max-lmul=m1" "-mrvv-max-lmul=m2" "-mrvv-max-lmul=m4" "-mrvv-max-lmul=m8" } } } } */
/* { dg-final { scan-assembler-not {\tvmul} } } */
}
/* FIXME: Since we don't have VECT cost model yet, LOAD_LANES/STORE_LANES are chosen
- instead of SLP when rvv-autotec-max-lmul=m1. */
-/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 1 "optimized" { xfail { any-opts "-mrvv-max-lmul=m1" } } } } */
+ instead of SLP. */
+/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 1 "optimized" { xfail { any-opts "-mrvv-max-lmul=m1" "-mrvv-max-lmul=m2" } } } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O3 -ftree-vectorize -mabi=lp64d -march=rv64gcv -mrvv-max-lmul=dynamic -fdump-tree-vect-all" } */
+/* { dg-skip-if "" { *-*-* } { "-O0" "-O1" "-O2" "-Os" "-Og" "-Oz" } } */
+
+#include <stdint-gcc.h>
+
+void dct( int16_t d[16], int16_t dct[16] )
+{
+ int16_t tmp[16];
+ for( int i = 0; i < 4; i++ )
+ {
+ int s03 = d[i*4+0] + d[i*4+3];
+ int s12 = d[i*4+1] + d[i*4+2];
+ int d03 = d[i*4+0] - d[i*4+3];
+ int d12 = d[i*4+1] - d[i*4+2];
+ tmp[0*4+i] = s03 + s12;
+ tmp[1*4+i] = 2*d03 + d12;
+ tmp[2*4+i] = s03 - s12;
+ tmp[3*4+i] = d03 - 2*d12;
+ }
+ for( int i = 0; i < 4; i++ )
+ {
+ int s03 = tmp[i*4+0] + tmp[i*4+3];
+ int s12 = tmp[i*4+1] + tmp[i*4+2];
+ int d03 = tmp[i*4+0] - tmp[i*4+3];
+ int d12 = tmp[i*4+1] - tmp[i*4+2];
+
+ dct[i*4+0] = s03 + s12;
+ dct[i*4+1] = 2*d03 + d12;
+ dct[i*4+2] = s03 - s12;
+ dct[i*4+3] = d03 - 2*d12;
+ }
+}
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "Choosing vector mode RVVMF2QI" "vect" } } */
+