This patch removes the AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS tunable and
use_new_vector_costs entry in aarch64-tuning-flags.def and makes the
AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS paths in the backend the
default. To that end, the function aarch64_use_new_vector_costs_p and its uses
were removed. To prevent costing vec_to_scalar operations with 0, as
described in
https://gcc.gnu.org/pipermail/gcc-patches/2024-October/665481.html,
we adjusted vectorizable_store such that the variable n_adjacent_stores
also covers vec_to_scalar operations. This way vec_to_scalar operations
are not costed individually, but as a group.
As suggested by Richard Sandiford, the "known_ne" in the multilane-check
was replaced by "maybe_ne" in order to treat nunits==1+1X as a vector
rather than a scalar.
Two tests were adjusted due to changes in codegen. In both cases, the
old code performed loop unrolling once, but the new code does not:
Example from gcc.target/aarch64/sve/strided_load_2.c (compiled with
-O2 -ftree-vectorize -march=armv8.2-a+sve -mtune=generic -moverride=tune=none):
f_int64_t_32:
cbz w3, .L92
mov x4, 0
uxtw x3, w3
+ cntd x5
+ whilelo p7.d, xzr, x3
+ mov z29.s, w5
mov z31.s, w2
- whilelo p6.d, xzr, x3
- mov x2, x3
- index z30.s, #0, #1
- uqdecd x2
- ptrue p5.b, all
- whilelo p7.d, xzr, x2
+ index z30.d, #0, #1
+ ptrue p6.b, all
.p2align 3,,7
.L94:
- ld1d z27.d, p7/z, [x0, #1, mul vl]
- ld1d z28.d, p6/z, [x0]
- movprfx z29, z31
- mul z29.s, p5/m, z29.s, z30.s
- incw x4
- uunpklo z0.d, z29.s
- uunpkhi z29.d, z29.s
- ld1d z25.d, p6/z, [x1, z0.d, lsl 3]
- ld1d z26.d, p7/z, [x1, z29.d, lsl 3]
- add z25.d, z28.d, z25.d
+ ld1d z27.d, p7/z, [x0, x4, lsl 3]
+ movprfx z28, z31
+ mul z28.s, p6/m, z28.s, z30.s
+ ld1d z26.d, p7/z, [x1, z28.d, uxtw 3]
add z26.d, z27.d, z26.d
- st1d z26.d, p7, [x0, #1, mul vl]
- whilelo p7.d, x4, x2
- st1d z25.d, p6, [x0]
- incw z30.s
- incb x0, all, mul #2
- whilelo p6.d, x4, x3
+ st1d z26.d, p7, [x0, x4, lsl 3]
+ add z30.s, z30.s, z29.s
+ incd x4
+ whilelo p7.d, x4, x3
b.any .L94
.L92:
ret