From: Tamar Christina Date: Sun, 22 Sep 2024 12:34:10 +0000 (+0100) Subject: aarch64: Take into account when VF is higher than known scalar iters X-Git-Tag: basepoints/gcc-16~5786 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=e84e5d034124c6733d3b36d8623c56090d4d17f7;p=thirdparty%2Fgcc.git aarch64: Take into account when VF is higher than known scalar iters Consider low overhead loops like: void foo (char *restrict a, int *restrict b, int *restrict c, int n) { for (int i = 0; i < 9; i++) { int res = c[i]; int t = b[i]; if (a[i] != 0) res = t; c[i] = res; } } For such loops we use latency only costing since the loop bounds is known and small. The current costing however does not consider the case where niters < VF. So when comparing the scalar vs vector costs it doesn't keep in mind that the scalar code can't perform VF iterations. This makes it overestimate the cost for the scalar loop and we incorrectly vectorize. This patch takes the minimum of the VF and niters in such cases. Before the patch we generate: note: Original vector body cost = 46 note: Vector loop iterates at most 1 times note: Scalar issue estimate: note: load operations = 2 note: store operations = 1 note: general operations = 1 note: reduction latency = 0 note: estimated min cycles per iteration = 1.000000 note: estimated cycles per vector iteration (for VF 32) = 32.000000 note: SVE issue estimate: note: load operations = 5 note: store operations = 4 note: general operations = 11 note: predicate operations = 12 note: reduction latency = 0 note: estimated min cycles per iteration without predication = 5.500000 note: estimated min cycles per iteration for predication = 12.000000 note: estimated min cycles per iteration = 12.000000 note: Low iteration count, so using pure latency costs note: Cost model analysis: vs after: note: Original vector body cost = 46 note: Known loop bounds, capping VF to 9 for analysis note: Vector loop iterates at most 1 times note: Scalar issue estimate: note: load operations = 2 note: store operations = 1 note: general operations = 1 note: reduction latency = 0 note: estimated min cycles per iteration = 1.000000 note: estimated cycles per vector iteration (for VF 9) = 9.000000 note: SVE issue estimate: note: load operations = 5 note: store operations = 4 note: general operations = 11 note: predicate operations = 12 note: reduction latency = 0 note: estimated min cycles per iteration without predication = 5.500000 note: estimated min cycles per iteration for predication = 12.000000 note: estimated min cycles per iteration = 12.000000 note: Increasing body cost to 1472 because the scalar code could issue within the limit imposed by predicate operations note: Low iteration count, so using pure latency costs note: Cost model analysis: gcc/ChangeLog: * config/aarch64/aarch64.cc (adjust_body_cost): Cap VF for low iteration loops. gcc/testsuite/ChangeLog: * gcc.target/aarch64/sve/asrdiv_4.c: Update bounds. * gcc.target/aarch64/sve/cond_asrd_2.c: Likewise. * gcc.target/aarch64/sve/cond_uxt_6.c: Likewise. * gcc.target/aarch64/sve/cond_uxt_7.c: Likewise. * gcc.target/aarch64/sve/cond_uxt_8.c: Likewise. * gcc.target/aarch64/sve/miniloop_1.c: Likewise. * gcc.target/aarch64/sve/spill_6.c: Likewise. * gcc.target/aarch64/sve/sve_iters_low_1.c: New test. * gcc.target/aarch64/sve/sve_iters_low_2.c: New test. --- diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 92763d403c7..68913beaee2 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -17565,6 +17565,19 @@ adjust_body_cost (loop_vec_info loop_vinfo, dump_printf_loc (MSG_NOTE, vect_location, "Original vector body cost = %d\n", body_cost); + /* If we know we have a single partial vector iteration, cap the VF + to the number of scalar iterations for costing purposes. */ + if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) + { + auto niters = LOOP_VINFO_INT_NITERS (loop_vinfo); + if (niters < estimated_vf && dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "Scalar loop iterates at most %wd times. Capping VF " + " from %d to %wd\n", niters, estimated_vf, niters); + + estimated_vf = MIN (estimated_vf, niters); + } + fractional_cost scalar_cycles_per_iter = scalar_ops.min_cycles_per_iter () * estimated_vf; diff --git a/gcc/testsuite/gcc.target/aarch64/sve/asrdiv_4.c b/gcc/testsuite/gcc.target/aarch64/sve/asrdiv_4.c index 6684fe1c124..10a96a894af 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/asrdiv_4.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/asrdiv_4.c @@ -15,12 +15,12 @@ } #define TEST_ALL(T) \ - T (int16_t, int8_t, 7) \ - T (int32_t, int8_t, 3) \ - T (int32_t, int16_t, 3) \ - T (int64_t, int8_t, 5) \ - T (int64_t, int16_t, 5) \ - T (int64_t, int32_t, 5) + T (int16_t, int8_t, 70) \ + T (int32_t, int8_t, 30) \ + T (int32_t, int16_t, 30) \ + T (int64_t, int8_t, 50) \ + T (int64_t, int16_t, 50) \ + T (int64_t, int32_t, 50) TEST_ALL (DEF_LOOP) diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_asrd_2.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_asrd_2.c index e4040ee3520..db1721efbc7 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/cond_asrd_2.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_asrd_2.c @@ -14,12 +14,12 @@ } #define TEST_ALL(T) \ - T (int16_t, int8_t, 7) \ - T (int32_t, int8_t, 3) \ - T (int32_t, int16_t, 3) \ - T (int64_t, int8_t, 5) \ - T (int64_t, int16_t, 5) \ - T (int64_t, int32_t, 5) + T (int16_t, int8_t, 70) \ + T (int32_t, int8_t, 30) \ + T (int32_t, int16_t, 30) \ + T (int64_t, int8_t, 50) \ + T (int64_t, int16_t, 50) \ + T (int64_t, int32_t, 50) TEST_ALL (DEF_LOOP) diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_6.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_6.c index e47276a3a35..b8b3e862d0a 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_6.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_6.c @@ -14,11 +14,11 @@ } #define TEST_ALL(T) \ - T (int32_t, uint16_t, 0xff, 3) \ + T (int32_t, uint16_t, 0xff, 30) \ \ - T (int64_t, uint16_t, 0xff, 5) \ - T (int64_t, uint32_t, 0xff, 5) \ - T (int64_t, uint32_t, 0xffff, 5) + T (int64_t, uint16_t, 0xff, 50) \ + T (int64_t, uint32_t, 0xff, 50) \ + T (int64_t, uint32_t, 0xffff, 50) TEST_ALL (DEF_LOOP) diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_7.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_7.c index f49915c4ac1..2d02fb70f33 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_7.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_7.c @@ -14,11 +14,11 @@ } #define TEST_ALL(T) \ - T (int32_t, uint16_t, 0xff, 3) \ + T (int32_t, uint16_t, 0xff, 30) \ \ - T (int64_t, uint16_t, 0xff, 5) \ - T (int64_t, uint32_t, 0xff, 5) \ - T (int64_t, uint32_t, 0xffff, 5) + T (int64_t, uint16_t, 0xff, 50) \ + T (int64_t, uint32_t, 0xff, 50) \ + T (int64_t, uint32_t, 0xffff, 50) TEST_ALL (DEF_LOOP) diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_8.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_8.c index 42eb4b2661b..8fe2455687b 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_8.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_8.c @@ -14,11 +14,11 @@ } #define TEST_ALL(T) \ - T (int32_t, uint16_t, 0xff, 3) \ + T (int32_t, uint16_t, 0xff, 30) \ \ - T (int64_t, uint16_t, 0xff, 5) \ - T (int64_t, uint32_t, 0xff, 5) \ - T (int64_t, uint32_t, 0xffff, 5) + T (int64_t, uint16_t, 0xff, 50) \ + T (int64_t, uint32_t, 0xff, 50) \ + T (int64_t, uint32_t, 0xffff, 50) TEST_ALL (DEF_LOOP) diff --git a/gcc/testsuite/gcc.target/aarch64/sve/miniloop_1.c b/gcc/testsuite/gcc.target/aarch64/sve/miniloop_1.c index 09eb4146816..cd1fd2b8a07 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/miniloop_1.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/miniloop_1.c @@ -6,7 +6,7 @@ void loop (int * __restrict__ a, int * __restrict__ b, int * __restrict__ c, int * __restrict__ g, int * __restrict__ h) { int i = 0; - for (i = 0; i < 3; i++) + for (i = 0; i < 30; i++) { a[i] += i; b[i] += i; diff --git a/gcc/testsuite/gcc.target/aarch64/sve/spill_6.c b/gcc/testsuite/gcc.target/aarch64/sve/spill_6.c index ae9c338f569..2ff969ced00 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/spill_6.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/spill_6.c @@ -11,20 +11,20 @@ void consumer (void *); { \ if (which) \ { \ - for (int i = 0; i < 7; ++i) \ + for (int i = 0; i < 70; ++i) \ x1[i] += VAL; \ consumer (x1); \ - for (int i = 0; i < 7; ++i) \ + for (int i = 0; i < 70; ++i) \ x2[i] -= VAL; \ consumer (x2); \ } \ else \ { \ - for (int i = 0; i < 7; ++i) \ + for (int i = 0; i < 70; ++i) \ x3[i] &= VAL; \ consumer (x3); \ } \ - for (int i = 0; i < 7; ++i) \ + for (int i = 0; i < 70; ++i) \ x4[i] |= VAL; \ consumer (x4); \ } diff --git a/gcc/testsuite/gcc.target/aarch64/sve/sve_iters_low_1.c b/gcc/testsuite/gcc.target/aarch64/sve/sve_iters_low_1.c new file mode 100644 index 00000000000..952a4b1cd58 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/sve_iters_low_1.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-march=armv9-a -Ofast -fdump-tree-vect-details" } */ + +void +foo (char *restrict a, int *restrict b, int *restrict c, int n) +{ + for (int i = 0; i < 9; i++) + { + int res = c[i]; + int t = b[i]; + if (a[i] != 0) + res = t; + c[i] = res; + } +} + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/sve_iters_low_2.c b/gcc/testsuite/gcc.target/aarch64/sve/sve_iters_low_2.c new file mode 100644 index 00000000000..02d10de2a62 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/sve_iters_low_2.c @@ -0,0 +1,20 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-march=armv9-a -Ofast -fdump-tree-vect-details" } */ + +void +foo (char *restrict a, int *restrict b, int *restrict c, int n, int stride) +{ + if (stride <= 1) + return; + + for (int i = 0; i < 9; i++) + { + int res = c[i]; + int t = b[i*stride]; + if (a[i] != 0) + res = t; + c[i] = res; + } +} + +/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" } } */