From: Richard Sandiford Date: Wed, 13 Nov 2019 09:12:17 +0000 (+0000) Subject: Account for the cost of generating loop masks X-Git-Tag: misc/cutover-git~1312 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=61e5f2df0345bcc1f7675125922692d727e20603;p=thirdparty%2Fgcc.git Account for the cost of generating loop masks We didn't take the cost of generating loop masks into account, and so tended to underestimate the cost of loops that need multiple masks. 2019-11-13 Richard Sandiford gcc/ * tree-vect-loop.c (vect_estimate_min_profitable_iters): Include the cost of generating loop masks. gcc/testsuite/ * gcc.target/aarch64/sve/mask_struct_store_3.c: Add -fno-vect-cost-model. * gcc.target/aarch64/sve/mask_struct_store_3_run.c: Likewise. * gcc.target/aarch64/sve/peel_ind_2.c: Likewise. * gcc.target/aarch64/sve/peel_ind_2_run.c: Likewise. * gcc.target/aarch64/sve/peel_ind_3.c: Likewise. * gcc.target/aarch64/sve/peel_ind_3_run.c: Likewise. From-SVN: r278125 --- diff --git a/gcc/ChangeLog b/gcc/ChangeLog index e7b04334fb51..047052835f0e 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,8 @@ +2019-11-13 Richard Sandiford + + * tree-vect-loop.c (vect_estimate_min_profitable_iters): Include + the cost of generating loop masks. + 2019-11-13 Richard Sandiford * tree-vectorizer.h (vect_apply_runtime_profitability_check_p): diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index a253a5397cc0..834c17a6d7f7 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,13 @@ +2019-11-13 Richard Sandiford + + * gcc.target/aarch64/sve/mask_struct_store_3.c: Add + -fno-vect-cost-model. + * gcc.target/aarch64/sve/mask_struct_store_3_run.c: Likewise. + * gcc.target/aarch64/sve/peel_ind_2.c: Likewise. + * gcc.target/aarch64/sve/peel_ind_2_run.c: Likewise. + * gcc.target/aarch64/sve/peel_ind_3.c: Likewise. + * gcc.target/aarch64/sve/peel_ind_3_run.c: Likewise. + 2019-11-13 Richard Sandiford PR c++/92206 diff --git a/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_store_3.c b/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_store_3.c index 001f5be8ff58..1765d54a483e 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_store_3.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_store_3.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */ +/* { dg-options "-O2 -ftree-vectorize -ffast-math -fno-vect-cost-model" } */ #include diff --git a/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_store_3_run.c b/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_store_3_run.c index 31d661b65945..4dbe0335c72f 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_store_3_run.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_store_3_run.c @@ -1,5 +1,5 @@ /* { dg-do run { target aarch64_sve_hw } } */ -/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */ +/* { dg-options "-O2 -ftree-vectorize -ffast-math -fno-vect-cost-model" } */ #include "mask_struct_store_3.c" diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_2.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_2.c index e792cdf2cad2..df82d58ea770 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_2.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_2.c @@ -1,7 +1,7 @@ /* { dg-do compile } */ /* Pick an arbitrary target for which unaligned accesses are more expensive. */ -/* { dg-options "-O3 -msve-vector-bits=256 -mtune=thunderx" } */ +/* { dg-options "-O3 -msve-vector-bits=256 -mtune=thunderx -fno-vect-cost-model" } */ #define N 512 #define START 7 diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_2_run.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_2_run.c index 9c5ae1bd0686..b9785356d182 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_2_run.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_2_run.c @@ -1,6 +1,6 @@ /* { dg-do run { target aarch64_sve_hw } } */ /* { dg-options "-O3 -mtune=thunderx" } */ -/* { dg-options "-O3 -mtune=thunderx -msve-vector-bits=256" { target aarch64_sve256_hw } } */ +/* { dg-options "-O3 -mtune=thunderx -msve-vector-bits=256 -fno-vect-cost-model" { target aarch64_sve256_hw } } */ #include "peel_ind_2.c" diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_3.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_3.c index 441589eef600..1707f02fe92a 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_3.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_3.c @@ -1,7 +1,7 @@ /* { dg-do compile } */ /* Pick an arbitrary target for which unaligned accesses are more expensive. */ -/* { dg-options "-O3 -msve-vector-bits=256 -mtune=thunderx" } */ +/* { dg-options "-O3 -msve-vector-bits=256 -mtune=thunderx -fno-vect-cost-model" } */ #define N 32 #define MAX_START 8 diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_3_run.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_3_run.c index 384a38eb8ec5..98389675d79d 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_3_run.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_3_run.c @@ -1,6 +1,6 @@ /* { dg-do run { target aarch64_sve_hw } } */ -/* { dg-options "-O3 -mtune=thunderx" } */ -/* { dg-options "-O3 -mtune=thunderx -msve-vector-bits=256" { target aarch64_sve256_hw } } */ +/* { dg-options "-O3 -mtune=thunderx -fno-vect-cost-model" } */ +/* { dg-options "-O3 -mtune=thunderx -msve-vector-bits=256 -fno-vect-cost-model" { target aarch64_sve256_hw } } */ #include "peel_ind_3.c" diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c index 83fb8486640d..005fa308911b 100644 --- a/gcc/tree-vect-loop.c +++ b/gcc/tree-vect-loop.c @@ -3291,6 +3291,32 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo, si->kind, si->stmt_info, si->misalign, vect_epilogue); } + + /* Calculate how many masks we need to generate. */ + unsigned int num_masks = 0; + rgroup_masks *rgm; + unsigned int num_vectors_m1; + FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm) + if (rgm->mask_type) + num_masks += num_vectors_m1 + 1; + gcc_assert (num_masks > 0); + + /* In the worst case, we need to generate each mask in the prologue + and in the loop body. One of the loop body mask instructions + replaces the comparison in the scalar loop, and since we don't + count the scalar comparison against the scalar body, we shouldn't + count that vector instruction against the vector body either. + + Sometimes we can use unpacks instead of generating prologue + masks and sometimes the prologue mask will fold to a constant, + so the actual prologue cost might be smaller. However, it's + simpler and safer to use the worst-case cost; if this ends up + being the tie-breaker between vectorizing or not, then it's + probably better not to vectorize. */ + (void) add_stmt_cost (target_cost_data, num_masks, vector_stmt, + NULL, 0, vect_prologue); + (void) add_stmt_cost (target_cost_data, num_masks - 1, vector_stmt, + NULL, 0, vect_body); } else if (npeel < 0) {