From: Jan Hubicka Date: Sun, 6 Aug 2023 19:23:31 +0000 (+0200) Subject: Fix profile update after peeled epilogues X-Git-Tag: basepoints/gcc-15~7130 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=838237aeeba578fc2cf42bfd3ecb9d9a4fb7a2b4;p=thirdparty%2Fgcc.git Fix profile update after peeled epilogues Epilogue peeling expects the scalar loop to have same number of executions as the vector loop which is true at the beggining of vectorization. However if the epilogues are vectorized, this is no longer the case. In this situation the loop preheader is replaced by new guard code with correct profile, however loop body is left unscaled. This leads to loop that exists more often then it is entered. This patch add slogic to scale the frequencies down and also to fix profile of original preheader where necesary. Bootstrapped/regtested x86_64-linux, comitted. gcc/ChangeLog: * tree-vect-loop-manip.cc (vect_do_peeling): Fix profile update of peeled epilogues. gcc/testsuite/ChangeLog: * gcc.dg/vect/vect-bitfield-read-1.c: Check profile consistency. * gcc.dg/vect/vect-bitfield-read-2.c: Check profile consistency. * gcc.dg/vect/vect-bitfield-read-3.c: Check profile consistency. * gcc.dg/vect/vect-bitfield-read-4.c: Check profile consistency. * gcc.dg/vect/vect-bitfield-read-5.c: Check profile consistency. * gcc.dg/vect/vect-bitfield-read-6.c: Check profile consistency. * gcc.dg/vect/vect-bitfield-read-7.c: Check profile consistency. * gcc.dg/vect/vect-bitfield-write-1.c: Check profile consistency. * gcc.dg/vect/vect-bitfield-write-2.c: Check profile consistency. * gcc.dg/vect/vect-bitfield-write-3.c: Check profile consistency. * gcc.dg/vect/vect-bitfield-write-4.c: Check profile consistency. * gcc.dg/vect/vect-bitfield-write-5.c: Check profile consistency. * gcc.dg/vect/vect-epilogues-2.c: Check profile consistency. * gcc.dg/vect/vect-epilogues.c: Check profile consistency. * gcc.dg/vect/vect-mask-store-move-1.c: Check profile consistency. --- diff --git a/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-1.c b/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-1.c index 42e50d9f0c8c..147c959568de 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-1.c +++ b/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-1.c @@ -1,3 +1,4 @@ +/* { dg-additional-options "-fdump-tree-optimized-details-blocks" } */ /* { dg-require-effective-target vect_int } */ /* { dg-require-effective-target vect_shift } */ @@ -39,3 +40,4 @@ int main (void) } /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ +/* { dg-final { scan-tree-dump-not "Invalid sum" "optimized" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-2.c b/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-2.c index a9aeefcd72c0..982e6a7967b0 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-2.c +++ b/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-2.c @@ -1,3 +1,4 @@ +/* { dg-additional-options "-fdump-tree-optimized-details-blocks" } */ /* { dg-require-effective-target vect_shift } */ /* { dg-require-effective-target vect_long_long } */ @@ -42,3 +43,4 @@ int main (void) } /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ +/* { dg-final { scan-tree-dump-not "Invalid sum" "optimized" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-3.c b/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-3.c index c7d0fd26bad5..f2a43c39f50e 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-3.c +++ b/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-3.c @@ -1,3 +1,4 @@ +/* { dg-additional-options "-fdump-tree-optimized-details-blocks" } */ /* { dg-require-effective-target vect_int } */ /* { dg-require-effective-target vect_shift } */ @@ -43,3 +44,4 @@ int main (void) } /* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" } } */ +/* { dg-final { scan-tree-dump-not "Invalid sum" "optimized" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-4.c b/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-4.c index 6a3ed8c0c6fa..9f6f0220664d 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-4.c +++ b/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-4.c @@ -1,3 +1,4 @@ +/* { dg-additional-options "-fdump-tree-optimized-details-blocks" } */ /* { dg-require-effective-target vect_shift } */ /* { dg-require-effective-target vect_long_long } */ @@ -44,3 +45,4 @@ int main (void) /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ +/* { dg-final { scan-tree-dump-not "Invalid sum" "optimized" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-5.c b/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-5.c index b2889df8a0a6..662aed104cf6 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-5.c +++ b/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-5.c @@ -1,3 +1,4 @@ +/* { dg-additional-options "-fdump-tree-optimized-details-blocks" } */ /* { dg-require-effective-target vect_int } */ /* { dg-require-effective-target vect_shift } */ @@ -41,3 +42,4 @@ int main (void) } /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ +/* { dg-final { scan-tree-dump-not "Invalid sum" "optimized" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-6.c b/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-6.c index 2445f531be25..9b315d6be86a 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-6.c +++ b/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-6.c @@ -1,3 +1,4 @@ +/* { dg-additional-options "-fdump-tree-optimized-details-blocks" } */ /* { dg-require-effective-target vect_int } */ /* { dg-require-effective-target vect_shift } */ @@ -41,3 +42,4 @@ int main (void) } /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ +/* { dg-final { scan-tree-dump-not "Invalid sum" "optimized" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-7.c b/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-7.c index 4b1ec8a6dab7..6d1043dd9710 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-7.c +++ b/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-7.c @@ -1,3 +1,4 @@ +/* { dg-additional-options "-fdump-tree-optimized-details-blocks" } */ /* { dg-require-effective-target vect_int } */ /* { dg-require-effective-target vect_shift } */ @@ -42,3 +43,4 @@ int main (void) } /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ +/* { dg-final { scan-tree-dump-not "Invalid sum" "optimized" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-bitfield-write-1.c b/gcc/testsuite/gcc.dg/vect/vect-bitfield-write-1.c index 22e623530141..7c710cf5a57a 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-bitfield-write-1.c +++ b/gcc/testsuite/gcc.dg/vect/vect-bitfield-write-1.c @@ -1,3 +1,4 @@ +/* { dg-additional-options "-fdump-tree-optimized-details-blocks" } */ /* { dg-require-effective-target vect_int } */ #include @@ -38,3 +39,4 @@ int main (void) /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ +/* { dg-final { scan-tree-dump-not "Invalid sum" "optimized" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-bitfield-write-2.c b/gcc/testsuite/gcc.dg/vect/vect-bitfield-write-2.c index 0c8291c9363d..3b609183c541 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-bitfield-write-2.c +++ b/gcc/testsuite/gcc.dg/vect/vect-bitfield-write-2.c @@ -1,3 +1,4 @@ +/* { dg-additional-options "-fdump-tree-optimized-details-blocks" } */ /* { dg-require-effective-target vect_int } */ /* { dg-require-effective-target vect_long_long } */ @@ -42,3 +43,4 @@ int main (void) /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ +/* { dg-final { scan-tree-dump-not "Invalid sum" "optimized" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-bitfield-write-3.c b/gcc/testsuite/gcc.dg/vect/vect-bitfield-write-3.c index 46fcb02b2f1b..e96da82c214a 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-bitfield-write-3.c +++ b/gcc/testsuite/gcc.dg/vect/vect-bitfield-write-3.c @@ -1,3 +1,4 @@ +/* { dg-additional-options "-fdump-tree-optimized-details-blocks" } */ /* { dg-require-effective-target vect_int } */ /* { dg-require-effective-target vect_long_long } */ @@ -43,3 +44,4 @@ int main (void) /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ +/* { dg-final { scan-tree-dump-not "Invalid sum" "optimized" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-bitfield-write-4.c b/gcc/testsuite/gcc.dg/vect/vect-bitfield-write-4.c index 5a7227a93e46..66442213c9f6 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-bitfield-write-4.c +++ b/gcc/testsuite/gcc.dg/vect/vect-bitfield-write-4.c @@ -1,3 +1,4 @@ +/* { dg-additional-options "-fdump-tree-optimized-details-blocks" } */ /* { dg-require-effective-target vect_int } */ #include @@ -41,3 +42,4 @@ int main (void) /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ +/* { dg-final { scan-tree-dump-not "Invalid sum" "optimized" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-bitfield-write-5.c b/gcc/testsuite/gcc.dg/vect/vect-bitfield-write-5.c index e0b36e411a4a..386de504aad8 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-bitfield-write-5.c +++ b/gcc/testsuite/gcc.dg/vect/vect-bitfield-write-5.c @@ -1,3 +1,4 @@ +/* { dg-additional-options "-fdump-tree-optimized-details-blocks" } */ /* { dg-require-effective-target vect_int } */ #include @@ -41,3 +42,4 @@ int main (void) /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ +/* { dg-final { scan-tree-dump-not "Invalid sum" "optimized" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-epilogues-2.c b/gcc/testsuite/gcc.dg/vect/vect-epilogues-2.c index b251e1f2dfd2..63c5e231f85b 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-epilogues-2.c +++ b/gcc/testsuite/gcc.dg/vect/vect-epilogues-2.c @@ -1,3 +1,4 @@ +/* { dg-additional-options "-fdump-tree-optimized-details-blocks" } */ /* { dg-do compile } */ int @@ -55,3 +56,4 @@ f6 (int *x, int a) x[a] += 1; return res; } +/* { dg-final { scan-tree-dump-not "Invalid sum" "optimized" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-epilogues.c b/gcc/testsuite/gcc.dg/vect/vect-epilogues.c index ab7e8a1a7599..11b8c83b7bae 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-epilogues.c +++ b/gcc/testsuite/gcc.dg/vect/vect-epilogues.c @@ -1,3 +1,4 @@ +/* { dg-additional-options "-fdump-tree-optimized-details-blocks" } */ /* { dg-do compile } */ /* Copied from PR 88915. */ @@ -17,3 +18,4 @@ void pixel_avg( unsigned char *dst, int i_dst_stride, } /* { dg-final { scan-tree-dump "LOOP EPILOGUE VECTORIZED" "vect" { target vect_multiple_sizes xfail { { arm32 && be } || vect_partial_vectors_usage_2 } } } } */ +/* { dg-final { scan-tree-dump-not "Invalid sum" "optimized" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-mask-store-move-1.c b/gcc/testsuite/gcc.dg/vect/vect-mask-store-move-1.c index 1e06b588c0ff..700adf9e1d41 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-mask-store-move-1.c +++ b/gcc/testsuite/gcc.dg/vect/vect-mask-store-move-1.c @@ -1,3 +1,4 @@ +/* { dg-additional-options "-fdump-tree-optimized-details-blocks" } */ /* { dg-do compile } */ /* { dg-additional-options "-mavx2" { target { i?86-*-* x86_64-*-* } } } */ @@ -16,3 +17,4 @@ void foo (int n) } /* { dg-final { scan-tree-dump-times "Move stmt to created bb" 4 "vect" { target { i?86-*-* x86_64-*-* } xfail { i?86-*-* x86_64-*-* } } } } */ +/* { dg-final { scan-tree-dump-not "Invalid sum" "optimized" } } */ diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc index 9de897d05a5f..0e7e223f22a3 100644 --- a/gcc/tree-vect-loop-manip.cc +++ b/gcc/tree-vect-loop-manip.cc @@ -3271,6 +3271,7 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, adjust_vec_debug_stmts (); scev_reset (); } + basic_block bb_before_epilog = NULL; if (epilog_peeling) { @@ -3290,6 +3291,7 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, epilog->force_vectorize = false; slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false); + bb_before_epilog = loop_preheader_edge (epilog)->src; /* Scalar version loop may be preferred. In this case, add guard and skip to epilog. Note this only happens when the number of @@ -3317,6 +3319,7 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, /* Simply propagate profile info from guard_bb to guard_to which is a merge point of control flow. */ + profile_count old_count = guard_to->count; guard_to->count = guard_bb->count; /* Restore the counts of the epilog loop if we didn't use the scalar loop. */ @@ -3332,9 +3335,15 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, free (bbs); free (original_bbs); } - } + else + scale_loop_profile (epilog, guard_to->count.probability_in (old_count), -1); - basic_block bb_before_epilog = loop_preheader_edge (epilog)->src; + /* Only need to handle basic block before epilog loop if it's not + the guard_bb, which is the case when skip_vector is true. */ + if (guard_bb != bb_before_epilog) + bb_before_epilog->count = single_pred_edge (bb_before_epilog)->count (); + bb_before_epilog = loop_preheader_edge (epilog)->src; + } /* If loop is peeled for non-zero constant times, now niters refers to orig_niters - prolog_peeling, it won't overflow even the orig_niters overflows. */