]> git.ipfire.org Git - thirdparty/gcc.git/commitdiff
Fix profile update after peeled epilogues
authorJan Hubicka <jh@suse.cz>
Sun, 6 Aug 2023 19:23:31 +0000 (21:23 +0200)
committerJan Hubicka <jh@suse.cz>
Sun, 6 Aug 2023 19:23:31 +0000 (21:23 +0200)
Epilogue peeling expects the scalar loop to have same number of executions as
the vector loop which is true at the beggining of vectorization. However if the
epilogues are vectorized, this is no longer the case.  In this situation the
loop preheader is replaced by new guard code with correct profile, however
loop body is left unscaled.  This leads to loop that exists more often then
it is entered.

This patch add slogic to scale the frequencies down and also to fix profile
of original preheader where necesary.

Bootstrapped/regtested x86_64-linux, comitted.

gcc/ChangeLog:

* tree-vect-loop-manip.cc (vect_do_peeling): Fix profile update of peeled epilogues.

gcc/testsuite/ChangeLog:

* gcc.dg/vect/vect-bitfield-read-1.c: Check profile consistency.
* gcc.dg/vect/vect-bitfield-read-2.c: Check profile consistency.
* gcc.dg/vect/vect-bitfield-read-3.c: Check profile consistency.
* gcc.dg/vect/vect-bitfield-read-4.c: Check profile consistency.
* gcc.dg/vect/vect-bitfield-read-5.c: Check profile consistency.
* gcc.dg/vect/vect-bitfield-read-6.c: Check profile consistency.
* gcc.dg/vect/vect-bitfield-read-7.c: Check profile consistency.
* gcc.dg/vect/vect-bitfield-write-1.c: Check profile consistency.
* gcc.dg/vect/vect-bitfield-write-2.c: Check profile consistency.
* gcc.dg/vect/vect-bitfield-write-3.c: Check profile consistency.
* gcc.dg/vect/vect-bitfield-write-4.c: Check profile consistency.
* gcc.dg/vect/vect-bitfield-write-5.c: Check profile consistency.
* gcc.dg/vect/vect-epilogues-2.c: Check profile consistency.
* gcc.dg/vect/vect-epilogues.c: Check profile consistency.
* gcc.dg/vect/vect-mask-store-move-1.c: Check profile consistency.

16 files changed:
gcc/testsuite/gcc.dg/vect/vect-bitfield-read-1.c
gcc/testsuite/gcc.dg/vect/vect-bitfield-read-2.c
gcc/testsuite/gcc.dg/vect/vect-bitfield-read-3.c
gcc/testsuite/gcc.dg/vect/vect-bitfield-read-4.c
gcc/testsuite/gcc.dg/vect/vect-bitfield-read-5.c
gcc/testsuite/gcc.dg/vect/vect-bitfield-read-6.c
gcc/testsuite/gcc.dg/vect/vect-bitfield-read-7.c
gcc/testsuite/gcc.dg/vect/vect-bitfield-write-1.c
gcc/testsuite/gcc.dg/vect/vect-bitfield-write-2.c
gcc/testsuite/gcc.dg/vect/vect-bitfield-write-3.c
gcc/testsuite/gcc.dg/vect/vect-bitfield-write-4.c
gcc/testsuite/gcc.dg/vect/vect-bitfield-write-5.c
gcc/testsuite/gcc.dg/vect/vect-epilogues-2.c
gcc/testsuite/gcc.dg/vect/vect-epilogues.c
gcc/testsuite/gcc.dg/vect/vect-mask-store-move-1.c
gcc/tree-vect-loop-manip.cc

index 42e50d9f0c8c8ba0debd7281a0d5102444ebd0b9..147c959568de047ed36f69e5d2eea3532bbc9532 100644 (file)
@@ -1,3 +1,4 @@
+/* { dg-additional-options "-fdump-tree-optimized-details-blocks" } */
 /* { dg-require-effective-target vect_int } */
 /* { dg-require-effective-target vect_shift } */
 
@@ -39,3 +40,4 @@ int main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-not "Invalid sum" "optimized" } } */
index a9aeefcd72c0f8d1bdaae48c78acd8dafd9f319f..982e6a7967b0b08daf74a5c419eb50773e048514 100644 (file)
@@ -1,3 +1,4 @@
+/* { dg-additional-options "-fdump-tree-optimized-details-blocks" } */
 /* { dg-require-effective-target vect_shift } */
 /* { dg-require-effective-target vect_long_long } */
 
@@ -42,3 +43,4 @@ int main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-not "Invalid sum" "optimized" } } */
index c7d0fd26bad5a561139e5f963b21faca0c34e06c..f2a43c39f50ee4f4a517e49f252610b39509e10e 100644 (file)
@@ -1,3 +1,4 @@
+/* { dg-additional-options "-fdump-tree-optimized-details-blocks" } */
 /* { dg-require-effective-target vect_int } */
 /* { dg-require-effective-target vect_shift } */
 
@@ -43,3 +44,4 @@ int main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-not "Invalid sum" "optimized" } } */
index 6a3ed8c0c6fa9633e06a0986ac2b6dc95a71a36e..9f6f0220664d8b481a59b8495bf0584be04a7fdd 100644 (file)
@@ -1,3 +1,4 @@
+/* { dg-additional-options "-fdump-tree-optimized-details-blocks" } */
 /* { dg-require-effective-target vect_shift } */
 /* { dg-require-effective-target vect_long_long } */
 
@@ -44,3 +45,4 @@ int main (void)
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
 
+/* { dg-final { scan-tree-dump-not "Invalid sum" "optimized" } } */
index b2889df8a0a68809b6b049fc40740bead3c33c05..662aed104cf6078fd8cd9c5987c4a0d7c5ea3eb4 100644 (file)
@@ -1,3 +1,4 @@
+/* { dg-additional-options "-fdump-tree-optimized-details-blocks" } */
 /* { dg-require-effective-target vect_int } */
 /* { dg-require-effective-target vect_shift } */
 
@@ -41,3 +42,4 @@ int main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-not "Invalid sum" "optimized" } } */
index 2445f531be2591874b9803f24004d1f335d90dc4..9b315d6be86a6dc2be0f33e614254336cf7754e5 100644 (file)
@@ -1,3 +1,4 @@
+/* { dg-additional-options "-fdump-tree-optimized-details-blocks" } */
 /* { dg-require-effective-target vect_int } */
 /* { dg-require-effective-target vect_shift } */
 
@@ -41,3 +42,4 @@ int main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-not "Invalid sum" "optimized" } } */
index 4b1ec8a6dab7922dda9dd0b3e41a6904e5f11759..6d1043dd9710a51d9b55806f0e91377140ba9af8 100644 (file)
@@ -1,3 +1,4 @@
+/* { dg-additional-options "-fdump-tree-optimized-details-blocks" } */
 /* { dg-require-effective-target vect_int } */
 /* { dg-require-effective-target vect_shift } */
 
@@ -42,3 +43,4 @@ int main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-not "Invalid sum" "optimized" } } */
index 22e6235301417d72e1f85ecbdd96d8e498500991..7c710cf5a57a609f8f8646a1b3053cbc9759b251 100644 (file)
@@ -1,3 +1,4 @@
+/* { dg-additional-options "-fdump-tree-optimized-details-blocks" } */
 /* { dg-require-effective-target vect_int } */
 
 #include <stdarg.h>
@@ -38,3 +39,4 @@ int main (void)
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
 
+/* { dg-final { scan-tree-dump-not "Invalid sum" "optimized" } } */
index 0c8291c9363d0de4c09f81525015b7b88004bc94..3b609183c5410672a2118d237d6b27dc7bf11c58 100644 (file)
@@ -1,3 +1,4 @@
+/* { dg-additional-options "-fdump-tree-optimized-details-blocks" } */
 /* { dg-require-effective-target vect_int } */
 /* { dg-require-effective-target vect_long_long } */
 
@@ -42,3 +43,4 @@ int main (void)
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
 
+/* { dg-final { scan-tree-dump-not "Invalid sum" "optimized" } } */
index 46fcb02b2f1b6bb2689a6b709901584605cc9a45..e96da82c214a03c4b14379be27a52eec6b3ef243 100644 (file)
@@ -1,3 +1,4 @@
+/* { dg-additional-options "-fdump-tree-optimized-details-blocks" } */
 /* { dg-require-effective-target vect_int } */
 /* { dg-require-effective-target vect_long_long } */
 
@@ -43,3 +44,4 @@ int main (void)
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
 
+/* { dg-final { scan-tree-dump-not "Invalid sum" "optimized" } } */
index 5a7227a93e4665cd10ee564c8b15165dc6cef303..66442213c9f69d534b91de7f0a7aa96d93d0b945 100644 (file)
@@ -1,3 +1,4 @@
+/* { dg-additional-options "-fdump-tree-optimized-details-blocks" } */
 /* { dg-require-effective-target vect_int } */
 
 #include <stdarg.h>
@@ -41,3 +42,4 @@ int main (void)
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
 
+/* { dg-final { scan-tree-dump-not "Invalid sum" "optimized" } } */
index e0b36e411a4a72335d4043f0f360c2e88b667397..386de504aad82c0ca7d05f80cf5c5f8380c79bc8 100644 (file)
@@ -1,3 +1,4 @@
+/* { dg-additional-options "-fdump-tree-optimized-details-blocks" } */
 /* { dg-require-effective-target vect_int } */
 
 #include <stdarg.h>
@@ -41,3 +42,4 @@ int main (void)
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
 
+/* { dg-final { scan-tree-dump-not "Invalid sum" "optimized" } } */
index b251e1f2dfd2c0490c9eb826ec9717c10eaf7b34..63c5e231f85b106111a1637a2c0e7b82bbc17c56 100644 (file)
@@ -1,3 +1,4 @@
+/* { dg-additional-options "-fdump-tree-optimized-details-blocks" } */
 /* { dg-do compile } */
 
 int
@@ -55,3 +56,4 @@ f6 (int *x, int a)
   x[a] += 1;
   return res;
 }
+/* { dg-final { scan-tree-dump-not "Invalid sum" "optimized" } } */
index ab7e8a1a75991545205069a3134250009befdd0c..11b8c83b7baefb54e710c766b4b50b93792304e5 100644 (file)
@@ -1,3 +1,4 @@
+/* { dg-additional-options "-fdump-tree-optimized-details-blocks" } */
 /* { dg-do compile } */
 
 /* Copied from PR 88915.  */
@@ -17,3 +18,4 @@ void pixel_avg( unsigned char *dst, int i_dst_stride,
  }
 
 /* { dg-final { scan-tree-dump "LOOP EPILOGUE VECTORIZED" "vect" { target vect_multiple_sizes xfail { { arm32 && be } || vect_partial_vectors_usage_2 } } } } */
+/* { dg-final { scan-tree-dump-not "Invalid sum" "optimized" } } */
index 1e06b588c0ffa3e30ea156391ca3425ddf1d9256..700adf9e1d41a5ef4807197be30da1e0783016fb 100644 (file)
@@ -1,3 +1,4 @@
+/* { dg-additional-options "-fdump-tree-optimized-details-blocks" } */
 /* { dg-do compile } */
 /* { dg-additional-options "-mavx2" { target { i?86-*-* x86_64-*-* } } } */
 
@@ -16,3 +17,4 @@ void foo (int n)
 }
 
 /* { dg-final { scan-tree-dump-times "Move stmt to created bb" 4 "vect" { target { i?86-*-* x86_64-*-* } xfail { i?86-*-* x86_64-*-* } } } } */
+/* { dg-final { scan-tree-dump-not "Invalid sum" "optimized" } } */
index 9de897d05a5f98cde36f7c8c7720b0ff21183dee..0e7e223f22a3ea3055caf830a0b74898b54b54ae 100644 (file)
@@ -3271,6 +3271,7 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
       adjust_vec_debug_stmts ();
       scev_reset ();
     }
+  basic_block bb_before_epilog = NULL;
 
   if (epilog_peeling)
     {
@@ -3290,6 +3291,7 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
 
       epilog->force_vectorize = false;
       slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false);
+      bb_before_epilog = loop_preheader_edge (epilog)->src;
 
       /* Scalar version loop may be preferred.  In this case, add guard
         and skip to epilog.  Note this only happens when the number of
@@ -3317,6 +3319,7 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
 
          /* Simply propagate profile info from guard_bb to guard_to which is
             a merge point of control flow.  */
+         profile_count old_count = guard_to->count;
          guard_to->count = guard_bb->count;
 
          /* Restore the counts of the epilog loop if we didn't use the scalar loop. */
@@ -3332,9 +3335,15 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
              free (bbs);
              free (original_bbs);
            }
-       }
+         else
+           scale_loop_profile (epilog, guard_to->count.probability_in (old_count), -1);
 
-      basic_block bb_before_epilog = loop_preheader_edge (epilog)->src;
+         /* Only need to handle basic block before epilog loop if it's not
+            the guard_bb, which is the case when skip_vector is true.  */
+         if (guard_bb != bb_before_epilog)
+           bb_before_epilog->count = single_pred_edge (bb_before_epilog)->count ();
+         bb_before_epilog = loop_preheader_edge (epilog)->src;
+       }
       /* If loop is peeled for non-zero constant times, now niters refers to
         orig_niters - prolog_peeling, it won't overflow even the orig_niters
         overflows.  */