]> git.ipfire.org Git - thirdparty/gcc.git/commitdiff
Fix profile update after cancelled loop distribution
authorJan Hubicka <jh@suse.cz>
Wed, 2 Aug 2023 07:25:12 +0000 (09:25 +0200)
committerJan Hubicka <jh@suse.cz>
Wed, 2 Aug 2023 07:25:12 +0000 (09:25 +0200)
Loop distribution and ifcvt introduces verisons of loops which may be removed
later if vectorization fails.  Ifcvt does this by temporarily breaking profile
and producing conditional that has two arms with 100% probability because we
know one of the versions will be removed.

Loop distribution is trickier, since it introduces test for alignment that
either survives to final code if vecotorization suceeds or is turned if it
fails.

Here we need to assign some reasonable probabilities for the case vectorization
goes well, so this code adds logic to scale profile back in case we remove the
call.

This is not perfect since we drop precise BB counts to guessed.  It is not big
deal since we do not use much reliablity of bb counts after this point.  Other
option would be to apply scale only if vectorization succeeds which however
needs bit more work at tree-loop-distribution side and would need all code in
this patch with small change that fold_loop_internal_call will have to know how
to adjust if conditional stays. I decided to go for easier solution for now.

Bootstrapped/regtested x86_64-linux, committed.

gcc/ChangeLog:

* cfg.cc (scale_strictly_dominated_blocks): New function.
* cfg.h (scale_strictly_dominated_blocks): Declare.
* tree-cfg.cc (fold_loop_internal_call): Fixup CFG profile.

gcc/testsuite/ChangeLog:

* gcc.dg/vect/pr98308.c: Check that profile is consistent.

gcc/cfg.cc
gcc/cfg.h
gcc/testsuite/gcc.dg/vect/pr98308.c
gcc/tree-cfg.cc

index 0de6d6b9e71d030227fce6ace2e8984219ee3dad..9eb9916f61aab63bbbeefe9a2e7134717fac8aec 100644 (file)
@@ -1195,3 +1195,27 @@ get_loop_copy (class loop *loop)
   else
     return NULL;
 }
+
+/* Scales the frequencies of all basic blocks that are strictly
+   dominated by BB by NUM/DEN.  */
+
+void
+scale_strictly_dominated_blocks (basic_block bb,
+                                profile_count num, profile_count den)
+{
+  basic_block son;
+
+  if (!den.nonzero_p () && !(num == profile_count::zero ()))
+    return;
+  auto_vec <basic_block, 8> worklist;
+  worklist.safe_push (bb);
+
+  while (!worklist.is_empty ())
+    for (son = first_dom_son (CDI_DOMINATORS, worklist.pop ());
+        son;
+        son = next_dom_son (CDI_DOMINATORS, son))
+      {
+       son->count = son->count.apply_scale (num, den);
+       worklist.safe_push (son);
+      }
+}
index 4bf4263ebfc593f98cb76523e1950b51ed0a22dc..a0e944979c8743c2b90912d82f963d11d60cdeca 100644 (file)
--- a/gcc/cfg.h
+++ b/gcc/cfg.h
@@ -127,6 +127,8 @@ extern void set_bb_copy (basic_block, basic_block);
 extern basic_block get_bb_copy (basic_block);
 void set_loop_copy (class loop *, class loop *);
 class loop *get_loop_copy (class loop *);
+void scale_strictly_dominated_blocks (basic_block,
+                                     profile_count, profile_count);
 
 /* Generic RAII class to allocate a bit from storage of integer type T.
    The allocated bit is accessible as mask with the single bit set
index 7d717b1ee51e8068b5e8877345b90caa8bc6920a..aeec9771c55334b4abfa23192c0a7904563869b8 100644 (file)
@@ -1,6 +1,7 @@
 /* { dg-do compile } */
 /* { dg-additional-options "-O3" } */
 /* { dg-additional-options "-march=skylake-avx512" { target avx512f } } */
+/* { dg-additional-options "-fdump-tree-optimized-details-blocks" } */
 
 extern unsigned long long int arr_86[];
 extern unsigned long long int arr_87[][15];
@@ -14,3 +15,4 @@ void test(_Bool a, unsigned short c[][15], unsigned char d[])
        arr_87[h][0] = a ? c[h][i] : 0;
       }
 }
+/* { dg-final { scan-tree-dump-not "Invalid sum" "optimized" } } */
index c65af8cc8003d1930d4dbccc11669fde18d2cdd2..c158454946c623876213dd1a990330b5270d9e90 100644 (file)
@@ -7703,6 +7703,44 @@ fold_loop_internal_call (gimple *g, tree value)
       FOR_EACH_IMM_USE_ON_STMT (use_p, iter)
        SET_USE (use_p, value);
       update_stmt (use_stmt);
+      /* If we turn conditional to constant, scale profile counts.
+        We know that the conditional was created by loop distribution
+        and all basic blocks dominated by the taken edge are part of
+        the loop distributed.  */
+      if (gimple_code (use_stmt) == GIMPLE_COND)
+       {
+         edge true_edge, false_edge;
+         extract_true_false_edges_from_block (gimple_bb (use_stmt),
+                                              &true_edge, &false_edge);
+         edge taken_edge = NULL, other_edge = NULL;
+         if (gimple_cond_true_p (as_a <gcond *>(use_stmt)))
+           {
+             taken_edge = true_edge;
+             other_edge = false_edge;
+           }
+         else if (gimple_cond_false_p (as_a <gcond *>(use_stmt)))
+           {
+             taken_edge = false_edge;
+             other_edge = true_edge;
+           }
+         if (taken_edge
+             && !(taken_edge->probability == profile_probability::always ()))
+           {
+             profile_count old_count = taken_edge->count ();
+             profile_count new_count = taken_edge->src->count;
+             taken_edge->probability = profile_probability::always ();
+             other_edge->probability = profile_probability::never ();
+             /* If we have multiple predecessors, we can't use the dominance
+                test.  This should not happen as the guarded code should
+                start with pre-header.  */
+             gcc_assert (single_pred_edge (taken_edge->dest));
+             taken_edge->dest->count
+               = taken_edge->dest->count.apply_scale (new_count,
+                                                      old_count);
+             scale_strictly_dominated_blocks (taken_edge->dest,
+                                              new_count, old_count);
+           }
+       }
     }
 }