]> git.ipfire.org Git - thirdparty/gcc.git/commitdiff
Improve static and AFDO profile combination
authorJan Hubicka <hubicka@ucw.cz>
Tue, 17 Jun 2025 15:26:18 +0000 (17:26 +0200)
committerJan Hubicka <hubicka@ucw.cz>
Tue, 17 Jun 2025 15:26:18 +0000 (17:26 +0200)
This patch makes afdo_adjust_guessed_profile more agressive on finding scales
on the boundaries of connected components with no annotation.  Originaly I
looked for edges into or out of the component with known AFDO counts and I also
haled edges from basic block with known AFDO count and known static probability
estimate.

Common problem is with components not containing any in edges, but only out
edges (i.e.  those with ENTRY_BLOCK).  In this case I added logic that looks
for edges out of the component to BBs with known AFDO count.  If all flow to
the BB is either from the component or has AFDO count, we can deterine scale
precisely.  It may happen that there are edges from other components. In this
case we know upper bound and use it, since it is better than nothing.

I also noticed that some components have 0 count in all profile and then scaling
gives up, which is fixed.  I also optimized the code a bit by replacing
map holding current component with an array holding component ID and broke out
saling logic into separate functions.

The patch fixes perl regression I introduced in last change.
according to
https://lnt.opensuse.org/db_default/v4/SPEC/67674
there were improvements (percentage is runtime change):

538.imagick_r  -32.52%
549.fotonik3d_r -22.68%
520.omnetpp_r  -12.37%
503.bwaves_r  -8.71%
508.namd_r  -5.10%
526.blender_r  -2.11%

and regressions:

554.roms_r  45.95%
527.cam4_r  21.69%
511.povray_r  13.59%
500.perlbench_r 10.19%
507.cactuBSSN_r 9.81%
510.parest_r  9.69%
548.exchange2_r 8.42%
502.gcc_r  5.10%
544.nab_r  3.76%
519.lbm_r  2.34%
541.leela_r  2.16%
525.x264_r 2.14%

This is a bit wild, but hope things will settle donw once we chase out
obvious problems (such as losing the profile of functions that has not been
inlined).

gcc/ChangeLog:

* auto-profile.cc (afdo_indirect_call): Compute speculative edge
probability.
(add_scale): Break out from ...
(scale_bbs): Break out from ...
(afdo_adjust_guessed_profile): ... here; use componet array instead of
current_component hash_map; handle components with only 0 profile;
be more agressive on finding scales along the boundary.

gcc/auto-profile.cc

index 8918d1e9f09dc09eb3b4937ed2ba9021d0adf9a2..3272cbec9b07be1789aae5365ea31cc90166551c 100644 (file)
@@ -1076,10 +1076,10 @@ afdo_indirect_call (gimple_stmt_iterator *gsi, const icall_target_map &map,
       fprintf (dump_file, "\n");
     }
 
-  /* FIXME: Count should be initialized.  */
   struct cgraph_edge *new_edge
-      = indirect_edge->make_speculative (direct_call,
-                                        profile_count::uninitialized ());
+      = indirect_edge->make_speculative
+               (direct_call,
+                gimple_bb (stmt)->count.apply_scale (99, 100));
   cgraph_edge::redirect_call_stmt_to_callee (new_edge);
   gimple_remove_histogram_value (cfun, stmt, hist);
   inline_call (new_edge, true, NULL, NULL, false);
@@ -1549,6 +1549,60 @@ cmp (const void *a, const void *b)
   return 0;
 }
 
+/* Add scale ORIG/ANNOTATED to SCALES.  */
+
+static void
+add_scale (vec <sreal> *scales, profile_count annotated, profile_count orig)
+{
+  if (dump_file)
+    {
+      orig.dump (dump_file);
+      fprintf (dump_file, " should be ");
+      annotated.dump (dump_file);
+      fprintf (dump_file, "\n");
+    }
+  if (orig.nonzero_p ())
+    {
+      sreal scale
+       = annotated.guessed_local ()
+               .to_sreal_scale (orig);
+      if (dump_file)
+       fprintf (dump_file, "    adding scale %.16f\n",
+                scale.to_double ());
+      scales->safe_push (scale);
+    }
+}
+
+/* Scale counts of all basic blocks in BBS by SCALE and convert them to
+   IPA quality.  */
+
+static void
+scale_bbs (const vec <basic_block> &bbs, sreal scale)
+{
+  if (dump_file)
+    fprintf (dump_file, "  Scaling by %.16f\n", scale.to_double ());
+  for (basic_block b : bbs)
+    if (!(b->count == profile_count::zero ())
+       && b->count.initialized_p ())
+      {
+       profile_count o = b->count;
+       b->count = b->count.force_guessed () * scale;
+
+       /* If we scaled to 0, make it auto-fdo since that is treated
+          less agressively.  */
+       if (!b->count.nonzero_p () && o.nonzero_p ())
+         b->count = profile_count::zero ().afdo ();
+       if (dump_file)
+         {
+           fprintf (dump_file, "    bb %i count updated ", b->index);
+           o.dump (dump_file);
+           fprintf (dump_file, " -> ");
+           b->count.dump (dump_file);
+           fprintf (dump_file, "\n");
+         }
+      }
+}
+
 /* In case given basic block was fully optimized out, AutoFDO
    will have no data about it.  In this case try to preserve static profile.
    Identify connected components (in undirected form of CFG) which has
@@ -1558,26 +1612,33 @@ cmp (const void *a, const void *b)
 void
 afdo_adjust_guessed_profile (bb_set *annotated_bb)
 {
-  auto_sbitmap visited (last_basic_block_for_fn (cfun));
   /* Basic blocks of connected component currently processed.  */
-  auto_vec <basic_block, 20> bbs (n_basic_blocks_for_fn (cfun) + 1);
+  auto_vec <basic_block, 20> bbs (n_basic_blocks_for_fn (cfun));
   /* Scale factors found.  */
-  auto_vec <sreal, 20> scales (n_basic_blocks_for_fn (cfun) + 1);
-  auto_vec <basic_block, 20> stack (n_basic_blocks_for_fn (cfun) + 1);
-
-  bitmap_clear (visited);
+  auto_vec <sreal, 20> scales;
+  auto_vec <basic_block, 20> stack (n_basic_blocks_for_fn (cfun));
 
   basic_block seed_bb;
-  FOR_BB_BETWEEN (seed_bb, ENTRY_BLOCK_PTR_FOR_FN (cfun),
-                 EXIT_BLOCK_PTR_FOR_FN (cfun), next_bb)
-   if (!is_bb_annotated (seed_bb, *annotated_bb)
-       && bitmap_set_bit (visited, seed_bb->index))
+  unsigned int component_id = 1;
+
+  /* Map from basic block to its component.
+     0   is used for univisited BBs,
+     1   means that BB is annotated,
+     >=2 is an id of the component BB belongs to.  */
+  auto_vec <unsigned int, 20> component;
+  component.safe_grow (last_basic_block_for_fn (cfun));
+  FOR_ALL_BB_FN (seed_bb, cfun)
+    component[seed_bb->index]
+       = is_bb_annotated (seed_bb, *annotated_bb) ? 1 : 0;
+  FOR_ALL_BB_FN (seed_bb, cfun)
+   if (!component[seed_bb->index])
      {
-       hash_set <basic_block> current_component;
-
        stack.quick_push (seed_bb);
+       component_id++;
        bbs.truncate (0);
        scales.truncate (0);
+       component[seed_bb->index] = component_id;
+       profile_count max_count = profile_count::zero ();
 
        /* Identify connected component starting in BB.  */
        if (dump_file)
@@ -1588,19 +1649,33 @@ afdo_adjust_guessed_profile (bb_set *annotated_bb)
           basic_block b = stack.pop ();
 
           bbs.quick_push (b);
-          current_component.add (b);
+          max_count = max_count.max (b->count);
 
           for (edge e: b->preds)
-            if (!is_bb_annotated (e->src, *annotated_bb)
-                && bitmap_set_bit (visited, e->src->index))
-               stack.quick_push (e->src);
+            if (!component[e->src->index])
+              {
+                 stack.quick_push (e->src);
+                 component[e->src->index] = component_id;
+              }
           for (edge e: b->succs)
-            if (!is_bb_annotated (e->dest, *annotated_bb)
-                && bitmap_set_bit (visited, e->dest->index))
-               stack.quick_push (e->dest);
+            if (!component[e->dest->index])
+              {
+                 stack.quick_push (e->dest);
+                 component[e->dest->index] = component_id;
+              }
         }
        while (!stack.is_empty ());
 
+       /* If all blocks in components has 0 count, we do not need
+         to scale, only we must convert to IPA quality.  */
+       if (!max_count.nonzero_p ())
+        {
+          if (dump_file)
+            fprintf (dump_file, "  All counts are 0; scale = 1\n");
+          scale_bbs (bbs, 1);
+          continue;
+        }
+
        /* Now visit the component and try to figure out its desired
          frequency.  */
        for (basic_block b : bbs)
@@ -1653,84 +1728,83 @@ afdo_adjust_guessed_profile (bb_set *annotated_bb)
               }
             else
               {
-                gcc_checking_assert (current_component.contains (e->src));
                 current_component_count += e->count ();
+                gcc_checking_assert (component[e->src->index] == component_id);
               }
           if (boundary && current_component_count.initialized_p ())
             {
-              profile_count in_count = b->count - current_component_count;
               if (dump_file)
-                {
-                  fprintf (dump_file, "    bb %i in count ", b->index);
-                  in_count.dump (dump_file);
-                  fprintf (dump_file, " should be ");
-                  annotated_count.dump (dump_file);
-                  fprintf (dump_file, "\n");
-                }
-              if (in_count.nonzero_p ())
-                {
-                  sreal scale
-                    = annotated_count.guessed_local ()
-                            .to_sreal_scale (in_count);
-                  if (dump_file)
-                    fprintf (dump_file, "    adding scale %.16f\n",
-                             scale.to_double ());
-                  scales.safe_push (scale);
-                }
+                fprintf (dump_file, "    bb %i in count ", b->index);
+              add_scale (&scales,
+                         annotated_count,
+                         b->count - current_component_count);
             }
           for (edge e: b->succs)
             if (AFDO_EINFO (e)->is_annotated ())
               {
-                profile_count out_count = e->count ();
-                profile_count annotated_count = AFDO_EINFO (e)->get_count ();
                 if (dump_file)
-                  {
-                    fprintf (dump_file, "    edge %i->%i count ",
-                             b->index, e->dest->index);
-                    out_count.dump (dump_file);
-                    fprintf (dump_file, " should be ");
-                    annotated_count.dump (dump_file);
-                    fprintf (dump_file, "\n");
-                  }
-                if (out_count.nonzero_p ())
-                  {
-                    sreal scale
-                      = annotated_count.guessed_local ()
-                              .to_sreal_scale (out_count);
-                    if (dump_file)
-                      fprintf (dump_file, "    adding scale %.16f\n",
-                               scale.to_double ());
-                    scales.safe_push (scale);
-                  }
+                  fprintf (dump_file, "    edge %i->%i count ",
+                           b->index, e->dest->index);
+                add_scale (&scales, AFDO_EINFO (e)->get_count (), e->count ());
+              }
+            else if (is_bb_annotated (e->dest, *annotated_bb))
+              {
+                profile_count annotated_count = e->dest->count;
+                profile_count out_count = profile_count::zero ();
+                bool ok = true;
+                for (edge e2: e->dest->preds)
+                  if (AFDO_EINFO (e2)->is_annotated ())
+                    annotated_count -= AFDO_EINFO (e2)->get_count ();
+                  else if (component[e->src->index] == component_id)
+                    out_count += e->count ();
+                  else if (e->probability.nonzero_p ())
+                    {
+                      ok = false;
+                      break;
+                    }
+                if (!ok)
+                  continue;
+                if (dump_file)
+                  fprintf (dump_file,
+                           "    edge %i->%i has annotated sucessor; count ",
+                           b->index, e->dest->index);
+                add_scale (&scales, annotated_count, e->count ());
               }
 
         }
+
+       /* If we failed to find annotated entry or exit edge,
+         look for exit edges and scale profile so the dest
+         BB get all flow it needs.  This is inprecise because
+         the edge is not annotated and thus BB has more than
+         one such predecessor.  */
+       if (!scales.length ())
+        for (basic_block b : bbs)
+          if (b->count.nonzero_p ())
+            for (edge e: b->succs)
+              if (is_bb_annotated (e->dest, *annotated_bb))
+                {
+                  profile_count annotated_count = e->dest->count;
+                  for (edge e2: e->dest->preds)
+                    if (AFDO_EINFO (e2)->is_annotated ())
+                      annotated_count -= AFDO_EINFO (e2)->get_count ();
+                  if (dump_file)
+                    fprintf (dump_file,
+                             "    edge %i->%i has annotated sucessor;"
+                             " upper bound count ",
+                             b->index, e->dest->index);
+                  add_scale (&scales, annotated_count, e->count ());
+                }
        if (!scales.length ())
-        continue;
+        {
+          if (dump_file)
+            fprintf (dump_file,
+                     "  Can not determine count from the boundary; giving up");
+          continue;
+        }
+       gcc_checking_assert (scales.length ());
        scales.qsort (cmp);
-       sreal scale = scales[scales.length () / 2];
-       if (dump_file)
-        fprintf (dump_file, "  Scaling by %.16f\n", scale.to_double ());
-       for (basic_block b : bbs)
-        if (!(b->count == profile_count::zero ())
-            && b->count.initialized_p ())
-          {
-            profile_count o = b->count;
-            b->count = b->count.force_guessed () * scale;
-
-            /* If we scaled to 0, make it auto-fdo since that is treated
-               less agressively.  */
-            if (!b->count.nonzero_p () && o.nonzero_p ())
-              b->count = profile_count::zero ().afdo ();
-            if (dump_file)
-              {
-                fprintf (dump_file, "    bb %i count updated ", b->index);
-                o.dump (dump_file);
-                fprintf (dump_file, " -> ");
-                b->count.dump (dump_file);
-                fprintf (dump_file, "\n");
-              }
-          }
+       scale_bbs (bbs, scales[scales.length () / 2]);
      }
 }