]> git.ipfire.org Git - thirdparty/gcc.git/commitdiff
Implement afdo inliner
authorJan Hubicka <hubicka@ucw.cz>
Wed, 18 Jun 2025 10:10:25 +0000 (12:10 +0200)
committerJan Hubicka <hubicka@ucw.cz>
Sat, 21 Jun 2025 03:37:02 +0000 (05:37 +0200)
This patch moves afdo inlining from early inliner into specialized one.
The reason is that early inliner is by design non-recursive while afdo
inliner needs to recurse.  In the past google handled it by increasing
early inliner iterations, but it can be done easily and cheaply without
it by simply recusing into inlined functions.

I will also look into moving VPT to early inliner now.

Bootstrapped/regtested x86_64-linux, comitted.

gcc/ChangeLog:

* auto-profile.cc (get_inline_stack): Add fn parameter.
* ipa-inline.cc (want_early_inline_function_p): Do not care
about AFDO.
(inline_functions_by_afdo): New function.
(early_inliner): Use it.

gcc/testsuite/ChangeLog:

* gcc.dg/tree-prof/afdo-vpt-earlyinline.c: Update template.
* gcc.dg/tree-prof/indir-call-prof-2.c: Likewise.
* gcc.dg/tree-prof/afdo-inline.c: New test.

gcc/auto-profile.cc
gcc/ipa-inline.cc
gcc/testsuite/gcc.dg/tree-prof/afdo-inline.c [new file with mode: 0644]
gcc/testsuite/gcc.dg/tree-prof/afdo-vpt-earlyinline.c
gcc/testsuite/gcc.dg/tree-prof/indir-call-prof-2.c

index 3272cbec9b07be1789aae5365ea31cc90166551c..07580f8cc9987bf599aad677ceb9ff8c833bb9ef 100644 (file)
@@ -386,7 +386,8 @@ get_function_decl_from_block (tree block)
 /* Store inline stack for STMT in STACK.  */
 
 static void
-get_inline_stack (location_t locus, inline_stack *stack)
+get_inline_stack (location_t locus, inline_stack *stack,
+                 tree fn = current_function_decl)
 {
   if (LOCATION_LOCUS (locus) == UNKNOWN_LOCATION)
     return;
@@ -408,9 +409,7 @@ get_inline_stack (location_t locus, inline_stack *stack)
           locus = tmp_locus;
         }
     }
-  stack->safe_push (
-      std::make_pair (current_function_decl,
-                      get_combined_location (locus, current_function_decl)));
+  stack->safe_push (std::make_pair (fn, get_combined_location (locus, fn)));
 }
 
 /* Return STMT's combined location, which is a 32bit integer in which
@@ -822,7 +821,19 @@ autofdo_source_profile::get_callsite_total_count (
 {
   inline_stack stack;
   stack.safe_push (std::make_pair (edge->callee->decl, 0));
-  get_inline_stack (gimple_location (edge->call_stmt), &stack);
+
+  cgraph_edge *e = edge;
+  do
+    {
+      get_inline_stack (gimple_location (e->call_stmt), &stack,
+                       e->caller->decl);
+      /* If caller is inlined, continue building stack.  */
+      if (!e->caller->inlined_to)
+       e = NULL;
+      else
+       e = e->caller->callers;
+    }
+  while (e);
 
   function_instance *s = get_function_instance_by_inline_stack (stack);
   if (s == NULL
index 35e5496d8463a601f7dd3e0a772d5e2fa8a1ce3d..c4ea37820913cd63db1747b6c4805001c5a5e553 100644 (file)
@@ -782,14 +782,6 @@ want_early_inline_function_p (struct cgraph_edge *e)
 
   if (DECL_DISREGARD_INLINE_LIMITS (callee->decl))
     ;
-  /* For AutoFDO, we need to make sure that before profile summary, all
-     hot paths' IR look exactly the same as profiled binary. As a result,
-     in einliner, we will disregard size limit and inline those callsites
-     that are:
-       * inlined in the profiled binary, and
-       * the cloned callee has enough samples to be considered "hot".  */
-  else if (flag_auto_profile && afdo_callsite_hot_enough_for_early_inline (e))
-    ;
   else if (!DECL_DECLARED_INLINE_P (callee->decl)
           && !opt_for_fn (e->caller->decl, flag_inline_small_functions))
     {
@@ -3117,6 +3109,81 @@ early_inline_small_functions (struct cgraph_node *node)
   return inlined;
 }
 
+/* With auto-fdo inline all functions that was inlined in the train run
+   and inlining seems useful.  That is there are enough samples in the callee
+   function.
+
+   Unlike early inlining, we inline recursively.
+   TODO: We should also integrate VPT.  */
+
+static bool
+inline_functions_by_afdo (struct cgraph_node *node)
+{
+  if (!flag_auto_profile)
+    return false;
+  struct cgraph_edge *e;
+  bool inlined = false;
+
+  for (e = node->callees; e; e = e->next_callee)
+    {
+      struct cgraph_node *callee = e->callee->ultimate_alias_target ();
+
+      if (!e->inline_failed)
+       {
+         inlined |= inline_functions_by_afdo (e->callee);
+         continue;
+       }
+      if (!afdo_callsite_hot_enough_for_early_inline (e))
+       continue;
+
+      if (callee->definition
+         && !ipa_fn_summaries->get (callee))
+       compute_fn_summary (callee, true);
+
+      if (!can_early_inline_edge_p (e))
+       {
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, e->call_stmt,
+                            "Not inlining %C -> %C using auto-profile, %s.",
+                            e->caller, e->callee,
+                            cgraph_inline_failed_string (e->inline_failed));
+         continue;
+       }
+      /* We can handle recursive inlining by first producing
+        inline clone.  */
+      if (e->recursive_p ())
+       {
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, e->call_stmt,
+                            "Not inlining %C recursively"
+                            " using auto-profile.\n",
+                            e->callee);
+         continue;
+       }
+
+      if (dump_enabled_p ())
+       {
+         if (e->caller->inlined_to)
+           dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, e->call_stmt,
+                            "Inlining using auto-profile %C into %C "
+                            "which is transitively inlined to %C.\n",
+                            callee, e->caller, e->caller->inlined_to);
+         else
+           dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, e->call_stmt,
+                            "Inlining using auto-profile %C into %C.\n",
+                            callee, e->caller);
+       }
+      inline_call (e, true, NULL, NULL, false);
+      inlined |= inline_functions_by_afdo (e->callee);
+      inlined = true;
+    }
+
+  if (inlined && !node->inlined_to)
+    ipa_update_overall_fn_summary (node);
+
+  return inlined;
+}
+
 unsigned int
 early_inliner (function *fun)
 {
@@ -3192,9 +3259,12 @@ early_inliner (function *fun)
       /* We iterate incremental inlining to get trivial cases of indirect
         inlining.  */
       while (iterations < opt_for_fn (node->decl,
-                                     param_early_inliner_max_iterations)
-            && early_inline_small_functions (node))
+                                     param_early_inliner_max_iterations))
        {
+         bool inlined = early_inline_small_functions (node);
+         inlined |= inline_functions_by_afdo (node);
+         if (!inlined)
+           break;
          timevar_push (TV_INTEGRATION);
          todo |= optimize_inline_calls (current_function_decl);
 
diff --git a/gcc/testsuite/gcc.dg/tree-prof/afdo-inline.c b/gcc/testsuite/gcc.dg/tree-prof/afdo-inline.c
new file mode 100644 (file)
index 0000000..b67b3cb
--- /dev/null
@@ -0,0 +1,27 @@
+/* { dg-options "-O2 -fdump-tree-einline-details --param early-inlining-insns=1" } */
+/* { dg-require-profiling "-fauto-profile" } */ 
+volatile int a[1000];
+int reta (int i)
+{
+       if (a[i])
+               __builtin_printf ("It is one\n");
+       if (a[i] == 2)
+               __builtin_printf ("It is two\n");
+       return a[i];
+}
+int test ()
+{
+       int s = 0;
+       for (int pos = 0; pos < 1000; pos++)
+         reta(pos);
+       if (s)
+               __builtin_printf ("sum error\n");
+}
+int main()
+{
+       for (int i = 0; i < 10000; i++)
+               test();
+       return 0;
+}
+/* { dg-final-use-autofdo { scan-tree-dump "Inlining using auto-profile test" "einline"} } */
+/* { dg-final-use-autofdo { scan-tree-dump "Inlining using auto-profile reta.*transitively inlined to main" "einline"} } */
index 3b51ea9f8a9db9f18ed35d882498a3ef6f4faa83..48a404942db7060fc181e124c1ada0a4d08a2b32 100644 (file)
@@ -1,4 +1,4 @@
-/* { dg-options "-O2 -fdump-ipa-afdo-details -fdump-tree-einline-details" } */
+/* { dg-options "-O2 -fdump-ipa-afdo-details -fdump-tree-einline-details --param early-inlining-insns=1" } */
 /* { dg-require-profiling "-fauto-profile" } */ 
 
 volatile int array[1000];
@@ -25,8 +25,8 @@ int main()
                test(&p);
        return 0;
 }
-/* { dg-final-use-autofdo { scan-tree-dump "Inlining test" "einline"} } */
+/* { dg-final-use-autofdo { scan-tree-dump "Inlining using auto-profile test" "einline"} } */
 /* { dg-final-use-autofdo { scan-ipa-dump "Checking indirect call -> direct call reta" "afdo"} } */
-/* { dg-final-use-autofdo { scan-ipa-dump "looks good" "afdo"} } */
+/* { dg-final-use-autofdo { scan-ipa-dump-times "looks good" 0 "afdo"} } */
 /* If we inlined reta->test->main, it will contian array[pos].  */
 /* { dg-final-use-autofdo { scan-ipa-dump "array.pos_" "afdo"} } */
index 1d64d9f3f62276bbe48d51d2cd14253956550ade..53cc753cab53c3e710eaa5d82a029ed752ee8adb 100644 (file)
@@ -31,5 +31,5 @@ main (void)
 }
 /* { dg-final-use-not-autofdo { scan-ipa-dump "Indirect call -> direct call.* add1 .will resolve by ipa-profile" "profile"} } */
 /* { dg-final-use-not-autofdo { scan-ipa-dump "Indirect call -> direct call.* sub1 .will resolve by ipa-profile" "profile"} } */
-/* { dg-final-use-autofdo { scan-ipa-dump "Inlining add1/1 into main/4." "afdo"} } */
-/* { dg-final-use-autofdo { scan-ipa-dump "Inlining sub1/2 into main/4." "afdo"} } */
+/* { dg-final-use-autofdo { scan-ipa-dump "Inlining add1/. into main/" "afdo"} } */
+/* { dg-final-use-autofdo { scan-ipa-dump "Inlining sub1/. into main/" "afdo"} } */